File indexing completed on 2025-04-19 09:06:21
0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035 #ifndef EIGEN_INVERSE_SIZE_4_H
0036 #define EIGEN_INVERSE_SIZE_4_H
0037
0038 namespace RivetEigen
0039 {
0040 namespace internal
0041 {
0042 template <typename MatrixType, typename ResultType>
0043 struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType>
0044 {
0045 enum
0046 {
0047 MatrixAlignment = traits<MatrixType>::Alignment,
0048 ResultAlignment = traits<ResultType>::Alignment,
0049 StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
0050 };
0051 typedef typename conditional<(MatrixType::Flags & LinearAccessBit), MatrixType const &, typename MatrixType::PlainObject>::type ActualMatrixType;
0052
0053 static void run(const MatrixType &mat, ResultType &result)
0054 {
0055 ActualMatrixType matrix(mat);
0056
0057 const float* data = matrix.data();
0058 const Index stride = matrix.innerStride();
0059 Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data);
0060 Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
0061 Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
0062 Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
0063
0064
0065
0066
0067 Packet4f A, B, C, D;
0068
0069 if (!StorageOrdersMatch)
0070 {
0071 A = vec4f_unpacklo(_L1, _L2);
0072 B = vec4f_unpacklo(_L3, _L4);
0073 C = vec4f_unpackhi(_L1, _L2);
0074 D = vec4f_unpackhi(_L3, _L4);
0075 }
0076 else
0077 {
0078 A = vec4f_movelh(_L1, _L2);
0079 B = vec4f_movehl(_L2, _L1);
0080 C = vec4f_movelh(_L3, _L4);
0081 D = vec4f_movehl(_L4, _L3);
0082 }
0083
0084 Packet4f AB, DC;
0085
0086
0087 AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
0088 AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
0089
0090
0091 DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
0092 DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
0093
0094
0095 Packet4f dA, dB, dC, dD;
0096
0097 dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
0098 dA = psub(dA, vec4f_movehl(dA, dA));
0099
0100 dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
0101 dB = psub(dB, vec4f_movehl(dB, dB));
0102
0103 dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
0104 dC = psub(dC, vec4f_movehl(dC, dC));
0105
0106 dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
0107 dD = psub(dD, vec4f_movehl(dD, dD));
0108
0109 Packet4f d, d1, d2;
0110
0111 d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
0112 d = padd(d, vec4f_movehl(d, d));
0113 d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
0114 d1 = pmul(dA, dD);
0115 d2 = pmul(dB, dC);
0116
0117
0118 Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
0119
0120
0121 Packet4f rd = pdiv(pset1<Packet4f>(1.0f), det);
0122
0123
0124 Packet4f iA, iB, iC, iD;
0125
0126
0127 iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
0128 iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
0129 iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
0130
0131
0132 iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
0133 iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
0134 iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
0135
0136
0137 iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
0138 iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
0139 iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
0140
0141
0142 iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
0143 iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
0144 iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
0145
0146 const float sign_mask[4] = {0.0f, numext::bit_cast<float>(0x80000000u), numext::bit_cast<float>(0x80000000u), 0.0f};
0147 const Packet4f p4f_sign_PNNP = ploadu<Packet4f>(sign_mask);
0148 rd = pxor(rd, p4f_sign_PNNP);
0149 iA = pmul(iA, rd);
0150 iB = pmul(iB, rd);
0151 iC = pmul(iC, rd);
0152 iD = pmul(iD, rd);
0153
0154 Index res_stride = result.outerStride();
0155 float *res = result.data();
0156
0157 pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
0158 pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
0159 pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
0160 pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
0161 }
0162 };
0163
0164 #if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
0165
0166
0167 template <typename MatrixType, typename ResultType>
0168 struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType>
0169 {
0170 enum
0171 {
0172 MatrixAlignment = traits<MatrixType>::Alignment,
0173 ResultAlignment = traits<ResultType>::Alignment,
0174 StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
0175 };
0176 typedef typename conditional<(MatrixType::Flags & LinearAccessBit),
0177 MatrixType const &,
0178 typename MatrixType::PlainObject>::type
0179 ActualMatrixType;
0180
0181 static void run(const MatrixType &mat, ResultType &result)
0182 {
0183 ActualMatrixType matrix(mat);
0184
0185
0186
0187
0188
0189
0190
0191
0192 Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
0193
0194 const double* data = matrix.data();
0195 const Index stride = matrix.innerStride();
0196 if (StorageOrdersMatch)
0197 {
0198 A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
0199 B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
0200 A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
0201 B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
0202 C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
0203 D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
0204 C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
0205 D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
0206 }
0207 else
0208 {
0209 Packet2d temp;
0210 A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
0211 C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
0212 A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
0213 C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
0214 temp = A1;
0215 A1 = vec2d_unpacklo(A1, A2);
0216 A2 = vec2d_unpackhi(temp, A2);
0217
0218 temp = C1;
0219 C1 = vec2d_unpacklo(C1, C2);
0220 C2 = vec2d_unpackhi(temp, C2);
0221
0222 B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
0223 D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
0224 B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
0225 D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
0226
0227 temp = B1;
0228 B1 = vec2d_unpacklo(B1, B2);
0229 B2 = vec2d_unpackhi(temp, B2);
0230
0231 temp = D1;
0232 D1 = vec2d_unpacklo(D1, D2);
0233 D2 = vec2d_unpackhi(temp, D2);
0234 }
0235
0236
0237 Packet2d dA, dB, dC, dD;
0238
0239 dA = vec2d_swizzle2(A2, A2, 1);
0240 dA = pmul(A1, dA);
0241 dA = psub(dA, vec2d_duplane(dA, 1));
0242
0243 dB = vec2d_swizzle2(B2, B2, 1);
0244 dB = pmul(B1, dB);
0245 dB = psub(dB, vec2d_duplane(dB, 1));
0246
0247 dC = vec2d_swizzle2(C2, C2, 1);
0248 dC = pmul(C1, dC);
0249 dC = psub(dC, vec2d_duplane(dC, 1));
0250
0251 dD = vec2d_swizzle2(D2, D2, 1);
0252 dD = pmul(D1, dD);
0253 dD = psub(dD, vec2d_duplane(dD, 1));
0254
0255 Packet2d DC1, DC2, AB1, AB2;
0256
0257
0258 AB1 = pmul(B1, vec2d_duplane(A2, 1));
0259 AB2 = pmul(B2, vec2d_duplane(A1, 0));
0260 AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
0261 AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
0262
0263
0264 DC1 = pmul(C1, vec2d_duplane(D2, 1));
0265 DC2 = pmul(C2, vec2d_duplane(D1, 0));
0266 DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
0267 DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
0268
0269 Packet2d d1, d2;
0270
0271
0272 Packet2d det;
0273
0274
0275 Packet2d rd;
0276
0277 d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
0278 d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
0279 rd = padd(d1, d2);
0280 rd = padd(rd, vec2d_duplane(rd, 1));
0281
0282 d1 = pmul(dA, dD);
0283 d2 = pmul(dB, dC);
0284
0285 det = padd(d1, d2);
0286 det = psub(det, rd);
0287 det = vec2d_duplane(det, 0);
0288 rd = pdiv(pset1<Packet2d>(1.0), det);
0289
0290
0291 Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
0292
0293
0294 iD1 = pmul(AB1, vec2d_duplane(C1, 0));
0295 iD2 = pmul(AB1, vec2d_duplane(C2, 0));
0296 iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
0297 iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
0298 dA = vec2d_duplane(dA, 0);
0299 iD1 = psub(pmul(D1, dA), iD1);
0300 iD2 = psub(pmul(D2, dA), iD2);
0301
0302
0303 iA1 = pmul(DC1, vec2d_duplane(B1, 0));
0304 iA2 = pmul(DC1, vec2d_duplane(B2, 0));
0305 iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
0306 iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
0307 dD = vec2d_duplane(dD, 0);
0308 iA1 = psub(pmul(A1, dD), iA1);
0309 iA2 = psub(pmul(A2, dD), iA2);
0310
0311
0312 iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
0313 iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
0314 iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
0315 iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
0316 dB = vec2d_duplane(dB, 0);
0317 iB1 = psub(pmul(C1, dB), iB1);
0318 iB2 = psub(pmul(C2, dB), iB2);
0319
0320
0321 iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
0322 iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
0323 iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
0324 iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
0325 dC = vec2d_duplane(dC, 0);
0326 iC1 = psub(pmul(B1, dC), iC1);
0327 iC2 = psub(pmul(B2, dC), iC2);
0328
0329 const double sign_mask1[2] = {0.0, numext::bit_cast<double>(0x8000000000000000ull)};
0330 const double sign_mask2[2] = {numext::bit_cast<double>(0x8000000000000000ull), 0.0};
0331 const Packet2d sign_PN = ploadu<Packet2d>(sign_mask1);
0332 const Packet2d sign_NP = ploadu<Packet2d>(sign_mask2);
0333 d1 = pxor(rd, sign_PN);
0334 d2 = pxor(rd, sign_NP);
0335
0336 Index res_stride = result.outerStride();
0337 double *res = result.data();
0338 pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
0339 pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
0340 pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
0341 pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
0342 pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
0343 pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
0344 pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
0345 pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
0346 }
0347 };
0348 #endif
0349 }
0350 }
0351 #endif