9 #ifndef QXS_VSIMD_WILSON_SU3_INCLUDED
10 #define QXS_VSIMD_WILSON_SU3_INCLUDED
31 u0 = svld1(pg, &u[
VLEN * 0]);
32 u1 = svld1(pg, &u[
VLEN * 1]);
33 u2 = svld1(pg, &u[
VLEN * (0 +
NVC)]);
34 u3 = svld1(pg, &u[
VLEN * (1 +
NVC)]);
35 u4 = svld1(pg, &u[
VLEN * (0 + 2 *
NVC)]);
36 u5 = svld1(pg, &u[
VLEN * (1 + 2 *
NVC)]);
46 u0 = svld1(pg, &u[
VLEN * 0]);
47 u1 = svld1(pg, &u[
VLEN * 1]);
48 u2 = svld1(pg, &u[
VLEN * 2]);
49 u3 = svld1(pg, &u[
VLEN * 3]);
50 u4 = svld1(pg, &u[
VLEN * 4]);
51 u5 = svld1(pg, &u[
VLEN * 5]);
61 load_vec_gather(pg, u0, &u[
VLEN * 0], index);
62 load_vec_gather(pg, u1, &u[
VLEN * 1], index);
63 load_vec_gather(pg, u2, &u[
VLEN * 2], index);
64 load_vec_gather(pg, u3, &u[
VLEN * 3], index);
65 load_vec_gather(pg, u4, &u[
VLEN * 4], index);
66 load_vec_gather(pg, u5, &u[
VLEN * 5], index);
79 yr = svmul_m(pg, u0, v0);
80 yr = svmls_m(pg, yr, u1, v1);
81 yr = svmla_m(pg, yr, u2, v2);
82 yr = svmls_m(pg, yr, u3, v3);
83 yr = svmla_m(pg, yr, u4, v4);
84 yr = svmls_m(pg, yr, u5, v5);
85 yi = svmul_m(pg, u0, v1);
86 yi = svmla_m(pg, yi, u1, v0);
87 yi = svmla_m(pg, yi, u2, v3);
88 yi = svmla_m(pg, yi, u3, v2);
89 yi = svmla_m(pg, yi, u4, v5);
90 yi = svmla_m(pg, yi, u5, v4);
103 yr = svmul_m(pg, u0, v0);
104 yr = svmla_m(pg, yr, u1, v1);
105 yr = svmla_m(pg, yr, u2, v2);
106 yr = svmla_m(pg, yr, u3, v3);
107 yr = svmla_m(pg, yr, u4, v4);
108 yr = svmla_m(pg, yr, u5, v5);
109 yi = svmul_m(pg, u0, v1);
110 yi = svmls_m(pg, yi, u1, v0);
111 yi = svmla_m(pg, yi, u2, v3);
112 yi = svmls_m(pg, yi, u3, v2);
113 yi = svmla_m(pg, yi, u4, v5);
114 yi = svmls_m(pg, yi, u5, v4);
120 for (
int k = 0; k <
VLEN; ++k) {
122 u[0].
v[k] * w[0].
v[k] - u[1].
v[k] * w[1].
v[k]
123 + u[6].
v[k] * w[2 *
ND].
v[k] - u[7].
v[k] * w[2 *
ND + 1].
v[k]
124 + u[12].
v[k] * w[4 *
ND].
v[k] - u[13].
v[k] * w[4 *
ND + 1].
v[k];
126 u[0].
v[k] * w[1].
v[k] + u[1].
v[k] * w[0].
v[k]
127 + u[6].
v[k] * w[2 *
ND + 1].
v[k] + u[7].
v[k] * w[2 *
ND].
v[k]
128 + u[12].
v[k] * w[4 *
ND + 1].
v[k] + u[13].
v[k] * w[4 *
ND].
v[k];
135 for (
int k = 0; k <
VLEN; ++k) {
137 u[0].
v[k] * w[0].
v[k] - u[1].
v[k] * w[1].
v[k]
138 + u[6].
v[k] * w[2].
v[k] - u[7].
v[k] * w[3].
v[k]
139 + u[12].
v[k] * w[4].
v[k] - u[13].
v[k] * w[5].
v[k];
141 u[0].
v[k] * w[1].
v[k] + u[1].
v[k] * w[0].
v[k]
142 + u[6].
v[k] * w[3].
v[k] + u[7].
v[k] * w[2].
v[k]
143 + u[12].
v[k] * w[5].
v[k] + u[13].
v[k] * w[4].
v[k];
150 for (
int k = 0; k <
VLEN; ++k) {
152 u[0].
v[k] * w[0].
v[k] + u[1].
v[k] * w[1].
v[k]
153 + u[2].
v[k] * w[2].
v[k] + u[3].
v[k] * w[3].
v[k]
154 + u[4].
v[k] * w[4].
v[k] + u[5].
v[k] * w[5].
v[k];
156 u[0].
v[k] * w[1].
v[k] - u[1].
v[k] * w[0].
v[k]
157 + u[2].
v[k] * w[3].
v[k] - u[3].
v[k] * w[2].
v[k]
158 + u[4].
v[k] * w[5].
v[k] - u[5].
v[k] * w[4].
v[k];
163 template<
typename REALTYPE>
171 int ic2 =
ND * 2 * ic;
172 load_vec(pg, vt1r, &v[
VLEN * (ic2 +
ID1)]);
173 load_vec(pg, vt1i, &v[
VLEN * (ic2 + 1 +
ID1)]);
174 load_vec(pg, vt2r, &v[
VLEN * (ic2 +
ID2)]);
175 load_vec(pg, vt2i, &v[
VLEN * (ic2 + 1 +
ID2)]);
176 load_vec(pg, vt3r, &v[
VLEN * (ic2 +
ID3)]);
177 load_vec(pg, vt3i, &v[
VLEN * (ic2 + 1 +
ID3)]);
178 load_vec(pg, vt4r, &v[
VLEN * (ic2 +
ID4)]);
179 load_vec(pg, vt4i, &v[
VLEN * (ic2 + 1 +
ID4)]);
183 template<
typename REALTYPE>
184 inline void set_sp4(
svbool_t pg, REALTYPE *v,
191 int ic2 =
ND * 2 * ic;
192 svst1(pg, &v[
VLEN * (ic2 +
ID1)], vt1r);
193 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID1)], vt1i);
194 svst1(pg, &v[
VLEN * (ic2 +
ID2)], vt2r);
195 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID2)], vt2i);
196 svst1(pg, &v[
VLEN * (ic2 +
ID3)], vt3r);
197 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID3)], vt3i);
198 svst1(pg, &v[
VLEN * (ic2 +
ID4)], vt4r);
199 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID4)], vt4i);
203 template<
typename REALTYPE>
206 for (
int ic = 0; ic <
NC; ++ic) {
207 int icr =
ND * 2 * ic;
208 int ici =
ND * 2 * ic + 1;
209 for (
int k = 0; k <
VLEN; ++k) {
210 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] + w[k +
VLEN * (ici +
ID4)];
211 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] - w[k +
VLEN * (icr +
ID4)];
212 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] + w[k +
VLEN * (ici +
ID3)];
213 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] - w[k +
VLEN * (icr +
ID3)];
224 int icr =
ND * 2 * ic;
225 int ici =
ND * 2 * ic + 1;
226 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
227 load_vec(pg, w1r, &v[
VLEN * (icr +
ID1)]);
228 load_vec(pg, w1i, &v[
VLEN * (ici +
ID1)]);
229 load_vec(pg, w2r, &v[
VLEN * (icr +
ID2)]);
230 load_vec(pg, w2i, &v[
VLEN * (ici +
ID2)]);
231 load_vec(pg, w3r, &v[
VLEN * (icr +
ID3)]);
232 load_vec(pg, w3i, &v[
VLEN * (ici +
ID3)]);
233 load_vec(pg, w4r, &v[
VLEN * (icr +
ID4)]);
234 load_vec(pg, w4i, &v[
VLEN * (ici +
ID4)]);
236 vt1r = svadd_m(pg, w1r, w4i);
237 vt1i = svsub_m(pg, w1i, w4r);
238 vt2r = svadd_m(pg, w2r, w3i);
239 vt2i = svsub_m(pg, w2i, w3r);
248 int icr =
ND * 2 * ic;
249 int ici =
ND * 2 * ic + 1;
250 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
251 load_vec_gather(pg, w1r, &v[
VLEN * (icr +
ID1)], index);
252 load_vec_gather(pg, w1i, &v[
VLEN * (ici +
ID1)], index);
253 load_vec_gather(pg, w2r, &v[
VLEN * (icr +
ID2)], index);
254 load_vec_gather(pg, w2i, &v[
VLEN * (ici +
ID2)], index);
255 load_vec_gather(pg, w3r, &v[
VLEN * (icr +
ID3)], index);
256 load_vec_gather(pg, w3i, &v[
VLEN * (ici +
ID3)], index);
257 load_vec_gather(pg, w4r, &v[
VLEN * (icr +
ID4)], index);
258 load_vec_gather(pg, w4i, &v[
VLEN * (ici +
ID4)], index);
260 vt1r = svadd_m(pg, w1r, w4i);
261 vt1i = svsub_m(pg, w1i, w4r);
262 vt2r = svadd_m(pg, w2r, w3i);
263 vt2i = svsub_m(pg, w2i, w3r);
267 template<
typename REALTYPE>
268 inline void set_sp2_xp1(REALTYPE *vt, REALTYPE *w)
270 for (
int ic = 0; ic <
NC; ++ic) {
271 int icr =
ND * 2 * ic;
272 int ici =
ND * 2 * ic + 1;
273 for (
int ky = 0; ky <
VLENY; ++ky) {
287 template<
typename REALTYPE>
290 for (
int ic = 0; ic <
NC; ++ic) {
291 int icr =
ND * 2 * ic;
292 int ici =
ND * 2 * ic + 1;
293 for (
int k = 0; k <
VLEN; ++k) {
294 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] - w[k +
VLEN * (ici +
ID4)];
295 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] + w[k +
VLEN * (icr +
ID4)];
296 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] - w[k +
VLEN * (ici +
ID3)];
297 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] + w[k +
VLEN * (icr +
ID3)];
308 int icr =
ND * 2 * ic;
309 int ici =
ND * 2 * ic + 1;
310 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
311 load_vec(pg, w1r, &v[
VLEN * (icr +
ID1)]);
312 load_vec(pg, w1i, &v[
VLEN * (ici +
ID1)]);
313 load_vec(pg, w2r, &v[
VLEN * (icr +
ID2)]);
314 load_vec(pg, w2i, &v[
VLEN * (ici +
ID2)]);
315 load_vec(pg, w3r, &v[
VLEN * (icr +
ID3)]);
316 load_vec(pg, w3i, &v[
VLEN * (ici +
ID3)]);
317 load_vec(pg, w4r, &v[
VLEN * (icr +
ID4)]);
318 load_vec(pg, w4i, &v[
VLEN * (ici +
ID4)]);
320 vt1r = svsub_m(pg, w1r, w4i);
321 vt1i = svadd_m(pg, w1i, w4r);
322 vt2r = svsub_m(pg, w2r, w3i);
323 vt2i = svadd_m(pg, w2i, w3r);
332 int icr =
ND * 2 * ic;
333 int ici =
ND * 2 * ic + 1;
334 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
335 load_vec_gather(pg, w1r, &v[
VLEN * (icr +
ID1)], index);
336 load_vec_gather(pg, w1i, &v[
VLEN * (ici +
ID1)], index);
337 load_vec_gather(pg, w2r, &v[
VLEN * (icr +
ID2)], index);
338 load_vec_gather(pg, w2i, &v[
VLEN * (ici +
ID2)], index);
339 load_vec_gather(pg, w3r, &v[
VLEN * (icr +
ID3)], index);
340 load_vec_gather(pg, w3i, &v[
VLEN * (ici +
ID3)], index);
341 load_vec_gather(pg, w4r, &v[
VLEN * (icr +
ID4)], index);
342 load_vec_gather(pg, w4i, &v[
VLEN * (ici +
ID4)], index);
344 vt1r = svsub_m(pg, w1r, w4i);
345 vt1i = svadd_m(pg, w1i, w4r);
346 vt2r = svsub_m(pg, w2r, w3i);
347 vt2i = svadd_m(pg, w2i, w3r);
351 template<
typename REALTYPE>
354 for (
int ic = 0; ic <
NC; ++ic) {
355 int icr =
ND * 2 * ic;
356 int ici =
ND * 2 * ic + 1;
357 for (
int k = 0; k <
VLEN; ++k) {
358 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] - w[k +
VLEN * (icr +
ID4)];
359 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] - w[k +
VLEN * (ici +
ID4)];
360 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] + w[k +
VLEN * (icr +
ID3)];
361 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] + w[k +
VLEN * (ici +
ID3)];
372 int icr =
ND * 2 * ic;
373 int ici =
ND * 2 * ic + 1;
374 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
375 load_vec(pg, w1r, &v[
VLEN * (icr +
ID1)]);
376 load_vec(pg, w1i, &v[
VLEN * (ici +
ID1)]);
377 load_vec(pg, w2r, &v[
VLEN * (icr +
ID2)]);
378 load_vec(pg, w2i, &v[
VLEN * (ici +
ID2)]);
379 load_vec(pg, w3r, &v[
VLEN * (icr +
ID3)]);
380 load_vec(pg, w3i, &v[
VLEN * (ici +
ID3)]);
381 load_vec(pg, w4r, &v[
VLEN * (icr +
ID4)]);
382 load_vec(pg, w4i, &v[
VLEN * (ici +
ID4)]);
384 vt1r = svsub_m(pg, w1r, w4r);
385 vt1i = svsub_m(pg, w1i, w4i);
386 vt2r = svadd_m(pg, w2r, w3r);
387 vt2i = svadd_m(pg, w2i, w3i);
396 int icr =
ND * 2 * ic;
397 int ici =
ND * 2 * ic + 1;
398 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
399 load_vec_gather(pg, w1r, &v[
VLEN * (icr +
ID1)], index);
400 load_vec_gather(pg, w1i, &v[
VLEN * (ici +
ID1)], index);
401 load_vec_gather(pg, w2r, &v[
VLEN * (icr +
ID2)], index);
402 load_vec_gather(pg, w2i, &v[
VLEN * (ici +
ID2)], index);
403 load_vec_gather(pg, w3r, &v[
VLEN * (icr +
ID3)], index);
404 load_vec_gather(pg, w3i, &v[
VLEN * (ici +
ID3)], index);
405 load_vec_gather(pg, w4r, &v[
VLEN * (icr +
ID4)], index);
406 load_vec_gather(pg, w4i, &v[
VLEN * (ici +
ID4)], index);
408 vt1r = svsub_m(pg, w1r, w4r);
409 vt1i = svsub_m(pg, w1i, w4i);
410 vt2r = svadd_m(pg, w2r, w3r);
411 vt2i = svadd_m(pg, w2i, w3i);
415 template<
typename REALTYPE>
416 inline void set_sp2_yp1(REALTYPE *vt, REALTYPE *w)
418 for (
int ic = 0; ic <
NC; ++ic) {
419 int icr =
ND * 2 * ic;
420 int ici =
ND * 2 * ic + 1;
421 for (
int kx = 0; kx <
VLENX; ++kx) {
435 template<
typename REALTYPE>
438 for (
int ic = 0; ic <
NC; ++ic) {
439 int icr =
ND * 2 * ic;
440 int ici =
ND * 2 * ic + 1;
441 for (
int k = 0; k <
VLEN; ++k) {
442 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] + w[k +
VLEN * (icr +
ID4)];
443 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] + w[k +
VLEN * (ici +
ID4)];
444 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] - w[k +
VLEN * (icr +
ID3)];
445 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] - w[k +
VLEN * (ici +
ID3)];
456 int icr =
ND * 2 * ic;
457 int ici =
ND * 2 * ic + 1;
458 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
459 load_vec(pg, w1r, &v[
VLEN * (icr +
ID1)]);
460 load_vec(pg, w1i, &v[
VLEN * (ici +
ID1)]);
461 load_vec(pg, w2r, &v[
VLEN * (icr +
ID2)]);
462 load_vec(pg, w2i, &v[
VLEN * (ici +
ID2)]);
463 load_vec(pg, w3r, &v[
VLEN * (icr +
ID3)]);
464 load_vec(pg, w3i, &v[
VLEN * (ici +
ID3)]);
465 load_vec(pg, w4r, &v[
VLEN * (icr +
ID4)]);
466 load_vec(pg, w4i, &v[
VLEN * (ici +
ID4)]);
468 vt1r = svadd_m(pg, w1r, w4r);
469 vt1i = svadd_m(pg, w1i, w4i);
470 vt2r = svsub_m(pg, w2r, w3r);
471 vt2i = svsub_m(pg, w2i, w3i);
480 int icr =
ND * 2 * ic;
481 int ici =
ND * 2 * ic + 1;
482 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
483 load_vec_gather(pg, w1r, &v[
VLEN * (icr +
ID1)], index);
484 load_vec_gather(pg, w1i, &v[
VLEN * (ici +
ID1)], index);
485 load_vec_gather(pg, w2r, &v[
VLEN * (icr +
ID2)], index);
486 load_vec_gather(pg, w2i, &v[
VLEN * (ici +
ID2)], index);
487 load_vec_gather(pg, w3r, &v[
VLEN * (icr +
ID3)], index);
488 load_vec_gather(pg, w3i, &v[
VLEN * (ici +
ID3)], index);
489 load_vec_gather(pg, w4r, &v[
VLEN * (icr +
ID4)], index);
490 load_vec_gather(pg, w4i, &v[
VLEN * (ici +
ID4)], index);
492 vt1r = svadd_m(pg, w1r, w4r);
493 vt1i = svadd_m(pg, w1i, w4i);
494 vt2r = svsub_m(pg, w2r, w3r);
495 vt2i = svsub_m(pg, w2i, w3i);
499 template<
typename REALTYPE>
502 for (
int ic = 0; ic <
NC; ++ic) {
503 int icr =
ND * 2 * ic;
504 int ici =
ND * 2 * ic + 1;
505 for (
int k = 0; k <
VLEN; ++k) {
506 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] + w[k +
VLEN * (ici +
ID3)];
507 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] - w[k +
VLEN * (icr +
ID3)];
508 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] - w[k +
VLEN * (ici +
ID4)];
509 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] + w[k +
VLEN * (icr +
ID4)];
520 int icr =
ND * 2 * ic;
521 int ici =
ND * 2 * ic + 1;
522 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
523 load_vec(pg, w1r, &v[
VLEN * (icr +
ID1)]);
524 load_vec(pg, w1i, &v[
VLEN * (ici +
ID1)]);
525 load_vec(pg, w2r, &v[
VLEN * (icr +
ID2)]);
526 load_vec(pg, w2i, &v[
VLEN * (ici +
ID2)]);
527 load_vec(pg, w3r, &v[
VLEN * (icr +
ID3)]);
528 load_vec(pg, w3i, &v[
VLEN * (ici +
ID3)]);
529 load_vec(pg, w4r, &v[
VLEN * (icr +
ID4)]);
530 load_vec(pg, w4i, &v[
VLEN * (ici +
ID4)]);
532 vt1r = svadd_m(pg, w1r, w3i);
533 vt1i = svsub_m(pg, w1i, w3r);
534 vt2r = svsub_m(pg, w2r, w4i);
535 vt2i = svadd_m(pg, w2i, w4r);
539 template<
typename REALTYPE>
542 for (
int ic = 0; ic <
NC; ++ic) {
543 int icr =
ND * 2 * ic;
544 int ici =
ND * 2 * ic + 1;
545 for (
int k = 0; k <
VLEN; ++k) {
546 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] - w[k +
VLEN * (ici +
ID3)];
547 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] + w[k +
VLEN * (icr +
ID3)];
548 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] + w[k +
VLEN * (ici +
ID4)];
549 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] - w[k +
VLEN * (icr +
ID4)];
560 int icr =
ND * 2 * ic;
561 int ici =
ND * 2 * ic + 1;
562 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
563 load_vec(pg, w1r, &v[
VLEN * (icr +
ID1)]);
564 load_vec(pg, w1i, &v[
VLEN * (ici +
ID1)]);
565 load_vec(pg, w2r, &v[
VLEN * (icr +
ID2)]);
566 load_vec(pg, w2i, &v[
VLEN * (ici +
ID2)]);
567 load_vec(pg, w3r, &v[
VLEN * (icr +
ID3)]);
568 load_vec(pg, w3i, &v[
VLEN * (ici +
ID3)]);
569 load_vec(pg, w4r, &v[
VLEN * (icr +
ID4)]);
570 load_vec(pg, w4i, &v[
VLEN * (ici +
ID4)]);
572 vt1r = svsub_m(pg, w1r, w3i);
573 vt1i = svadd_m(pg, w1i, w3r);
574 vt2r = svadd_m(pg, w2r, w4i);
575 vt2i = svsub_m(pg, w2i, w4r);
579 template<
typename REALTYPE>
580 inline void set_sp2_tp_dirac(
Vsimd_t *vt1,
Vsimd_t *vt2, REALTYPE *w)
582 for (
int ic = 0; ic <
NC; ++ic) {
583 int icr =
ND * 2 * ic;
584 int ici =
ND * 2 * ic + 1;
585 for (
int k = 0; k <
VLEN; ++k) {
586 vt1[2 * ic].
v[k] = 2.0 * w[k +
VLEN * (icr +
ID3)];
587 vt1[2 * ic + 1].
v[k] = 2.0 * w[k +
VLEN * (ici +
ID3)];
588 vt2[2 * ic].
v[k] = 2.0 * w[k +
VLEN * (icr +
ID4)];
589 vt2[2 * ic + 1].
v[k] = 2.0 * w[k +
VLEN * (ici +
ID4)];
595 inline void set_sp2_tp_dirac(
svbool_t pg,
600 int icr =
ND * 2 * ic;
601 int ici =
ND * 2 * ic + 1;
603 load_vec(pg, w3r, &v[
VLEN * (icr +
ID3)]);
604 load_vec(pg, w3i, &v[
VLEN * (ici +
ID3)]);
605 load_vec(pg, w4r, &v[
VLEN * (icr +
ID4)]);
606 load_vec(pg, w4i, &v[
VLEN * (ici +
ID4)]);
608 vt1r = svmul_m(pg, w3r,
real_t(2.0));
609 vt1i = svmul_m(pg, w3i,
real_t(2.0));
610 vt2r = svmul_m(pg, w4r,
real_t(2.0));
611 vt2i = svmul_m(pg, w4i,
real_t(2.0));
615 template<
typename REALTYPE>
616 inline void set_sp2_tm_dirac(
Vsimd_t *vt1,
Vsimd_t *vt2, REALTYPE *w)
618 for (
int ic = 0; ic <
NC; ++ic) {
619 int icr =
ND * 2 * ic;
620 int ici =
ND * 2 * ic + 1;
621 for (
int k = 0; k <
VLEN; ++k) {
622 vt1[2 * ic].
v[k] = 2.0 * w[k +
VLEN * (icr +
ID1)];
623 vt1[2 * ic + 1].
v[k] = 2.0 * w[k +
VLEN * (ici +
ID1)];
624 vt2[2 * ic].
v[k] = 2.0 * w[k +
VLEN * (icr +
ID2)];
625 vt2[2 * ic + 1].
v[k] = 2.0 * w[k +
VLEN * (ici +
ID2)];
631 inline void set_sp2_tm_dirac(
svbool_t pg,
636 int icr =
ND * 2 * ic;
637 int ici =
ND * 2 * ic + 1;
639 load_vec(pg, w1r, &v[
VLEN * (icr +
ID1)]);
640 load_vec(pg, w1i, &v[
VLEN * (ici +
ID1)]);
641 load_vec(pg, w2r, &v[
VLEN * (icr +
ID2)]);
642 load_vec(pg, w2i, &v[
VLEN * (ici +
ID2)]);
644 vt1r = svmul_m(pg, w1r,
real_t(2.0));
645 vt1i = svmul_m(pg, w1i,
real_t(2.0));
646 vt2r = svmul_m(pg, w2r,
real_t(2.0));
647 vt2i = svmul_m(pg, w2i,
real_t(2.0));
653 for (
int k = 0; k <
VLEN; ++k) {
654 x[
ID1].
v[k] += wt1[0].
v[k];
655 x[1 +
ID1].
v[k] += wt1[1].
v[k];
656 x[
ID2].
v[k] += wt2[0].
v[k];
657 x[1 +
ID2].
v[k] += wt2[1].
v[k];
658 x[
ID3].
v[k] += -wt2[1].
v[k];
659 x[1 +
ID3].
v[k] += wt2[0].
v[k];
660 x[
ID4].
v[k] += -wt1[1].
v[k];
661 x[1 +
ID4].
v[k] += wt1[0].
v[k];
670 int ic2 =
ND * 2 * ic;
672 load_vec(pg, vtr, &v[ic2 +
ID1].v[0]);
673 load_vec(pg, vti, &v[ic2 + 1 +
ID1].v[0]);
674 vtr = svadd_m(pg, vtr, wt1r);
675 vti = svadd_m(pg, vti, wt1i);
676 svst1(pg, &v[ic2 +
ID1].v[0], vtr);
677 svst1(pg, &v[ic2 + 1 +
ID1].v[0], vti);
679 load_vec(pg, vtr, &v[ic2 +
ID2].v[0]);
680 load_vec(pg, vti, &v[ic2 + 1 +
ID2].v[0]);
681 vtr = svadd_m(pg, vtr, wt2r);
682 vti = svadd_m(pg, vti, wt2i);
683 svst1(pg, &v[ic2 +
ID2].v[0], vtr);
684 svst1(pg, &v[ic2 + 1 +
ID2].v[0], vti);
686 load_vec(pg, vtr, &v[ic2 +
ID3].v[0]);
687 load_vec(pg, vti, &v[ic2 + 1 +
ID3].v[0]);
688 vtr = svsub_m(pg, vtr, wt2i);
689 vti = svadd_m(pg, vti, wt2r);
690 svst1(pg, &v[ic2 +
ID3].v[0], vtr);
691 svst1(pg, &v[ic2 + 1 +
ID3].v[0], vti);
693 load_vec(pg, vtr, &v[ic2 +
ID4].v[0]);
694 load_vec(pg, vti, &v[ic2 + 1 +
ID4].v[0]);
695 vtr = svsub_m(pg, vtr, wt1i);
696 vti = svadd_m(pg, vti, wt1r);
697 svst1(pg, &v[ic2 +
ID4].v[0], vtr);
698 svst1(pg, &v[ic2 + 1 +
ID4].v[0], vti);
706 int ic2 =
ND * 2 * ic;
708 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID1)]);
709 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID1)]);
710 vtr = svadd_m(pg, vtr, wt1r);
711 vti = svadd_m(pg, vti, wt1i);
712 svst1(pg, &v[
VLEN * (ic2 +
ID1)], vtr);
713 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID1)], vti);
715 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID2)]);
716 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID2)]);
717 vtr = svadd_m(pg, vtr, wt2r);
718 vti = svadd_m(pg, vti, wt2i);
719 svst1(pg, &v[
VLEN * (ic2 +
ID2)], vtr);
720 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID2)], vti);
722 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID3)]);
723 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID3)]);
724 vtr = svsub_m(pg, vtr, wt2i);
725 vti = svadd_m(pg, vti, wt2r);
726 svst1(pg, &v[
VLEN * (ic2 +
ID3)], vtr);
727 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID3)], vti);
729 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID4)]);
730 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID4)]);
731 vtr = svsub_m(pg, vtr, wt1i);
732 vti = svadd_m(pg, vti, wt1r);
733 svst1(pg, &v[
VLEN * (ic2 +
ID4)], vtr);
734 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID4)], vti);
740 for (
int k = 0; k <
VLEN; ++k) {
741 x[
ID1].
v[k] += wt1[0].
v[k];
742 x[1 +
ID1].
v[k] += wt1[1].
v[k];
743 x[
ID2].
v[k] += wt2[0].
v[k];
744 x[1 +
ID2].
v[k] += wt2[1].
v[k];
745 x[
ID3].
v[k] += wt2[1].
v[k];
746 x[1 +
ID3].
v[k] += -wt2[0].
v[k];
747 x[
ID4].
v[k] += wt1[1].
v[k];
748 x[1 +
ID4].
v[k] += -wt1[0].
v[k];
757 int ic2 =
ND * 2 * ic;
759 load_vec(pg, vtr, &v[ic2 +
ID1].v[0]);
760 load_vec(pg, vti, &v[ic2 + 1 +
ID1].v[0]);
761 vtr = svadd_m(pg, vtr, wt1r);
762 vti = svadd_m(pg, vti, wt1i);
763 svst1(pg, &v[ic2 +
ID1].v[0], vtr);
764 svst1(pg, &v[ic2 + 1 +
ID1].v[0], vti);
766 load_vec(pg, vtr, &v[ic2 +
ID2].v[0]);
767 load_vec(pg, vti, &v[ic2 + 1 +
ID2].v[0]);
768 vtr = svadd_m(pg, vtr, wt2r);
769 vti = svadd_m(pg, vti, wt2i);
770 svst1(pg, &v[ic2 +
ID2].v[0], vtr);
771 svst1(pg, &v[ic2 + 1 +
ID2].v[0], vti);
773 load_vec(pg, vtr, &v[ic2 +
ID3].v[0]);
774 load_vec(pg, vti, &v[ic2 + 1 +
ID3].v[0]);
775 vtr = svadd_m(pg, vtr, wt2i);
776 vti = svsub_m(pg, vti, wt2r);
777 svst1(pg, &v[ic2 +
ID3].v[0], vtr);
778 svst1(pg, &v[ic2 + 1 +
ID3].v[0], vti);
780 load_vec(pg, vtr, &v[ic2 +
ID4].v[0]);
781 load_vec(pg, vti, &v[ic2 + 1 +
ID4].v[0]);
782 vtr = svadd_m(pg, vtr, wt1i);
783 vti = svsub_m(pg, vti, wt1r);
784 svst1(pg, &v[ic2 +
ID4].v[0], vtr);
785 svst1(pg, &v[ic2 + 1 +
ID4].v[0], vti);
793 int ic2 =
ND * 2 * ic;
795 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID1)]);
796 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID1)]);
797 vtr = svadd_m(pg, vtr, wt1r);
798 vti = svadd_m(pg, vti, wt1i);
799 svst1(pg, &v[
VLEN * (ic2 +
ID1)], vtr);
800 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID1)], vti);
802 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID2)]);
803 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID2)]);
804 vtr = svadd_m(pg, vtr, wt2r);
805 vti = svadd_m(pg, vti, wt2i);
806 svst1(pg, &v[
VLEN * (ic2 +
ID2)], vtr);
807 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID2)], vti);
809 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID3)]);
810 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID3)]);
811 vtr = svadd_m(pg, vtr, wt2i);
812 vti = svsub_m(pg, vti, wt2r);
813 svst1(pg, &v[
VLEN * (ic2 +
ID3)], vtr);
814 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID3)], vti);
816 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID4)]);
817 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID4)]);
818 vtr = svadd_m(pg, vtr, wt1i);
819 vti = svsub_m(pg, vti, wt1r);
820 svst1(pg, &v[
VLEN * (ic2 +
ID4)], vtr);
821 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID4)], vti);
827 for (
int k = 0; k <
VLEN; ++k) {
828 v[
ID1].
v[k] += wt1[0].
v[k];
829 v[1 +
ID1].
v[k] += wt1[1].
v[k];
830 v[
ID2].
v[k] += wt2[0].
v[k];
831 v[1 +
ID2].
v[k] += wt2[1].
v[k];
832 v[
ID3].
v[k] += wt2[0].
v[k];
833 v[1 +
ID3].
v[k] += wt2[1].
v[k];
834 v[
ID4].
v[k] += -wt1[0].
v[k];
835 v[1 +
ID4].
v[k] += -wt1[1].
v[k];
844 int ic2 =
ND * 2 * ic;
846 load_vec(pg, vtr, &v[ic2 +
ID1].v[0]);
847 load_vec(pg, vti, &v[ic2 + 1 +
ID1].v[0]);
848 vtr = svadd_m(pg, vtr, wt1r);
849 vti = svadd_m(pg, vti, wt1i);
850 svst1(pg, &v[ic2 +
ID1].v[0], vtr);
851 svst1(pg, &v[ic2 + 1 +
ID1].v[0], vti);
853 load_vec(pg, vtr, &v[ic2 +
ID2].v[0]);
854 load_vec(pg, vti, &v[ic2 + 1 +
ID2].v[0]);
855 vtr = svadd_m(pg, vtr, wt2r);
856 vti = svadd_m(pg, vti, wt2i);
857 svst1(pg, &v[ic2 +
ID2].v[0], vtr);
858 svst1(pg, &v[ic2 + 1 +
ID2].v[0], vti);
860 load_vec(pg, vtr, &v[ic2 +
ID3].v[0]);
861 load_vec(pg, vti, &v[ic2 + 1 +
ID3].v[0]);
862 vtr = svadd_m(pg, vtr, wt2r);
863 vti = svadd_m(pg, vti, wt2i);
864 svst1(pg, &v[ic2 +
ID3].v[0], vtr);
865 svst1(pg, &v[ic2 + 1 +
ID3].v[0], vti);
867 load_vec(pg, vtr, &v[ic2 +
ID4].v[0]);
868 load_vec(pg, vti, &v[ic2 + 1 +
ID4].v[0]);
869 vtr = svsub_m(pg, vtr, wt1r);
870 vti = svsub_m(pg, vti, wt1i);
871 svst1(pg, &v[ic2 +
ID4].v[0], vtr);
872 svst1(pg, &v[ic2 + 1 +
ID4].v[0], vti);
880 int ic2 =
ND * 2 * ic;
883 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID1)]);
884 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID1)]);
885 vtr = svadd_m(pg, vtr, wt1r);
886 vti = svadd_m(pg, vti, wt1i);
887 svst1(pg, &v[
VLEN * (ic2 +
ID1)], vtr);
888 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID1)], vti);
890 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID2)]);
891 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID2)]);
892 vtr = svadd_m(pg, vtr, wt2r);
893 vti = svadd_m(pg, vti, wt2i);
894 svst1(pg, &v[
VLEN * (ic2 +
ID2)], vtr);
895 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID2)], vti);
897 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID3)]);
898 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID3)]);
899 vtr = svadd_m(pg, vtr, wt2r);
900 vti = svadd_m(pg, vti, wt2i);
901 svst1(pg, &v[
VLEN * (ic2 +
ID3)], vtr);
902 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID3)], vti);
904 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID4)]);
905 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID4)]);
906 vtr = svsub_m(pg, vtr, wt1r);
907 vti = svsub_m(pg, vti, wt1i);
908 svst1(pg, &v[
VLEN * (ic2 +
ID4)], vtr);
909 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID4)], vti);
915 for (
int k = 0; k <
VLEN; ++k) {
916 v[
ID1].
v[k] += wt1[0].
v[k];
917 v[1 +
ID1].
v[k] += wt1[1].
v[k];
918 v[
ID2].
v[k] += wt2[0].
v[k];
919 v[1 +
ID2].
v[k] += wt2[1].
v[k];
920 v[
ID3].
v[k] += -wt2[0].
v[k];
921 v[1 +
ID3].
v[k] += -wt2[1].
v[k];
922 v[
ID4].
v[k] += wt1[0].
v[k];
923 v[1 +
ID4].
v[k] += wt1[1].
v[k];
932 int ic2 =
ND * 2 * ic;
934 load_vec(pg, vtr, &v[ic2 +
ID1].v[0]);
935 load_vec(pg, vti, &v[ic2 + 1 +
ID1].v[0]);
936 vtr = svadd_m(pg, vtr, wt1r);
937 vti = svadd_m(pg, vti, wt1i);
938 svst1(pg, &v[ic2 +
ID1].v[0], vtr);
939 svst1(pg, &v[ic2 + 1 +
ID1].v[0], vti);
941 load_vec(pg, vtr, &v[ic2 +
ID2].v[0]);
942 load_vec(pg, vti, &v[ic2 + 1 +
ID2].v[0]);
943 vtr = svadd_m(pg, vtr, wt2r);
944 vti = svadd_m(pg, vti, wt2i);
945 svst1(pg, &v[ic2 +
ID2].v[0], vtr);
946 svst1(pg, &v[ic2 + 1 +
ID2].v[0], vti);
948 load_vec(pg, vtr, &v[ic2 +
ID3].v[0]);
949 load_vec(pg, vti, &v[ic2 + 1 +
ID3].v[0]);
950 vtr = svsub_m(pg, vtr, wt2r);
951 vti = svsub_m(pg, vti, wt2i);
952 svst1(pg, &v[ic2 +
ID3].v[0], vtr);
953 svst1(pg, &v[ic2 + 1 +
ID3].v[0], vti);
955 load_vec(pg, vtr, &v[ic2 +
ID4].v[0]);
956 load_vec(pg, vti, &v[ic2 + 1 +
ID4].v[0]);
957 vtr = svadd_m(pg, vtr, wt1r);
958 vti = svadd_m(pg, vti, wt1i);
959 svst1(pg, &v[ic2 +
ID4].v[0], vtr);
960 svst1(pg, &v[ic2 + 1 +
ID4].v[0], vti);
968 int ic2 =
ND * 2 * ic;
970 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID1)]);
971 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID1)]);
972 vtr = svadd_m(pg, vtr, wt1r);
973 vti = svadd_m(pg, vti, wt1i);
974 svst1(pg, &v[
VLEN * (ic2 +
ID1)], vtr);
975 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID1)], vti);
977 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID2)]);
978 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID2)]);
979 vtr = svadd_m(pg, vtr, wt2r);
980 vti = svadd_m(pg, vti, wt2i);
981 svst1(pg, &v[
VLEN * (ic2 +
ID2)], vtr);
982 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID2)], vti);
984 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID3)]);
985 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID3)]);
986 vtr = svsub_m(pg, vtr, wt2r);
987 vti = svsub_m(pg, vti, wt2i);
988 svst1(pg, &v[
VLEN * (ic2 +
ID3)], vtr);
989 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID3)], vti);
991 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID4)]);
992 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID4)]);
993 vtr = svadd_m(pg, vtr, wt1r);
994 vti = svadd_m(pg, vti, wt1i);
995 svst1(pg, &v[
VLEN * (ic2 +
ID4)], vtr);
996 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID4)], vti);
1002 for (
int k = 0; k <
VLEN; ++k) {
1003 v[
ID1].
v[k] += wt1[0].
v[k];
1004 v[1 +
ID1].
v[k] += wt1[1].
v[k];
1005 v[
ID2].
v[k] += wt2[0].
v[k];
1006 v[1 +
ID2].
v[k] += wt2[1].
v[k];
1007 v[
ID3].
v[k] += -wt1[1].
v[k];
1008 v[1 +
ID3].
v[k] += wt1[0].
v[k];
1009 v[
ID4].
v[k] += wt2[1].
v[k];
1010 v[1 +
ID4].
v[k] += -wt2[0].
v[k];
1019 int ic2 =
ND * 2 * ic;
1021 load_vec(pg, vtr, &v[ic2 +
ID1].v[0]);
1022 load_vec(pg, vti, &v[ic2 + 1 +
ID1].v[0]);
1023 vtr = svadd_m(pg, vtr, wt1r);
1024 vti = svadd_m(pg, vti, wt1i);
1025 svst1(pg, &v[ic2 +
ID1].v[0], vtr);
1026 svst1(pg, &v[ic2 + 1 +
ID1].v[0], vti);
1028 load_vec(pg, vtr, &v[ic2 +
ID2].v[0]);
1029 load_vec(pg, vti, &v[ic2 + 1 +
ID2].v[0]);
1030 vtr = svadd_m(pg, vtr, wt2r);
1031 vti = svadd_m(pg, vti, wt2i);
1032 svst1(pg, &v[ic2 +
ID2].v[0], vtr);
1033 svst1(pg, &v[ic2 + 1 +
ID2].v[0], vti);
1035 load_vec(pg, vtr, &v[ic2 +
ID3].v[0]);
1036 load_vec(pg, vti, &v[ic2 + 1 +
ID3].v[0]);
1037 vtr = svsub_m(pg, vtr, wt1i);
1038 vti = svadd_m(pg, vti, wt1r);
1039 svst1(pg, &v[ic2 +
ID3].v[0], vtr);
1040 svst1(pg, &v[ic2 + 1 +
ID3].v[0], vti);
1042 load_vec(pg, vtr, &v[ic2 +
ID4].v[0]);
1043 load_vec(pg, vti, &v[ic2 + 1 +
ID4].v[0]);
1044 vtr = svadd_m(pg, vtr, wt2i);
1045 vti = svsub_m(pg, vti, wt2r);
1046 svst1(pg, &v[ic2 +
ID4].v[0], vtr);
1047 svst1(pg, &v[ic2 + 1 +
ID4].v[0], vti);
1055 int ic2 =
ND * 2 * ic;
1057 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID1)]);
1058 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID1)]);
1059 vtr = svadd_m(pg, vtr, wt1r);
1060 vti = svadd_m(pg, vti, wt1i);
1061 svst1(pg, &v[
VLEN * (ic2 +
ID1)], vtr);
1062 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID1)], vti);
1064 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID2)]);
1065 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID2)]);
1066 vtr = svadd_m(pg, vtr, wt2r);
1067 vti = svadd_m(pg, vti, wt2i);
1068 svst1(pg, &v[
VLEN * (ic2 +
ID2)], vtr);
1069 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID2)], vti);
1071 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID3)]);
1072 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID3)]);
1073 vtr = svsub_m(pg, vtr, wt1i);
1074 vti = svadd_m(pg, vti, wt1r);
1075 svst1(pg, &v[
VLEN * (ic2 +
ID3)], vtr);
1076 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID3)], vti);
1078 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID4)]);
1079 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID4)]);
1080 vtr = svadd_m(pg, vtr, wt2i);
1081 vti = svsub_m(pg, vti, wt2r);
1082 svst1(pg, &v[
VLEN * (ic2 +
ID4)], vtr);
1083 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID4)], vti);
1089 for (
int k = 0; k <
VLEN; ++k) {
1090 v[
ID1].
v[k] += wt1[0].
v[k];
1091 v[1 +
ID1].
v[k] += wt1[1].
v[k];
1092 v[
ID2].
v[k] += wt2[0].
v[k];
1093 v[1 +
ID2].
v[k] += wt2[1].
v[k];
1094 v[
ID3].
v[k] += wt1[1].
v[k];
1095 v[1 +
ID3].
v[k] += -wt1[0].
v[k];
1096 v[
ID4].
v[k] += -wt2[1].
v[k];
1097 v[1 +
ID4].
v[k] += wt2[0].
v[k];
1106 int ic2 =
ND * 2 * ic;
1108 load_vec(pg, vtr, &v[ic2 +
ID1].v[0]);
1109 load_vec(pg, vti, &v[ic2 + 1 +
ID1].v[0]);
1110 vtr = svadd_m(pg, vtr, wt1r);
1111 vti = svadd_m(pg, vti, wt1i);
1112 svst1(pg, &v[ic2 +
ID1].v[0], vtr);
1113 svst1(pg, &v[ic2 + 1 +
ID1].v[0], vti);
1115 load_vec(pg, vtr, &v[ic2 +
ID2].v[0]);
1116 load_vec(pg, vti, &v[ic2 + 1 +
ID2].v[0]);
1117 vtr = svadd_m(pg, vtr, wt2r);
1118 vti = svadd_m(pg, vti, wt2i);
1119 svst1(pg, &v[ic2 +
ID2].v[0], vtr);
1120 svst1(pg, &v[ic2 + 1 +
ID2].v[0], vti);
1122 load_vec(pg, vtr, &v[ic2 +
ID3].v[0]);
1123 load_vec(pg, vti, &v[ic2 + 1 +
ID3].v[0]);
1124 vtr = svadd_m(pg, vtr, wt1i);
1125 vti = svsub_m(pg, vti, wt1r);
1126 svst1(pg, &v[ic2 +
ID3].v[0], vtr);
1127 svst1(pg, &v[ic2 + 1 +
ID3].v[0], vti);
1129 load_vec(pg, vtr, &v[ic2 +
ID4].v[0]);
1130 load_vec(pg, vti, &v[ic2 + 1 +
ID4].v[0]);
1131 vtr = svsub_m(pg, vtr, wt2i);
1132 vti = svadd_m(pg, vti, wt2r);
1133 svst1(pg, &v[ic2 +
ID4].v[0], vtr);
1134 svst1(pg, &v[ic2 + 1 +
ID4].v[0], vti);
1142 int ic2 =
ND * 2 * ic;
1144 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID1)]);
1145 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID1)]);
1146 vtr = svadd_m(pg, vtr, wt1r);
1147 vti = svadd_m(pg, vti, wt1i);
1148 svst1(pg, &v[
VLEN * (ic2 +
ID1)], vtr);
1149 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID1)], vti);
1151 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID2)]);
1152 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID2)]);
1153 vtr = svadd_m(pg, vtr, wt2r);
1154 vti = svadd_m(pg, vti, wt2i);
1155 svst1(pg, &v[
VLEN * (ic2 +
ID2)], vtr);
1156 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID2)], vti);
1158 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID3)]);
1159 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID3)]);
1160 vtr = svadd_m(pg, vtr, wt1i);
1161 vti = svsub_m(pg, vti, wt1r);
1162 svst1(pg, &v[
VLEN * (ic2 +
ID3)], vtr);
1163 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID3)], vti);
1165 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID4)]);
1166 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID4)]);
1167 vtr = svsub_m(pg, vtr, wt2i);
1168 vti = svadd_m(pg, vti, wt2r);
1169 svst1(pg, &v[
VLEN * (ic2 +
ID4)], vtr);
1170 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID4)], vti);
1176 for (
int k = 0; k <
VLEN; ++k) {
1177 v[
ID3].
v[k] += wt1[0].
v[k];
1178 v[1 +
ID3].
v[k] += wt1[1].
v[k];
1179 v[
ID4].
v[k] += wt2[0].
v[k];
1180 v[1 +
ID4].
v[k] += wt2[1].
v[k];
1189 int ic2 =
ND * 2 * ic;
1191 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID3)]);
1192 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID3)]);
1193 vtr = svadd_m(pg, vtr, wt1r);
1194 vti = svadd_m(pg, vti, wt1i);
1195 svst1(pg, &v[
VLEN * (ic2 +
ID3)], vtr);
1196 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID3)], vti);
1198 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID4)]);
1199 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID4)]);
1200 vtr = svadd_m(pg, vtr, wt2r);
1201 vti = svadd_m(pg, vti, wt2i);
1202 svst1(pg, &v[
VLEN * (ic2 +
ID4)], vtr);
1203 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID4)], vti);
1211 int ic2 =
ND * 2 * ic;
1213 load_vec(pg, vtr, &v[ic2 +
ID3].v[0]);
1214 load_vec(pg, vti, &v[ic2 + 1 +
ID3].v[0]);
1215 vtr = svadd_m(pg, vtr, wt1r);
1216 vti = svadd_m(pg, vti, wt1i);
1217 svst1(pg, &v[ic2 +
ID3].v[0], vtr);
1218 svst1(pg, &v[ic2 + 1 +
ID3].v[0], vti);
1220 load_vec(pg, vtr, &v[ic2 +
ID4].v[0]);
1221 load_vec(pg, vti, &v[ic2 + 1 +
ID4].v[0]);
1222 vtr = svadd_m(pg, vtr, wt2r);
1223 vti = svadd_m(pg, vti, wt2i);
1224 svst1(pg, &v[ic2 +
ID4].v[0], vtr);
1225 svst1(pg, &v[ic2 + 1 +
ID4].v[0], vti);
1231 for (
int k = 0; k <
VLEN; ++k) {
1232 v[
ID1].
v[k] += wt1[0].
v[k];
1233 v[1 +
ID1].
v[k] += wt1[1].
v[k];
1234 v[
ID2].
v[k] += wt2[0].
v[k];
1235 v[1 +
ID2].
v[k] += wt2[1].
v[k];
1244 int ic2 =
ND * 2 * ic;
1246 load_vec(pg, vtr, &v[ic2 +
ID1].v[0]);
1247 load_vec(pg, vti, &v[ic2 + 1 +
ID1].v[0]);
1248 vtr = svadd_m(pg, vtr, wt1r);
1249 vti = svadd_m(pg, vti, wt1i);
1250 svst1(pg, &v[ic2 +
ID1].v[0], vtr);
1251 svst1(pg, &v[ic2 + 1 +
ID1].v[0], vti);
1253 load_vec(pg, vtr, &v[ic2 +
ID2].v[0]);
1254 load_vec(pg, vti, &v[ic2 + 1 +
ID2].v[0]);
1255 vtr = svadd_m(pg, vtr, wt2r);
1256 vti = svadd_m(pg, vti, wt2i);
1257 svst1(pg, &v[ic2 +
ID2].v[0], vtr);
1258 svst1(pg, &v[ic2 + 1 +
ID2].v[0], vti);
1266 int ic2 =
ND * 2 * ic;
1268 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID1)]);
1269 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID1)]);
1270 vtr = svadd_m(pg, vtr, wt1r);
1271 vti = svadd_m(pg, vti, wt1i);
1272 svst1(pg, &v[
VLEN * (ic2 +
ID1)], vtr);
1273 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID1)], vti);
1275 load_vec(pg, vtr, &v[
VLEN * (ic2 +
ID2)]);
1276 load_vec(pg, vti, &v[
VLEN * (ic2 + 1 +
ID2)]);
1277 vtr = svadd_m(pg, vtr, wt2r);
1278 vti = svadd_m(pg, vti, wt2i);
1279 svst1(pg, &v[
VLEN * (ic2 +
ID2)], vtr);
1280 svst1(pg, &v[
VLEN * (ic2 + 1 +
ID2)], vti);
1286 for (
int ivc = 0; ivc <
NVC; ++ivc) {
1287 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
1288 for (
int k = 0; k <
VLEN; ++k) {
1289 v[
ID1 + ivc2].
v[k] = -w[
ID3 + ivc2].
v[k];
1290 v[
ID2 + ivc2].
v[k] = -w[
ID4 + ivc2].
v[k];
1291 v[
ID3 + ivc2].
v[k] = -w[
ID1 + ivc2].
v[k];
1292 v[
ID4 + ivc2].
v[k] = -w[
ID2 + ivc2].
v[k];
1298 template<
typename REALTYPE>
1299 inline void mult_gm5_dirac_vec(
svbool_t pg,
1300 REALTYPE *__restrict v,
1301 REALTYPE *__restrict w)
1307 load_vec(pg, vt3r, &w[
VLEN * (
ID1)]);
1308 load_vec(pg, vt3i, &w[
VLEN * (
ID1 + 1)]);
1309 flip_sign(pg, vt3r);
1310 flip_sign(pg, vt3i);
1312 load_vec(pg, vt4r, &w[
VLEN * (
ID2)]);
1313 load_vec(pg, vt4i, &w[
VLEN * (
ID2 + 1)]);
1314 flip_sign(pg, vt4r);
1315 flip_sign(pg, vt4i);
1317 load_vec(pg, vt1r, &w[
VLEN * (
ID3)]);
1318 load_vec(pg, vt1i, &w[
VLEN * (
ID3 + 1)]);
1319 flip_sign(pg, vt1r);
1320 flip_sign(pg, vt1i);
1322 load_vec(pg, vt2r, &w[
VLEN * (
ID4)]);
1323 load_vec(pg, vt2i, &w[
VLEN * (
ID4 + 1)]);
1324 flip_sign(pg, vt2r);
1325 flip_sign(pg, vt2i);
1327 save_vec(pg, &v[
VLEN * (
ID1)], vt1r);
1328 save_vec(pg, &v[
VLEN * (
ID1 + 1)], vt1i);
1329 save_vec(pg, &v[
VLEN * (
ID2)], vt2r);
1330 save_vec(pg, &v[
VLEN * (
ID2 + 1)], vt2i);
1331 save_vec(pg, &v[
VLEN * (
ID3)], vt3r);
1332 save_vec(pg, &v[
VLEN * (
ID3 + 1)], vt3i);
1333 save_vec(pg, &v[
VLEN * (
ID4)], vt4r);
1334 save_vec(pg, &v[
VLEN * (
ID4 + 1)], vt4i);
1338 template<
typename REALTYPE>
1339 inline void load_mult_gm5_dirac_vec(
Vsimd_t *v, REALTYPE *w,
int Nc)
1341 for (
int ivc = 0; ivc <
NVC; ++ivc) {
1342 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
1343 for (
int k = 0; k <
VLEN; ++k) {
1353 template<
typename REALTYPE>
1354 inline void load_mult_gm5_dirac_vec(
svbool_t pg,
1363 load_vec(pg, vt3r, &w[
VLEN * (
ID1)]);
1364 load_vec(pg, vt3i, &w[
VLEN * (
ID1 + 1)]);
1365 flip_sign(pg, vt3r);
1366 flip_sign(pg, vt3i);
1368 load_vec(pg, vt4r, &w[
VLEN * (
ID2)]);
1369 load_vec(pg, vt4i, &w[
VLEN * (
ID2 + 1)]);
1370 flip_sign(pg, vt4r);
1371 flip_sign(pg, vt4i);
1373 load_vec(pg, vt1r, &w[
VLEN * (
ID3)]);
1374 load_vec(pg, vt1i, &w[
VLEN * (
ID3 + 1)]);
1375 flip_sign(pg, vt1r);
1376 flip_sign(pg, vt1i);
1378 load_vec(pg, vt2r, &w[
VLEN * (
ID4)]);
1379 load_vec(pg, vt2i, &w[
VLEN * (
ID4 + 1)]);
1380 flip_sign(pg, vt2r);
1381 flip_sign(pg, vt2i);
1385 template<
typename REALTYPE>
1386 inline void load_mult_gm5_dirac_vec(
Vsimd_t *v,
1387 REALTYPE a, REALTYPE *w,
int Nc)
1389 for (
int ivc = 0; ivc <
NVC; ++ivc) {
1390 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
1391 for (
int k = 0; k <
VLEN; ++k) {
1392 v[
ID1 + ivc2].
v[k] = -a * w[k +
VLEN * (
ID3 + ivc2)];
1393 v[
ID2 + ivc2].
v[k] = -a * w[k +
VLEN * (
ID4 + ivc2)];
1394 v[
ID3 + ivc2].
v[k] = -a * w[k +
VLEN * (
ID1 + ivc2)];
1395 v[
ID4 + ivc2].
v[k] = -a * w[k +
VLEN * (
ID2 + ivc2)];