9 #ifndef QXS_VSIMD_WILSON_SU3_INCLUDED
10 #define QXS_VSIMD_WILSON_SU3_INCLUDED
31 load_vec(pg, u0, &u[
VLEN * 0]);
32 load_vec(pg, u1, &u[
VLEN * 1]);
33 load_vec(pg, u2, &u[
VLEN * (0 +
NVC)]);
34 load_vec(pg, u3, &u[
VLEN * (1 +
NVC)]);
35 load_vec(pg, u4, &u[
VLEN * (0 + 2 *
NVC)]);
36 load_vec(pg, u5, &u[
VLEN * (1 + 2 *
NVC)]);
46 load_vec(pg, u0, &u[
VLEN * 0]);
47 load_vec(pg, u1, &u[
VLEN * 1]);
48 load_vec(pg, u2, &u[
VLEN * 2]);
49 load_vec(pg, u3, &u[
VLEN * 3]);
50 load_vec(pg, u4, &u[
VLEN * 4]);
51 load_vec(pg, u5, &u[
VLEN * 5]);
61 load_vec_gather(pg, u0, &u[
VLEN * 0], index);
62 load_vec_gather(pg, u1, &u[
VLEN * 1], index);
63 load_vec_gather(pg, u2, &u[
VLEN * 2], index);
64 load_vec_gather(pg, u3, &u[
VLEN * 3], index);
65 load_vec_gather(pg, u4, &u[
VLEN * 4], index);
66 load_vec_gather(pg, u5, &u[
VLEN * 5], index);
73 for (
int k = 0; k <
VLEN; ++k) {
75 u[0].
v[k] * w[0].
v[k] - u[1].
v[k] * w[1].
v[k]
76 + u[6].
v[k] * w[2].
v[k] - u[7].
v[k] * w[3].
v[k]
77 + u[12].
v[k] * w[4].
v[k] - u[13].
v[k] * w[5].
v[k];
79 u[0].
v[k] * w[1].
v[k] + u[1].
v[k] * w[0].
v[k]
80 + u[6].
v[k] * w[3].
v[k] + u[7].
v[k] * w[2].
v[k]
81 + u[12].
v[k] * w[5].
v[k] + u[13].
v[k] * w[4].
v[k];
95 for (
int k = 0; k <
VLEN; ++k) {
97 u0.
v[k] * v0.
v[k] - u1.
v[k] * v1.
v[k]
98 + u2.
v[k] * v2.
v[k] - u3.
v[k] * v3.
v[k]
99 + u4.
v[k] * v4.
v[k] - u5.
v[k] * v5.
v[k];
101 u0.
v[k] * v1.
v[k] + u1.
v[k] * v0.
v[k]
102 + u2.
v[k] * v3.
v[k] + u3.
v[k] * v2.
v[k]
103 + u4.
v[k] * v5.
v[k] + u5.
v[k] * v4.
v[k];
111 for (
int k = 0; k <
VLEN; ++k) {
113 u[0].
v[k] * w[0].
v[k] + u[1].
v[k] * w[1].
v[k]
114 + u[2].
v[k] * w[2].
v[k] + u[3].
v[k] * w[3].
v[k]
115 + u[4].
v[k] * w[4].
v[k] + u[5].
v[k] * w[5].
v[k];
117 u[0].
v[k] * w[1].
v[k] - u[1].
v[k] * w[0].
v[k]
118 + u[2].
v[k] * w[3].
v[k] - u[3].
v[k] * w[2].
v[k]
119 + u[4].
v[k] * w[5].
v[k] - u[5].
v[k] * w[4].
v[k];
133 for (
int k = 0; k <
VLEN; ++k) {
135 u0.
v[k] * v0.
v[k] + u1.
v[k] * v1.
v[k]
136 + u2.
v[k] * v2.
v[k] + u3.
v[k] * v3.
v[k]
137 + u4.
v[k] * v4.
v[k] + u5.
v[k] * v5.
v[k];
139 u0.
v[k] * v1.
v[k] - u1.
v[k] * v0.
v[k]
140 + u2.
v[k] * v3.
v[k] - u3.
v[k] * v2.
v[k]
141 + u4.
v[k] * v5.
v[k] - u5.
v[k] * v4.
v[k];
148 for (
int k = 0; k <
VLEN; ++k) {
150 u[0].
v[k] * w[0].
v[k] - u[1].
v[k] * w[1].
v[k]
151 + u[6].
v[k] * w[2 *
ND].
v[k] - u[7].
v[k] * w[2 *
ND + 1].
v[k]
152 + u[12].
v[k] * w[4 *
ND].
v[k] - u[13].
v[k] * w[4 *
ND + 1].
v[k];
154 u[0].
v[k] * w[1].
v[k] + u[1].
v[k] * w[0].
v[k]
155 + u[6].
v[k] * w[2 *
ND + 1].
v[k] + u[7].
v[k] * w[2 *
ND].
v[k]
156 + u[12].
v[k] * w[4 *
ND + 1].
v[k] + u[13].
v[k] * w[4 *
ND].
v[k];
161 template<
typename REALTYPE>
164 for (
int ic = 0; ic <
NC; ++ic) {
165 int icr =
ND * 2 * ic;
166 int ici =
ND * 2 * ic + 1;
167 for (
int k = 0; k <
VLEN; ++k) {
168 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] + w[k +
VLEN * (ici +
ID4)];
169 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] - w[k +
VLEN * (icr +
ID4)];
170 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] + w[k +
VLEN * (ici +
ID3)];
171 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] - w[k +
VLEN * (icr +
ID3)];
177 template<
typename REALTYPE>
183 int icr =
ND * 2 * ic;
184 int ici =
ND * 2 * ic + 1;
185 for (
int k = 0; k <
VLEN; ++k) {
194 template<
typename REALTYPE>
195 inline void set_sp2_xp1(REALTYPE *vt, REALTYPE *w)
197 for (
int ic = 0; ic <
NC; ++ic) {
198 int icr =
ND * 2 * ic;
199 int ici =
ND * 2 * ic + 1;
200 for (
int ky = 0; ky <
VLENY; ++ky) {
214 template<
typename REALTYPE>
220 int icr =
ND * 2 * ic;
221 int ici =
ND * 2 * ic + 1;
222 for (
int k = 0; k <
VLEN; ++k) {
231 template<
typename REALTYPE>
234 for (
int ic = 0; ic <
NC; ++ic) {
235 int icr =
ND * 2 * ic;
236 int ici =
ND * 2 * ic + 1;
237 for (
int k = 0; k <
VLEN; ++k) {
238 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] - w[k +
VLEN * (ici +
ID4)];
239 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] + w[k +
VLEN * (icr +
ID4)];
240 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] - w[k +
VLEN * (ici +
ID3)];
241 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] + w[k +
VLEN * (icr +
ID3)];
247 template<
typename REALTYPE>
253 int icr =
ND * 2 * ic;
254 int ici =
ND * 2 * ic + 1;
255 for (
int k = 0; k <
VLEN; ++k) {
264 template<
typename REALTYPE>
267 for (
int ic = 0; ic <
NC; ++ic) {
268 int icr =
ND * 2 * ic;
269 int ici =
ND * 2 * ic + 1;
270 for (
int k = 0; k <
VLEN; ++k) {
271 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] - w[k +
VLEN * (icr +
ID4)];
272 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] - w[k +
VLEN * (ici +
ID4)];
273 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] + w[k +
VLEN * (icr +
ID3)];
274 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] + w[k +
VLEN * (ici +
ID3)];
280 template<
typename REALTYPE>
281 inline void set_sp2_yp1(REALTYPE *vt, REALTYPE *w)
283 for (
int ic = 0; ic <
NC; ++ic) {
284 int icr =
ND * 2 * ic;
285 int ici =
ND * 2 * ic + 1;
286 for (
int kx = 0; kx <
VLENX; ++kx) {
300 template<
typename REALTYPE>
306 int icr =
ND * 2 * ic;
307 int ici =
ND * 2 * ic + 1;
308 for (
int k = 0; k <
VLEN; ++k) {
317 template<
typename REALTYPE>
320 for (
int ic = 0; ic <
NC; ++ic) {
321 int icr =
ND * 2 * ic;
322 int ici =
ND * 2 * ic + 1;
323 for (
int k = 0; k <
VLEN; ++k) {
324 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] + w[k +
VLEN * (icr +
ID4)];
325 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] + w[k +
VLEN * (ici +
ID4)];
326 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] - w[k +
VLEN * (icr +
ID3)];
327 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] - w[k +
VLEN * (ici +
ID3)];
333 template<
typename REALTYPE>
339 int icr =
ND * 2 * ic;
340 int ici =
ND * 2 * ic + 1;
341 for (
int k = 0; k <
VLEN; ++k) {
350 template<
typename REALTYPE>
353 for (
int ic = 0; ic <
NC; ++ic) {
354 int icr =
ND * 2 * ic;
355 int ici =
ND * 2 * ic + 1;
356 for (
int k = 0; k <
VLEN; ++k) {
357 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] + w[k +
VLEN * (ici +
ID3)];
358 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] - w[k +
VLEN * (icr +
ID3)];
359 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] - w[k +
VLEN * (ici +
ID4)];
360 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] + w[k +
VLEN * (icr +
ID4)];
366 template<
typename REALTYPE>
372 int icr =
ND * 2 * ic;
373 int ici =
ND * 2 * ic + 1;
374 for (
int k = 0; k <
VLEN; ++k) {
383 template<
typename REALTYPE>
386 for (
int ic = 0; ic <
NC; ++ic) {
387 int icr =
ND * 2 * ic;
388 int ici =
ND * 2 * ic + 1;
389 for (
int k = 0; k <
VLEN; ++k) {
390 vt1[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID1)] - w[k +
VLEN * (ici +
ID3)];
391 vt1[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID1)] + w[k +
VLEN * (icr +
ID3)];
392 vt2[2 * ic].
v[k] = w[k +
VLEN * (icr +
ID2)] + w[k +
VLEN * (ici +
ID4)];
393 vt2[2 * ic + 1].
v[k] = w[k +
VLEN * (ici +
ID2)] - w[k +
VLEN * (icr +
ID4)];
399 template<
typename REALTYPE>
400 inline void set_sp2_tp_dirac(
svbool_t pg,
405 int icr =
ND * 2 * ic;
406 int ici =
ND * 2 * ic + 1;
407 for (
int k = 0; k <
VLEN; ++k) {
408 vt1r.
v[k] = 2.0 * w[k +
VLEN * (icr +
ID3)];
409 vt1i.
v[k] = 2.0 * w[k +
VLEN * (ici +
ID3)];
410 vt2r.
v[k] = 2.0 * w[k +
VLEN * (icr +
ID4)];
411 vt2i.
v[k] = 2.0 * w[k +
VLEN * (ici +
ID4)];
416 template<
typename REALTYPE>
417 inline void set_sp2_tp_dirac(
Vsimd_t *vt1,
Vsimd_t *vt2, REALTYPE *w)
419 for (
int ic = 0; ic <
NC; ++ic) {
420 int icr =
ND * 2 * ic;
421 int ici =
ND * 2 * ic + 1;
422 for (
int k = 0; k <
VLEN; ++k) {
423 vt1[2 * ic].
v[k] = 2.0 * w[k +
VLEN * (icr +
ID3)];
424 vt1[2 * ic + 1].
v[k] = 2.0 * w[k +
VLEN * (ici +
ID3)];
425 vt2[2 * ic].
v[k] = 2.0 * w[k +
VLEN * (icr +
ID4)];
426 vt2[2 * ic + 1].
v[k] = 2.0 * w[k +
VLEN * (ici +
ID4)];
432 template<
typename REALTYPE>
433 inline void set_sp2_tm_dirac(
svbool_t pg,
438 int icr =
ND * 2 * ic;
439 int ici =
ND * 2 * ic + 1;
440 for (
int k = 0; k <
VLEN; ++k) {
441 vt1r.
v[k] = 2.0 * w[k +
VLEN * (icr +
ID1)];
442 vt1i.
v[k] = 2.0 * w[k +
VLEN * (ici +
ID1)];
443 vt2r.
v[k] = 2.0 * w[k +
VLEN * (icr +
ID2)];
444 vt2i.
v[k] = 2.0 * w[k +
VLEN * (ici +
ID2)];
449 template<
typename REALTYPE>
450 inline void set_sp2_tm_dirac(
Vsimd_t *vt1,
Vsimd_t *vt2, REALTYPE *w)
452 for (
int ic = 0; ic <
NC; ++ic) {
453 int icr =
ND * 2 * ic;
454 int ici =
ND * 2 * ic + 1;
455 for (
int k = 0; k <
VLEN; ++k) {
456 vt1[2 * ic].
v[k] = 2.0 * w[k +
VLEN * (icr +
ID1)];
457 vt1[2 * ic + 1].
v[k] = 2.0 * w[k +
VLEN * (ici +
ID1)];
458 vt2[2 * ic].
v[k] = 2.0 * w[k +
VLEN * (icr +
ID2)];
459 vt2[2 * ic + 1].
v[k] = 2.0 * w[k +
VLEN * (ici +
ID2)];
469 int ic2 =
ND * 2 * ic;
470 for (
int k = 0; k <
VLEN; ++k) {
471 x[ic2 +
ID1].
v[k] += wt1r.
v[k];
472 x[ic2 + 1 +
ID1].
v[k] += wt1i.
v[k];
473 x[ic2 +
ID2].
v[k] += wt2r.
v[k];
474 x[ic2 + 1 +
ID2].
v[k] += wt2i.
v[k];
475 x[ic2 +
ID3].
v[k] += -wt2i.
v[k];
476 x[ic2 + 1 +
ID3].
v[k] += wt2r.
v[k];
477 x[ic2 +
ID4].
v[k] += -wt1i.
v[k];
478 x[ic2 + 1 +
ID4].
v[k] += wt1r.
v[k];
483 template<
typename REALTYPE>
484 inline void set_sp4_xp(
svbool_t pg, REALTYPE *x,
488 int ic2 =
ND * 2 * ic;
489 for (
int k = 0; k <
VLEN; ++k) {
490 x[k +
VLEN * (ic2 +
ID1)] += wt1r.
v[k];
491 x[k +
VLEN * (ic2 + 1 +
ID1)] += wt1i.
v[k];
492 x[k +
VLEN * (ic2 +
ID2)] += wt2r.
v[k];
493 x[k +
VLEN * (ic2 + 1 +
ID2)] += wt2i.
v[k];
494 x[k +
VLEN * (ic2 +
ID3)] += -wt2i.
v[k];
495 x[k +
VLEN * (ic2 + 1 +
ID3)] += wt2r.
v[k];
496 x[k +
VLEN * (ic2 +
ID4)] += -wt1i.
v[k];
497 x[k +
VLEN * (ic2 + 1 +
ID4)] += wt1r.
v[k];
504 for (
int k = 0; k <
VLEN; ++k) {
505 x[
ID1].
v[k] += wt1[0].
v[k];
506 x[1 +
ID1].
v[k] += wt1[1].
v[k];
507 x[
ID2].
v[k] += wt2[0].
v[k];
508 x[1 +
ID2].
v[k] += wt2[1].
v[k];
509 x[
ID3].
v[k] += -wt2[1].
v[k];
510 x[1 +
ID3].
v[k] += wt2[0].
v[k];
511 x[
ID4].
v[k] += -wt1[1].
v[k];
512 x[1 +
ID4].
v[k] += wt1[0].
v[k];
519 for (
int k = 0; k <
VLEN; ++k) {
520 x[
ID1].
v[k] += wt1[0].
v[k];
521 x[1 +
ID1].
v[k] += wt1[1].
v[k];
522 x[
ID2].
v[k] += wt2[0].
v[k];
523 x[1 +
ID2].
v[k] += wt2[1].
v[k];
524 x[
ID3].
v[k] += wt2[1].
v[k];
525 x[1 +
ID3].
v[k] += -wt2[0].
v[k];
526 x[
ID4].
v[k] += wt1[1].
v[k];
527 x[1 +
ID4].
v[k] += -wt1[0].
v[k];
536 int ic2 =
ND * 2 * ic;
537 for (
int k = 0; k <
VLEN; ++k) {
538 x[ic2 +
ID1].
v[k] += wt1r.
v[k];
539 x[ic2 + 1 +
ID1].
v[k] += wt1i.
v[k];
540 x[ic2 +
ID2].
v[k] += wt2r.
v[k];
541 x[ic2 + 1 +
ID2].
v[k] += wt2i.
v[k];
542 x[ic2 +
ID3].
v[k] += wt2i.
v[k];
543 x[ic2 + 1 +
ID3].
v[k] += -wt2r.
v[k];
544 x[ic2 +
ID4].
v[k] += wt1i.
v[k];
545 x[ic2 + 1 +
ID4].
v[k] += -wt1r.
v[k];
550 template<
typename REALTYPE>
551 inline void set_sp4_xm(
svbool_t pg, REALTYPE *x,
555 int ic2 =
ND * 2 * ic;
556 for (
int k = 0; k <
VLEN; ++k) {
557 x[k +
VLEN * (ic2 +
ID1)] += wt1r.
v[k];
558 x[k +
VLEN * (ic2 + 1 +
ID1)] += wt1i.
v[k];
559 x[k +
VLEN * (ic2 +
ID2)] += wt2r.
v[k];
560 x[k +
VLEN * (ic2 + 1 +
ID2)] += wt2i.
v[k];
561 x[k +
VLEN * (ic2 +
ID3)] += wt2i.
v[k];
562 x[k +
VLEN * (ic2 + 1 +
ID3)] += -wt2r.
v[k];
563 x[k +
VLEN * (ic2 +
ID4)] += wt1i.
v[k];
564 x[k +
VLEN * (ic2 + 1 +
ID4)] += -wt1r.
v[k];
571 for (
int k = 0; k <
VLEN; ++k) {
572 v[
ID1].
v[k] += wt1[0].
v[k];
573 v[1 +
ID1].
v[k] += wt1[1].
v[k];
574 v[
ID2].
v[k] += wt2[0].
v[k];
575 v[1 +
ID2].
v[k] += wt2[1].
v[k];
576 v[
ID3].
v[k] += wt2[0].
v[k];
577 v[1 +
ID3].
v[k] += wt2[1].
v[k];
578 v[
ID4].
v[k] += -wt1[0].
v[k];
579 v[1 +
ID4].
v[k] += -wt1[1].
v[k];
586 for (
int k = 0; k <
VLEN; ++k) {
587 v[
ID1].
v[k] += wt1[0].
v[k];
588 v[1 +
ID1].
v[k] += wt1[1].
v[k];
589 v[
ID2].
v[k] += wt2[0].
v[k];
590 v[1 +
ID2].
v[k] += wt2[1].
v[k];
591 v[
ID3].
v[k] += -wt2[0].
v[k];
592 v[1 +
ID3].
v[k] += -wt2[1].
v[k];
593 v[
ID4].
v[k] += wt1[0].
v[k];
594 v[1 +
ID4].
v[k] += wt1[1].
v[k];
603 int ic2 =
ND * 2 * ic;
604 for (
int k = 0; k <
VLEN; ++k) {
605 v[ic2 +
ID1].
v[k] += wt1r.
v[k];
606 v[ic2 + 1 +
ID1].
v[k] += wt1i.
v[k];
607 v[ic2 +
ID2].
v[k] += wt2r.
v[k];
608 v[ic2 + 1 +
ID2].
v[k] += wt2i.
v[k];
609 v[ic2 +
ID3].
v[k] += wt2r.
v[k];
610 v[ic2 + 1 +
ID3].
v[k] += wt2i.
v[k];
611 v[ic2 +
ID4].
v[k] += -wt1r.
v[k];
612 v[ic2 + 1 +
ID4].
v[k] += -wt1i.
v[k];
621 int ic2 =
ND * 2 * ic;
622 for (
int k = 0; k <
VLEN; ++k) {
623 v[ic2 +
ID1].
v[k] += wt1r.
v[k];
624 v[ic2 + 1 +
ID1].
v[k] += wt1i.
v[k];
625 v[ic2 +
ID2].
v[k] += wt2r.
v[k];
626 v[ic2 + 1 +
ID2].
v[k] += wt2i.
v[k];
627 v[ic2 +
ID3].
v[k] += -wt2r.
v[k];
628 v[ic2 + 1 +
ID3].
v[k] += -wt2i.
v[k];
629 v[ic2 +
ID4].
v[k] += wt1r.
v[k];
630 v[ic2 + 1 +
ID4].
v[k] += wt1i.
v[k];
635 template<
typename REALTYPE>
636 inline void set_sp4_yp(
svbool_t pg, REALTYPE *v,
640 int ic2 =
ND * 2 * ic;
641 for (
int k = 0; k <
VLEN; ++k) {
642 v[k +
VLEN * (ic2 +
ID1)] += wt1r.
v[k];
643 v[k +
VLEN * (ic2 + 1 +
ID1)] += wt1i.
v[k];
644 v[k +
VLEN * (ic2 +
ID2)] += wt2r.
v[k];
645 v[k +
VLEN * (ic2 + 1 +
ID2)] += wt2i.
v[k];
646 v[k +
VLEN * (ic2 +
ID3)] += wt2r.
v[k];
647 v[k +
VLEN * (ic2 + 1 +
ID3)] += wt2i.
v[k];
648 v[k +
VLEN * (ic2 +
ID4)] += -wt1r.
v[k];
649 v[k +
VLEN * (ic2 + 1 +
ID4)] += -wt1i.
v[k];
654 template<
typename REALTYPE>
655 inline void set_sp4_ym(
svbool_t pg, REALTYPE *v,
659 int ic2 =
ND * 2 * ic;
660 for (
int k = 0; k <
VLEN; ++k) {
661 v[k +
VLEN * (ic2 +
ID1)] += wt1r.
v[k];
662 v[k +
VLEN * (ic2 + 1 +
ID1)] += wt1i.
v[k];
663 v[k +
VLEN * (ic2 +
ID2)] += wt2r.
v[k];
664 v[k +
VLEN * (ic2 + 1 +
ID2)] += wt2i.
v[k];
665 v[k +
VLEN * (ic2 +
ID3)] += -wt2r.
v[k];
666 v[k +
VLEN * (ic2 + 1 +
ID3)] += -wt2i.
v[k];
667 v[k +
VLEN * (ic2 +
ID4)] += wt1r.
v[k];
668 v[k +
VLEN * (ic2 + 1 +
ID4)] += wt1i.
v[k];
675 for (
int k = 0; k <
VLEN; ++k) {
676 v[
ID1].
v[k] += wt1[0].
v[k];
677 v[1 +
ID1].
v[k] += wt1[1].
v[k];
678 v[
ID2].
v[k] += wt2[0].
v[k];
679 v[1 +
ID2].
v[k] += wt2[1].
v[k];
680 v[
ID3].
v[k] += -wt1[1].
v[k];
681 v[1 +
ID3].
v[k] += wt1[0].
v[k];
682 v[
ID4].
v[k] += wt2[1].
v[k];
683 v[1 +
ID4].
v[k] += -wt2[0].
v[k];
690 for (
int k = 0; k <
VLEN; ++k) {
691 v[
ID1].
v[k] += wt1[0].
v[k];
692 v[1 +
ID1].
v[k] += wt1[1].
v[k];
693 v[
ID2].
v[k] += wt2[0].
v[k];
694 v[1 +
ID2].
v[k] += wt2[1].
v[k];
695 v[
ID3].
v[k] += wt1[1].
v[k];
696 v[1 +
ID3].
v[k] += -wt1[0].
v[k];
697 v[
ID4].
v[k] += -wt2[1].
v[k];
698 v[1 +
ID4].
v[k] += wt2[0].
v[k];
707 int ic2 =
ND * 2 * ic;
708 for (
int k = 0; k <
VLEN; ++k) {
709 v[ic2 +
ID1].
v[k] += wt1r.
v[k];
710 v[ic2 + 1 +
ID1].
v[k] += wt1i.
v[k];
711 v[ic2 +
ID2].
v[k] += wt2r.
v[k];
712 v[ic2 + 1 +
ID2].
v[k] += wt2i.
v[k];
713 v[ic2 +
ID3].
v[k] += -wt1i.
v[k];
714 v[ic2 + 1 +
ID3].
v[k] += wt1r.
v[k];
715 v[ic2 +
ID4].
v[k] += wt2i.
v[k];
716 v[ic2 + 1 +
ID4].
v[k] += -wt2r.
v[k];
725 int ic2 =
ND * 2 * ic;
726 for (
int k = 0; k <
VLEN; ++k) {
727 v[ic2 +
ID1].
v[k] += wt1r.
v[k];
728 v[ic2 + 1 +
ID1].
v[k] += wt1i.
v[k];
729 v[ic2 +
ID2].
v[k] += wt2r.
v[k];
730 v[ic2 + 1 +
ID2].
v[k] += wt2i.
v[k];
731 v[ic2 +
ID3].
v[k] += wt1i.
v[k];
732 v[ic2 + 1 +
ID3].
v[k] += -wt1r.
v[k];
733 v[ic2 +
ID4].
v[k] += -wt2i.
v[k];
734 v[ic2 + 1 +
ID4].
v[k] += wt2r.
v[k];
739 template<
typename REALTYPE>
740 inline void set_sp4_zp(
svbool_t pg, REALTYPE *v,
744 int ic2 =
ND * 2 * ic;
745 for (
int k = 0; k <
VLEN; ++k) {
746 v[k +
VLEN * (ic2 +
ID1)] += wt1r.
v[k];
747 v[k +
VLEN * (ic2 + 1 +
ID1)] += wt1i.
v[k];
748 v[k +
VLEN * (ic2 +
ID2)] += wt2r.
v[k];
749 v[k +
VLEN * (ic2 + 1 +
ID2)] += wt2i.
v[k];
750 v[k +
VLEN * (ic2 +
ID3)] += -wt1i.
v[k];
751 v[k +
VLEN * (ic2 + 1 +
ID3)] += wt1r.
v[k];
752 v[k +
VLEN * (ic2 +
ID4)] += wt2i.
v[k];
753 v[k +
VLEN * (ic2 + 1 +
ID4)] += -wt2r.
v[k];
758 template<
typename REALTYPE>
759 inline void set_sp4_zm(
svbool_t pg, REALTYPE *v,
763 int ic2 =
ND * 2 * ic;
764 for (
int k = 0; k <
VLEN; ++k) {
765 v[k +
VLEN * (ic2 +
ID1)] += wt1r.
v[k];
766 v[k +
VLEN * (ic2 + 1 +
ID1)] += wt1i.
v[k];
767 v[k +
VLEN * (ic2 +
ID2)] += wt2r.
v[k];
768 v[k +
VLEN * (ic2 + 1 +
ID2)] += wt2i.
v[k];
769 v[k +
VLEN * (ic2 +
ID3)] += wt1i.
v[k];
770 v[k +
VLEN * (ic2 + 1 +
ID3)] += -wt1r.
v[k];
771 v[k +
VLEN * (ic2 +
ID4)] += -wt2i.
v[k];
772 v[k +
VLEN * (ic2 + 1 +
ID4)] += wt2r.
v[k];
779 for (
int k = 0; k <
VLEN; ++k) {
780 v[
ID3].
v[k] += wt1[0].
v[k];
781 v[1 +
ID3].
v[k] += wt1[1].
v[k];
782 v[
ID4].
v[k] += wt2[0].
v[k];
783 v[1 +
ID4].
v[k] += wt2[1].
v[k];
790 for (
int k = 0; k <
VLEN; ++k) {
791 v[
ID1].
v[k] += wt1[0].
v[k];
792 v[1 +
ID1].
v[k] += wt1[1].
v[k];
793 v[
ID2].
v[k] += wt2[0].
v[k];
794 v[1 +
ID2].
v[k] += wt2[1].
v[k];
803 int ic2 =
ND * 2 * ic;
804 for (
int k = 0; k <
VLEN; ++k) {
805 v[ic2 +
ID3].
v[k] += wt1r.
v[k];
806 v[ic2 + 1 +
ID3].
v[k] += wt1i.
v[k];
807 v[ic2 +
ID4].
v[k] += wt2r.
v[k];
808 v[ic2 + 1 +
ID4].
v[k] += wt2i.
v[k];
817 int ic2 =
ND * 2 * ic;
818 for (
int k = 0; k <
VLEN; ++k) {
819 v[ic2 +
ID1].
v[k] += wt1r.
v[k];
820 v[ic2 + 1 +
ID1].
v[k] += wt1i.
v[k];
821 v[ic2 +
ID2].
v[k] += wt2r.
v[k];
822 v[ic2 + 1 +
ID2].
v[k] += wt2i.
v[k];
827 template<
typename REALTYPE>
828 inline void set_sp4_tp_dirac(
svbool_t pg, REALTYPE *v,
832 int ic2 =
ND * 2 * ic;
833 for (
int k = 0; k <
VLEN; ++k) {
834 v[k +
VLEN * (ic2 +
ID3)] += wt1r.
v[k];
835 v[k +
VLEN * (ic2 + 1 +
ID3)] += wt1i.
v[k];
836 v[k +
VLEN * (ic2 +
ID4)] += wt2r.
v[k];
837 v[k +
VLEN * (ic2 + 1 +
ID4)] += wt2i.
v[k];
842 template<
typename REALTYPE>
843 inline void set_sp4_tm_dirac(
svbool_t pg, REALTYPE *v,
847 int ic2 =
ND * 2 * ic;
848 for (
int k = 0; k <
VLEN; ++k) {
849 v[k +
VLEN * (ic2 +
ID1)] += wt1r.
v[k];
850 v[k +
VLEN * (ic2 + 1 +
ID1)] += wt1i.
v[k];
851 v[k +
VLEN * (ic2 +
ID2)] += wt2r.
v[k];
852 v[k +
VLEN * (ic2 + 1 +
ID2)] += wt2i.
v[k];
857 template<
typename REALTYPE>
858 inline void mult_gm5_dirac_vec(
svbool_t pg,
859 REALTYPE *v, REALTYPE *w)
861 for (
int k = 0; k <
VLEN; ++k) {