18 #ifndef MULT_DOMAINWALL_5DIN_EO_QXS_INCLUDED
19 #define MULT_DOMAINWALL_5DIN_EO_QXS_INCLUDED
30 int *Nsize,
int *do_comm)
36 int Nst2v = Nx2v * Ny * Nz * Nt;
41 int ith, nth, site0, site1;
42 set_threadtask(ith, nth, site0, site1, Nst2v);
44 for (
int site = site0; site < site1; ++site) {
45 real_t *wp2 = &wp[Nin5 * site];
46 real_t *yp2 = &yp[Nin5 * site];
49 for (
int is = 0; is < Ns; ++is) {
50 for (
int ic = 0; ic <
NC; ++ic) {
51 svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
52 int offset = 2 *
ND * ic +
NVCD * is;
54 real_t bb = b[is] * factor;
56 load_vec(pg, vt1r, &wp2[
VLEN * (offset + 0)]);
57 load_vec(pg, vt1i, &wp2[
VLEN * (offset + 1)]);
58 load_vec(pg, vt2r, &wp2[
VLEN * (offset + 2)]);
59 load_vec(pg, vt2i, &wp2[
VLEN * (offset + 3)]);
60 load_vec(pg, vt3r, &wp2[
VLEN * (offset + 4)]);
61 load_vec(pg, vt3i, &wp2[
VLEN * (offset + 5)]);
62 load_vec(pg, vt4r, &wp2[
VLEN * (offset + 6)]);
63 load_vec(pg, vt4i, &wp2[
VLEN * (offset + 7)]);
64 scal_vec(pg, vt1r, bb);
65 scal_vec(pg, vt1i, bb);
66 scal_vec(pg, vt2r, bb);
67 scal_vec(pg, vt2i, bb);
68 scal_vec(pg, vt3r, bb);
69 scal_vec(pg, vt3i, bb);
70 scal_vec(pg, vt4r, bb);
71 scal_vec(pg, vt4i, bb);
73 int is_up = (is + 1) % Ns;
74 real_t Fup = 0.5 * c[is] * factor;
75 if (is == Ns - 1) Fup *= -mq;
76 add_aPm5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
77 vt3r, vt3i, vt4r, vt4i,
80 int is_dn = (is - 1 + Ns) % Ns;
81 real_t Fdn = 0.5 * c[is] * factor;
82 if (is == 0) Fdn *= -mq;
83 add_aPp5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
84 vt3r, vt3i, vt4r, vt4i,
86 save_vec(pg, &yp2[
VLEN * (offset + 0)], vt1r);
87 save_vec(pg, &yp2[
VLEN * (offset + 1)], vt1i);
88 save_vec(pg, &yp2[
VLEN * (offset + 2)], vt2r);
89 save_vec(pg, &yp2[
VLEN * (offset + 3)], vt2i);
90 save_vec(pg, &yp2[
VLEN * (offset + 4)], vt3r);
91 save_vec(pg, &yp2[
VLEN * (offset + 5)], vt3i);
92 save_vec(pg, &yp2[
VLEN * (offset + 6)], vt4r);
93 save_vec(pg, &yp2[
VLEN * (offset + 7)], vt4i);
110 int Nstv = Nxv * Ny * Nz * Nt;
113 int Nin5 = Nin4 * Ns;
115 int ith, nth, site0, site1;
116 set_threadtask(ith, nth, site0, site1, Nstv);
118 for (
int site = site0; site < site1; ++site) {
119 real_t *vp2 = &vp[Nin5 * site];
120 real_t *wp2 = &wp[Nin5 * site];
122 for (
int is = 0; is < Ns; ++is) {
123 for (
int ic = 0; ic <
NC; ++ic) {
126 int offset = 2 *
ND * ic +
NVCD * is;
128 load_mult_gm5_dirac_vec(pg, vt1r, vt1i, vt2r, vt2i,
129 vt3r, vt3i, vt4r, vt4i, &wp2[
VLEN * offset]);
130 save_vec(pg, &vp2[
VLEN * (offset + 0)], vt1r);
131 save_vec(pg, &vp2[
VLEN * (offset + 1)], vt1i);
132 save_vec(pg, &vp2[
VLEN * (offset + 2)], vt2r);
133 save_vec(pg, &vp2[
VLEN * (offset + 3)], vt2i);
134 save_vec(pg, &vp2[
VLEN * (offset + 4)], vt3r);
135 save_vec(pg, &vp2[
VLEN * (offset + 5)], vt3i);
136 save_vec(pg, &vp2[
VLEN * (offset + 6)], vt4r);
137 save_vec(pg, &vp2[
VLEN * (offset + 7)], vt4i);
151 int Nstv = Nxv * Ny * Nz * Nt;
154 int Nin5 = Nin4 * Ns;
156 int ith, nth, site0, site1;
157 set_threadtask(ith, nth, site0, site1, Nstv);
161 pg = set_predicate();
163 for (
int site = site0; site < site1; ++site) {
164 real_t *vp2 = &vp[Nin5 * site];
166 for (
int is = 0; is < Ns; ++is) {
167 for (
int i = 0; i <
NVCD; ++i) {
168 save_vec(pg, &vp2[
VLEN * (is *
NVCD + i)], y);
181 int *Nsize,
int *do_comm)
187 int Nst2v = Nx2v * Ny * Nz * Nt;
188 int Nst2 = Nst2v *
VLEN;
191 int Nin5 = Nin4 * Ns;
193 int ith, nth, site0, site1;
194 set_threadtask(ith, nth, site0, site1, Nst2v);
196 for (
int site = site0; site < site1; ++site) {
197 real_t *vp2 = &vp[Nin5 * site];
198 real_t *yp2 = &yp[Nin5 * site];
201 for (
int is = 0; is < Ns; ++is) {
202 for (
int ic = 0; ic <
NC; ++ic) {
203 svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
204 int offset = 2 *
ND * ic +
NVCD * is;
207 load_vec(pg, vt3r, &yp2[
VLEN * (offset + 0)]);
208 load_vec(pg, vt3i, &yp2[
VLEN * (offset + 1)]);
209 load_vec(pg, vt4r, &yp2[
VLEN * (offset + 2)]);
210 load_vec(pg, vt4i, &yp2[
VLEN * (offset + 3)]);
211 load_vec(pg, vt1r, &yp2[
VLEN * (offset + 4)]);
212 load_vec(pg, vt1i, &yp2[
VLEN * (offset + 5)]);
213 load_vec(pg, vt2r, &yp2[
VLEN * (offset + 6)]);
214 load_vec(pg, vt2i, &yp2[
VLEN * (offset + 7)]);
215 scal_vec(pg, vt1r, bb);
216 scal_vec(pg, vt1i, bb);
217 scal_vec(pg, vt2r, bb);
218 scal_vec(pg, vt2i, bb);
219 scal_vec(pg, vt3r, bb);
220 scal_vec(pg, vt3i, bb);
221 scal_vec(pg, vt4r, bb);
222 scal_vec(pg, vt4i, bb);
224 int is_up = (is + 1) % Ns;
225 real_t Fup = 0.5 * (-0.5) * c[is_up];
226 if (is == Ns - 1) Fup *= -mq;
227 add_aPp5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
228 vt3r, vt3i, vt4r, vt4i,
229 Fup, yp2, is_up, ic);
231 int is_dn = (is - 1 + Ns) % Ns;
232 real_t Fdn = -0.5 * (-0.5) * c[is_dn];
233 if (is == 0) Fdn *= -mq;
234 add_aPm5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
235 vt3r, vt3i, vt4r, vt4i,
236 Fdn, yp2, is_dn, ic);
237 save_vec(pg, &vp2[
VLEN * (offset + 0)], vt1r);
238 save_vec(pg, &vp2[
VLEN * (offset + 1)], vt1i);
239 save_vec(pg, &vp2[
VLEN * (offset + 2)], vt2r);
240 save_vec(pg, &vp2[
VLEN * (offset + 3)], vt2i);
241 save_vec(pg, &vp2[
VLEN * (offset + 4)], vt3r);
242 save_vec(pg, &vp2[
VLEN * (offset + 5)], vt3i);
243 save_vec(pg, &vp2[
VLEN * (offset + 6)], vt4r);
244 save_vec(pg, &vp2[
VLEN * (offset + 7)], vt4i);
256 int *Leo,
int *Nsize,
int *do_comm,
263 int Nst2v = Nx2v * Ny * Nz * Nt;
264 int Nst2 = Nst2v *
VLEN;
267 int Nin5 = Nin4 * Ns;
269 int NvU2 =
NDF * Nst2;
271 int Nxy2 = Nx2v * Ny;
272 int Nxyz2 = Nx2v * Ny * Nz;
274 svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
275 svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
276 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
277 set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
278 set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
279 set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
280 set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
281 set_predicate_yp(pg1_yp, pg2_yp);
282 set_predicate_ym(pg1_ym, pg2_ym);
284 int ith, nth, site0, site1;
285 set_threadtask(ith, nth, site0, site1, Nst2v);
289 for (
int site = site0; site < site1; ++site) {
290 int ix = site % Nx2v;
291 int iyzt = site / Nx2v;
293 int izt = site / Nxy2;
296 int ixy = ix + Nx2v * iy;
297 int ixyz = ixy + Nxy2 * iz;
298 int jeo = (ieo + Leo[
VLENY * iyzt]) % 2;
301 int ix_p1 = (site + 1) % Nx2v;
302 int iyzt_p1 = (site + 1) / Nx2v;
303 int iy_p1 = iyzt_p1 % Ny;
304 int izt_p1 = (site + 1) / Nxy2;
305 int iz_p1 = izt_p1 % Nz;
306 int it_p1 = izt_p1 / Nz;
307 int ixy_p1 = ix_p1 + Nx2v * iy_p1;
308 int ixyz_p1 = ixy_p1 + Nxy2 * iz_p1;
309 int jeo_p1 = (ieo + Leo[
VLENY * iyzt_p1]) % 2;
313 real_t *wp2 = &wp[Nin5 * site];
314 real_t *vp2 = &vp[Nin5 * site];
325 if ((ix < Nx2v - 1) || (do_comm[0] == 0)) {
326 int nei = ix + 1 + Nx2v * iyzt;
328 nei_p1 = ix - 1 + Nx2v * iyzt;
329 if (ix == Nx2v - 1) nei = 0 + Nx2v * iyzt;
331 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (ieo + 2 * idir)];
332 real_t *wpn = &wp[Nin5 * nei];
335 for (
int is = 0; is < Ns; ++is) {
340 mult_wilson_eo_xpb(pg1e_xp, pg2e_xp, pg3e_xp,
341 &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
344 for (
int is = 0; is < Ns; ++is) {
349 mult_wilson_eo_xpb(pg1o_xp, pg2o_xp, pg3o_xp,
350 &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
358 if ((ix > 0) || (do_comm[0] == 0)) {
359 int nei = ix - 1 + Nx2v * iyzt;
361 int iy2_p1 = (iy + 1) % Ny;
362 nei_p1 = ix + Nx2v * (iy2_p1 + Ny * izt);
364 if (ix == 0) nei = Nx2v - 1 + Nx2v * iyzt;
366 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
367 real_t *un = &up[
VLEN *
NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
368 real_t *wpn = &wp[Nin5 * nei];
370 for (
int is = 0; is < Ns; ++is) {
375 mult_wilson_eo_xmb(pg1e_xm, pg2e_xm, pg3e_xm, &vp2[Nin4 * is],
376 u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
379 for (
int is = 0; is < Ns; ++is) {
384 mult_wilson_eo_xmb(pg1o_xm, pg2o_xm, pg3o_xm, &vp2[Nin4 * is],
385 u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
395 if ((iy < Ny - 1) || (do_comm[idir] == 0)) {
396 int iy2 = (iy + 1) % Ny;
397 int nei = ix + Nx2v * (iy2 + Ny * izt);
399 int iy2_p1 = (iy - 1 + Ny) % Ny;
400 nei_p1 = ix + Nx2v * (iy2_p1 + Ny * izt);
403 real_t *wpn = &wp[Nin5 * nei];
404 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (ieo + 2 * idir)];
405 for (
int is = 0; is < Ns; ++is) {
408 mult_wilson_ypb(pg1_yp, pg2_yp,
409 &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
415 if ((iy > 0) || (do_comm[idir] == 0)) {
416 int iy2 = (iy - 1 + Ny) % Ny;
417 int nei = ix + Nx2v * (iy2 + Ny * izt);
419 int iz2_p1 = (iz + 1) % Nz;
420 nei_p1 = ixy + Nxy2 * (iz2_p1 + Nz * it);
422 real_t *wpn = &wp[Nin5 * nei];
423 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
424 real_t *un = &up[
VLEN *
NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
425 for (
int is = 0; is < Ns; ++is) {
427 mult_wilson_ymb(pg1_ym, pg2_ym,
428 &vp2[Nin4 * is], u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
436 if ((iz < Nz - 1) || (do_comm[idir] == 0)) {
437 int iz2 = (iz + 1) % Nz;
438 int nei = ixy + Nxy2 * (iz2 + Nz * it);
440 int iz2_p1 = (iz - 1 + Nz) % Nz;
441 nei_p1 = ixy + Nxy2 * (iz2_p1 + Nz * it);
443 real_t *wpn = &wp[Nin5 * nei];
444 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (ieo + 2 * idir)];
445 for (
int is = 0; is < Ns; ++is) {
448 mult_wilson_zpb(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
454 if ((iz > 0) || (do_comm[idir] == 0)) {
455 int iz2 = (iz - 1 + Nz) % Nz;
456 int nei = ixy + Nxy2 * (iz2 + Nz * it);
458 int it2_p1 = (it + 1) % Nt;
459 nei_p1 = ixyz + Nxyz2 * it2_p1;
461 real_t *wpn = &wp[Nin5 * nei];
462 real_t *u2 = &up[
VLEN *
NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
463 for (
int is = 0; is < Ns; ++is) {
466 mult_wilson_zmb(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
474 if ((it < Nt - 1) || (do_comm[idir] == 0)) {
475 int it2 = (it + 1) % Nt;
476 int nei = ixyz + Nxyz2 * it2;
478 int it2_p1 = (it - 1 + Nt) % Nt;
479 nei_p1 = ixyz + Nxyz2 * it2_p1;
481 real_t *wpn = &wp[Nin5 * nei];
482 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (ieo + 2 * idir)];
483 for (
int is = 0; is < Ns; ++is) {
486 mult_wilson_tpb_dirac(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
492 if ((it > 0) || (do_comm[idir] == 0)) {
493 int it2 = (it - 1 + Nt) % Nt;
494 int nei = ixyz + Nxyz2 * it2;
496 nei_p1 = ix_p1 + 1 + Nx2v * iyzt_p1;
498 real_t *wpn = &wp[Nin5 * nei];
499 real_t *u2 = &up[
VLEN *
NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
500 for (
int is = 0; is < Ns; ++is) {
503 mult_wilson_tmb_dirac(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
518 int *Leo,
int *Nsize,
int *do_comm,
525 int Nst2v = Nx2v * Ny * Nz * Nt;
526 int Nst2 = Nst2v *
VLEN;
529 int Nin5 = Nin4 * Ns;
531 int Nin5H = Nin4H * Ns;
532 int NvU2 =
NDF * Nst2;
534 int Nxy2 = Nx2v * Ny;
535 int Nxyz2 = Nx2v * Ny * Nz;
537 svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
538 svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
539 set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
540 set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
541 set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
542 set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
543 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
544 set_predicate_yp(pg1_yp, pg2_yp);
545 set_predicate_ym(pg1_ym, pg2_ym);
547 set_index_xp_eo(svidx_xp);
548 set_index_xm_eo(svidx_xm);
551 if (do_comm[idir] == 1) {
552 int Nyzt = Ny * Nz * Nt;
554 int ith, nth, site0, site1;
555 set_threadtask(ith, nth, site0, site1, Nyzt);
557 for (
int iyzt = site0; iyzt < site1; ++iyzt) {
559 int iz = (iyzt / Ny) % Nz;
560 int it = iyzt / (Ny * Nz);
562 int jeo = (ieo + Leo[
VLENY * iyzt]) % 2;
564 int Nskipx = (
VLENY + 1) / 2;
568 int ibf_up_xp1 = Nskipx *
NVC *
ND2 * Ns * (iyzt + 1);
569 int ibf_dn_xp1 = Nskipx *
NVC *
ND2 * Ns * (iyzt + 1);
570 int ibf_dn_xp1_2 = Nskipx *
NVC *
ND2 * Ns * iyzt;
571 int ibf_up_xp1_2 = Nskipx *
NVC *
ND2 * Ns * (iyzt + 1);
573 int ibf_up_xp1 = Nskipx *
NVC *
ND2 * Ns * ((iyzt + 1) / 2);
574 int ibf_dn_xp1 = Nskipx *
NVC *
ND2 * Ns * ((iyzt + 1) / 2);
575 int ibf_dn_xp1_2 = Nskipx *
NVC *
ND2 * Ns * (iyzt / 2);
576 int ibf_up_xp1_2 = Nskipx *
NVC *
ND2 * Ns * ((iyzt + 1) / 2);
583 int ibf_up = Nskipx *
NVC *
ND2 * Ns * iyzt;
584 int ibf_dn = Nskipx *
NVC *
ND2 * Ns * iyzt;
586 int ibf_up = Nskipx *
NVC *
ND2 * Ns * (iyzt / 2);
587 int ibf_dn = Nskipx *
NVC *
ND2 * Ns * (iyzt / 2);
592 int site = ix + Nx2v * iyzt;
593 real_t *wp2 = &wp[Nin5 * site];
596 int site_xp1 = ix + Nx2v * (iyzt + 1);
597 int site_xp1_2 = Nx2v - 1 + Nx2v * (iyzt + 1);
599 set_index_xm_eo(svidx_xm);
602 for (
int is = 0; is < Ns; ++is) {
607 if ((!((it == 0) || (it == Nt - 1))) &&
608 (!((iz == 0) || (iz == Nz - 1))) &&
609 (!((iy == 0) || (iy == Ny - 1)))) {
615 mult_wilson_eo_xp1(pg2o_xm, svidx_xm,
616 &buf1_xp[ibf_up + Nskipx *
NVC *
ND2 * is],
621 for (
int is = 0; is < Ns; ++is) {
626 if ((!((it == 0) || (it == Nt - 1))) &&
627 (!((iz == 0) || (iz == Nz - 1))) &&
628 (!((iy == 0) || (iy == Ny - 1)))) {
634 mult_wilson_eo_xp1(pg2e_xm, svidx_xm,
635 &buf1_xp[ibf_up + Nskipx *
NVC *
ND2 * is],
643 int site = ix + Nx2v * iyzt;
644 real_t *wp2 = &wp[Nin5 * site];
645 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
647 int site_xp1 = ix + Nx2v * (iyzt + 1);
648 int site_xp1_2 = 0 + Nx2v * (iyzt + 1);
653 set_index_xp_eo(svidx_xp);
655 for (
int is = 0; is < Ns; ++is) {
660 if ((!((it == 0) || (it == Nt - 1))) &&
661 (!((iz == 0) || (iz == Nz - 1))) &&
662 (!((iy == 0) || (iy == Ny - 1)))) {
668 mult_wilson_eo_xm1(pg2o_xp, svidx_xp,
669 &buf1_xm[ibf_dn + Nskipx *
NVC *
ND2 * is],
670 u2, &wp2[Nin4 * is]);
674 for (
int is = 0; is < Ns; ++is) {
679 if ((!((it == 0) || (it == Nt - 1))) &&
680 (!((iz == 0) || (iz == Nz - 1))) &&
681 (!((iy == 0) || (iy == Ny - 1)))) {
687 mult_wilson_eo_xm1(pg2e_xp, svidx_xp,
688 &buf1_xm[ibf_dn + Nskipx *
NVC *
ND2 * is],
689 u2, &wp2[Nin4 * is]);
698 if (do_comm[idir] == 1) {
699 int Nxzt2 = Nx2v * Nz * Nt;
701 int ith, nth, site0, site1;
702 set_threadtask(ith, nth, site0, site1, Nxzt2);
704 for (
int ixzt = site0; ixzt < site1; ++ixzt) {
705 int ix = ixzt % Nx2v;
706 int izt = ixzt / Nx2v;
715 ((ix + i_p1) % Nx2v + Nx2v * (izt + (ix + i_p1) / Nx2v));
717 ((ix + i_p2) % Nx2v + Nx2v * (izt + (ix + i_p2) / Nx2v));
721 int site = ix + Nx2v * (iy + Ny * izt);
723 real_t *wp2 = &wp[Nin5 * site];
725 int iy_p1 = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
726 int izt_p1 = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
727 int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
728 int iy_p2 = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
729 int izt_p2 = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
730 int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
732 for (
int is = 0; is < Ns; ++is) {
736 if ((!((it == 0) || (it == Nt - 1))) &&
737 (!((iz == 0) || (iz == Nz - 1)))) {
742 mult_wilson_yp1(pg2_ym,
743 &buf1_yp[ibf +
VLENX *
NVC *
ND2 * is], &wp2[Nin4 * is]);
748 int site = ix + Nx2v * (iy + Ny * izt);
750 real_t *wp2 = &wp[Nin5 * site];
751 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
753 int iy_p1 = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
754 int izt_p1 = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
755 int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
756 int iy_p2 = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
757 int izt_p2 = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
758 int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
762 for (
int is = 0; is < Ns; ++is) {
766 if ((!((it == 0) || (it == Nt - 1))) &&
767 (!((iz == 0) || (iz == Nz - 1)))) {
773 mult_wilson_ym1(pg2_yp,
774 &buf1_ym[ibf +
VLENX *
NVC *
ND2 * is], u2, &wp2[Nin4 * is]);
781 if (do_comm[idir] == 1) {
782 int Nxyt2 = Nxy2 * Nt;
784 int ith, nth, site0, site1;
785 set_threadtask(ith, nth, site0, site1, Nxyt2);
787 for (
int ixyt = site0; ixyt < site1; ++ixyt) {
788 int ixy = ixyt % Nxy2;
789 int it = ixyt / Nxy2;
793 int ibf_zp1 = Nin5H * ((ixy + i_p1) % Nxy2 + Nxy2 * (it + (ixy + i_p1) / Nxy2));
794 int ibf_zp2 = Nin5H * ((ixy + i_p2) % Nxy2 + Nxy2 * (it + (ixy + i_p2) / Nxy2));
796 int ibf = Nin5H * (ixy + Nxy2 * it);
800 int site = ixy + Nxy2 * (iz + Nz * it);
801 real_t *wp2 = &wp[Nin5 * site];
803 int iz_p1 = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
804 int it_p1 = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
805 int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
806 int iz_p2 = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
807 int it_p2 = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
808 int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
810 for (
int is = 0; is < Ns; ++is) {
814 if (!((it == 0) || (it == Nt - 1))) {
819 mult_wilson_zp1(&buf1_zp[ibf + Nin4H * is], &wp2[Nin4 * is]);
825 int site = ixy + Nxy2 * (iz + Nz * it);
826 real_t *wp2 = &wp[Nin5 * site];
827 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
829 int iz_p1 = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
830 int it_p1 = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
831 int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
832 int iz_p2 = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
833 int it_p2 = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
834 int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
838 for (
int is = 0; is < Ns; ++is) {
842 if (!((it == 0) || (it == Nt - 1))) {
847 mult_wilson_zm1(&buf1_zm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
854 if (do_comm[idir] == 1) {
855 int ith, nth, site0, site1;
856 set_threadtask(ith, nth, site0, site1, Nxyz2);
858 for (
int ixyz = site0; ixyz < site1; ++ixyz) {
859 int ibf = Nin5H * ixyz;
865 int ibf_tp1 = Nin5H * (ixyz + i_p1);
866 int ibf_tp2 = Nin5H * (ixyz + i_p2);
870 int site = ixyz + Nxyz2 * it;
871 real_t *wp2 = &wp[Nin5 * site];
874 int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
875 int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
877 for (
int is = 0; is < Ns; ++is) {
884 mult_wilson_tp1_dirac(&buf1_tp[ibf + Nin4H * is], &wp2[Nin4 * is]);
890 int site = ixyz + Nxyz2 * it;
891 real_t *wp2 = &wp[Nin5 * site];
894 int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
895 int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
899 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
900 for (
int is = 0; is < Ns; ++is) {
907 mult_wilson_tm1_dirac(&buf1_tm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
923 int *Leo,
int *Nsize,
int *do_comm,
930 int Nst2v = Nx2v * Ny * Nz * Nt;
931 int Nst2 = Nst2v *
VLEN;
934 int Nin5 = Nin4 * Ns;
936 int Nin5H = Nin4H * Ns;
937 int NvU2 =
NDF * Nst2;
939 int Nxy2 = Nx2v * Ny;
940 int Nxyz2 = Nx2v * Ny * Nz;
942 svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
943 svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
944 set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
945 set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
946 set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
947 set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
948 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
949 set_predicate_yp(pg1_yp, pg2_yp);
950 set_predicate_ym(pg1_ym, pg2_ym);
952 set_index_xp_eo(svidx_xp);
953 set_index_xm_eo(svidx_xm);
955 int Nskipx = (
VLENY + 1) / 2;
957 int ith, nth, site0, site1;
958 set_threadtask(ith, nth, site0, site1, Nst2v);
964 for (
int site = site0; site < site1; ++site) {
965 int ix = site % Nx2v;
966 int iyzt = site / Nx2v;
968 int izt = site / Nxy2;
971 int ixy = ix + Nx2v * iy;
972 int ixyz = ixy + Nxy2 * iz;
973 int jeo = (ieo + Leo[
VLENY * iyzt]) % 2;
976 int ibf_up_xp1, ibf_dn_xp1, site_xp1, site_xp1_2, ibf_up_xp1_2, ibf_dn_xp1_2;
978 ibf_up_xp1 = Nskipx *
NVC *
ND2 * Ns * (iyzt + 1);
980 ibf_up_xp1 = Nskipx *
NVC *
ND2 * Ns * ((iyzt + 1) / 2);
982 ibf_dn_xp1 = Nskipx *
NVC *
ND2 * Ns * (iyzt + 1);
984 ibf_dn_xp1 = Nskipx *
NVC *
ND2 * Ns * ((iyzt + 1) / 2);
986 site_xp1 = (iyzt + 1) * Nx2v + ix;
989 site_xp1_2 = iyzt * Nx2v + Nx2v - 1;
991 ibf_dn_xp1_2 = Nskipx *
NVC *
ND2 * Ns * iyzt;
993 ibf_dn_xp1_2 = Nskipx *
NVC *
ND2 * Ns * (iyzt / 2);
995 }
else if (ix == Nx2v - 1) {
996 site_xp1_2 = (iyzt + 1) * Nx2v + 0;
998 ibf_up_xp1_2 = Nskipx *
NVC *
ND2 * Ns * (iyzt + 1);
1000 ibf_up_xp1_2 = Nskipx *
NVC *
ND2 * Ns * ((iyzt + 1) / 2);
1007 int ibf_yp1 =
VLENX *
NVC *
ND2 * Ns * ((ix + i_p1) % Nx2v + Nx2v * (izt + (ix + i_p1) / Nx2v));
1008 int iy_p1 = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
1009 int izt_p1 = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
1010 int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
1012 int ibf_yp2 =
VLENX *
NVC *
ND2 * Ns * ((ix + i_p2) % Nx2v + Nx2v * (izt + (ix + i_p2) / Nx2v));
1013 int iy_p2 = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
1014 int izt_p2 = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
1015 int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
1017 int ibf_zp1 = Nin5H * ((ixy + i_p1) % Nxy2 + Nxy2 * (it + (ixy + i_p1) / Nxy2));
1018 int iz_p1 = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
1019 int it_p1 = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
1020 int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
1022 int ibf_zp2 = Nin5H * ((ixy + i_p2) % Nxy2 + Nxy2 * (it + (ixy + i_p2) / Nxy2));
1023 int iz_p2 = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
1024 int it_p2 = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
1025 int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
1027 int ibf_tp1 = Nin5H * (ixyz + i_p1);
1028 int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
1030 int ibf_tp2 = Nin5H * (ixyz + i_p2);
1031 int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
1036 real_t *wp2 = &wp[Nin5 * site];
1037 real_t *vp2 = &vp[Nin5 * site];
1040 if (do_comm[idir] == 1) {
1041 if (ix == Nx2v - 1) {
1042 int ibf_up = Nskipx *
NVC *
ND2 * Ns * iyzt;
1044 ibf_up = Nskipx *
NVC *
ND2 * Ns * (iyzt / 2);
1049 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 0)];
1050 set_index_xp_eo(svidx_xp);
1052 for (
int is = 0; is < Ns; ++is) {
1058 if ((!((it == 0) || (it == Nt - 1))) &&
1059 (!((iz == 0) || (iz == Nz - 1))) &&
1060 (!((iy == 0) || (iy == Ny - 1)))) {
1066 mult_wilson_eo_xp2(pg1e_xp, pg2e_xp, pg3e_xp, svidx_xp,
1067 &vp2[Nin4 * is], &u[
VLEN *
NDF * site], &wp2[Nin4 * is],
1068 &buf2_xp[ibf_up + Nskipx *
NVC *
ND2 * is]);
1071 for (
int is = 0; is < Ns; ++is) {
1076 if ((!((it == 0) || (it == Nt - 1))) &&
1077 (!((iz == 0) || (iz == Nz - 1))) &&
1078 (!((iy == 0) || (iy == Ny - 1)))) {
1084 mult_wilson_eo_xp2(pg1o_xp, pg2o_xp, pg3o_xp, svidx_xp,
1085 &vp2[Nin4 * is], &u[
VLEN *
NDF * site], &wp2[Nin4 * is],
1086 &buf2_xp[ibf_up + Nskipx *
NVC *
ND2 * is]);
1092 int ibf_dn = Nskipx *
NVC *
ND2 * Ns * iyzt;
1094 ibf_dn = Nskipx *
NVC *
ND2 * Ns * (iyzt / 2);
1099 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 0)];
1101 set_index_xm_eo(svidx_xm);
1103 for (
int is = 0; is < Ns; ++is) {
1108 if ((!((it == 0) || (it == Nt - 1))) &&
1109 (!((iz == 0) || (iz == Nz - 1))) &&
1110 (!((iy == 0) || (iy == Ny - 1)))) {
1116 mult_wilson_eo_xm2(pg1e_xm, pg2e_xm, pg3e_xm, svidx_xm,
1117 &vp2[Nin4 * is], &u[
VLEN *
NDF * site], &wp2[Nin4 * is],
1118 &buf2_xm[ibf_dn + Nskipx *
NVC *
ND2 * is]);
1121 for (
int is = 0; is < Ns; ++is) {
1126 if ((!((it == 0) || (it == Nt - 1))) &&
1127 (!((iz == 0) || (iz == Nz - 1))) &&
1128 (!((iy == 0) || (iy == Ny - 1)))) {
1134 mult_wilson_eo_xm2(pg1o_xm, pg2o_xm, pg3o_xm, svidx_xm,
1135 &vp2[Nin4 * is], &u[
VLEN *
NDF * site], &wp2[Nin4 * is],
1136 &buf2_xm[ibf_dn + Nskipx *
NVC *
ND2 * is]);
1143 if (do_comm[idir] == 1) {
1145 int ibf =
VLENX *
NVC *
ND2 * Ns * (ix + Nx2v * izt);
1149 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 1)];
1150 for (
int is = 0; is < Ns; ++is) {
1155 if ((!((it == 0) || (it == Nt - 1))) &&
1156 (!((iz == 0) || (iz == Nz - 1)))) {
1162 mult_wilson_yp2(pg1_yp, pg2_yp,
1163 &vp2[Nin4 * is], &u[
VLEN *
NDF * site],
1164 &wp2[Nin4 * is], &buf2_yp[ibf +
VLENX *
NVC *
ND2 * is]);
1169 int ibf =
VLENX *
NVC *
ND2 * Ns * (ix + Nx2v * izt);
1172 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 1)];
1173 for (
int is = 0; is < Ns; ++is) {
1178 if ((!((it == 0) || (it == Nt - 1))) &&
1179 (!((iz == 0) || (iz == Nz - 1)))) {
1186 mult_wilson_ym2(pg1_ym, pg2_ym,
1187 &vp2[Nin4 * is], &u[
VLEN *
NDF * site],
1188 &wp2[Nin4 * is], &buf2_ym[ibf +
VLENX *
NVC *
ND2 * is]);
1194 if (do_comm[idir] == 1) {
1196 int ibf = Nin5H * (ixy + Nxy2 * it);
1199 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (ieo + 2 * idir)];
1200 for (
int is = 0; is < Ns; ++is) {
1204 if (!((it == 0) || (it == Nt - 1))) {
1209 mult_wilson_zp2(&vp2[Nin4 * is], u2, &buf2_zp[ibf + Nin4H * is]);
1214 int ibf = Nin5H * (ixy + Nxy2 * it);
1216 for (
int is = 0; is < Ns; ++is) {
1220 if (!((it == 0) || (it == Nt - 1))) {
1225 mult_wilson_zm2(&vp2[Nin4 * is], &buf2_zm[ibf + Nin4H * is]);
1231 if (do_comm[idir] == 1) {
1233 int ibf = Nin5H * ixyz;
1237 real_t *u2 = &up[
VLEN *
NDF * site + NvU2 * (ieo + 2 * idir)];
1238 for (
int is = 0; is < Ns; ++is) {
1245 mult_wilson_tp2_dirac(&vp2[Nin4 * is], u2, &buf2_tp[ibf + Nin4H * is]);
1250 int ibf = Nin5H * ixyz;
1253 for (
int is = 0; is < Ns; ++is) {
1260 mult_wilson_tm2_dirac(&vp2[Nin4 * is], &buf2_tm[ibf + Nin4H * is]);
1274 int *Leo,
int *Nsize,
int *do_comm,
1283 Leo, Nsize, do_comm, ieo);
1294 int Nx2v = Nsize[0];
1298 int Nst2v = Nx2v * Ny * Nz * Nt;
1299 int Nst2 = Nst2v *
VLEN;
1302 int Nin5 = Nin4 * Ns;
1305 int ith, nth, site0, site1;
1306 set_threadtask(ith, nth, site0, site1, Nst2v);
1308 for (
int site = site0; site < site1; ++site) {
1309 real_t *vp2 = &vp[Nin5 * site];
1310 real_t *wp2 = &wp[Nin5 * site];
1313 for (
int ic = 0; ic <
NC; ++ic) {
1314 svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
1315 svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
1316 svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
1317 int offset0 = 2 *
ND * ic;
1322 load_vec(pg, v1r, &wp2[
VLEN * (offset0 + 0)]);
1323 load_vec(pg, v1i, &wp2[
VLEN * (offset0 + 1)]);
1324 load_vec(pg, v2r, &wp2[
VLEN * (offset0 + 2)]);
1325 load_vec(pg, v2i, &wp2[
VLEN * (offset0 + 3)]);
1326 load_vec(pg, v3r, &wp2[
VLEN * (offset0 + 4)]);
1327 load_vec(pg, v3i, &wp2[
VLEN * (offset0 + 5)]);
1328 load_vec(pg, v4r, &wp2[
VLEN * (offset0 + 6)]);
1329 load_vec(pg, v4i, &wp2[
VLEN * (offset0 + 7)]);
1330 save_vec(pg, &vp2[
VLEN * (offset0 + 0)], v1r);
1331 save_vec(pg, &vp2[
VLEN * (offset0 + 1)], v1i);
1332 save_vec(pg, &vp2[
VLEN * (offset0 + 2)], v2r);
1333 save_vec(pg, &vp2[
VLEN * (offset0 + 3)], v2i);
1334 save_vec(pg, &vp2[
VLEN * (offset0 + 4)], v3r);
1335 save_vec(pg, &vp2[
VLEN * (offset0 + 5)], v3i);
1336 save_vec(pg, &vp2[
VLEN * (offset0 + 6)], v4r);
1337 save_vec(pg, &vp2[
VLEN * (offset0 + 7)], v4i);
1339 scal_vec(pg, y1r, e[0]);
1341 scal_vec(pg, y1i, e[0]);
1343 scal_vec(pg, y2r, e[0]);
1345 scal_vec(pg, y2i, e[0]);
1347 scal_vec(pg, y3r, e[0]);
1349 scal_vec(pg, y3i, e[0]);
1351 scal_vec(pg, y4r, e[0]);
1353 scal_vec(pg, y4i, e[0]);
1355 for (
int is = 1; is < Ns - 1; ++is) {
1365 int offset = 2 *
ND * ic +
NVCD * is;
1371 load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
1372 load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
1373 load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
1374 load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
1375 load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
1376 load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
1377 load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
1378 load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
1382 add_aPp5_dirac_vec(pg,
1383 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1385 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1386 save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
1387 save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
1388 save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
1389 save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
1390 save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
1391 save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
1392 save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
1393 save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
1394 axpy_vec(pg, y1r, e[is], v1r);
1395 axpy_vec(pg, y1i, e[is], v1i);
1396 axpy_vec(pg, y2r, e[is], v2r);
1397 axpy_vec(pg, y2i, e[is], v2i);
1398 axpy_vec(pg, y3r, e[is], v3r);
1399 axpy_vec(pg, y3i, e[is], v3i);
1400 axpy_vec(pg, y4r, e[is], v4r);
1401 axpy_vec(pg, y4i, e[is], v4i);
1413 int offset = 2 *
ND * ic +
NVCD * is;
1414 load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
1415 load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
1416 load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
1417 load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
1418 load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
1419 load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
1420 load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
1421 load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
1423 add_aPp5_dirac_vec(pg,
1424 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1426 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1428 add_aPm5_dirac_vec(pg,
1429 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1431 y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i);
1432 save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
1433 save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
1434 save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
1435 save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
1436 save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
1437 save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
1438 save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
1439 save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
1454 int Nx2v = Nsize[0];
1458 int Nst2v = Nx2v * Ny * Nz * Nt;
1459 int Nst2 = Nst2v *
VLEN;
1462 int Nin5 = Nin4 * Ns;
1465 int ith, nth, site0, site1;
1466 set_threadtask(ith, nth, site0, site1, Nst2v);
1468 for (
int site = site0; site < site1; ++site) {
1469 real_t *vp2 = &vp[Nin5 * site];
1470 real_t *wp2 = &wp[Nin5 * site];
1473 for (
int ic = 0; ic <
NC; ++ic) {
1474 svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
1475 svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
1476 svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
1478 int offset0 = 2 *
ND * ic +
NVCD * (Ns - 1);
1483 load_vec(pg, v1r, &wp2[
VLEN * (offset0 + 0)]);
1484 load_vec(pg, v1i, &wp2[
VLEN * (offset0 + 1)]);
1485 load_vec(pg, v2r, &wp2[
VLEN * (offset0 + 2)]);
1486 load_vec(pg, v2i, &wp2[
VLEN * (offset0 + 3)]);
1487 load_vec(pg, v3r, &wp2[
VLEN * (offset0 + 4)]);
1488 load_vec(pg, v3i, &wp2[
VLEN * (offset0 + 5)]);
1489 load_vec(pg, v4r, &wp2[
VLEN * (offset0 + 6)]);
1490 load_vec(pg, v4i, &wp2[
VLEN * (offset0 + 7)]);
1492 real_t a = dpinv[Ns - 1];
1494 scal_vec(pg, v1r, a);
1495 scal_vec(pg, v1i, a);
1496 scal_vec(pg, v2r, a);
1497 scal_vec(pg, v2i, a);
1498 scal_vec(pg, v3r, a);
1499 scal_vec(pg, v3i, a);
1500 scal_vec(pg, v4r, a);
1501 scal_vec(pg, v4i, a);
1503 save_vec(pg, &vp2[
VLEN * (offset0 + 0)], v1r);
1504 save_vec(pg, &vp2[
VLEN * (offset0 + 1)], v1i);
1505 save_vec(pg, &vp2[
VLEN * (offset0 + 2)], v2r);
1506 save_vec(pg, &vp2[
VLEN * (offset0 + 3)], v2i);
1507 save_vec(pg, &vp2[
VLEN * (offset0 + 4)], v3r);
1508 save_vec(pg, &vp2[
VLEN * (offset0 + 5)], v3i);
1509 save_vec(pg, &vp2[
VLEN * (offset0 + 6)], v4r);
1510 save_vec(pg, &vp2[
VLEN * (offset0 + 7)], v4i);
1512 set_aPp5_dirac_vec(pg,
1513 y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i,
1515 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i);
1516 for (
int is = Ns - 2; is >= 0; --is) {
1526 int offset = 2 *
ND * ic +
NVCD * is;
1531 load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
1532 load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
1533 load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
1534 load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
1535 load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
1536 load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
1537 load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
1538 load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
1542 add_aPm5_dirac_vec(pg,
1543 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1545 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1547 axpy_vec(pg, v1r, -f[is], y1r);
1548 axpy_vec(pg, v1i, -f[is], y1i);
1549 axpy_vec(pg, v2r, -f[is], y2r);
1550 axpy_vec(pg, v2i, -f[is], y2i);
1551 axpy_vec(pg, v3r, -f[is], y3r);
1552 axpy_vec(pg, v3i, -f[is], y3i);
1553 axpy_vec(pg, v4r, -f[is], y4r);
1554 axpy_vec(pg, v4i, -f[is], y4i);
1557 scal_vec(pg, v1r, aa);
1558 scal_vec(pg, v1i, aa);
1559 scal_vec(pg, v2r, aa);
1560 scal_vec(pg, v2i, aa);
1561 scal_vec(pg, v3r, aa);
1562 scal_vec(pg, v3i, aa);
1563 scal_vec(pg, v4r, aa);
1564 scal_vec(pg, v4i, aa);
1566 save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
1567 save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
1568 save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
1569 save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
1570 save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
1571 save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
1572 save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
1573 save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
1588 int Nx2v = Nsize[0];
1592 int Nst2v = Nx2v * Ny * Nz * Nt;
1593 int Nst2 = Nst2v *
VLEN;
1596 int Nin5 = Nin4 * Ns;
1599 int ith, nth, site0, site1;
1600 set_threadtask(ith, nth, site0, site1, Nst2v);
1602 for (
int site = site0; site < site1; ++site) {
1603 real_t *vp2 = &vp[Nin5 * site];
1604 real_t *wp2 = &wp[Nin5 * site];
1607 for (
int ic = 0; ic <
NC; ++ic) {
1608 svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
1609 svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
1610 svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
1611 int offset0 = 2 *
ND * ic;
1616 load_vec(pg, v1r, &wp2[
VLEN * (offset0 + 0)]);
1617 load_vec(pg, v1i, &wp2[
VLEN * (offset0 + 1)]);
1618 load_vec(pg, v2r, &wp2[
VLEN * (offset0 + 2)]);
1619 load_vec(pg, v2i, &wp2[
VLEN * (offset0 + 3)]);
1620 load_vec(pg, v3r, &wp2[
VLEN * (offset0 + 4)]);
1621 load_vec(pg, v3i, &wp2[
VLEN * (offset0 + 5)]);
1622 load_vec(pg, v4r, &wp2[
VLEN * (offset0 + 6)]);
1623 load_vec(pg, v4i, &wp2[
VLEN * (offset0 + 7)]);
1626 scal_vec(pg, v1r, a);
1627 scal_vec(pg, v1i, a);
1628 scal_vec(pg, v2r, a);
1629 scal_vec(pg, v2i, a);
1630 scal_vec(pg, v3r, a);
1631 scal_vec(pg, v3i, a);
1632 scal_vec(pg, v4r, a);
1633 scal_vec(pg, v4i, a);
1635 save_vec(pg, &vp2[
VLEN * (offset0 + 0)], v1r);
1636 save_vec(pg, &vp2[
VLEN * (offset0 + 1)], v1i);
1637 save_vec(pg, &vp2[
VLEN * (offset0 + 2)], v2r);
1638 save_vec(pg, &vp2[
VLEN * (offset0 + 3)], v2i);
1639 save_vec(pg, &vp2[
VLEN * (offset0 + 4)], v3r);
1640 save_vec(pg, &vp2[
VLEN * (offset0 + 5)], v3i);
1641 save_vec(pg, &vp2[
VLEN * (offset0 + 6)], v4r);
1642 save_vec(pg, &vp2[
VLEN * (offset0 + 7)], v4i);
1644 scal_vec(pg, y1r, f[0]);
1646 scal_vec(pg, y1i, f[0]);
1648 scal_vec(pg, y2r, f[0]);
1650 scal_vec(pg, y2i, f[0]);
1652 scal_vec(pg, y3r, f[0]);
1654 scal_vec(pg, y3i, f[0]);
1656 scal_vec(pg, y4r, f[0]);
1658 scal_vec(pg, y4i, f[0]);
1660 for (
int is = 1; is < Ns - 1; ++is) {
1670 int offset = 2 *
ND * ic +
NVCD * is;
1675 load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
1676 load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
1677 load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
1678 load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
1679 load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
1680 load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
1681 load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
1682 load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
1686 add_aPm5_dirac_vec(pg,
1687 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1689 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1692 scal_vec(pg, v1r, aa);
1693 scal_vec(pg, v1i, aa);
1694 scal_vec(pg, v2r, aa);
1695 scal_vec(pg, v2i, aa);
1696 scal_vec(pg, v3r, aa);
1697 scal_vec(pg, v3i, aa);
1698 scal_vec(pg, v4r, aa);
1699 scal_vec(pg, v4i, aa);
1701 save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
1702 save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
1703 save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
1704 save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
1705 save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
1706 save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
1707 save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
1708 save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
1709 axpy_vec(pg, y1r, f[is], v1r);
1710 axpy_vec(pg, y1i, f[is], v1i);
1711 axpy_vec(pg, y2r, f[is], v2r);
1712 axpy_vec(pg, y2i, f[is], v2i);
1713 axpy_vec(pg, y3r, f[is], v3r);
1714 axpy_vec(pg, y3i, f[is], v3i);
1715 axpy_vec(pg, y4r, f[is], v4r);
1716 axpy_vec(pg, y4i, f[is], v4i);
1728 int offset = 2 *
ND * ic +
NVCD * is;
1729 load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
1730 load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
1731 load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
1732 load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
1733 load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
1734 load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
1735 load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
1736 load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
1737 a =
real_t(0.5) * dm[is - 1];
1738 add_aPm5_dirac_vec(pg,
1739 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1741 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1742 add_aPp5_dirac_vec(pg,
1743 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1745 y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i);
1747 scal_vec(pg, v1r, aa);
1748 scal_vec(pg, v1i, aa);
1749 scal_vec(pg, v2r, aa);
1750 scal_vec(pg, v2i, aa);
1751 scal_vec(pg, v3r, aa);
1752 scal_vec(pg, v3i, aa);
1753 scal_vec(pg, v4r, aa);
1754 scal_vec(pg, v4i, aa);
1756 save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
1757 save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
1758 save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
1759 save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
1760 save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
1761 save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
1762 save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
1763 save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
1777 int Nx2v = Nsize[0];
1781 int Nst2v = Nx2v * Ny * Nz * Nt;
1782 int Nst2 = Nst2v *
VLEN;
1785 int Nin5 = Nin4 * Ns;
1788 int ith, nth, site0, site1;
1789 set_threadtask(ith, nth, site0, site1, Nst2v);
1791 for (
int site = site0; site < site1; ++site) {
1792 real_t *vp2 = &vp[Nin5 * site];
1793 real_t *wp2 = &wp[Nin5 * site];
1796 for (
int ic = 0; ic <
NC; ++ic) {
1797 svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
1798 svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
1799 svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
1801 int offset0 = 2 *
ND * ic +
NVCD * (Ns - 1);
1806 load_vec(pg, v1r, &wp2[
VLEN * (offset0 + 0)]);
1807 load_vec(pg, v1i, &wp2[
VLEN * (offset0 + 1)]);
1808 load_vec(pg, v2r, &wp2[
VLEN * (offset0 + 2)]);
1809 load_vec(pg, v2i, &wp2[
VLEN * (offset0 + 3)]);
1810 load_vec(pg, v3r, &wp2[
VLEN * (offset0 + 4)]);
1811 load_vec(pg, v3i, &wp2[
VLEN * (offset0 + 5)]);
1812 load_vec(pg, v4r, &wp2[
VLEN * (offset0 + 6)]);
1813 load_vec(pg, v4i, &wp2[
VLEN * (offset0 + 7)]);
1815 save_vec(pg, &vp2[
VLEN * (offset0 + 0)], v1r);
1816 save_vec(pg, &vp2[
VLEN * (offset0 + 1)], v1i);
1817 save_vec(pg, &vp2[
VLEN * (offset0 + 2)], v2r);
1818 save_vec(pg, &vp2[
VLEN * (offset0 + 3)], v2i);
1819 save_vec(pg, &vp2[
VLEN * (offset0 + 4)], v3r);
1820 save_vec(pg, &vp2[
VLEN * (offset0 + 5)], v3i);
1821 save_vec(pg, &vp2[
VLEN * (offset0 + 6)], v4r);
1822 save_vec(pg, &vp2[
VLEN * (offset0 + 7)], v4i);
1824 set_aPm5_dirac_vec(pg,
1825 y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i,
1827 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i);
1829 for (
int is = Ns - 2; is >= 0; --is) {
1839 int offset = 2 *
ND * ic +
NVCD * is;
1844 load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
1845 load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
1846 load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
1847 load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
1848 load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
1849 load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
1850 load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
1851 load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
1855 add_aPp5_dirac_vec(pg,
1856 v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1858 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1860 axpy_vec(pg, v1r, -e[is], y1r);
1861 axpy_vec(pg, v1i, -e[is], y1i);
1862 axpy_vec(pg, v2r, -e[is], y2r);
1863 axpy_vec(pg, v2i, -e[is], y2i);
1864 axpy_vec(pg, v3r, -e[is], y3r);
1865 axpy_vec(pg, v3i, -e[is], y3i);
1866 axpy_vec(pg, v4r, -e[is], y4r);
1867 axpy_vec(pg, v4i, -e[is], y4i);
1869 save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
1870 save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
1871 save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
1872 save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
1873 save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
1874 save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
1875 save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
1876 save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);