10 #ifndef MULT_DOMAINWALL_5DIN_QXS_INCLUDED
11 #define MULT_DOMAINWALL_5DIN_QXS_INCLUDED
20 int *Nsize,
int *do_comm)
26 int Nstv = Nxv * Nyv * Nz * Nt;
27 int Nst = Nstv *
VLEN;
32 int ith, nth, site0, site1;
33 set_threadtask(ith, nth, site0, site1, Nstv);
35 for (
int site = site0; site < site1; ++site) {
36 real_t *vp2 = &vp[Nin5 * site];
37 real_t *wp2 = &wp[Nin5 * site];
38 real_t *yp2 = &yp[Nin5 * site];
40 for (
int is = 0; is < Ns; ++is) {
43 for (
int ic = 0; ic <
NC; ++ic) {
44 svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
46 int is_up = (is + 1) % Ns;
48 if (is == Ns - 1) Fup = -0.5 * mq;
49 set_aPm5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
50 vt3r, vt3i, vt4r, vt4i,
53 int is_dn = (is - 1 + Ns) % Ns;
55 if (is == 0) Fdn = -0.5 * mq;
56 add_aPp5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
57 vt3r, vt3i, vt4r, vt4i,
61 real_t FF1 = b[is] * (4.0 - M0) + 1.0;
62 real_t FF2 = c[is] * (4.0 - M0) - 1.0;
63 int offset = 2 *
ND * ic +
NVCD * is;
64 dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt1r, 0 + offset);
65 dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt1i, 1 + offset);
66 dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt2r, 2 + offset);
67 dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt2i, 3 + offset);
68 dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt3r, 4 + offset);
69 dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt3i, 5 + offset);
70 dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt4r, 6 + offset);
71 dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt4i, 7 + offset);
83 int *Nsize,
int *do_comm)
89 int Nstv = Nxv * Nyv * Nz * Nt;
90 int Nst = Nstv *
VLEN;
95 int ith, nth, site0, site1;
96 set_threadtask(ith, nth, site0, site1, Nstv);
98 for (
int site = site0; site < site1; ++site) {
99 real_t *vp2 = &vp[Nin5 * site];
100 real_t *wp2 = &wp[Nin5 * site];
101 real_t *yp2 = &yp[Nin5 * site];
107 for (
int is = 0; is < Ns; ++is) {
110 for (
int ic = 0; ic <
NC; ++ic) {
111 svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
112 FF1 = b[is] * (4.0 - M0) + 1.0;
114 int index = 2 *
ND * ic +
NVCD * is;
118 dw_5dir_dag(pg, vt3r, vt3i, vt4r, vt4i, vt1r, vt1i, vt2r, vt2i,
119 wp2, yp2, -FF1, -a1, index);
121 int is_up = (is + 1) % Ns;
122 FF2 = c[is_up] * (4.0 - M0) - 1.0;
123 real_t aup = -0.5 * c[is_up];
124 int index_up = 2 *
ND * ic +
NVCD * is_up;
125 svreal_t xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i;
126 dw_5dir_dag(pg, xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i,
127 wp2, yp2, FF2, aup, index_up);
130 if (is == Ns - 1) Fup = -0.5 * mq;
131 add_aPp5_dirac_vec(pg, vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i,
132 Fup, xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i);
134 int is_dn = (is - 1 + Ns) % Ns;
135 FF2 = c[is_dn] * (4.0 - M0) - 1.0;
136 real_t adn = -0.5 * c[is_dn];
137 int index_dn = 2 *
ND * ic +
NVCD * is_dn;
138 dw_5dir_dag(pg, xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i,
139 wp2, yp2, FF2, adn, index_dn);
142 if (is == 0) Fdn = -0.5 * mq;
143 add_aPm5_dirac_vec(pg, vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i,
144 -Fdn, xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i);
146 int offset = 2 *
ND * ic +
NVCD * is;
147 save_vec(pg, &vp2[
VLEN * (0 + offset)], vt1r);
148 save_vec(pg, &vp2[
VLEN * (1 + offset)], vt1i);
149 save_vec(pg, &vp2[
VLEN * (2 + offset)], vt2r);
150 save_vec(pg, &vp2[
VLEN * (3 + offset)], vt2i);
151 save_vec(pg, &vp2[
VLEN * (4 + offset)], vt3r);
152 save_vec(pg, &vp2[
VLEN * (5 + offset)], vt3i);
153 save_vec(pg, &vp2[
VLEN * (6 + offset)], vt4r);
154 save_vec(pg, &vp2[
VLEN * (7 + offset)], vt4i);
169 int Nstv = Nxv * Nyv * Nz * Nt;
172 int Nin5 = Nin4 * Ns;
174 int ith, nth, site0, site1;
175 set_threadtask(ith, nth, site0, site1, Nstv);
177 for (
int site = site0; site < site1; ++site) {
178 real_t *vp2 = &vp[Nin5 * site];
179 real_t *wp2 = &wp[Nin5 * site];
182 for (
int is = 0; is < Ns; ++is) {
183 for (
int ic = 0; ic <
NC; ++ic) {
184 svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
185 int index = 2 *
ND * ic +
NVCD * is;
186 load_vec(pg, vt1r, &wp2[
VLEN * (4 + index)]);
187 load_vec(pg, vt1i, &wp2[
VLEN * (5 + index)]);
188 load_vec(pg, vt2r, &wp2[
VLEN * (6 + index)]);
189 load_vec(pg, vt2i, &wp2[
VLEN * (7 + index)]);
190 load_vec(pg, vt3r, &wp2[
VLEN * (0 + index)]);
191 load_vec(pg, vt3i, &wp2[
VLEN * (1 + index)]);
192 load_vec(pg, vt4r, &wp2[
VLEN * (2 + index)]);
193 load_vec(pg, vt4i, &wp2[
VLEN * (3 + index)]);
203 save_vec(pg, &vp2[
VLEN * (0 + index)], vt1r);
204 save_vec(pg, &vp2[
VLEN * (1 + index)], vt1i);
205 save_vec(pg, &vp2[
VLEN * (2 + index)], vt2r);
206 save_vec(pg, &vp2[
VLEN * (3 + index)], vt2i);
207 save_vec(pg, &vp2[
VLEN * (4 + index)], vt3r);
208 save_vec(pg, &vp2[
VLEN * (5 + index)], vt3i);
209 save_vec(pg, &vp2[
VLEN * (6 + index)], vt4r);
210 save_vec(pg, &vp2[
VLEN * (7 + index)], vt4i);
224 int Nstv = Nxv * Nyv * Nz * Nt;
227 int Nin5 = Nin4 * Ns;
229 int ith, nth, site0, site1;
230 set_threadtask(ith, nth, site0, site1, Nstv);
235 for (
int site = site0; site < site1; ++site) {
236 real_t *vp2 = &vp[Nin5 * site];
238 for (
int is = 0; is < Ns; ++is) {
239 save_vec(&vp2[Nin4 * is], yL,
NVCD);
250 int *Nsize,
int *do_comm)
256 int Nstv = Nxv * Nyv * Nz * Nt;
257 int Nst = Nstv *
VLEN;
260 int Nin5 = Nin4 * Ns;
265 int Nxyz = Nxv * Nyv * Nz;
267 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
268 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
269 set_predicate_xp(pg1_xp, pg2_xp);
270 set_predicate_xm(pg1_xm, pg2_xm);
271 set_predicate_yp(pg1_yp, pg2_yp);
272 set_predicate_ym(pg1_ym, pg2_ym);
274 int ith, nth, site0, site1;
275 set_threadtask(ith, nth, site0, site1, Nstv);
279 for (
int site = site0; site < site1; ++site) {
281 int iyzt = site / Nxv;
282 int ixy = site % Nxy;
284 int izt = site / Nxy;
287 int ixyz = site % Nxyz;
291 real_t *wp2 = &wp[Nin5 * site];
292 real_t *vp2 = &vp[Nin5 * site];
298 load_vec(vL, vp2,
NVCD * Ns);
302 if ((ix < Nxv - 1) || (do_comm[idir] == 0)) {
303 int ix2 = (ix + 1) % Nxv;
304 nei = ix2 + Nxv * iyzt;
305 real_t *wpn = &wp[Nin5 * nei];
307 for (
int is = 0; is < Ns; ++is) {
308 mult_wilson_xpb(pg1_xp, pg2_xp, &vL[
NVCD * is].v[0], u,
309 &wp2[Nin4 * is], &wpn[Nin4 * is]);
313 if ((ix > 0) || (do_comm[idir] == 0)) {
314 int ix2 = (ix - 1 + Nxv) % Nxv;
315 nei = ix2 + Nxv * iyzt;
316 real_t *wpn = &wp[Nin5 * nei];
319 for (
int is = 0; is < Ns; ++is) {
320 mult_wilson_xmb(pg1_xm, pg2_xm, &vL[
NVCD * is], ux, un,
321 &wp2[Nin4 * is], &wpn[Nin4 * is]);
327 if ((iy < Nyv - 1) || (do_comm[idir] == 0)) {
328 int iy2 = (iy + 1) % Nyv;
329 nei = ix + Nxv * (iy2 + Nyv * izt);
330 real_t *wpn = &wp[Nin5 * nei];
332 for (
int is = 0; is < Ns; ++is) {
333 mult_wilson_ypb(pg1_yp, pg2_yp, &vL[
NVCD * is].v[0], u,
334 &wp2[Nin4 * is], &wpn[Nin4 * is]);
338 if ((iy > 0) || (do_comm[idir] == 0)) {
339 int iy2 = (iy - 1 + Nyv) % Nyv;
340 nei = ix + Nxv * (iy2 + Nyv * izt);
341 real_t *wpn = &wp[Nin5 * nei];
344 for (
int is = 0; is < Ns; ++is) {
345 mult_wilson_ymb(pg1_ym, pg2_ym, &vL[
NVCD * is].v[0], ux, un,
346 &wp2[Nin4 * is], &wpn[Nin4 * is]);
352 if ((iz < Nz - 1) || (do_comm[idir] == 0)) {
353 int iz2 = (iz + 1) % Nz;
354 nei = ixy + Nxy * (iz2 + Nz * it);
355 real_t *wpn = &wp[Nin5 * nei];
357 for (
int is = 0; is < Ns; ++is) {
358 mult_wilson_zpb(&vL[
NVCD * is].v[0], u, &wpn[Nin4 * is]);
362 if ((iz > 0) || (do_comm[idir] == 0)) {
363 int iz2 = (iz - 1 + Nz) % Nz;
364 nei = ixy + Nxy * (iz2 + Nz * it);
365 real_t *wpn = &wp[Nin5 * nei];
367 for (
int is = 0; is < Ns; ++is) {
368 mult_wilson_zmb(&vL[
NVCD * is].v[0], u, &wpn[Nin4 * is]);
374 if ((it < Nt - 1) || (do_comm[idir] == 0)) {
375 int it2 = (it + 1) % Nt;
376 nei = ixyz + Nxyz * it2;
377 real_t *wpn = &wp[Nin5 * nei];
379 for (
int is = 0; is < Ns; ++is) {
380 mult_wilson_tpb_dirac(&vL[
NVCD * is].v[0], u, &wpn[Nin4 * is]);
384 if ((it > 0) || (do_comm[idir] == 0)) {
385 int it2 = (it - 1 + Nt) % Nt;
386 nei = ixyz + Nxyz * it2;
387 real_t *wpn = &wp[Nin5 * nei];
389 for (
int is = 0; is < Ns; ++is) {
390 mult_wilson_tmb_dirac(&vL[
NVCD * is].v[0], u, &wpn[Nin4 * is]);
394 save_vec(vp2, vL,
NVCD * Ns);
407 int *Nsize,
int *do_comm)
413 int Nstv = Nxv * Nyv * Nz * Nt;
414 int Nst = Nstv *
VLEN;
418 int Nin5 = Nin4 * Ns;
420 int Nin5H = Nin4H * Ns;
422 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
423 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
424 set_predicate_xp(pg1_xp, pg2_xp);
425 set_predicate_xm(pg1_xm, pg2_xm);
426 set_predicate_yp(pg1_yp, pg2_yp);
427 set_predicate_ym(pg1_ym, pg2_ym);
429 set_index_xp(svidx_xp);
430 set_index_xm(svidx_xm);
432 if (do_comm[0] == 1) {
435 int Nyzt = Nyv * Nz * Nt;
437 int ith, nth, site0, site1;
438 set_threadtask(ith, nth, site0, site1, Nyzt);
440 for (
int iyzt = site0; iyzt < site1; ++iyzt) {
443 int site = ix + Nxv * iyzt;
444 real_t *wp2 = &wp[Nin5 * site];
446 set_index_xm(svidx_xm);
447 for (
int is = 0; is < Ns; ++is) {
448 mult_wilson_xp1(pg2_xm, svidx_xm,
449 &buf1_xp[ibf +
VLENY *
NVC *
ND2 * is], &wp2[Nin4 * is]);
455 int site = ix + Nxv * iyzt;
456 real_t *wp2 = &wp[Nin5 * site];
459 set_index_xp(svidx_xp);
460 for (
int is = 0; is < Ns; ++is) {
461 mult_wilson_xm1(pg2_xp, svidx_xp,
462 &buf1_xm[ibf +
VLENY *
NVC *
ND2 * is], u2, &wp2[Nin4 * is]);
468 if (do_comm[1] == 1) {
470 int Nxzt = Nxv * Nz * Nt;
472 int ith, nth, site0, site1;
473 set_threadtask(ith, nth, site0, site1, Nxzt);
475 for (
int ixzt = site0; ixzt < site1; ++ixzt) {
477 int izt = ixzt / Nxv;
481 int site = ix + Nxv * (iy + Nyv * izt);
482 real_t *wp2 = &wp[Nin5 * site];
484 int ibf =
VLENX *
NVC *
ND2 * Ns * (ix + Nxv * izt);
485 for (
int is = 0; is < Ns; ++is) {
486 mult_wilson_yp1(pg2_ym,
487 &buf1_yp[ibf +
VLENX *
NVC *
ND2 * is], &wp2[Nin4 * is]);
493 int site = ix + Nxv * (iy + Nyv * izt);
494 real_t *wp2 = &wp[Nin5 * site];
496 int ibf =
VLENX *
NVC *
ND2 * Ns * (ix + Nxv * izt);
498 for (
int is = 0; is < Ns; ++is) {
499 mult_wilson_ym1(pg2_yp,
500 &buf1_ym[ibf +
VLENX *
NVC *
ND2 * is], u2, &wp2[Nin4 * is]);
507 if (do_comm[2] == 1) {
510 int Nxyt = Nxv * Nyv * Nt;
512 int ith, nth, site0, site1;
513 set_threadtask(ith, nth, site0, site1, Nxyt);
515 for (
int ixyt = site0; ixyt < site1; ++ixyt) {
516 int ixy = ixyt % Nxy;
521 int site = ixy + Nxy * (iz + Nz * it);
522 real_t *wp2 = &wp[Nin5 * site];
523 int ibf = Nin5H * (ixy + Nxy * it);
524 for (
int is = 0; is < Ns; ++is) {
525 mult_wilson_zp1(&buf1_zp[ibf + Nin4H * is], &wp2[Nin4 * is]);
530 int site = ixy + Nxy * (iz + Nz * it);
531 real_t *wp2 = &wp[Nin5 * site];
532 int ibf = Nin5H * (ixy + Nxy * it);
534 for (
int is = 0; is < Ns; ++is) {
535 mult_wilson_zm1(&buf1_zm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
541 if (do_comm[3] == 1) {
543 int Nxyz = Nxv * Nyv * Nz;
545 int ith, nth, site0, site1;
546 set_threadtask(ith, nth, site0, site1, Nxyz);
548 for (
int ixyz = site0; ixyz < site1; ++ixyz) {
551 int site = ixyz + Nxyz * it;
552 real_t *wp2 = &wp[Nin5 * site];
553 int ibf = Nin5H * ixyz;
554 for (
int is = 0; is < Ns; ++is) {
555 mult_wilson_tp1_dirac(&buf1_tp[ibf + Nin4H * is], &wp2[Nin4 * is]);
561 int site = ixyz + Nxyz * it;
562 real_t *wp2 = &wp[Nin5 * site];
563 int ibf = Nin5H * ixyz;
565 for (
int is = 0; is < Ns; ++is) {
566 mult_wilson_tm1_dirac(&buf1_tm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
582 int *Nsize,
int *do_comm)
590 int Nstv = Nxv * Nyv * Nz * Nt;
591 int Nst = Nstv *
VLEN;
594 int Nin5 = Nin4 * Ns;
596 int Nin5H = Nin4H * Ns;
600 int Nxyz = Nxv * Nyv * Nz;
602 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
603 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
604 set_predicate_xp(pg1_xp, pg2_xp);
605 set_predicate_xm(pg1_xm, pg2_xm);
606 set_predicate_yp(pg1_yp, pg2_yp);
607 set_predicate_ym(pg1_ym, pg2_ym);
609 set_index_xp(svidx_xp);
610 set_index_xm(svidx_xm);
612 int ith, nth, site0, site1;
613 set_threadtask(ith, nth, site0, site1, Nstv);
617 for (
int site = site0; site < site1; ++site) {
619 int iyzt = site / Nxv;
620 int ixy = site % Nxy;
622 int izt = site / Nxy;
625 int ixyz = site % Nxyz;
629 real_t *wp2 = &wp[Nin5 * site];
630 real_t *vp2 = &vp[Nin5 * site];
636 clear_vec(vL,
NVCD * Ns);
641 if (do_comm[idir] == 1) {
645 for (
int is = 0; is < Ns; ++is) {
646 set_index_xp(svidx_xp);
647 mult_wilson_xp2(pg1_xp, pg2_xp, svidx_xp,
648 &vL[
NVCD * is].v[0], u2,
649 &wp2[Nin4 * is], &buf2_xp[ibf +
VLENY *
NVC *
ND2 * is]);
657 set_index_xm(svidx_xm);
658 for (
int is = 0; is < Ns; ++is) {
659 mult_wilson_xm2(pg1_xm, pg2_xm, svidx_xm,
660 &vL[
NVCD * is].v[0], u2,
661 &wp2[Nin4 * is], &buf2_xm[ibf +
VLENY *
NVC *
ND2 * is]);
668 if (do_comm[idir] == 1) {
670 int ibf =
VLENX *
NVC *
ND2 * Ns * (ix + Nxv * izt);
672 for (
int is = 0; is < Ns; ++is) {
673 mult_wilson_yp2(pg1_yp, pg2_yp,
674 &vL[
NVCD * is].v[0], u2,
675 &wp2[Nin4 * is], &buf2_yp[ibf +
VLENX *
NVC *
ND2 * is]);
681 int ibf =
VLENX *
NVC *
ND2 * Ns * (ix + Nxv * izt);
683 for (
int is = 0; is < Ns; ++is) {
684 mult_wilson_ym2(pg1_ym, pg2_ym,
685 &vL[
NVCD * is].v[0], u2,
686 &wp2[Nin4 * is], &buf2_ym[ibf +
VLENX *
NVC *
ND2 * is]);
693 if (do_comm[idir] == 1) {
695 int ibf = Nin5H * (ixy + Nxy * it);
697 for (
int is = 0; is < Ns; ++is) {
698 mult_wilson_zp2(&vL[
NVCD * is].v[0], u2, &buf2_zp[ibf + Nin4H * is]);
704 int ibf = Nin5H * (ixy + Nxy * it);
705 for (
int is = 0; is < Ns; ++is) {
706 mult_wilson_zm2(&vL[
NVCD * is].v[0], &buf2_zm[ibf + Nin4H * is]);
713 if (do_comm[idir] == 1) {
715 int ibf = Nin5H * ixyz;
717 for (
int is = 0; is < Ns; ++is) {
718 mult_wilson_tp2_dirac(&vL[
NVCD * is].v[0], u2, &buf2_tp[ibf + Nin4H * is]);
724 int ibf = Nin5H * ixyz;
725 for (
int is = 0; is < Ns; ++is) {
726 mult_wilson_tm2_dirac(&vL[
NVCD * is].v[0], &buf2_tm[ibf + Nin4H * is]);
736 for (
int ivs = 0; ivs <
NVCD * Ns; ivs += 4) {
739 load_vec(pg, yt1, &vp2[
VLEN * (ivs)]);
740 load_vec(pg, yt2, &vp2[
VLEN * (ivs + 1)]);
741 load_vec(pg, yt3, &vp2[
VLEN * (ivs + 2)]);
742 load_vec(pg, yt4, &vp2[
VLEN * (ivs + 3)]);
743 load_vec(pg, vt1, &(vL[ivs].v[0]));
744 load_vec(pg, vt2, &(vL[ivs + 1].v[0]));
745 load_vec(pg, vt3, &(vL[ivs + 2].v[0]));
746 load_vec(pg, vt4, &(vL[ivs + 3].v[0]));
748 add_vec(pg, yt1, vt1);
749 add_vec(pg, yt2, vt2);
750 add_vec(pg, yt3, vt3);
751 add_vec(pg, yt4, vt4);
752 save_vec(pg, &vp2[
VLEN * (ivs)], yt1);
753 save_vec(pg, &vp2[
VLEN * (ivs + 1)], yt2);
754 save_vec(pg, &vp2[
VLEN * (ivs + 2)], yt3);
755 save_vec(pg, &vp2[
VLEN * (ivs + 3)], yt4);