10 #ifndef MULT_WILSON_EO_QXS_INCLUDED
11 #define MULT_WILSON_EO_QXS_INCLUDED
16 #define IMPLE 1 // 0: original, 1: Nitadori-san's
23 int *Nsize,
int *do_comm,
24 int *Leo,
const int ieo,
32 int Nst2v = Nx2v * Ny * Nz * Nt;
33 int Nst2 = Nst2v *
VLEN;
36 int Nxyz2 = Nx2v * Ny * Nz;
38 svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
39 svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
40 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
44 set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
45 set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
46 set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
47 set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
48 set_predicate_yp(pg1_yp, pg2_yp);
49 set_predicate_ym(pg1_ym, pg2_ym);
52 svuint_t idx1e_xp, idx1o_xp, idx1e_xm, idx1o_xm;
54 set_idx_predicate_xp_eo(pg1e_xp, idx1e_xp, 0);
55 set_idx_predicate_xp_eo(pg1o_xp, idx1o_xp, 1);
56 set_idx_predicate_xm_eo(pg1e_xm, idx1e_xm, 0);
57 set_idx_predicate_xm_eo(pg1o_xm, idx1o_xm, 1);
58 set_idx_predicate_yp(pg1_yp, idx1_yp);
59 set_idx_predicate_ym(pg1_ym, idx1_ym);
63 set_threadtask(ith, nth, is, ns, Nst2v);
65 for (
int site = is; site < ns; ++site) {
67 int iyzt = site / Nx2v;
69 int izt = site / Nxy2;
72 int ixy = ix + Nx2v * iy;
73 int ixyz = ixy + Nxy2 * iz;
74 int jeo = (ieo + Leo[
VLENY * iyzt]) % 2;
81 if ((ix < Nx2v - 1) || (do_comm[0] == 0)) {
82 int nei = ix + 1 + Nx2v * iyzt;
83 if (ix == Nx2v - 1) nei = 0 + Nx2v * iyzt;
84 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 0)];
89 mult_wilson_eo_xpb(pg1e_xp, pg2e_xp, pg3e_xp,
95 set_idx_predicate_xp_eo(pg1e_xp, idx1e_xp, 0);
97 mult_wilson_eo_xpb(pg1e_xp, idx1e_xp,
104 mult_wilson_eo_xpb(pg1o_xp, pg2o_xp, pg3o_xp,
110 set_idx_predicate_xp_eo(pg1o_xp, idx1o_xp, 1);
112 mult_wilson_eo_xpb(pg1o_xp, idx1o_xp,
119 if ((ix > 0) || (do_comm[0] == 0)) {
120 int nei = ix - 1 + Nx2v * iyzt;
121 if (ix == 0) nei = Nx2v - 1 + Nx2v * iyzt;
122 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 0)];
126 mult_wilson_eo_xmb(pg1e_xm, pg2e_xm, pg3e_xm, v2v,
132 set_idx_predicate_xm_eo(pg1e_xm, idx1e_xm, 0);
134 mult_wilson_eo_xmb(pg1e_xm, idx1e_xm, v2v,
141 mult_wilson_eo_xmb(pg1o_xm, pg2o_xm, pg3o_xm, v2v,
147 set_idx_predicate_xm_eo(pg1o_xm, idx1o_xm, 1);
149 mult_wilson_eo_xmb(pg1o_xm, idx1o_xm, v2v,
156 if ((iy < Ny - 1) || (do_comm[1] == 0)) {
157 int iy2 = (iy + 1) % Ny;
158 int nei = ix + Nx2v * (iy2 + Ny * izt);
159 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 1)];
161 mult_wilson_ypb(pg1_yp, pg2_yp, v2v,
166 set_idx_predicate_yp(pg1_yp, idx1_yp);
168 mult_wilson_ypb(pg1_yp, idx1_yp, v2v,
174 if ((iy > 0) || (do_comm[1] == 0)) {
175 int iy2 = (iy - 1 + Ny) % Ny;
176 int nei = ix + Nx2v * (iy2 + Ny * izt);
177 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 1)];
179 mult_wilson_ymb(pg1_ym, pg2_ym, v2v,
184 set_idx_predicate_ym(pg1_ym, idx1_ym);
186 mult_wilson_ymb(pg1_ym, idx1_ym, v2v,
192 if ((iz < Nz - 1) || (do_comm[2] == 0)) {
193 int iz2 = (iz + 1) % Nz;
194 int nei = ixy + Nxy2 * (iz2 + Nz * it);
195 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 2)];
199 if ((iz > 0) || (do_comm[2] == 0)) {
200 int iz2 = (iz - 1 + Nz) % Nz;
201 int nei = ixy + Nxy2 * (iz2 + Nz * it);
202 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 2)];
206 if ((it < Nt - 1) || (do_comm[3] == 0)) {
207 int it2 = (it + 1) % Nt;
208 int nei = ixyz + Nxyz2 * it2;
209 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 3)];
210 mult_wilson_tpb_dirac(v2v, &u[
VLEN *
NDF * site], &v1[
VLEN *
NVCD * nei]);
213 if ((it > 0) || (do_comm[3] == 0)) {
214 int it2 = (it - 1 + Nt) % Nt;
215 int nei = ixyz + Nxyz2 * it2;
216 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 3)];
217 mult_wilson_tmb_dirac(v2v, &u[
VLEN *
NDF * nei], &v1[
VLEN *
NVCD * nei]);
223 for (
int i = 0; i <
NVCD; ++i) {
225 load_vec(pg, v2t, &v2v[i].v[0]);
226 scal_vec(pg, v2t, -kappa);
227 save_vec(pg, &vv2[
VLEN * i], v2t);
230 mult_wilson_aypx_save(&v2[
VLEN *
NVCD * site],
231 kappa, v2v, &xp[
VLEN *
NVCD * site]);
243 int *Nsize,
int *do_comm,
int *Leo,
244 const int ieo,
const int iflag)
251 int Nst2v = Nx2v * Ny * Nz * Nt;
252 int Nst2 = Nst2v *
VLEN;
254 int Nxy2 = Nx2v * Ny;
255 int Nxyz2 = Nx2v * Ny * Nz;
257 svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
258 svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
259 set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
260 set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
261 set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
262 set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
263 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
264 set_predicate_yp(pg1_yp, pg2_yp);
265 set_predicate_ym(pg1_ym, pg2_ym);
271 set_index_xp_eo(svidx_xp);
272 set_index_xm_eo(svidx_xm);
274 int Nskipx = (
VLENY + 1) / 2;
277 if (do_comm[0] == 1) {
280 int Nyzt = Ny * Nz * Nt;
282 int ith, nth, is, ns;
283 set_threadtask(ith, nth, is, ns, Nyzt);
285 for (
int iyzt = is; iyzt < ns; ++iyzt) {
286 int jeo = (ieo + Leo[
VLENY * iyzt]) % 2;
289 int site = ix + Nx2v * iyzt;
290 int ibf_up = Nskipx *
NVC *
ND2 * iyzt;
294 set_index_xm_eo(svidx_xm);
295 mult_wilson_eo_xp1(pg2o_xm, svidx_xm,
296 &buf_xp[ibf_up], &v1[
VLEN *
NVCD * site]);
298 mult_wilson_eo_xp1(pg2o_xm,
299 &buf_xp[ibf_up], &v1[
VLEN *
NVCD * site]);
303 if (
VLENY == 1) ibf_up = Nskipx *
NVC *
ND2 * (iyzt / 2);
305 set_index_xm_eo(svidx_xm);
306 mult_wilson_eo_xp1(pg2e_xm, svidx_xm,
307 &buf_xp[ibf_up], &v1[
VLEN *
NVCD * site]);
309 mult_wilson_eo_xp1(pg2e_xm,
310 &buf_xp[ibf_up], &v1[
VLEN *
NVCD * site]);
317 int site = ix + Nx2v * iyzt;
318 int ibf_dn = Nskipx *
NVC *
ND2 * iyzt;
319 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * idir)];
321 if (
VLENY == 1) ibf_dn = Nskipx *
NVC *
ND2 * (iyzt / 2);
323 set_index_xp_eo(svidx_xp);
324 mult_wilson_eo_xm1(pg2o_xp, svidx_xp, &buf_xm[ibf_dn],
327 mult_wilson_eo_xm1(pg2o_xp, &buf_xm[ibf_dn],
333 set_index_xp_eo(svidx_xp);
334 mult_wilson_eo_xm1(pg2e_xp, svidx_xp, &buf_xm[ibf_dn],
337 mult_wilson_eo_xm1(pg2e_xp, &buf_xm[ibf_dn],
346 if (do_comm[1] > 0) {
349 int Nxzt = Nx2v * Nz * Nt;
351 int ith, nth, is, ns;
352 set_threadtask(ith, nth, is, ns, Nxzt);
354 for (
int ixzt = is; ixzt < ns; ++ixzt) {
355 int ix = ixzt % Nx2v;
356 int izt = ixzt / Nx2v;
359 int site = ix + Nx2v * (iy + Ny * izt);
361 mult_wilson_yp1(pg2_ym,
362 &buf_yp[ibf], &v1[
VLEN *
NVCD * site]);
366 int site = ix + Nx2v * (iy + Ny * izt);
367 real_t *u = &up[
NDF * Nst2 * ((1 - ieo) + 2 * idir)];
369 mult_wilson_ym1(pg2_yp,
370 &buf_ym[ibf], &u[
VLEN *
NDF * site],
376 if (do_comm[2] > 0) {
378 int Nxyt2 = Nxy2 * Nt;
380 int ith, nth, is, ns;
381 set_threadtask(ith, nth, is, ns, Nxyt2);
383 for (
int ixyt = is; ixyt < ns; ++ixyt) {
384 int ixy = ixyt % Nxy2;
385 int it = ixyt / Nxy2;
388 int site = ixy + Nxy2 * (iz + Nz * it);
389 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy2 * it);
390 mult_wilson_zp1(&buf_zp[ibf], &v1[
VLEN *
NVCD * site]);
394 int site = ixy + Nxy2 * (iz + Nz * it);
395 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy2 * it);
396 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * idir)];
397 mult_wilson_zm1(&buf_zm[ibf], &u[
VLEN *
NDF * site],
403 if (do_comm[3] > 0) {
406 int ith, nth, is, ns;
407 set_threadtask(ith, nth, is, ns, Nxyz2);
409 for (
int ixyz = is; ixyz < ns; ++ixyz) {
412 int site = ixyz + Nxyz2 * it;
413 mult_wilson_tp1_dirac(&buf_tp[
VLEN *
NVC *
ND2 * ixyz],
418 int site = ixyz + Nxyz2 * it;
419 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * idir)];
420 mult_wilson_tm1_dirac(&buf_tm[
VLEN *
NVC *
ND2 * ixyz],
436 int *Nsize,
int *do_comm,
int *Leo,
437 const int ieo,
const int iflag)
444 int Nst2v = Nx2v * Ny * Nz * Nt;
445 int Nst2 = Nst2v *
VLEN;
447 int Nxy2 = Nx2v * Ny;
448 int Nxyz2 = Nx2v * Ny * Nz;
450 svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
451 svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
452 set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
453 set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
454 set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
455 set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
456 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
457 set_predicate_yp(pg1_yp, pg2_yp);
458 set_predicate_ym(pg1_ym, pg2_ym);
464 set_index_xp_eo(svidx_xp);
465 set_index_xm_eo(svidx_xm);
473 int Nskipx = (
VLENY + 1) / 2;
475 int ith, nth, is, ns;
476 set_threadtask(ith, nth, is, ns, Nst2v);
478 for (
int site = is; site < ns; ++site) {
479 int ix = site % Nx2v;
480 int iyzt = site / Nx2v;
482 int izt = site / Nxy2;
485 int ixy = ix + Nx2v * iy;
486 int ixyz = ixy + Nxy2 * iz;
487 int jeo = (ieo + Leo[
VLENY * iyzt]) % 2;
490 clear_vec(v2v,
NVCD);
494 if ((ix == Nx2v - 1) && (do_comm[0] > 0)) {
495 int ibf_up = Nskipx *
NVC *
ND2 * iyzt;
496 if (
VLENY == 1) ibf_up = Nskipx *
NVC *
ND2 * (iyzt / 2);
497 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 0)];
499 set_index_xp_eo(svidx_xp);
500 mult_wilson_eo_xp2(pg1e_xp, pg2e_xp, pg3e_xp, svidx_xp,
502 &v1[
VLEN *
NVCD * site], &buf_xp[ibf_up]);
504 set_index_xp_eo(svidx_xp);
505 mult_wilson_eo_xp2(pg1o_xp, pg2o_xp, pg3o_xp, svidx_xp,
507 &v1[
VLEN *
NVCD * site], &buf_xp[ibf_up]);
511 if ((ix == 0) && (do_comm[0] > 0)) {
512 int ibf_dn = Nskipx *
NVC *
ND2 * iyzt;
513 if (
VLENY == 1) ibf_dn = Nskipx *
NVC *
ND2 * (iyzt / 2);
514 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 0)];
516 set_index_xm_eo(svidx_xm);
517 mult_wilson_eo_xm2(pg1e_xm, pg2e_xm, pg3e_xm, svidx_xm,
519 &v1[
VLEN *
NVCD * site], &buf_xm[ibf_dn]);
521 set_index_xm_eo(svidx_xm);
522 mult_wilson_eo_xm2(pg1o_xm, pg2o_xm, pg3o_xm, svidx_xm,
524 &v1[
VLEN *
NVCD * site], &buf_xm[ibf_dn]);
528 if ((iy == Ny - 1) && (do_comm[1] > 0)) {
530 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 1)];
531 mult_wilson_yp2(pg1_yp, pg2_yp,
533 &v1[
VLEN *
NVCD * site], &buf_yp[ibf]);
536 if ((iy == 0) && (do_comm[1] > 0)) {
538 real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 1)];
539 mult_wilson_ym2(pg1_ym, pg2_ym,
541 &v1[
VLEN *
NVCD * site], &buf_ym[ibf]);
544 if ((iz == Nz - 1) && (do_comm[2] > 0)) {
545 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy2 * it);
546 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 2)];
547 mult_wilson_zp2(v2v, &u[
VLEN *
NDF * site], &buf_zp[ibf]);
550 if ((iz == 0) && (do_comm[2] > 0)) {
551 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy2 * it);
552 mult_wilson_zm2(v2v, &buf_zm[ibf]);
555 if ((it == Nt - 1) && (do_comm[3] > 0)) {
556 real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 3)];
557 mult_wilson_tp2_dirac(v2v, &u[
VLEN *
NDF * site],
561 if ((it == 0) && (do_comm[3] > 0)) {
562 mult_wilson_tm2_dirac(v2v, &buf_tm[
VLEN *
NVC *
ND2 * ixyz]);
568 for (
int i = 0; i <
NVCD; i += 8) {
573 load_vec(pg, vt1, &vv2[
VLEN * (i)]);
574 load_vec(pg, vt2, &vv2[
VLEN * (i + 1)]);
575 load_vec(pg, vt3, &vv2[
VLEN * (i + 2)]);
576 load_vec(pg, vt4, &vv2[
VLEN * (i + 3)]);
578 load_vec(pg, wt1, &ww2[
VLEN * (i)]);
579 load_vec(pg, wt2, &ww2[
VLEN * (i + 1)]);
580 load_vec(pg, wt3, &ww2[
VLEN * (i + 2)]);
581 load_vec(pg, wt4, &ww2[
VLEN * (i + 3)]);
583 axpy_vec(pg, vt1, kappa_eo, wt1);
584 axpy_vec(pg, vt2, kappa_eo, wt2);
585 axpy_vec(pg, vt3, kappa_eo, wt3);
587 load_vec(pg, vt5, &vv2[
VLEN * (i + 4)]);
588 load_vec(pg, vt6, &vv2[
VLEN * (i + 5)]);
589 load_vec(pg, vt7, &vv2[
VLEN * (i + 6)]);
590 load_vec(pg, vt8, &vv2[
VLEN * (i + 7)]);
592 load_vec(pg, wt5, &ww2[
VLEN * (i + 4)]);
593 load_vec(pg, wt6, &ww2[
VLEN * (i + 5)]);
594 load_vec(pg, wt7, &ww2[
VLEN * (i + 6)]);
595 load_vec(pg, wt8, &ww2[
VLEN * (i + 7)]);
597 axpy_vec(pg, vt4, kappa_eo, wt4);
598 axpy_vec(pg, vt5, kappa_eo, wt5);
599 axpy_vec(pg, vt6, kappa_eo, wt6);
600 axpy_vec(pg, vt7, kappa_eo, wt7);
601 axpy_vec(pg, vt8, kappa_eo, wt8);
603 save_vec(pg, &vv2[
VLEN * (i)], vt1);
604 save_vec(pg, &vv2[
VLEN * (i + 1)], vt2);
605 save_vec(pg, &vv2[
VLEN * (i + 2)], vt3);
606 save_vec(pg, &vv2[
VLEN * (i + 3)], vt4);
607 save_vec(pg, &vv2[
VLEN * (i + 4)], vt5);
608 save_vec(pg, &vv2[
VLEN * (i + 5)], vt6);
609 save_vec(pg, &vv2[
VLEN * (i + 6)], vt7);
610 save_vec(pg, &vv2[
VLEN * (i + 7)], vt8);