10 #ifndef MULT_STAGGERED_QXS_INCLUDED
11 #define MULT_STAGGERED_QXS_INCLUDED
23 int Nstv = Nxv * Nyv * Nz * Nt;
26 set_threadtask(ith, nth, is, ns, Nstv);
30 for (
int site = is; site < ns; ++site) {
32 load_vec(pg, vph, &ph[
VLEN * site]);
34 for (
int in = 0; in < Nin; ++in) {
36 load_vec(pg, vt, &v[
VLEN * (in + Nin * site)]);
37 scal_vec(pg, vt, vph);
38 save_vec(pg, &v[
VLEN * (in + Nin * site)], vt);
51 int Nstv = Nxv * Nyv * Nz * Nt;
54 set_threadtask(ith, nth, is, ns, Nstv);
60 for (
int site = is; site < ns; ++site) {
61 for (
int in = 0; in < Nin; ++in) {
63 save_vec(pg, &v[
VLEN * (in + Nin * site)], vz);
78 int Nstv = Nxv * Nyv * Nz * Nt;
81 set_threadtask(ith, nth, is, ns, Nstv);
85 for (
int site = is; site < ns; ++site) {
86 for (
int in = 0; in < Nin; ++in) {
88 load_vec(pg, wt, &w[
VLEN * (in + Nin * site)]);
89 load_vec(pg, vt, &v[
VLEN * (in + Nin * site)]);
91 axpy_vec(pg, vt, a, wt);
93 save_vec(pg, &v[
VLEN * (in + Nin * site)], vt);
107 int Nstv = Nxv * Nyv * Nz * Nt;
109 int ith, nth, is, ns;
110 set_threadtask(ith, nth, is, ns, Nstv);
114 for (
int site = is; site < ns; ++site) {
115 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
116 load_vec(pg, wt0, &w[
VLEN * (0 +
NVC * site)]);
117 load_vec(pg, wt1, &w[
VLEN * (1 +
NVC * site)]);
118 load_vec(pg, wt2, &w[
VLEN * (2 +
NVC * site)]);
119 load_vec(pg, wt3, &w[
VLEN * (3 +
NVC * site)]);
120 load_vec(pg, wt4, &w[
VLEN * (4 +
NVC * site)]);
121 load_vec(pg, wt5, &w[
VLEN * (5 +
NVC * site)]);
123 for (
int ic = 0; ic <
NC; ++ic) {
124 svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
125 load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
126 &u[
VLEN * (2 * ic +
NDF * site)]);
129 mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
130 wt0, wt1, wt2, wt3, wt4, wt5);
134 save_vec(pg, &v[
VLEN * (2 * ic +
NVC * site)], xtr);
135 save_vec(pg, &v[
VLEN * (2 * ic + 1 +
NVC * site)], xti);
149 int Nstv = Nxv * Nyv * Nz * Nt;
151 int ith, nth, is, ns;
152 set_threadtask(ith, nth, is, ns, Nstv);
156 for (
int site = is; site < ns; ++site) {
157 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
158 load_vec(pg, wt0, &w[
VLEN * (0 +
NVC * site)]);
159 load_vec(pg, wt1, &w[
VLEN * (1 +
NVC * site)]);
160 load_vec(pg, wt2, &w[
VLEN * (2 +
NVC * site)]);
161 load_vec(pg, wt3, &w[
VLEN * (3 +
NVC * site)]);
162 load_vec(pg, wt4, &w[
VLEN * (4 +
NVC * site)]);
163 load_vec(pg, wt5, &w[
VLEN * (5 +
NVC * site)]);
165 for (
int ic = 0; ic <
NC; ++ic) {
166 svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
167 load_udag(pg, ut0, ut1, ut2, ut3, ut4, ut5,
171 mult_udv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
172 wt0, wt1, wt2, wt3, wt4, wt5);
176 save_vec(pg, &v[
VLEN * (2 * ic +
NVC * site)], xtr);
177 save_vec(pg, &v[
VLEN * (2 * ic + 1 +
NVC * site)], xti);
186 int *Nsize,
int *do_comm)
192 int Nstv = Nxv * Nyv * Nz * Nt;
193 int Nst = Nstv *
VLEN;
197 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
198 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
199 set_predicate_xp(pg1_xp, pg2_xp);
200 set_predicate_xm(pg1_xm, pg2_xm);
201 set_predicate_yp(pg1_yp, pg2_yp);
202 set_predicate_ym(pg1_ym, pg2_ym);
205 int Nxyz = Nxv * Nyv * Nz;
207 int ith, nth, is, ns;
208 set_threadtask(ith, nth, is, ns, Nstv);
210 for (
int site = is; site < ns; ++site) {
212 int iyzt = site / Nxv;
214 int izt = site / Nxy;
217 int ixy = ix + Nxv * iy;
218 int ixyz = ixy + Nxy * iz;
220 svreal_t vt0, vt1, vt2, vt3, vt4, vt5;
228 if ((ix < Nxv - 1) || (do_comm[0] == 0)) {
230 int nei = ix + 1 + Nxv * iyzt;
231 if (ix == Nxv - 1) nei = 0 + Nxv * iyzt;
232 mult_staggered_xp(pg, pg1_xp, pg2_xp,
233 vt0, vt1, vt2, vt3, vt4, vt5,
238 if ((ix > 0) || (do_comm[0] == 0)) {
240 int nei = ix - 1 + Nxv * iyzt;
241 if (ix == 0) nei = Nxv - 1 + Nxv * iyzt;
242 mult_staggered_xm(pg, pg1_xm, pg2_xm,
243 vt0, vt1, vt2, vt3, vt4, vt5,
248 if ((iy < Nyv - 1) || (do_comm[1] == 0)) {
249 int iy2 = (iy + 1) % Nyv;
250 int nei = ix + Nxv * (iy2 + Nyv * izt);
252 mult_staggered_yp(pg, pg1_yp, pg2_yp,
253 vt0, vt1, vt2, vt3, vt4, vt5,
258 if ((iy > 0) || (do_comm[1] == 0)) {
259 int iy2 = (iy - 1 + Nyv) % Nyv;
260 int nei = ix + Nxv * (iy2 + Nyv * izt);
262 mult_staggered_ym(pg, pg1_ym, pg2_ym,
263 vt0, vt1, vt2, vt3, vt4, vt5,
268 if ((iz < Nz - 1) || (do_comm[2] == 0)) {
269 int iz2 = (iz + 1) % Nz;
270 int nei = ixy + Nxy * (iz2 + Nz * it);
271 mult_staggered_up(pg, vt0, vt1, vt2, vt3, vt4, vt5,
272 &up[
VLEN *
NDF * (site + Nstv * 2)],
276 if ((iz > 0) || (do_comm[2] == 0)) {
277 int iz2 = (iz - 1 + Nz) % Nz;
278 int nei = ixy + Nxy * (iz2 + Nz * it);
279 mult_staggered_dn(pg, vt0, vt1, vt2, vt3, vt4, vt5,
280 &up[
VLEN *
NDF * (nei + Nstv * 2)],
284 if ((it < Nt - 1) || (do_comm[3] == 0)) {
285 int it2 = (it + 1) % Nt;
286 int nei = ixyz + Nxyz * it2;
287 mult_staggered_up(pg, vt0, vt1, vt2, vt3, vt4, vt5,
288 &up[
VLEN *
NDF * (site + Nstv * 3)],
292 if ((it > 0) || (do_comm[3] == 0)) {
293 int it2 = (it - 1 + Nt) % Nt;
294 int nei = ixyz + Nxyz * it2;
295 mult_staggered_dn(pg, vt0, vt1, vt2, vt3, vt4, vt5,
296 &up[
VLEN *
NDF * (nei + Nstv * 3)],
300 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
301 load_vec(pg, wt0, &v1[
VLEN * (0 +
NVC * site)]);
302 load_vec(pg, wt1, &v1[
VLEN * (1 +
NVC * site)]);
303 load_vec(pg, wt2, &v1[
VLEN * (2 +
NVC * site)]);
304 load_vec(pg, wt3, &v1[
VLEN * (3 +
NVC * site)]);
305 load_vec(pg, wt4, &v1[
VLEN * (4 +
NVC * site)]);
306 load_vec(pg, wt5, &v1[
VLEN * (5 +
NVC * site)]);
308 scal_vec(pg, vt0, fac);
309 scal_vec(pg, vt1, fac);
310 scal_vec(pg, vt2, fac);
311 scal_vec(pg, vt3, fac);
312 scal_vec(pg, vt4, fac);
313 scal_vec(pg, vt5, fac);
315 axpy_vec(pg, vt0, mq, wt0);
316 axpy_vec(pg, vt1, mq, wt1);
317 axpy_vec(pg, vt2, mq, wt2);
318 axpy_vec(pg, vt3, mq, wt3);
319 axpy_vec(pg, vt4, mq, wt4);
320 axpy_vec(pg, vt5, mq, wt5);
322 save_vec(pg, &v2[
VLEN * (0 +
NVC * site)], vt0);
323 save_vec(pg, &v2[
VLEN * (1 +
NVC * site)], vt1);
324 save_vec(pg, &v2[
VLEN * (2 +
NVC * site)], vt2);
325 save_vec(pg, &v2[
VLEN * (3 +
NVC * site)], vt3);
326 save_vec(pg, &v2[
VLEN * (4 +
NVC * site)], vt4);
327 save_vec(pg, &v2[
VLEN * (5 +
NVC * site)], vt5);
338 int *Nsize,
int *do_comm)
344 int Nstv = Nxv * Nyv * Nz * Nt;
345 int Nst = Nstv *
VLEN;
348 int Nxyz = Nxv * Nyv * Nz;
351 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
352 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
353 set_predicate_xp(pg1_xp, pg2_xp);
354 set_predicate_xm(pg1_xm, pg2_xm);
355 set_predicate_yp(pg1_yp, pg2_yp);
356 set_predicate_ym(pg1_ym, pg2_ym);
358 set_index_xp(svidx_xp);
359 set_index_xm(svidx_xm);
362 if (do_comm[0] > 0) {
366 int Nyzt = Nyv * Nz * Nt;
367 int ith, nth, isx, nsx;
368 set_threadtask(ith, nth, isx, nsx, Nyzt);
370 for (
int iyzt = isx; iyzt < nsx; ++iyzt) {
373 int site = ix + Nxv * iyzt;
376 set_index_xm(svidx_xm);
377 for (
int ivc = 0; ivc <
NVC; ++ivc) {
379 load_vec(pg2_xm, wt, &v1[
VLEN * (ivc +
NVC * site)]);
381 save_vec_scatter(pg2_xm, &buf[
VLENY * ivc], wt, svidx_xm);
386 int site = ix + Nxv * iyzt;
389 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
390 load_vec(pg2_xp, wt0, &v1[
VLEN * (0 +
NVC * site)]);
391 load_vec(pg2_xp, wt1, &v1[
VLEN * (1 +
NVC * site)]);
392 load_vec(pg2_xp, wt2, &v1[
VLEN * (2 +
NVC * site)]);
393 load_vec(pg2_xp, wt3, &v1[
VLEN * (3 +
NVC * site)]);
394 load_vec(pg2_xp, wt4, &v1[
VLEN * (4 +
NVC * site)]);
395 load_vec(pg2_xp, wt5, &v1[
VLEN * (5 +
NVC * site)]);
397 set_index_xp(svidx_xp);
398 for (
int ic = 0; ic <
NC; ++ic) {
399 svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
401 load_udag(pg2_xp, ut0, ut1, ut2, ut3, ut4, ut5,
403 mult_udv(pg2_xp, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
404 wt0, wt1, wt2, wt3, wt4, wt5);
407 save_vec_scatter(pg2_xp, &buf[
VLENY * (2 * ic)], xtr, svidx_xp);
408 save_vec_scatter(pg2_xp, &buf[
VLENY * (2 * ic + 1)], xti, svidx_xp);
414 if (do_comm[1] > 0) {
418 int Nxzt = Nxv * Nz * Nt;
419 int ith, nth, isy, nsy;
420 set_threadtask(ith, nth, isy, nsy, Nxzt);
422 for (
int ixzt = isy; ixzt < nsy; ++ixzt) {
424 int izt = ixzt / Nxv;
427 int site = ix + Nxv * (iy + Nyv * izt);
429 for (
int ivc = 0; ivc <
NVC; ++ivc) {
431 load_vec(pg2_ym, wt, &v1[
VLEN * (ivc +
NVC * site)]);
433 save_vec(pg2_ym, &buf[
VLENX * ivc], wt);
438 int site = ix + Nxv * (iy + Nyv * izt);
441 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
442 load_vec(pg2_yp, wt0, &v1[
VLEN * (0 +
NVC * site)]);
443 load_vec(pg2_yp, wt1, &v1[
VLEN * (1 +
NVC * site)]);
444 load_vec(pg2_yp, wt2, &v1[
VLEN * (2 +
NVC * site)]);
445 load_vec(pg2_yp, wt3, &v1[
VLEN * (3 +
NVC * site)]);
446 load_vec(pg2_yp, wt4, &v1[
VLEN * (4 +
NVC * site)]);
447 load_vec(pg2_yp, wt5, &v1[
VLEN * (5 +
NVC * site)]);
449 for (
int ic = 0; ic <
NC; ++ic) {
450 svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
452 load_udag(pg2_yp, ut0, ut1, ut2, ut3, ut4, ut5,
454 mult_udv(pg2_yp, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
455 wt0, wt1, wt2, wt3, wt4, wt5);
459 save_vec(pg2_yp, &buf[offset +
VLENX * (2 * ic)], xtr);
460 save_vec(pg2_yp, &buf[offset +
VLENX * (2 * ic + 1)], xti);
466 if (do_comm[2] > 0) {
469 int Nxyt = Nxv * Nyv * Nt;
470 int ith, nth, isz, nsz;
471 set_threadtask(ith, nth, isz, nsz, Nxyt);
473 for (
int ixyt = isz; ixyt < nsz; ++ixyt) {
474 int ixy = ixyt % Nxy;
478 int site = ixy + Nxy * (iz + Nz * it);
480 for (
int ivc = 0; ivc <
NVC; ++ivc) {
482 load_vec(pg, wt, &v1[
VLEN * (ivc +
NVC * site)]);
484 save_vec(pg, &buf[
VLEN * ivc], wt);
489 int site = ixy + Nxy * (iz + Nz * it);
490 mult_staggered_dn1(pg, &buf_zm[
VLEN *
NVC * ixyt],
491 &up[
VLEN *
NDF * (site + Nstv * 2)],
497 if (do_comm[3] > 0) {
500 int ith, nth, ist, nst;
501 set_threadtask(ith, nth, ist, nst, Nxyz);
503 for (
int ixyz = ist; ixyz < nst; ++ixyz) {
506 int site = ixyz + Nxyz * it;
508 for (
int ivc = 0; ivc <
NVC; ++ivc) {
510 load_vec(pg, wt, &v1[
VLEN * (ivc +
NVC * site)]);
512 save_vec(pg, &buf[
VLEN * ivc], wt);
517 int site = ixyz + Nxyz * it;
518 mult_staggered_dn1(pg, &buf_tm[
VLEN *
NVC * ixyz],
519 &up[
VLEN *
NDF * (site + Nstv * 3)],
534 int *Nsize,
int *do_comm)
540 int Nstv = Nxv * Nyv * Nz * Nt;
541 int Nst = Nstv *
VLEN;
544 int Nxyz = Nxv * Nyv * Nz;
549 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
550 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
551 set_predicate_xp(pg1_xp, pg2_xp);
552 set_predicate_xm(pg1_xm, pg2_xm);
553 set_predicate_yp(pg1_yp, pg2_yp);
554 set_predicate_ym(pg1_ym, pg2_ym);
556 set_index_xp(svidx_xp);
557 set_index_xm(svidx_xm);
559 int ith, nth, is, ns;
560 set_threadtask(ith, nth, is, ns, Nstv);
562 for (
int site = is; site < ns; ++site) {
564 int iyzt = site / Nxv;
566 int izt = site / Nxy;
569 int ixy = ix + Nxv * iy;
570 int ixyz = ixy + Nxy * iz;
572 svreal_t vt0, vt1, vt2, vt3, vt4, vt5;
582 if ((ix == Nxv - 1) && (do_comm[0] > 0)) {
586 set_index_xp(svidx_xp);
587 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
588 load_vec(pg1_xp, wt0, &v1[
VLEN * (0 +
NVC * site) + 1]);
589 load_vec(pg1_xp, wt1, &v1[
VLEN * (1 +
NVC * site) + 1]);
590 load_vec(pg1_xp, wt2, &v1[
VLEN * (2 +
NVC * site) + 1]);
591 load_vec(pg1_xp, wt3, &v1[
VLEN * (3 +
NVC * site) + 1]);
592 load_vec(pg1_xp, wt4, &v1[
VLEN * (4 +
NVC * site) + 1]);
593 load_vec(pg1_xp, wt5, &v1[
VLEN * (5 +
NVC * site) + 1]);
595 load_add_gather(pg2_xp, wt0, &buf[
VLENY * 0], svidx_xp);
596 load_add_gather(pg2_xp, wt1, &buf[
VLENY * 1], svidx_xp);
597 load_add_gather(pg2_xp, wt2, &buf[
VLENY * 2], svidx_xp);
598 load_add_gather(pg2_xp, wt3, &buf[
VLENY * 3], svidx_xp);
599 load_add_gather(pg2_xp, wt4, &buf[
VLENY * 4], svidx_xp);
600 load_add_gather(pg2_xp, wt5, &buf[
VLENY * 5], svidx_xp);
602 svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
604 load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
606 mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
607 wt0, wt1, wt2, wt3, wt4, wt5);
608 add_vec(pg, vt0, xtr);
609 add_vec(pg, vt1, xti);
610 load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
612 mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
613 wt0, wt1, wt2, wt3, wt4, wt5);
614 add_vec(pg, vt2, xtr);
615 add_vec(pg, vt3, xti);
616 load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
618 mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
619 wt0, wt1, wt2, wt3, wt4, wt5);
620 add_vec(pg, vt4, xtr);
621 add_vec(pg, vt5, xti);
626 if ((ix == 0) && (do_comm[0] > 0)) {
630 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
631 load_vec(pg1_xm, wt0, &v1[
VLEN * (0 +
NVC * site) - 1]);
632 load_vec(pg1_xm, wt1, &v1[
VLEN * (1 +
NVC * site) - 1]);
633 load_vec(pg1_xm, wt2, &v1[
VLEN * (2 +
NVC * site) - 1]);
634 load_vec(pg1_xm, wt3, &v1[
VLEN * (3 +
NVC * site) - 1]);
635 load_vec(pg1_xm, wt4, &v1[
VLEN * (4 +
NVC * site) - 1]);
636 load_vec(pg1_xm, wt5, &v1[
VLEN * (5 +
NVC * site) - 1]);
638 set_index_xm(svidx_xm);
639 svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
641 load_udag(pg1_xm, ut0, ut1, ut2, ut3, ut4, ut5,
642 &u[
VLEN * (0 +
NDF * site) - 1]);
643 mult_udv(pg1_xm, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
644 wt0, wt1, wt2, wt3, wt4, wt5);
645 load_add_gather(pg2_xm, xtr, &buf[
VLENY * 0], svidx_xm);
646 load_add_gather(pg2_xm, xti, &buf[
VLENY * 1], svidx_xm);
647 sub_vec(pg, vt0, xtr);
648 sub_vec(pg, vt1, xti);
650 load_udag(pg1_xm, ut0, ut1, ut2, ut3, ut4, ut5,
652 mult_udv(pg1_xm, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
653 wt0, wt1, wt2, wt3, wt4, wt5);
654 load_add_gather(pg2_xm, xtr, &buf[
VLENY * 2], svidx_xm);
655 load_add_gather(pg2_xm, xti, &buf[
VLENY * 3], svidx_xm);
656 sub_vec(pg, vt2, xtr);
657 sub_vec(pg, vt3, xti);
659 load_udag(pg1_xm, ut0, ut1, ut2, ut3, ut4, ut5,
661 mult_udv(pg1_xm, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
662 wt0, wt1, wt2, wt3, wt4, wt5);
663 load_add_gather(pg2_xm, xtr, &buf[
VLENY * 4], svidx_xm);
664 load_add_gather(pg2_xm, xti, &buf[
VLENY * 5], svidx_xm);
665 sub_vec(pg, vt4, xtr);
666 sub_vec(pg, vt5, xti);
671 if ((iy == Nyv - 1) && (do_comm[1] > 0)) {
673 int ixzt = ix + Nxv * izt;
676 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
677 load_vec(pg1_yp, wt0, &v1[
VLEN * (0 +
NVC * site) +
VLENX]);
678 load_vec(pg1_yp, wt1, &v1[
VLEN * (1 +
NVC * site) +
VLENX]);
679 load_vec(pg1_yp, wt2, &v1[
VLEN * (2 +
NVC * site) +
VLENX]);
680 load_vec(pg1_yp, wt3, &v1[
VLEN * (3 +
NVC * site) +
VLENX]);
681 load_vec(pg1_yp, wt4, &v1[
VLEN * (4 +
NVC * site) +
VLENX]);
682 load_vec(pg1_yp, wt5, &v1[
VLEN * (5 +
NVC * site) +
VLENX]);
685 load_add(pg2_yp, wt0, &buf[offset +
VLENX * 0]);
686 load_add(pg2_yp, wt1, &buf[offset +
VLENX * 1]);
687 load_add(pg2_yp, wt2, &buf[offset +
VLENX * 2]);
688 load_add(pg2_yp, wt3, &buf[offset +
VLENX * 3]);
689 load_add(pg2_yp, wt4, &buf[offset +
VLENX * 4]);
690 load_add(pg2_yp, wt5, &buf[offset +
VLENX * 5]);
691 svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
693 load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
695 mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
696 wt0, wt1, wt2, wt3, wt4, wt5);
697 add_vec(pg, vt0, xtr);
698 add_vec(pg, vt1, xti);
699 load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
701 mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
702 wt0, wt1, wt2, wt3, wt4, wt5);
703 add_vec(pg, vt2, xtr);
704 add_vec(pg, vt3, xti);
705 load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
707 mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
708 wt0, wt1, wt2, wt3, wt4, wt5);
709 add_vec(pg, vt4, xtr);
710 add_vec(pg, vt5, xti);
715 if ((iy == 0) && (do_comm[1] > 0)) {
717 int ixzt = ix + Nxv * izt;
720 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
721 load_vec(pg1_ym, wt0, &v1[
VLEN * (0 +
NVC * site) -
VLENX]);
722 load_vec(pg1_ym, wt1, &v1[
VLEN * (1 +
NVC * site) -
VLENX]);
723 load_vec(pg1_ym, wt2, &v1[
VLEN * (2 +
NVC * site) -
VLENX]);
724 load_vec(pg1_ym, wt3, &v1[
VLEN * (3 +
NVC * site) -
VLENX]);
725 load_vec(pg1_ym, wt4, &v1[
VLEN * (4 +
NVC * site) -
VLENX]);
726 load_vec(pg1_ym, wt5, &v1[
VLEN * (5 +
NVC * site) -
VLENX]);
728 svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
730 load_udag(pg1_ym, ut0, ut1, ut2, ut3, ut4, ut5,
732 mult_udv(pg1_ym, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
733 wt0, wt1, wt2, wt3, wt4, wt5);
734 load_add(pg2_ym, xtr, &buf[
VLENX * 0]);
735 load_add(pg2_ym, xti, &buf[
VLENX * 1]);
736 sub_vec(pg, vt0, xtr);
737 sub_vec(pg, vt1, xti);
739 load_udag(pg1_ym, ut0, ut1, ut2, ut3, ut4, ut5,
741 mult_udv(pg1_ym, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
742 wt0, wt1, wt2, wt3, wt4, wt5);
743 load_add(pg2_ym, xtr, &buf[
VLENX * 2]);
744 load_add(pg2_ym, xti, &buf[
VLENX * 3]);
745 sub_vec(pg, vt2, xtr);
746 sub_vec(pg, vt3, xti);
748 load_udag(pg1_ym, ut0, ut1, ut2, ut3, ut4, ut5,
750 mult_udv(pg1_ym, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
751 wt0, wt1, wt2, wt3, wt4, wt5);
752 load_add(pg2_ym, xtr, &buf[
VLENX * 4]);
753 load_add(pg2_ym, xti, &buf[
VLENX * 5]);
754 sub_vec(pg, vt4, xtr);
755 sub_vec(pg, vt5, xti);
760 if ((iz == Nz - 1) && (do_comm[2] > 0)) {
761 int ixyt = ixy + Nxy * it;
762 mult_staggered_up(pg, vt0, vt1, vt2, vt3, vt4, vt5,
763 &up[
VLEN *
NDF * (site + Nstv * 2)],
768 if ((iz == 0) && (do_comm[2] > 0)) {
769 int ixyt = ixy + Nxy * it;
773 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
774 load_vec(pg, wt0, &buf[
VLEN * 0]);
775 load_vec(pg, wt1, &buf[
VLEN * 1]);
776 load_vec(pg, wt2, &buf[
VLEN * 2]);
777 load_vec(pg, wt3, &buf[
VLEN * 3]);
778 load_vec(pg, wt4, &buf[
VLEN * 4]);
779 load_vec(pg, wt5, &buf[
VLEN * 5]);
781 sub_vec(pg, vt0, wt0);
782 sub_vec(pg, vt1, wt1);
783 sub_vec(pg, vt2, wt2);
784 sub_vec(pg, vt3, wt3);
785 sub_vec(pg, vt4, wt4);
786 sub_vec(pg, vt5, wt5);
791 if ((it == Nt - 1) && (do_comm[3] > 0)) {
792 mult_staggered_up(pg, vt0, vt1, vt2, vt3, vt4, vt5,
793 &up[
VLEN *
NDF * (site + Nstv * 3)],
798 if ((it == 0) && (do_comm[3] > 0)) {
801 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
802 load_vec(pg, wt0, &buf[
VLEN * 0]);
803 load_vec(pg, wt1, &buf[
VLEN * 1]);
804 load_vec(pg, wt2, &buf[
VLEN * 2]);
805 load_vec(pg, wt3, &buf[
VLEN * 3]);
806 load_vec(pg, wt4, &buf[
VLEN * 4]);
807 load_vec(pg, wt5, &buf[
VLEN * 5]);
809 sub_vec(pg, vt0, wt0);
810 sub_vec(pg, vt1, wt1);
811 sub_vec(pg, vt2, wt2);
812 sub_vec(pg, vt3, wt3);
813 sub_vec(pg, vt4, wt4);
814 sub_vec(pg, vt5, wt5);
820 svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
821 load_vec(pg, wt0, &v2[
VLEN * (0 +
NVC * site)]);
822 load_vec(pg, wt1, &v2[
VLEN * (1 +
NVC * site)]);
823 load_vec(pg, wt2, &v2[
VLEN * (2 +
NVC * site)]);
824 load_vec(pg, wt3, &v2[
VLEN * (3 +
NVC * site)]);
825 load_vec(pg, wt4, &v2[
VLEN * (4 +
NVC * site)]);
826 load_vec(pg, wt5, &v2[
VLEN * (5 +
NVC * site)]);
828 scal_vec(pg, vt0, fac);
829 scal_vec(pg, vt1, fac);
830 scal_vec(pg, vt2, fac);
831 scal_vec(pg, vt3, fac);
832 scal_vec(pg, vt4, fac);
833 scal_vec(pg, vt5, fac);
835 add_vec(pg, vt0, wt0);
836 add_vec(pg, vt1, wt1);
837 add_vec(pg, vt2, wt2);
838 add_vec(pg, vt3, wt3);
839 add_vec(pg, vt4, wt4);
840 add_vec(pg, vt5, wt5);
848 save_vec(pg, &v2[
VLEN * (0 +
NVC * site)], vt0);
849 save_vec(pg, &v2[
VLEN * (1 +
NVC * site)], vt1);
850 save_vec(pg, &v2[
VLEN * (2 +
NVC * site)], vt2);
851 save_vec(pg, &v2[
VLEN * (3 +
NVC * site)], vt3);
852 save_vec(pg, &v2[
VLEN * (4 +
NVC * site)], vt4);
853 save_vec(pg, &v2[
VLEN * (5 +
NVC * site)], vt5);