11 template<
typename AFIELD>
13 "ShiftAField_lex<AFIELD>";
15 template<
typename AFIELD>
19 std::vector<int> bc(Ndim);
20 for (
int mu = 0; mu < Ndim; ++mu) {
29 template<
typename AFIELD>
37 vout.
general(m_vl,
"%s: construction\n", class_name.c_str());
46 m_Nvol = m_Nx * m_Ny * m_Nz * m_Nt;
52 m_Nstv = m_Nvol /
VLEN;
57 if (bc.size() != m_Ndim) {
58 vout.
crucial(m_vl,
"%s: incorrect size of boundary condition\n",
63 m_boundary.resize(m_Ndim);
65 for (
int mu = 0; mu < m_Ndim; ++mu) {
66 m_boundary[mu] = bc[mu];
67 vout.
general(m_vl,
" boundary[%d] = %2d\n", mu, m_boundary[mu]);
71 for (
int mu = 0; mu < m_Ndim; ++mu) {
74 do_comm_any += do_comm[mu];
75 vout.
general(
" do_comm[%d] = %d\n", mu, do_comm[mu]);
78 m_Nbdsize.resize(m_Ndim);
79 m_Nbdsize[0] = m_Nin * m_Ny * m_Nz * m_Nt;
80 m_Nbdsize[1] = m_Nin * m_Nx * m_Nz * m_Nt;
81 m_Nbdsize[2] = m_Nin * m_Nx * m_Ny * m_Nt;
82 m_Nbdsize[3] = m_Nin * m_Nx * m_Ny * m_Nz;
86 vout.
general(m_vl,
"%s: construction finished.\n", class_name.c_str());
91 template<
typename AFIELD>
98 template<
typename AFIELD>
101 chsend_up.resize(m_Ndim);
102 chrecv_up.resize(m_Ndim);
103 chsend_dn.resize(m_Ndim);
104 chrecv_dn.resize(m_Ndim);
106 for (
int mu = 0; mu < m_Ndim; ++mu) {
107 int Nvsize = m_Nbdsize[mu] *
sizeof(
real_t);
109 chsend_dn[mu].send_init(Nvsize, mu, -1);
110 chsend_up[mu].send_init(Nvsize, mu, 1);
112 chrecv_up[mu].recv_init(Nvsize, mu, 1);
113 chrecv_dn[mu].recv_init(Nvsize, mu, -1);
115 void *buf_up = (
void *)chsend_dn[mu].ptr();
116 chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
117 void *buf_dn = (
void *)chsend_up[mu].ptr();
118 chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
120 if (do_comm[mu] == 1) {
121 chset_send.append(chsend_up[mu]);
122 chset_send.append(chsend_dn[mu]);
123 chset_recv.append(chrecv_up[mu]);
124 chset_recv.append(chrecv_dn[mu]);
131 template<
typename AFIELD>
141 for (
int ex = 0; ex < Nex; ++ex) {
142 real_t *vp = v.
ptr(index.idx(0, m_Nin, 0, ex));
143 real_t *wp =
const_cast<AFIELD *
>(&w)->ptr(index.idx(0, m_Nin, 0, ex));
148 }
else if (mu == 1) {
151 }
else if (mu == 2) {
154 }
else if (mu == 3) {
158 vout.
crucial(m_vl,
"Error at %s: wrong parameter\n",
167 template<
typename AFIELD>
169 const AFIELD& w,
const int ex2,
181 real_t *vp = v.
ptr(index.idx(0, m_Nin, 0, ex1));
182 real_t *wp =
const_cast<AFIELD *
>(&w)->ptr(index.idx(0, m_Nin, 0, ex2));
187 }
else if (mu == 1) {
190 }
else if (mu == 2) {
193 }
else if (mu == 3) {
197 vout.
crucial(m_vl,
"Error at %s: wrong parameter\n",
205 template<
typename AFIELD>
215 for (
int ex = 0; ex < Nex; ++ex) {
216 real_t *vp = v.
ptr(index.idx(0, m_Nin, 0, ex));
217 real_t *wp =
const_cast<AFIELD *
>(&w)->ptr(index.idx(0, m_Nin, 0, ex));
222 }
else if (mu == 1) {
225 }
else if (mu == 2) {
228 }
else if (mu == 3) {
232 vout.
crucial(m_vl,
"Error at %s: wrong parameter\n",
241 template<
typename AFIELD>
243 const AFIELD& w,
const int ex2,
255 real_t *vp = v.
ptr(index.idx(0, m_Nin, 0, ex1));
256 real_t *wp =
const_cast<AFIELD *
>(&w)->ptr(index.idx(0, m_Nin, 0, ex2));
261 }
else if (mu == 1) {
264 }
else if (mu == 2) {
267 }
else if (mu == 3) {
271 vout.
crucial(m_vl,
"Error at %s: wrong parameter\n",
279 template<
typename AFIELD>
292 int ith, nth, is, ns;
293 set_threadtask(ith, nth, is, ns, m_Nvol);
297 if (do_comm[0] == 1) {
311 for (
int site = is; site < ns; ++site) {
312 int ix = site % m_Nx;
313 int iyzt = site / m_Nx;
315 for (
int in = 0; in < m_Nin; ++in) {
316 int index = index_alt.idx(in, m_Nin, site, ex);
317 buf1[in + m_Nin * iyzt] = bc2 * wp[index];
326 chrecv_up[0].start();
327 chsend_dn[0].start();
336 for (
int site = is; site < ns; ++site) {
337 int ix = site % m_Nx;
338 int iyzt = site / m_Nx;
339 if ((ix < m_Nx - 1) || (do_comm[0] == 0)) {
340 int ix2 = (ix + 1) % m_Nx;
341 int nei = ix2 + m_Nx * iyzt;
343 if (ix == m_Nx - 1) bc3 = bc2;
344 for (
int in = 0; in < m_Nin; ++in) {
345 int iv = index_alt.idx(in, m_Nin, site, ex);
346 int iw = index_alt.idx(in, m_Nin, nei, ex);
347 vp[iv] = bc3 * wp[iw];
350 for (
int in = 0; in < m_Nin; ++in) {
351 int iv = index_alt.idx(in, m_Nin, site, ex);
352 vp[iv] = buf2[in + m_Nin * iyzt];
382 template<
typename AFIELD>
395 int ith, nth, is, ns;
396 set_threadtask(ith, nth, is, ns, m_Nvol);
400 if (do_comm[0] == 1) {
401 for (
int site = is; site < ns; ++site) {
402 int ix = site % m_Nx;
403 int iyzt = site / m_Nx;
405 for (
int in = 0; in < m_Nin; ++in) {
406 int index = index_alt.idx(in, m_Nin, site, ex);
407 buf1[in + m_Nin * iyzt] = bc2 * wp[index];
416 chrecv_up[0].start();
417 chsend_dn[0].start();
426 for (
int site = is; site < ns; ++site) {
427 int ix = site % m_Nx;
428 int iyzt = site / m_Nx;
429 if ((ix < m_Nx - 1) || (do_comm[0] == 0)) {
430 int ix2 = (ix + 1) % m_Nx;
431 int nei = ix2 + m_Nx * iyzt;
433 if (ix == m_Nx - 1) bc3 = bc2;
434 for (
int in = 0; in < m_Nin; ++in) {
435 int iv = index_alt.idx(in, m_Nin, site, ex);
436 int iw = index_alt.idx(in, m_Nin, nei, ex);
437 vp[iv] = bc3 * wp[iw];
440 for (
int in = 0; in < m_Nin; ++in) {
441 int iv = index_alt.idx(in, m_Nin, site, ex);
442 vp[iv] = buf2[in + m_Nin * iyzt];
452 template<
typename AFIELD>
457 bc2 =
real_t(m_boundary[0]);
467 int ith, nth, is, ns;
468 set_threadtask(ith, nth, is, ns, m_Nvol);
472 if (do_comm[0] == 1) {
486 for (
int site = is; site < ns; ++site) {
487 int ix = site % m_Nx;
488 int iyzt = site / m_Nx;
489 if (ix == m_Nx - 1) {
490 for (
int in = 0; in < m_Nin; ++in) {
491 int index = index_alt.idx(in, m_Nin, site, ex);
492 buf1[in + m_Nin * iyzt] = bc2 * wp[index];
501 chrecv_dn[0].start();
502 chsend_up[0].start();
511 for (
int site = is; site < ns; ++site) {
512 int ix = site % m_Nx;
513 int iyzt = site / m_Nx;
514 if ((ix > 0) || (do_comm[0] == 0)) {
515 int ix2 = (ix - 1 + m_Nx) % m_Nx;
516 int nei = ix2 + m_Nx * iyzt;
518 if (ix == 0) bc3 = bc2;
519 for (
int in = 0; in < m_Nin; ++in) {
520 int iv = index_alt.idx(in, m_Nin, site, ex);
521 int iw = index_alt.idx(in, m_Nin, nei, ex);
522 vp[iv] = bc3 * wp[iw];
525 for (
int in = 0; in < m_Nin; ++in) {
526 int index = index_alt.idx(in, m_Nin, site, ex);
527 vp[index] = buf2[in + m_Nin * iyzt];
557 template<
typename AFIELD>
562 bc2 =
real_t(m_boundary[0]);
572 int ith, nth, is, ns;
573 set_threadtask(ith, nth, is, ns, m_Nvol);
577 if (do_comm[0] == 1) {
578 for (
int site = is; site < ns; ++site) {
579 int ix = site % m_Nx;
580 int iyzt = site / m_Nx;
581 if (ix == m_Nx - 1) {
582 for (
int in = 0; in < m_Nin; ++in) {
583 int index = index_alt.idx(in, m_Nin, site, ex);
584 buf1[in + m_Nin * iyzt] = bc2 * wp[index];
593 chrecv_dn[0].start();
594 chsend_up[0].start();
603 for (
int site = is; site < ns; ++site) {
604 int ix = site % m_Nx;
605 int iyzt = site / m_Nx;
606 if ((ix > 0) || (do_comm[0] == 0)) {
607 int ix2 = (ix - 1 + m_Nx) % m_Nx;
608 int nei = ix2 + m_Nx * iyzt;
610 if (ix == 0) bc3 = bc2;
611 for (
int in = 0; in < m_Nin; ++in) {
612 int iv = index_alt.idx(in, m_Nin, site, ex);
613 int iw = index_alt.idx(in, m_Nin, nei, ex);
614 vp[iv] = bc3 * wp[iw];
617 for (
int in = 0; in < m_Nin; ++in) {
618 int index = index_alt.idx(in, m_Nin, site, ex);
619 vp[index] = buf2[in + m_Nin * iyzt];
629 template<
typename AFIELD>
635 int Nin2 = m_Nin / 2;
640 int ith, nth, is, ns;
641 set_threadtask(ith, nth, is, ns, m_Nstv);
645 if (do_comm[1] == 1) {
648 chrecv_up[1].start();
650 for (
int site = is; site < ns; ++site) {
651 int ix = site % m_Nxv;
652 int iy = (site / m_Nxv) % m_Ny;
653 int izt = site / (m_Nxv * m_Ny);
655 int iv =
VLEN * Nin2 * site;
656 int ibf =
VLEN * Nin2 * (ix + m_Nxv * izt);
658 load_vec(vt, &wp[iv], Nin2);
659 scal_vec(vt, bc2, Nin2);
660 save_vec(&buf1[ibf], vt, Nin2);
667 chsend_dn[1].start();
671 for (
int site = is; site < ns; ++site) {
672 int ix = site % m_Nxv;
673 int iy = (site / m_Nxv) % m_Ny;
674 int izt = site / (m_Nxv * m_Ny);
675 int iv =
VLEN * Nin2 * site;
676 if ((iy < m_Ny - 1) || (do_comm[1] == 0)) {
677 int iyn = (iy + 1) % m_Ny;
678 int nei =
VLEN * Nin2 * (ix + m_Nxv * (iyn + m_Ny * izt));
680 load_vec(vt, &wp[nei], Nin2);
681 if (iy == m_Ny - 1) scal_vec(vt, bc2, Nin2);
682 save_vec(&vp[iv], vt, Nin2);
686 if (do_comm[1] == 1) {
693 for (
int site = is; site < ns; ++site) {
694 int ix = site % m_Nxv;
695 int iy = (site / m_Nxv) % m_Ny;
696 int izt = site / (m_Nxv * m_Ny);
697 if (iy == m_Ny - 1) {
698 int iv =
VLEN * Nin2 * site;
699 int ibf =
VLEN * Nin2 * (ix + m_Nxv * izt);
701 load_vec(vt, &buf2[ibf], Nin2);
702 save_vec(&vp[iv], vt, Nin2);
717 template<
typename AFIELD>
730 int ith, nth, is, ns;
731 set_threadtask(ith, nth, is, ns, m_Nvol);
735 if (do_comm[1] == 1) {
738 chrecv_up[1].start();
741 for (
int site = is; site < ns; ++site) {
742 int ix = site % m_Nx;
743 int iy = (site / m_Nx) % m_Ny;
744 int izt = site / (m_Nx * m_Ny);
746 for (
int in = 0; in < m_Nin; ++in) {
747 int iw = index_alt.idx(in, m_Nin, site, ex);
748 int ixzt = ix + m_Nx * izt;
749 buf1[in + m_Nin * ixzt] = bc2 * wp[iw];
758 chsend_dn[1].start();
762 for (
int site = is; site < ns; ++site) {
763 int ix = site % m_Nx;
764 int iy = (site / m_Nx) % m_Ny;
765 int izt = site / (m_Nx * m_Ny);
766 if ((iy < m_Ny - 1) || (do_comm[1] == 0)) {
767 int iy2 = (iy + 1) % m_Ny;
768 int nei = ix + m_Nx * (iy2 + m_Ny * izt);
770 if (iy == m_Ny - 1) bc3 = bc2;
771 for (
int in = 0; in < m_Nin; ++in) {
772 int iv = index_alt.idx(in, m_Nin, site, ex);
773 int iw = index_alt.idx(in, m_Nin, nei, ex);
774 vp[iv] = bc3 * wp[iw];
779 if (do_comm[1] == 1) {
787 for (
int site = is; site < ns; ++site) {
788 int ix = site % m_Nx;
789 int iy = (site / m_Nx) % m_Ny;
790 int izt = site / (m_Nx * m_Ny);
791 int ixzt = ix + m_Nx * izt;
792 if (iy == m_Ny - 1) {
793 for (
int in = 0; in < m_Nin; ++in) {
794 int iv = index_alt.idx(in, m_Nin, site, ex);
795 vp[iv] = buf2[in + m_Nin * ixzt];
810 template<
typename AFIELD>
816 int Nin2 = m_Nin / 2;
821 int ith, nth, is, ns;
822 set_threadtask(ith, nth, is, ns, m_Nstv);
826 if (do_comm[1] == 1) {
829 chrecv_dn[1].start();
831 for (
int site = is; site < ns; ++site) {
832 int ix = site % m_Nxv;
833 int iy = (site / m_Nxv) % m_Ny;
834 int izt = site / (m_Nxv * m_Ny);
835 if (iy == m_Ny - 1) {
836 int iv =
VLEN * Nin2 * site;
837 int ibf =
VLEN * Nin2 * (ix + m_Nxv * izt);
839 load_vec(vt, &wp[iv], Nin2);
840 save_vec(&buf1[ibf], vt, Nin2);
848 chsend_up[1].start();
852 for (
int site = is; site < ns; ++site) {
853 int ix = site % m_Nxv;
854 int iy = (site / m_Nxv) % m_Ny;
855 int izt = site / (m_Nxv * m_Ny);
856 int iv =
VLEN * Nin2 * site;
857 if ((iy > 0) || (do_comm[1] == 0)) {
858 int iyn = (iy - 1 + m_Ny) % m_Ny;
859 int nei =
VLEN * Nin2 * (ix + m_Nxv * (iyn + m_Ny * izt));
861 load_vec(vt, &wp[nei], Nin2);
862 if (iy == 0) scal_vec(vt, bc2, Nin2);
863 save_vec(&vp[iv], vt, Nin2);
867 if (do_comm[1] == 1) {
875 for (
int site = is; site < ns; ++site) {
876 int ix = site % m_Nxv;
877 int iy = (site / m_Nxv) % m_Ny;
878 int izt = site / (m_Nxv * m_Ny);
880 int iv =
VLEN * Nin2 * site;
881 int ibf =
VLEN * Nin2 * (ix + m_Nxv * izt);
883 load_vec(vt, &buf2[ibf], Nin2);
884 scal_vec(vt, bc2, Nin2);
885 save_vec(&vp[iv], vt, Nin2);
900 template<
typename AFIELD>
906 int Nin2 = m_Nin / 2;
915 int ith, nth, is, ns;
916 set_threadtask(ith, nth, is, ns, m_Nvol);
920 if (do_comm[1] == 1) {
923 chrecv_dn[1].start();
926 for (
int site = is; site < ns; ++site) {
927 int ix = site % m_Nx;
928 int iy = (site / m_Nx) % m_Ny;
929 int izt = site / (m_Nx * m_Ny);
930 if (iy == m_Ny - 1) {
931 for (
int in = 0; in < m_Nin; ++in) {
932 int iw = index_alt.idx(in, m_Nin, site, ex);
933 int ixzt = ix + m_Nx * izt;
934 buf1[in + m_Nin * ixzt] = wp[iw];
943 chsend_up[1].start();
947 for (
int site = is; site < ns; ++site) {
948 int ix = site % m_Nx;
949 int iy = (site / m_Nx) % m_Ny;
950 int izt = site / (m_Nx * m_Ny);
951 if ((iy > 0) || (do_comm[1] == 0)) {
952 int iy2 = (iy - 1 + m_Ny) % m_Ny;
953 int nei = ix + m_Nx * (iy2 + m_Ny * izt);
955 if (iy == 0) bc3 = bc2;
956 for (
int in = 0; in < m_Nin; ++in) {
957 int iv = index_alt.idx(in, m_Nin, site, ex);
958 int iw = index_alt.idx(in, m_Nin, nei, ex);
959 vp[iv] = bc3 * wp[iw];
964 if (do_comm[1] == 1) {
973 for (
int site = is; site < ns; ++site) {
974 int ix = site % m_Nx;
975 int iy = (site / m_Nx) % m_Ny;
976 int izt = site / (m_Nx * m_Ny);
977 int ixzt = ix + m_Nx * izt;
979 for (
int in = 0; in < m_Nin; ++in) {
980 int iv = index_alt.idx(in, m_Nin, site, ex);
981 vp[iv] = bc2 * buf2[in + m_Nin * ixzt];
997 template<
typename AFIELD>
1003 int Nin2 = m_Nin / 2;
1008 int ith, nth, is, ns;
1009 set_threadtask(ith, nth, is, ns, m_Nstv);
1011 int Nxy = m_Nxv * m_Ny;
1014 if (do_comm[2] == 1) {
1017 chrecv_up[2].start();
1020 for (
int site = is; site < ns; ++site) {
1021 int ixy = site % Nxy;
1022 int iz = (site / Nxy) % m_Nz;
1023 int it = site / (Nxy * m_Nz);
1025 int iv =
VLEN * Nin2 * site;
1026 int ibf =
VLEN * Nin2 * (ixy + Nxy * it);
1028 load_vec(vt, &wp[iv], Nin2);
1029 scal_vec(vt, bc2, Nin2);
1030 save_vec(&buf1[ibf], vt, Nin2);
1038 chsend_dn[2].start();
1042 for (
int site = is; site < ns; ++site) {
1043 int ixy = site % Nxy;
1044 int iz = (site / Nxy) % m_Nz;
1045 int it = site / (Nxy * m_Nz);
1046 int iv =
VLEN * Nin2 * site;
1047 if ((iz < m_Nz - 1) || (do_comm[2] == 0)) {
1048 int izn = (iz + 1) % m_Nz;
1049 int nei =
VLEN * Nin2 * (ixy + Nxy * (izn + m_Nz * it));
1051 load_vec(vt, &wp[nei], Nin2);
1052 if (iz == m_Nz - 1) scal_vec(vt, bc2, Nin2);
1053 save_vec(&vp[iv], vt, Nin2);
1057 if (do_comm[2] == 1) {
1060 chrecv_up[2].wait();
1065 for (
int site = is; site < ns; ++site) {
1066 int ixy = site % Nxy;
1067 int iz = (site / Nxy) % m_Nz;
1068 int it = site / (Nxy * m_Nz);
1069 if (iz == m_Nz - 1) {
1070 int iv =
VLEN * Nin2 * site;
1071 int ibf =
VLEN * Nin2 * (ixy + Nxy * it);
1073 load_vec(vt, &buf2[ibf], Nin2);
1074 save_vec(&vp[iv], vt, Nin2);
1079 chsend_dn[2].wait();
1088 template<
typename AFIELD>
1101 int ith, nth, is, ns;
1102 set_threadtask(ith, nth, is, ns, m_Nvol);
1104 int Nxy = m_Nx * m_Ny;
1108 if (do_comm[2] == 1) {
1111 chrecv_up[2].start();
1114 for (
int site = is; site < ns; ++site) {
1115 int ixy = site % Nxy;
1116 int iz = (site / Nxy) % m_Nz;
1117 int it = site / (Nxy * m_Nz);
1119 for (
int in = 0; in < m_Nin; ++in) {
1120 int iw = index_alt.idx(in, m_Nin, site, ex);
1121 int ixyt = ixy + Nxy * it;
1122 buf1[in + m_Nin * ixyt] = bc2 * wp[iw];
1131 chsend_dn[2].start();
1135 for (
int site = is; site < ns; ++site) {
1136 int ixy = site % Nxy;
1137 int iz = (site / Nxy) % m_Nz;
1138 int it = site / (Nxy * m_Nz);
1139 if ((iz < m_Nz - 1) || (do_comm[2] == 0)) {
1140 int iz2 = (iz + 1) % m_Nz;
1141 int nei = ixy + Nxy * (iz2 + m_Nz * it);
1143 if (iz == m_Nz - 1) bc3 = bc2;
1144 for (
int in = 0; in < m_Nin; ++in) {
1145 int iv = index_alt.idx(in, m_Nin, site, ex);
1146 int iw = index_alt.idx(in, m_Nin, nei, ex);
1147 vp[iv] = bc3 * wp[iw];
1152 if (do_comm[2] == 1) {
1155 chrecv_up[2].wait();
1160 for (
int site = is; site < ns; ++site) {
1161 int ixy = site % Nxy;
1162 int iz = (site / Nxy) % m_Nz;
1163 int it = site / (Nxy * m_Nz);
1164 if (iz == m_Nz - 1) {
1165 for (
int in = 0; in < m_Nin; ++in) {
1166 int iv = index_alt.idx(in, m_Nin, site, ex);
1167 int ixyt = ixy + Nxy * it;
1168 vp[iv] = buf2[in + m_Nin * ixyt];
1174 chsend_dn[2].wait();
1183 template<
typename AFIELD>
1189 int Nin2 = m_Nin / 2;
1194 int ith, nth, is, ns;
1195 set_threadtask(ith, nth, is, ns, m_Nstv);
1197 int Nxy = m_Nxv * m_Ny;
1200 if (do_comm[2] == 1) {
1203 chrecv_dn[2].start();
1205 for (
int site = is; site < ns; ++site) {
1206 int ixy = site % Nxy;
1207 int iz = (site / Nxy) % m_Nz;
1208 int it = site / (Nxy * m_Nz);
1209 if (iz == m_Nz - 1) {
1210 int iv =
VLEN * Nin2 * site;
1211 int ibf =
VLEN * Nin2 * (ixy + Nxy * it);
1213 load_vec(vt, &wp[iv], Nin2);
1214 save_vec(&buf1[ibf], vt, Nin2);
1222 chsend_up[2].start();
1226 for (
int site = is; site < ns; ++site) {
1227 int ixy = site % Nxy;
1228 int iz = (site / Nxy) % m_Nz;
1229 int it = site / (Nxy * m_Nz);
1230 int iv =
VLEN * Nin2 * site;
1231 if ((iz > 0) || (do_comm[2] == 0)) {
1232 int izn = (iz - 1 + m_Nz) % m_Nz;
1233 int nei =
VLEN * Nin2 * (ixy + Nxy * (izn + m_Nz * it));
1235 load_vec(vt, &wp[nei], Nin2);
1236 if (iz == 0) scal_vec(vt, bc2, Nin2);
1237 save_vec(&vp[iv], vt, Nin2);
1241 if (do_comm[2] == 1) {
1244 chrecv_dn[2].wait();
1249 for (
int site = is; site < ns; ++site) {
1250 int ixy = site % Nxy;
1251 int iz = (site / Nxy) % m_Nz;
1252 int it = site / (Nxy * m_Nz);
1254 int iv =
VLEN * Nin2 * site;
1255 int ibf =
VLEN * Nin2 * (ixy + Nxy * it);
1257 load_vec(vt, &buf2[ibf], Nin2);
1258 scal_vec(vt, bc2, Nin2);
1259 save_vec(&vp[iv], vt, Nin2);
1264 chsend_up[2].wait();
1273 template<
typename AFIELD>
1288 int ith, nth, is, ns;
1289 set_threadtask(ith, nth, is, ns, m_Nvol);
1291 int Nxy = m_Nx * m_Ny;
1295 if (do_comm[2] == 1) {
1298 chrecv_dn[2].start();
1301 for (
int site = is; site < ns; ++site) {
1302 int ixy = site % Nxy;
1303 int iz = (site / Nxy) % m_Nz;
1304 int it = site / (Nxy * m_Nz);
1305 if (iz == m_Nz - 1) {
1306 for (
int in = 0; in < m_Nin; ++in) {
1307 int iw = index_alt.idx(in, m_Nin, site, ex);
1308 int ixyt = ixy + Nxy * it;
1309 buf1[in + m_Nin * ixyt] = wp[iw];
1318 chsend_up[2].start();
1322 for (
int site = is; site < ns; ++site) {
1323 int ixy = site % Nxy;
1324 int iz = (site / Nxy) % m_Nz;
1325 int it = site / (Nxy * m_Nz);
1326 if ((iz > 0) || (do_comm[2] == 0)) {
1327 int iz2 = (iz - 1 + m_Nz) % m_Nz;
1328 int nei = ixy + Nxy * (iz2 + m_Nz * it);
1330 if (iz == 0) bc3 = bc2;
1331 for (
int in = 0; in < m_Nin; ++in) {
1332 int iv = index_alt.idx(in, m_Nin, site, ex);
1333 int iw = index_alt.idx(in, m_Nin, nei, ex);
1334 vp[iv] = bc3 * wp[iw];
1339 if (do_comm[2] == 1) {
1342 chrecv_dn[2].wait();
1347 for (
int site = is; site < ns; ++site) {
1348 int ixy = site % Nxy;
1349 int iz = (site / Nxy) % m_Nz;
1350 int it = site / (Nxy * m_Nz);
1352 for (
int in = 0; in < m_Nin; ++in) {
1353 int iv = index_alt.idx(in, m_Nin, site, ex);
1354 int ixyt = ixy + Nxy * it;
1355 vp[iv] = bc2 * buf2[in + m_Nin * ixyt];
1361 chsend_up[2].wait();
1370 template<
typename AFIELD>
1376 int Nin2 = m_Nin / 2;
1381 int ith, nth, is, ns;
1382 set_threadtask(ith, nth, is, ns, m_Nstv);
1384 int Nxyz = m_Nxv * m_Ny * m_Nz;
1388 if (do_comm[3] == 1) {
1391 chrecv_up[3].start();
1394 for (
int site = is; site < ns; ++site) {
1395 int ixyz = site % Nxyz;
1396 int it = site / Nxyz;
1398 int iv =
VLEN * Nin2 * site;
1399 int ibf =
VLEN * Nin2 * ixyz;
1401 load_vec(vt, &wp[iv], Nin2);
1402 scal_vec(vt, bc2, Nin2);
1403 save_vec(&buf1[ibf], vt, Nin2);
1411 chsend_dn[3].start();
1415 for (
int site = is; site < ns; ++site) {
1416 int ixyz = site % Nxyz;
1417 int it = site / Nxyz;
1418 int iv =
VLEN * Nin2 * site;
1419 if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
1420 int itn = (it + 1) % m_Nt;
1421 int nei =
VLEN * Nin2 * (ixyz + Nxyz * itn);
1423 load_vec(vt, &wp[nei], Nin2);
1424 if (it == m_Nt - 1) scal_vec(vt, bc2, Nin2);
1425 save_vec(&vp[iv], vt, Nin2);
1429 if (do_comm[3] == 1) {
1432 chrecv_up[3].wait();
1437 for (
int site = is; site < ns; ++site) {
1438 int ixyz = site % Nxyz;
1439 int it = site / Nxyz;
1440 if (it == m_Nt - 1) {
1441 int iv =
VLEN * Nin2 * site;
1442 int ibf =
VLEN * Nin2 * ixyz;
1444 load_vec(vt, &buf2[ibf], Nin2);
1445 save_vec(&vp[iv], vt, Nin2);
1450 chsend_dn[3].wait();
1459 template<
typename AFIELD>
1472 int ith, nth, is, ns;
1473 set_threadtask(ith, nth, is, ns, m_Nvol);
1475 int Nxyz = m_Nx * m_Ny * m_Nz;
1479 if (do_comm[3] == 1) {
1482 chrecv_up[3].start();
1484 for (
int site = is; site < ns; ++site) {
1485 int ixyz = site % Nxyz;
1486 int it = site / Nxyz;
1488 for (
int in = 0; in < m_Nin; ++in) {
1489 int iw = index_alt.idx(in, m_Nin, site, ex);
1490 buf1[in + m_Nin * ixyz] = bc2 * wp[iw];
1499 chsend_dn[3].start();
1504 for (
int site = is; site < ns; ++site) {
1505 int ixyz = site % Nxyz;
1506 int it = site / Nxyz;
1507 if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
1508 int it2 = (it + 1) % m_Nt;
1509 int nei = ixyz + Nxyz * it2;
1511 if (it == m_Nt - 1) bc3 = bc2;
1512 for (
int in = 0; in < m_Nin; ++in) {
1513 int iv = index_alt.idx(in, m_Nin, site, ex);
1514 int iw = index_alt.idx(in, m_Nin, nei, ex);
1515 vp[iv] = bc3 * wp[iw];
1520 if (do_comm[3] == 1) {
1523 chrecv_up[3].wait();
1528 for (
int site = is; site < ns; ++site) {
1529 int ixyz = site % Nxyz;
1530 int it = site / Nxyz;
1531 if (it == m_Nt - 1) {
1532 for (
int in = 0; in < m_Nin; ++in) {
1533 int iv = index_alt.idx(in, m_Nin, site, ex);
1534 vp[iv] = buf2[in + m_Nin * ixyz];
1541 chsend_dn[3].wait();
1550 template<
typename AFIELD>
1556 int Nin2 = m_Nin / 2;
1561 int ith, nth, is, ns;
1562 set_threadtask(ith, nth, is, ns, m_Nstv);
1564 int Nxyz = m_Nxv * m_Ny * m_Nz;
1567 if (do_comm[3] == 1) {
1570 chrecv_dn[3].start();
1572 for (
int site = is; site < ns; ++site) {
1573 int ixyz = site % Nxyz;
1574 int it = site / Nxyz;
1575 if (it == m_Nt - 1) {
1576 int iv =
VLEN * Nin2 * site;
1577 int ibf =
VLEN * Nin2 * ixyz;
1579 load_vec(vt, &wp[iv], Nin2);
1580 save_vec(&buf1[ibf], vt, Nin2);
1588 chsend_up[3].start();
1592 for (
int site = is; site < ns; ++site) {
1593 int ixyz = site % Nxyz;
1594 int it = site / Nxyz;
1595 int iv =
VLEN * Nin2 * site;
1596 if ((it > 0) || (do_comm[3] == 0)) {
1597 int itn = (it - 1 + m_Nt) % m_Nt;
1598 int nei =
VLEN * Nin2 * (ixyz + Nxyz * itn);
1600 load_vec(vt, &wp[nei], Nin2);
1601 if (it == 0) scal_vec(vt, bc2, Nin2);
1602 save_vec(&vp[iv], vt, Nin2);
1606 if (do_comm[3] == 1) {
1609 chrecv_dn[3].wait();
1614 for (
int site = is; site < ns; ++site) {
1615 int ixyz = site % Nxyz;
1616 int it = site / Nxyz;
1618 int iv =
VLEN * Nin2 * site;
1619 int ibf =
VLEN * Nin2 * ixyz;
1621 load_vec(vt, &buf2[ibf], Nin2);
1622 scal_vec(vt, bc2, Nin2);
1623 save_vec(&vp[iv], vt, Nin2);
1628 chsend_up[3].wait();
1637 template<
typename AFIELD>
1650 int ith, nth, is, ns;
1651 set_threadtask(ith, nth, is, ns, m_Nvol);
1653 int Nxyz = m_Nx * m_Ny * m_Nz;
1657 if (do_comm[3] == 1) {
1660 chrecv_dn[3].start();
1663 for (
int site = is; site < ns; ++site) {
1664 int ixyz = site % Nxyz;
1665 int it = site / Nxyz;
1666 if (it == m_Nt - 1) {
1667 for (
int in = 0; in < m_Nin; ++in) {
1668 int iw = index_alt.idx(in, m_Nin, site, ex);
1669 buf1[in + m_Nin * ixyz] = wp[iw];
1678 chsend_up[3].start();
1682 for (
int site = is; site < ns; ++site) {
1683 int ixyz = site % Nxyz;
1684 int it = site / Nxyz;
1685 if ((it > 0) || (do_comm[3] == 0)) {
1686 int it2 = (it - 1 + m_Nt) % m_Nt;
1687 int nei = ixyz + Nxyz * it2;
1689 if (it == 0) bc3 = bc2;
1690 for (
int in = 0; in < m_Nin; ++in) {
1691 int iv = index_alt.idx(in, m_Nin, site, ex);
1692 int iw = index_alt.idx(in, m_Nin, nei, ex);
1693 vp[iv] = bc3 * wp[iw];
1698 if (do_comm[3] == 1) {
1701 chrecv_dn[3].wait();
1706 for (
int site = is; site < ns; ++site) {
1707 int ixyz = site % Nxyz;
1708 int it = site / Nxyz;
1710 for (
int in = 0; in < m_Nin; ++in) {
1711 int iv = index_alt.idx(in, m_Nin, site, ex);
1712 vp[iv] = bc2 * buf2[in + m_Nin * ixyz];
1718 chsend_up[3].wait();