12 template<
typename AFIELD>
16 template<
typename AFIELD>
32 vout.
general(m_vl,
"%s: construction\n", class_name.c_str());
38 if (repr !=
"Dirac") {
39 vout.
crucial(
" Error at %s: unsupported gamma-matrix type: %s\n",
40 class_name.c_str(), repr.c_str());
54 m_Ndf = 2 * m_Nc * m_Nc;
65 m_Nstv = m_Nst /
VLEN;
67 if (
VLENX * m_Nxv != m_Nx) {
68 vout.
crucial(m_vl,
"%s: Nx must be multiple of VLENX.\n",
72 if (
VLENY * m_Nyv != m_Ny) {
73 vout.
crucial(m_vl,
"%s: Ny must be multiple of VLENY.\n",
88 for (
int mu = 0; mu < m_Ndim; ++mu) {
91 do_comm_any += do_comm[mu];
92 vout.
general(
" do_comm[%d] = %d\n", mu, do_comm[mu]);
95 m_bdsize.resize(m_Ndim);
97 m_bdsize[0] = m_Nvc * Nd2 * m_Ny * m_Nz * m_Nt;
98 m_bdsize[1] = m_Nvc * Nd2 * m_Nx * m_Nz * m_Nt;
99 m_bdsize[2] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nt;
100 m_bdsize[3] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nz;
105 m_U.reset(
NDF, m_Nst, m_Ndim);
108 int NinF = 2 * m_Nc * m_Nd;
109 m_v2.reset(NinF, m_Nst, 1);
113 set_parameters(params);
123 template<
typename AFIELD>
126 chsend_up.resize(m_Ndim);
127 chrecv_up.resize(m_Ndim);
128 chsend_dn.resize(m_Ndim);
129 chrecv_dn.resize(m_Ndim);
131 for (
int mu = 0; mu < m_Ndim; ++mu) {
133 size_t Nvsize = m_bdsize[mu] *
sizeof(
real_t);
135 chsend_dn[mu].send_init(Nvsize, mu, -1);
136 chsend_up[mu].send_init(Nvsize, mu, 1);
138 chrecv_up[mu].recv_init(Nvsize, mu, 1);
139 chrecv_dn[mu].recv_init(Nvsize, mu, -1);
141 void *buf_up = (
void *)chsend_dn[mu].ptr();
142 chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
143 void *buf_dn = (
void *)chsend_up[mu].ptr();
144 chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
147 if (do_comm[mu] == 1) {
148 chset_send.append(chsend_up[mu]);
149 chset_send.append(chsend_dn[mu]);
150 chset_recv.append(chrecv_up[mu]);
151 chset_recv.append(chrecv_dn[mu]);
158 template<
typename AFIELD>
168 template<
typename AFIELD>
178 vout.
crucial(m_vl,
"Error at %s: input parameter not found.\n",
183 set_parameters(
real_t(kappa), bc);
188 template<
typename AFIELD>
190 const std::vector<int> bc)
192 assert(bc.size() == m_Ndim);
200 m_boundary.resize(m_Ndim);
201 for (
int mu = 0; mu < m_Ndim; ++mu) {
202 m_boundary[mu] = bc[mu];
206 vout.
general(m_vl,
"%s: set parameters\n", class_name.c_str());
207 vout.
general(m_vl,
" gamma-matrix type = %s\n", m_repr.c_str());
209 for (
int mu = 0; mu < m_Ndim; ++mu) {
210 vout.
general(m_vl,
" boundary[%d] = %2d\n", mu, m_boundary[mu]);
218 template<
typename AFIELD>
221 params.
set_double(
"hopping_parameter",
double(m_CKs));
223 params.
set_string(
"gamma_matrix_type", m_repr);
230 template<
typename AFIELD>
235 vout.
detailed(m_vl,
"%s: set_config is called: num_threads = %d\n",
236 class_name.c_str(), nth);
244 vout.
detailed(m_vl,
"%s: set_config finished\n", class_name.c_str());
249 template<
typename AFIELD>
262 template<
typename AFIELD>
269 if (ith == 0) m_conf = u;
280 template<
typename AFIELD>
291 template<
typename AFIELD>
302 template<
typename AFIELD>
310 }
else if (mu == 1) {
312 }
else if (mu == 2) {
314 }
else if (mu == 3) {
317 vout.
crucial(m_vl,
"%s: mult_up for %d direction is undefined.",
318 class_name.c_str(), mu);
325 template<
typename AFIELD>
333 }
else if (mu == 1) {
335 }
else if (mu == 2) {
337 }
else if (mu == 3) {
340 vout.
crucial(m_vl,
"%s: mult_dn for %d direction is undefined.",
341 class_name.c_str(), mu);
348 template<
typename AFIELD>
354 if (ith == 0) m_mode = mode;
361 template<
typename AFIELD>
369 template<
typename AFIELD>
374 }
else if (m_mode ==
"DdagD") {
376 }
else if (m_mode ==
"Ddag") {
378 }
else if (m_mode ==
"H") {
381 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
388 template<
typename AFIELD>
393 }
else if (m_mode ==
"DdagD") {
395 }
else if (m_mode ==
"Ddag") {
397 }
else if (m_mode ==
"H") {
400 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
407 template<
typename AFIELD>
415 template<
typename AFIELD>
426 template<
typename AFIELD>
436 template<
typename AFIELD>
453 template<
typename AFIELD>
459 int ith, nth, is, ns;
460 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
464 for (
int site = is; site < ns; ++site) {
465 for (
int ic = 0; ic <
NC; ++ic) {
466 for (
int id = 0;
id <
ND2; ++id) {
467 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
468 load_vec(wt, &wp[
VLEN * idx1], 2);
469 save_vec(&vp[
VLEN * idx1], wt, 2);
472 for (
int id =
ND2;
id <
ND; ++id) {
473 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
474 load_vec(wt, &wp[
VLEN * idx1], 2);
475 scal_vec(wt,
real_t(-1.0), 2);
476 save_vec(&vp[
VLEN * idx1], wt, 2);
486 template<
typename AFIELD>
497 if (do_comm_any > 0) {
498 if (ith == 0) chset_recv.start();
510 buf1_zp, buf1_zm, buf1_tp, buf1_tm,
511 up, v1, &m_boundary[0], m_Nsize, do_comm);
515 if (ith == 0) chset_send.start();
519 m_CKs, &m_boundary[0], m_Nsize, do_comm);
521 if (do_comm_any > 0) {
522 if (ith == 0) chset_recv.wait();
536 buf2_xp, buf2_xm, buf2_yp, buf2_ym,
537 buf2_zp, buf2_zm, buf2_tp, buf2_tm,
538 m_CKs, &m_boundary[0], m_Nsize, do_comm);
540 if (ith == 0) chset_send.wait();
548 template<
typename AFIELD>
565 aypx(-m_CKs, vp, wp);
572 template<
typename AFIELD>
581 template<
typename AFIELD>
584 int ith, nth, is, ns;
585 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
589 for (
int site = is; site < ns; ++site) {
592 aypx_vec(a, vt, wt,
NVCD);
599 template<
typename AFIELD>
602 int ith, nth, is, ns;
603 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
608 for (
int site = is; site < ns; ++site) {
615 template<
typename AFIELD>
620 int ith, nth, is, ns;
621 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
628 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
632 if (do_comm[0] > 0) {
633 for (
int site = is; site < ns; ++site) {
634 int ix = site % m_Nxv;
635 int iyzt = site / m_Nxv;
638 mult_wilson_xp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
646 chrecv_up[0].start();
647 chsend_dn[0].start();
654 for (
int site = is; site < ns; ++site) {
655 int ix = site % m_Nxv;
656 int iyzt = site / m_Nxv;
659 clear_vec(v2v,
NVCD);
663 if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
664 int nei = ix + 1 + m_Nxv * iyzt;
665 if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
667 mult_wilson_xpb(v2v, &u[
VLEN *
NDF * site], zL);
671 mult_wilson_xpb(v2v, &u[
VLEN *
NDF * site], zL);
672 mult_wilson_xp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
683 template<
typename AFIELD>
688 int ith, nth, is, ns;
689 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
696 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
700 if (do_comm[0] > 0) {
701 for (
int site = is; site < ns; ++site) {
702 int ix = site % m_Nxv;
703 int iyzt = site / m_Nxv;
704 if (ix == m_Nxv - 1) {
706 mult_wilson_xm1(&buf1[ibf], &u[
VLEN *
NDF * site],
714 chrecv_dn[0].start();
715 chsend_up[0].start();
722 for (
int site = is; site < ns; ++site) {
723 int ix = site % m_Nxv;
724 int iyzt = site / m_Nxv;
729 clear_vec(v2v,
NVCD);
731 if ((ix > 0) || (do_comm[0] == 0)) {
732 int nei = ix - 1 + m_Nxv * iyzt;
733 if (ix == 0) nei = m_Nxv - 1 + m_Nxv * iyzt;
736 mult_wilson_xmb(v2v, uL, zL);
740 shift_vec0_xfw(uL, &u[
VLEN *
NDF * site],
NDF);
741 mult_wilson_xmb(v2v, uL, zL);
742 mult_wilson_xm2(v2v, &buf2[ibf]);
753 template<
typename AFIELD>
757 int Nxy = m_Nxv * m_Nyv;
759 int ith, nth, is, ns;
760 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
765 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
769 if (do_comm[1] > 0) {
770 for (
int site = is; site < ns; ++site) {
771 int ix = site % m_Nxv;
772 int iy = (site / m_Nxv) % m_Nyv;
773 int izt = site / Nxy;
776 mult_wilson_yp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
784 chrecv_up[1].start();
785 chsend_dn[1].start();
793 for (
int site = is; site < ns; ++site) {
794 int ix = site % m_Nxv;
795 int iy = (site / m_Nxv) % m_Nyv;
796 int izt = site / Nxy;
799 clear_vec(v2v,
NVCD);
803 if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
804 int iy2 = (iy + 1) % m_Nyv;
805 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
807 mult_wilson_ypb(v2v, &u[
VLEN *
NDF * site], zL);
811 mult_wilson_ypb(v2v, &u[
VLEN *
NDF * site], zL);
812 mult_wilson_yp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
823 template<
typename AFIELD>
827 int Nxy = m_Nxv * m_Nyv;
829 int ith, nth, is, ns;
830 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
835 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
839 if (do_comm[1] > 0) {
840 for (
int site = is; site < ns; ++site) {
841 int ix = site % m_Nxv;
842 int iy = (site / m_Nxv) % m_Nyv;
843 int izt = site / Nxy;
844 if (iy == m_Nyv - 1) {
846 mult_wilson_ym1(&buf1[ibf], &u[
VLEN *
NDF * site],
855 chrecv_dn[1].start();
856 chsend_up[1].start();
864 for (
int site = is; site < ns; ++site) {
865 int ix = site % m_Nxv;
866 int iy = (site / m_Nxv) % m_Nyv;
867 int izt = site / Nxy;
870 clear_vec(v2v,
NVCD);
875 if ((iy != 0) || (do_comm[idir] == 0)) {
876 int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
877 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
880 mult_wilson_ymb(v2v, uL, zL);
884 shift_vec0_yfw(uL, &u[
VLEN *
NDF * site],
NDF);
885 mult_wilson_ymb(v2v, uL, zL);
886 mult_wilson_ym2(v2v, &buf2[ibf]);
897 template<
typename AFIELD>
901 int Nxy = m_Nxv * m_Nyv;
903 int ith, nth, is, ns;
904 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
909 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
913 if (do_comm[2] > 0) {
914 for (
int site = is; site < ns; ++site) {
915 int ixy = site % Nxy;
916 int iz = (site / Nxy) % m_Nz;
917 int it = site / (Nxy * m_Nz);
919 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
920 mult_wilson_zp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
928 chrecv_up[2].start();
929 chsend_dn[2].start();
937 for (
int site = is; site < ns; ++site) {
938 int ixy = site % Nxy;
939 int iz = (site / Nxy) % m_Nz;
940 int it = site / (Nxy * m_Nz);
943 clear_vec(v2v,
NVCD);
945 if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
946 int iz2 = (iz + 1) % m_Nz;
947 int nei = ixy + Nxy * (iz2 + m_Nz * it);
950 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
951 mult_wilson_zp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
962 template<
typename AFIELD>
966 int Nxy = m_Nxv * m_Nyv;
968 int ith, nth, is, ns;
969 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
974 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
978 if (do_comm[2] > 0) {
979 for (
int site = is; site < ns; ++site) {
980 int ixy = site % Nxy;
981 int iz = (site / Nxy) % m_Nz;
982 int it = site / (Nxy * m_Nz);
983 if (iz == m_Nz - 1) {
984 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
985 mult_wilson_zm1(&buf1[ibf], &u[
VLEN *
NDF * site],
994 chrecv_dn[2].start();
995 chsend_up[2].start();
1003 for (
int site = is; site < ns; ++site) {
1004 int ixy = site % Nxy;
1005 int iz = (site / Nxy) % m_Nz;
1006 int it = site / (Nxy * m_Nz);
1009 clear_vec(v2v,
NVCD);
1011 if ((iz > 0) || (do_comm[2] == 0)) {
1012 int iz2 = (iz - 1 + m_Nz) % m_Nz;
1013 int nei = ixy + Nxy * (iz2 + m_Nz * it);
1016 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1017 mult_wilson_zm2(v2v, &buf2[ibf]);
1028 template<
typename AFIELD>
1032 int Nxyz = m_Nxv * m_Nyv * m_Nz;
1034 int ith, nth, is, ns;
1035 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1040 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1044 if (do_comm[3] > 0) {
1045 for (
int site = is; site < ns; ++site) {
1046 int ixyz = site % Nxyz;
1047 int it = site / Nxyz;
1049 mult_wilson_tp1_dirac(&buf1[
VLEN *
NVC *
ND2 * ixyz],
1058 chrecv_up[3].start();
1059 chsend_dn[3].start();
1060 chrecv_up[3].wait();
1061 chsend_dn[3].wait();
1067 for (
int site = is; site < ns; ++site) {
1068 int ixyz = site % Nxyz;
1069 int it = site / Nxyz;
1072 clear_vec(v2v,
NVCD);
1074 if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
1075 int it2 = (it + 1) % m_Nt;
1076 int nei = ixyz + Nxyz * it2;
1077 mult_wilson_tpb_dirac(v2v, &u[
VLEN *
NDF * site],
1080 mult_wilson_tp2_dirac(v2v, &u[
VLEN *
NDF * site],
1092 template<
typename AFIELD>
1096 int Nxyz = m_Nxv * m_Nyv * m_Nz;
1098 int ith, nth, is, ns;
1099 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1104 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1108 if (do_comm[3] > 0) {
1109 for (
int site = is; site < ns; ++site) {
1110 int ixyz = site % Nxyz;
1111 int it = site / Nxyz;
1112 if (it == m_Nt - 1) {
1113 mult_wilson_tm1_dirac(&buf1[
VLEN *
NVC *
ND2 * ixyz],
1122 chrecv_dn[3].start();
1123 chsend_up[3].start();
1124 chrecv_dn[3].wait();
1125 chsend_up[3].wait();
1130 for (
int site = is; site < ns; ++site) {
1131 int ixyz = site % Nxyz;
1132 int it = site / Nxyz;
1135 clear_vec(v2v,
NVCD);
1137 if ((it > 0) || (do_comm[3] == 0)) {
1138 int it2 = (it - 1 + m_Nt) % m_Nt;
1139 int nei = ixyz + Nxyz * it2;
1140 mult_wilson_tmb_dirac(v2v, &u[
VLEN *
NDF * nei],
1143 mult_wilson_tm2_dirac(v2v, &buf2[
VLEN *
NVC *
ND2 * ixyz]);
1154 template<
typename AFIELD>
1161 double flop_site, flop;
1163 if (m_repr ==
"Dirac") {
1164 flop_site =
static_cast<double>(
1165 m_Nc * m_Nd * (4 + 6 * (4 * m_Nc + 2) + 2 * (4 * m_Nc + 1)));
1166 }
else if (m_repr ==
"Chiral") {
1167 flop_site =
static_cast<double>(
1168 m_Nc * m_Nd * (4 + 8 * (4 * m_Nc + 2)));
1170 vout.
crucial(m_vl,
"%s: input repr is undefined.\n");
1174 flop = flop_site *
static_cast<double>(Lvol);
1175 if ((mode ==
"DdagD") || (mode ==
"DDdag")) flop *= 2.0;