12 template<
typename AFIELD>
16 template<
typename AFIELD>
32 vout.
general(m_vl,
"%s: construction\n", class_name.c_str());
38 if (repr !=
"Dirac") {
39 vout.
crucial(
" Error at %s: unsupported gamma-matrix type: %s\n",
40 class_name.c_str(), repr.c_str());
54 m_Ndf = 2 * m_Nc * m_Nc;
65 m_Nstv = m_Nst /
VLEN;
67 if (
VLENX * m_Nxv != m_Nx) {
68 vout.
crucial(m_vl,
"%s: Nx must be multiple of VLENX.\n",
72 if (
VLENY * m_Nyv != m_Ny) {
73 vout.
crucial(m_vl,
"%s: Ny must be multiple of VLENY.\n",
88 for (
int mu = 0; mu < m_Ndim; ++mu) {
91 do_comm_any += do_comm[mu];
92 vout.
general(
" do_comm[%d] = %d\n", mu, do_comm[mu]);
95 m_bdsize.resize(m_Ndim);
97 m_bdsize[0] = m_Nvc * Nd2 * m_Ny * m_Nz * m_Nt;
98 m_bdsize[1] = m_Nvc * Nd2 * m_Nx * m_Nz * m_Nt;
99 m_bdsize[2] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nt;
100 m_bdsize[3] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nz;
108 #ifdef CHIRAL_ROTATION
109 params_ct.
set_string(
"gamma_matrix_type",
"Chiral");
115 set_parameters(params);
120 m_U.reset(
NDF, m_Nst, m_Ndim);
123 int NinF = 2 * m_Nc * m_Nd;
124 m_v2.reset(NinF, m_Nst, 1);
126 int Ndm2 = m_Nd * m_Nd / 2;
128 m_T.reset(
NDF * Ndm2, m_Nst, 1);
136 template<
typename AFIELD>
139 chsend_up.resize(m_Ndim);
140 chrecv_up.resize(m_Ndim);
141 chsend_dn.resize(m_Ndim);
142 chrecv_dn.resize(m_Ndim);
144 for (
int mu = 0; mu < m_Ndim; ++mu) {
145 size_t Nvsize = m_bdsize[mu] *
sizeof(
real_t);
147 chsend_dn[mu].send_init(Nvsize, mu, -1);
148 chsend_up[mu].send_init(Nvsize, mu, 1);
150 chrecv_up[mu].recv_init(Nvsize, mu, 1);
151 chrecv_dn[mu].recv_init(Nvsize, mu, -1);
153 void *buf_up = (
void *)chsend_dn[mu].ptr();
154 chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
155 void *buf_dn = (
void *)chsend_up[mu].ptr();
156 chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
159 if (do_comm[mu] == 1) {
160 chset_send.append(chsend_up[mu]);
161 chset_send.append(chsend_dn[mu]);
162 chset_recv.append(chrecv_up[mu]);
163 chset_recv.append(chrecv_dn[mu]);
170 template<
typename AFIELD>
180 template<
typename AFIELD>
191 vout.
crucial(m_vl,
"Error at %s: input parameter not found.\n",
199 #ifdef CHIRAL_ROTATION
200 params_csw.
set_string(
"gamma_matrix_type",
"Chiral");
202 m_fopr_csw->set_parameters(params_csw);
207 template<
typename AFIELD>
210 const std::vector<int> bc)
212 assert(bc.size() == m_Ndim);
221 m_boundary.resize(m_Ndim);
222 for (
int mu = 0; mu < m_Ndim; ++mu) {
223 m_boundary[mu] = bc[mu];
227 vout.
general(m_vl,
"%s: set parameters\n", class_name.c_str());
228 vout.
general(m_vl,
" gamma-matrix type = %s\n", m_repr.c_str());
231 for (
int mu = 0; mu < m_Ndim; ++mu) {
232 vout.
general(m_vl,
" boundary[%d] = %2d\n", mu, m_boundary[mu]);
240 template<
typename AFIELD>
243 params.
set_double(
"hopping_parameter",
double(m_CKs));
244 params.
set_double(
"clover_coefficient",
double(m_csw));
246 params.
set_string(
"gamma_matrix_type", m_repr);
253 template<
typename AFIELD>
258 vout.
detailed(m_vl,
"%s: set_config is called: num_threads = %d\n",
259 class_name.c_str(), nth);
267 vout.
detailed(m_vl,
"%s: set_config finished\n", class_name.c_str());
272 template<
typename AFIELD>
285 template<
typename AFIELD>
292 if (ith == 0) m_conf = u;
300 m_fopr_csw->set_config(u);
302 #ifdef CHIRAL_ROTATION
311 template<
typename AFIELD>
326 const int Nin =
NDF *
ND * 2;
328 m_fopr_csw->set_mode(
"D");
330 int ith, nth, is, ns;
331 set_threadtask_mult(ith, nth, is, ns, m_Nst);
333 for (
int id = 0;
id < m_Nd / 2; ++id) {
334 for (
int ic = 0; ic < m_Nc; ++ic) {
338 for (
int site = is; site < ns; ++site) {
339 m_w1.set_r(ic,
id, site, 0, 1.0);
343 m_fopr_csw->mult(m_w2, m_w1);
346 for (
int site = is; site < ns; ++site) {
347 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
348 real_t vt_r = m_w2.cmp_r(ic2, 0, site, 0);
349 real_t vt_i = m_w2.cmp_i(ic2, 0, site, 0);
350 int in = ic2 +
NC * (ic +
NC * (
id + 0));
351 int idx_r = index.idx(2 * in, Nin, site, 0);
352 int idx_i = index.idx(2 * in + 1, Nin, site, 0);
353 m_T.set(idx_r, vt_r);
354 m_T.set(idx_i, vt_i);
357 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
358 real_t vt_r = m_w2.cmp_r(ic2, 1, site, 0);
359 real_t vt_i = m_w2.cmp_i(ic2, 1, site, 0);
360 int in = ic2 +
NC * (ic +
NC * (
id + 4));
361 int idx_r = index.idx(2 * in, Nin, site, 0);
362 int idx_i = index.idx(2 * in + 1, Nin, site, 0);
363 m_T.set(idx_r, vt_r);
364 m_T.set(idx_i, vt_i);
367 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
368 real_t vt_r = -m_w2.cmp_r(ic2, 2, site, 0);
369 real_t vt_i = -m_w2.cmp_i(ic2, 2, site, 0);
370 int in = ic2 +
NC * (ic +
NC * (
id + 2));
371 int idx_r = index.idx(2 * in, Nin, site, 0);
372 int idx_i = index.idx(2 * in + 1, Nin, site, 0);
373 m_T.set(idx_r, vt_r);
374 m_T.set(idx_i, vt_i);
377 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
378 real_t vt_r = -m_w2.cmp_r(ic2, 3, site, 0);
379 real_t vt_i = -m_w2.cmp_i(ic2, 3, site, 0);
380 int in = ic2 +
NC * (ic +
NC * (
id + 6));
381 int idx_r = index.idx(2 * in, Nin, site, 0);
382 int idx_i = index.idx(2 * in + 1, Nin, site, 0);
383 m_T.set(idx_r, vt_r);
384 m_T.set(idx_i, vt_i);
392 real_t kappaR = 1.0 / m_CKs;
400 template<
typename AFIELD>
412 const int Nin =
NDF *
ND * 2;
414 m_fopr_csw->set_mode(
"D");
416 int ith, nth, is, ns;
417 set_threadtask_mult(ith, nth, is, ns, m_Nst);
420 constexpr
int idx[72] = {
421 0, -1, 6, 7, 16, 17, 28, 29, 24, 25, 34, 35,
422 -1, -1, 1, -1, 12, 13, 18, 19, 32, 33, 26, 27,
423 -1, -1, -1, -5, 2, -1, 8, 9, 20, 21, 30, 31,
424 -1, -1, -1, -1, -1, -1, 3, -1, 14, 15, 22, 23,
425 -1, -1, -1, -1, -1, -1, -1, -1, 4, -1, 10, 11,
426 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1,
429 for (
int id = 0;
id < m_Nd / 2; ++id) {
430 for (
int ic = 0; ic < m_Nc; ++ic) {
434 for (
int site = is; site < ns; ++site) {
435 m_w1.set_r(ic,
id, site, 0, 1.0);
436 m_w1.set_r(ic,
id + 2, site, 0, 1.0);
440 m_fopr_csw->mult(m_w2, m_w1);
443 for (
int site = is; site < ns; ++site) {
444 for (
int id2 = 0; id2 < m_Nd; ++id2) {
445 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
446 real_t vt_r = 0.5 * m_w2.cmp_r(ic2, id2, site, 0);
447 real_t vt_i = 0.5 * m_w2.cmp_i(ic2, id2, site, 0);
448 int i = ic2 + m_Nc * (id2 % 2);
449 int j = ic + m_Nc * id;
450 int ij = m_Nc * 2 * i + j;
451 int in_r =
idx[2 * ij];
452 int in_i =
idx[2 * ij + 1];
454 in_r += 36 * (id2 / 2);
455 int idx_r = index.idx(in_r, Nin, site, 0);
456 m_T.set(idx_r, vt_r);
459 in_i += 36 * (id2 / 2);
460 int idx_i = index.idx(in_i, Nin, site, 0);
461 m_T.set(idx_i, vt_i);
471 real_t kappaR = 1.0 / m_CKs;
479 template<
typename AFIELD>
490 template<
typename AFIELD>
501 template<
typename AFIELD>
509 }
else if (mu == 1) {
511 }
else if (mu == 2) {
513 }
else if (mu == 3) {
516 vout.
crucial(m_vl,
"%s: mult_up for %d direction is undefined.",
517 class_name.c_str(), mu);
524 template<
typename AFIELD>
532 }
else if (mu == 1) {
534 }
else if (mu == 2) {
536 }
else if (mu == 3) {
539 vout.
crucial(m_vl,
"%s: mult_dn for %d direction is undefined.",
540 class_name.c_str(), mu);
547 template<
typename AFIELD>
553 if (ith == 0) m_mode = mode;
560 template<
typename AFIELD>
568 template<
typename AFIELD>
573 }
else if (m_mode ==
"DdagD") {
575 }
else if (m_mode ==
"Ddag") {
577 }
else if (m_mode ==
"H") {
580 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
587 template<
typename AFIELD>
592 }
else if (m_mode ==
"DdagD") {
594 }
else if (m_mode ==
"Ddag") {
596 }
else if (m_mode ==
"H") {
599 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
606 template<
typename AFIELD>
615 template<
typename AFIELD>
626 template<
typename AFIELD>
636 template<
typename AFIELD>
669 template<
typename AFIELD>
675 int ith, nth, is, ns;
676 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
680 for (
int site = is; site < ns; ++site) {
681 for (
int ic = 0; ic <
NC; ++ic) {
682 for (
int id = 0;
id <
ND2; ++id) {
683 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
684 load_vec(wt, &wp[
VLEN * idx1], 2);
685 save_vec(&vp[
VLEN * idx1], wt, 2);
688 for (
int id =
ND2;
id <
ND; ++id) {
689 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
690 load_vec(wt, &wp[
VLEN * idx1], 2);
691 scal_vec(wt,
real_t(-1.0), 2);
692 save_vec(&vp[
VLEN * idx1], wt, 2);
702 template<
typename AFIELD>
707 int ith, nth, is, ns;
708 set_threadtask(ith, nth, is, ns, m_Nstv);
712 for (
int site = is; site < ns; ++site) {
722 for (
int jd = 0; jd <
ND2; ++jd) {
723 for (
int id = 0;
id <
ND; ++id) {
724 int ig =
VLEN *
NDF * (site + m_Nstv * (
id +
ND * jd));
725 load_vec(ut, &u[ig],
NDF);
726 for (
int ic = 0; ic <
NC; ++ic) {
728 int id2 = (
id +
ND2) %
ND;
729 mult_ctv(wt1, &ut[ic2], &v1v[2 *
id],
NC);
730 mult_ctv(wt2, &ut[ic2], &v1v[2 * id2],
NC);
731 int icd1 = 2 * (jd +
ND * ic);
732 int icd2 = 2 * (jd +
ND2 +
ND * ic);
733 axpy_vec(&v2v[icd1],
real_t(1.0), wt1, 2);
734 axpy_vec(&v2v[icd2],
real_t(1.0), wt2, 2);
747 template<
typename AFIELD>
759 if (do_comm_any > 0) {
760 if (ith == 0) chset_recv.start();
772 buf1_zp, buf1_zm, buf1_tp, buf1_tm,
773 up, v1, &m_boundary[0], m_Nsize, do_comm);
777 if (ith == 0) chset_send.start();
780 #ifdef CHIRAL_ROTATION
782 m_CKs, &m_boundary[0], m_Nsize, do_comm);
785 m_CKs, &m_boundary[0], m_Nsize, do_comm);
788 if (do_comm_any > 0) {
789 if (ith == 0) chset_recv.wait();
803 buf2_xp, buf2_xm, buf2_yp, buf2_ym,
804 buf2_zp, buf2_zm, buf2_tp, buf2_tm,
805 m_CKs, &m_boundary[0], m_Nsize, do_comm);
807 if (ith == 0) chset_send.wait();
815 template<
typename AFIELD>
834 aypx(-m_CKs, vp, wp);
841 template<
typename AFIELD>
850 template<
typename AFIELD>
853 int ith, nth, is, ns;
854 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
858 for (
int site = is; site < ns; ++site) {
861 aypx_vec(a, vt, wt,
NVCD);
868 template<
typename AFIELD>
871 int ith, nth, is, ns;
872 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
877 for (
int site = is; site < ns; ++site) {
884 template<
typename AFIELD>
889 int ith, nth, is, ns;
890 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
897 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
901 if (do_comm[0] > 0) {
902 for (
int site = is; site < ns; ++site) {
903 int ix = site % m_Nxv;
904 int iyzt = site / m_Nxv;
907 mult_wilson_xp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
915 chrecv_up[0].start();
916 chsend_dn[0].start();
923 for (
int site = is; site < ns; ++site) {
924 int ix = site % m_Nxv;
925 int iyzt = site / m_Nxv;
928 clear_vec(v2v,
NVCD);
932 if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
933 int nei = ix + 1 + m_Nxv * iyzt;
934 if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
936 mult_wilson_xpb(v2v, &u[
VLEN *
NDF * site], zL);
940 mult_wilson_xpb(v2v, &u[
VLEN *
NDF * site], zL);
941 mult_wilson_xp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
952 template<
typename AFIELD>
957 int ith, nth, is, ns;
958 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
965 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
969 if (do_comm[0] > 0) {
970 for (
int site = is; site < ns; ++site) {
971 int ix = site % m_Nxv;
972 int iyzt = site / m_Nxv;
973 if (ix == m_Nxv - 1) {
975 mult_wilson_xm1(&buf1[ibf], &u[
VLEN *
NDF * site],
983 chrecv_dn[0].start();
984 chsend_up[0].start();
991 for (
int site = is; site < ns; ++site) {
992 int ix = site % m_Nxv;
993 int iyzt = site / m_Nxv;
998 clear_vec(v2v,
NVCD);
1000 if ((ix > 0) || (do_comm[0] == 0)) {
1001 int nei = ix - 1 + m_Nxv * iyzt;
1002 if (ix == 0) nei = m_Nxv - 1 + m_Nxv * iyzt;
1005 mult_wilson_xmb(v2v, uL, zL);
1009 shift_vec0_xfw(uL, &u[
VLEN *
NDF * site],
NDF);
1010 mult_wilson_xmb(v2v, uL, zL);
1011 mult_wilson_xm2(v2v, &buf2[ibf]);
1022 template<
typename AFIELD>
1026 int Nxy = m_Nxv * m_Nyv;
1028 int ith, nth, is, ns;
1029 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1034 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1038 if (do_comm[1] > 0) {
1039 for (
int site = is; site < ns; ++site) {
1040 int ix = site % m_Nxv;
1041 int iy = (site / m_Nxv) % m_Nyv;
1042 int izt = site / Nxy;
1045 mult_wilson_yp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
1053 chrecv_up[1].start();
1054 chsend_dn[1].start();
1055 chrecv_up[1].wait();
1056 chsend_dn[1].wait();
1062 for (
int site = is; site < ns; ++site) {
1063 int ix = site % m_Nxv;
1064 int iy = (site / m_Nxv) % m_Nyv;
1065 int izt = site / Nxy;
1068 clear_vec(v2v,
NVCD);
1072 if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
1073 int iy2 = (iy + 1) % m_Nyv;
1074 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
1076 mult_wilson_ypb(v2v, &u[
VLEN *
NDF * site], zL);
1080 mult_wilson_ypb(v2v, &u[
VLEN *
NDF * site], zL);
1081 mult_wilson_yp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
1092 template<
typename AFIELD>
1096 int Nxy = m_Nxv * m_Nyv;
1098 int ith, nth, is, ns;
1099 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1104 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1108 if (do_comm[1] > 0) {
1109 for (
int site = is; site < ns; ++site) {
1110 int ix = site % m_Nxv;
1111 int iy = (site / m_Nxv) % m_Nyv;
1112 int izt = site / Nxy;
1113 if (iy == m_Nyv - 1) {
1115 mult_wilson_ym1(&buf1[ibf], &u[
VLEN *
NDF * site],
1124 chrecv_dn[1].start();
1125 chsend_up[1].start();
1126 chrecv_dn[1].wait();
1127 chsend_up[1].wait();
1133 for (
int site = is; site < ns; ++site) {
1134 int ix = site % m_Nxv;
1135 int iy = (site / m_Nxv) % m_Nyv;
1136 int izt = site / Nxy;
1139 clear_vec(v2v,
NVCD);
1144 if ((iy != 0) || (do_comm[idir] == 0)) {
1145 int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
1146 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
1149 mult_wilson_ymb(v2v, uL, zL);
1153 shift_vec0_yfw(uL, &u[
VLEN *
NDF * site],
NDF);
1154 mult_wilson_ymb(v2v, uL, zL);
1155 mult_wilson_ym2(v2v, &buf2[ibf]);
1166 template<
typename AFIELD>
1170 int Nxy = m_Nxv * m_Nyv;
1172 int ith, nth, is, ns;
1173 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1178 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1182 if (do_comm[2] > 0) {
1183 for (
int site = is; site < ns; ++site) {
1184 int ixy = site % Nxy;
1185 int iz = (site / Nxy) % m_Nz;
1186 int it = site / (Nxy * m_Nz);
1188 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1189 mult_wilson_zp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
1197 chrecv_up[2].start();
1198 chsend_dn[2].start();
1199 chrecv_up[2].wait();
1200 chsend_dn[2].wait();
1206 for (
int site = is; site < ns; ++site) {
1207 int ixy = site % Nxy;
1208 int iz = (site / Nxy) % m_Nz;
1209 int it = site / (Nxy * m_Nz);
1212 clear_vec(v2v,
NVCD);
1214 if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
1215 int iz2 = (iz + 1) % m_Nz;
1216 int nei = ixy + Nxy * (iz2 + m_Nz * it);
1219 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1220 mult_wilson_zp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
1231 template<
typename AFIELD>
1235 int Nxy = m_Nxv * m_Nyv;
1237 int ith, nth, is, ns;
1238 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1243 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1247 if (do_comm[2] > 0) {
1248 for (
int site = is; site < ns; ++site) {
1249 int ixy = site % Nxy;
1250 int iz = (site / Nxy) % m_Nz;
1251 int it = site / (Nxy * m_Nz);
1252 if (iz == m_Nz - 1) {
1253 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1254 mult_wilson_zm1(&buf1[ibf], &u[
VLEN *
NDF * site],
1263 chrecv_dn[2].start();
1264 chsend_up[2].start();
1265 chrecv_dn[2].wait();
1266 chsend_up[2].wait();
1272 for (
int site = is; site < ns; ++site) {
1273 int ixy = site % Nxy;
1274 int iz = (site / Nxy) % m_Nz;
1275 int it = site / (Nxy * m_Nz);
1278 clear_vec(v2v,
NVCD);
1280 if ((iz > 0) || (do_comm[2] == 0)) {
1281 int iz2 = (iz - 1 + m_Nz) % m_Nz;
1282 int nei = ixy + Nxy * (iz2 + m_Nz * it);
1285 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1286 mult_wilson_zm2(v2v, &buf2[ibf]);
1297 template<
typename AFIELD>
1301 int Nxyz = m_Nxv * m_Nyv * m_Nz;
1303 int ith, nth, is, ns;
1304 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1309 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1313 if (do_comm[3] > 0) {
1314 for (
int site = is; site < ns; ++site) {
1315 int ixyz = site % Nxyz;
1316 int it = site / Nxyz;
1318 mult_wilson_tp1_dirac(&buf1[
VLEN *
NVC *
ND2 * ixyz],
1327 chrecv_up[3].start();
1328 chsend_dn[3].start();
1329 chrecv_up[3].wait();
1330 chsend_dn[3].wait();
1336 for (
int site = is; site < ns; ++site) {
1337 int ixyz = site % Nxyz;
1338 int it = site / Nxyz;
1341 clear_vec(v2v,
NVCD);
1343 if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
1344 int it2 = (it + 1) % m_Nt;
1345 int nei = ixyz + Nxyz * it2;
1346 mult_wilson_tpb_dirac(v2v, &u[
VLEN *
NDF * site],
1349 mult_wilson_tp2_dirac(v2v, &u[
VLEN *
NDF * site],
1361 template<
typename AFIELD>
1365 int Nxyz = m_Nxv * m_Nyv * m_Nz;
1367 int ith, nth, is, ns;
1368 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1373 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1377 if (do_comm[3] > 0) {
1378 for (
int site = is; site < ns; ++site) {
1379 int ixyz = site % Nxyz;
1380 int it = site / Nxyz;
1381 if (it == m_Nt - 1) {
1382 mult_wilson_tm1_dirac(&buf1[
VLEN *
NVC *
ND2 * ixyz],
1391 chrecv_dn[3].start();
1392 chsend_up[3].start();
1393 chrecv_dn[3].wait();
1394 chsend_up[3].wait();
1399 for (
int site = is; site < ns; ++site) {
1400 int ixyz = site % Nxyz;
1401 int it = site / Nxyz;
1404 clear_vec(v2v,
NVCD);
1406 if ((it > 0) || (do_comm[3] == 0)) {
1407 int it2 = (it - 1 + m_Nt) % m_Nt;
1408 int nei = ixyz + Nxyz * it2;
1409 mult_wilson_tmb_dirac(v2v, &u[
VLEN *
NDF * nei],
1412 mult_wilson_tm2_dirac(v2v, &buf2[
VLEN *
NVC *
ND2 * ixyz]);
1423 template<
typename AFIELD>
1430 double flop_wilson, flop_clover, flop_site, flop;
1432 if (m_repr ==
"Dirac") {
1433 flop_wilson =
static_cast<double>(
1435 + 6 * (4 * m_Nc + 2)
1436 + 2 * (4 * m_Nc + 1)));
1439 flop_clover =
static_cast<double>(
1441 + 2 * (2 * (m_Nc * m_Nd - 1) + 1)
1444 }
else if (m_repr ==
"Chiral") {
1445 flop_wilson =
static_cast<double>(
1446 m_Nc * m_Nd * (4 + 8 * (4 * m_Nc + 2)));
1448 flop_clover =
static_cast<double>(
1449 m_Nc * m_Nd * (2 * (2 * (m_Nc * m_Nd - 1) + 1)
1452 vout.
crucial(m_vl,
"%s: input repr is undefined.\n");
1456 flop_site = flop_wilson + flop_clover;
1458 flop = flop_site *
static_cast<double>(Lvol);
1459 if ((mode ==
"DdagD") || (mode ==
"DDdag")) flop *= 2.0;