12 template<
typename AFIELD>
16 template<
typename AFIELD>
31 vout.
general(m_vl,
"%s: construction\n", class_name.c_str());
37 if (repr !=
"Dirac") {
38 vout.
crucial(
" Error at %s: unsupported gamma-matrix type: %s\n",
39 class_name.c_str(), repr.c_str());
53 m_Ndf = 2 * m_Nc * m_Nc;
64 m_Nstv = m_Nst /
VLEN;
76 for (
int mu = 0; mu < m_Ndim; ++mu) {
79 do_comm_any += do_comm[mu];
80 vout.
general(
" do_comm[%d] = %d\n", mu, do_comm[mu]);
83 m_bdsize.resize(m_Ndim);
85 m_bdsize[0] = m_Nvc * Nd2 * m_Ny * m_Nz * m_Nt;
86 m_bdsize[1] = m_Nvc * Nd2 * m_Nx * m_Nz * m_Nt;
87 m_bdsize[2] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nt;
88 m_bdsize[3] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nz;
96 #ifdef CHIRAL_ROTATION
97 params_ct.
set_string(
"gamma_matrix_type",
"Chiral");
105 set_parameters(params);
110 m_U.reset(
NDF, m_Nst, m_Ndim);
113 m_Ublock.reset(
NDF, m_Nst, m_Ndim);
116 int NinF = 2 * m_Nc * m_Nd;
117 m_v2.reset(NinF, m_Nst, 1);
119 int Ndm2 = m_Nd * m_Nd / 2;
121 #ifdef CHIRAL_ROTATION
122 m_T.reset(
NDF * Ndm2, m_Nst, 1);
124 m_T.reset(
NDF * Ndm2, m_Nst, 1);
133 template<
typename AFIELD>
136 chsend_up.resize(m_Ndim);
137 chrecv_up.resize(m_Ndim);
138 chsend_dn.resize(m_Ndim);
139 chrecv_dn.resize(m_Ndim);
141 for (
int mu = 0; mu < m_Ndim; ++mu) {
142 size_t Nvsize = m_bdsize[mu] *
sizeof(
real_t);
144 chsend_dn[mu].send_init(Nvsize, mu, -1);
145 chsend_up[mu].send_init(Nvsize, mu, 1);
147 chrecv_up[mu].recv_init(Nvsize, mu, 1);
148 chrecv_dn[mu].recv_init(Nvsize, mu, -1);
150 void *buf_up = (
void *)chsend_dn[mu].ptr();
151 chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
152 void *buf_dn = (
void *)chsend_up[mu].ptr();
153 chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
156 if (do_comm[mu] == 1) {
157 chset_send.append(chsend_up[mu]);
158 chset_send.append(chsend_dn[mu]);
159 chset_recv.append(chrecv_up[mu]);
160 chset_recv.append(chrecv_dn[mu]);
167 template<
typename AFIELD>
177 template<
typename AFIELD>
180 const string str_vlevel = params.
get_string(
"verbose_level");
186 std::vector<int> block_size;
194 vout.
crucial(m_vl,
"Error at %s: input parameter not found.\n",
199 set_parameters(
real_t(kappa),
real_t(csw), bc, block_size);
202 #ifdef CHIRAL_ROTATION
203 params_csw.
set_string(
"gamma_matrix_type",
"Chiral");
205 m_fopr_csw->set_parameters(params_csw);
210 template<
typename AFIELD>
214 const std::vector<int> bc,
215 const std::vector<int> block_size)
217 assert(bc.size() == m_Ndim);
226 m_boundary.resize(m_Ndim);
227 m_block_size.resize(m_Ndim);
228 for (
int mu = 0; mu < m_Ndim; ++mu) {
229 m_boundary[mu] = bc[mu];
230 m_block_size[mu] = block_size[mu];
235 vout.
general(m_vl,
"%s: set parameters\n", class_name.c_str());
236 vout.
general(m_vl,
" gamma-matrix type = %s\n", m_repr.c_str());
239 for (
int mu = 0; mu < m_Ndim; ++mu) {
240 vout.
general(m_vl,
" boundary[%d] = %2d\n", mu, m_boundary[mu]);
242 for (
int mu = 0; mu < m_Ndim; ++mu) {
243 vout.
general(m_vl,
" block_size[%d] = %2d\n", mu, m_block_size[mu]);
246 int rem = m_Nx % m_block_size[0]
247 + m_Ny % m_block_size[1]
248 + m_Nz % m_block_size[2]
249 + m_Nt % m_block_size[3];
251 vout.
crucial(m_vl,
"%s: block_size is irelevant.\n",
258 m_block_sizev[0] = m_block_size[0] /
VLENX;
259 m_block_sizev[1] = m_block_size[1] /
VLENY;
260 m_block_sizev[2] = m_block_size[2];
261 m_block_sizev[3] = m_block_size[3];
265 if ((m_block_sizev[0] *
VLENX != m_block_size[0]) ||
266 (m_block_sizev[1] *
VLENY != m_block_size[1])) {
267 vout.
crucial(m_vl,
"%s: bad blocksize in XY: must be divided by"
268 " VLENX=%d, VLENY=%d but are %d, %d\n",
270 m_block_size[0], m_block_size[1]);
275 int NBx = m_Nx / m_block_size[0];
276 int NBy = m_Ny / m_block_size[1];
277 int NBz = m_Nz / m_block_size[2];
278 int NBt = m_Nt / m_block_size[3];
283 m_Ieo = (NBx * ipex + NBy * ipey + NBz * ipez + NBt * ipet) % 2;
290 template<
typename AFIELD>
293 params.
set_double(
"hopping_parameter",
double(m_CKs));
294 params.
set_double(
"clover_coefficient",
double(m_csw));
297 params.
set_string(
"gamma_matrix_type", m_repr);
304 template<
typename AFIELD>
309 vout.
detailed(m_vl,
"%s: set_config is called: num_threads = %d\n",
310 class_name.c_str(), nth);
318 vout.
detailed(m_vl,
"%s: set_config finished\n", class_name.c_str());
323 template<
typename AFIELD>
336 template<
typename AFIELD>
343 if (ith == 0) m_conf = u;
345 m_fopr_csw->set_config(u);
356 set_block_config(m_Ublock);
358 m_fopr_csw->set_config(u);
360 #ifdef CHIRAL_ROTATION
369 template<
typename AFIELD>
376 int ith, nth, is, ns;
377 set_threadtask_mult(ith, nth, is, ns, m_Nst);
379 for (
int site = is; site < ns; ++site) {
380 int ix = site % m_Nx;
381 int iy = (site / m_Nx) % m_Ny;
382 int iz = (site / (m_Nx * m_Ny)) % m_Nz;
383 int it = site / (m_Nx * m_Ny * m_Nz);
386 if ((ix + 1) % m_block_size[mu] == 0) {
387 for (
int in = 0; in <
NDF; ++in) {
388 int i = index.idx_G(in, site, mu);
394 if ((iy + 1) % m_block_size[mu] == 0) {
395 for (
int in = 0; in <
NDF; ++in) {
396 int i = index.idx_G(in, site, mu);
402 if ((iz + 1) % m_block_size[mu] == 0) {
403 for (
int in = 0; in <
NDF; ++in) {
404 int i = index.idx_G(in, site, mu);
410 if ((it + 1) % m_block_size[mu] == 0) {
411 for (
int in = 0; in <
NDF; ++in) {
412 int i = index.idx_G(in, site, mu);
423 template<
typename AFIELD>
438 const int Nin =
NDF *
ND * 2;
440 m_fopr_csw->set_mode(
"D");
442 int ith, nth, is, ns;
443 set_threadtask_mult(ith, nth, is, ns, m_Nst);
445 for (
int id = 0;
id < m_Nd / 2; ++id) {
446 for (
int ic = 0; ic < m_Nc; ++ic) {
450 for (
int site = is; site < ns; ++site) {
451 m_w1.set_r(ic,
id, site, 0, 1.0);
455 m_fopr_csw->mult(m_w2, m_w1);
458 for (
int site = is; site < ns; ++site) {
459 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
460 real_t vt_r = m_w2.cmp_r(ic2, 0, site, 0);
461 real_t vt_i = m_w2.cmp_i(ic2, 0, site, 0);
462 int in = ic2 +
NC * (ic +
NC * (
id + 0));
463 int idx_r = index.idx(2 * in, Nin, site, 0);
464 int idx_i = index.idx(2 * in + 1, Nin, site, 0);
465 m_T.set(idx_r, vt_r);
466 m_T.set(idx_i, vt_i);
469 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
470 real_t vt_r = m_w2.cmp_r(ic2, 1, site, 0);
471 real_t vt_i = m_w2.cmp_i(ic2, 1, site, 0);
472 int in = ic2 +
NC * (ic +
NC * (
id + 4));
473 int idx_r = index.idx(2 * in, Nin, site, 0);
474 int idx_i = index.idx(2 * in + 1, Nin, site, 0);
475 m_T.set(idx_r, vt_r);
476 m_T.set(idx_i, vt_i);
479 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
480 real_t vt_r = -m_w2.cmp_r(ic2, 2, site, 0);
481 real_t vt_i = -m_w2.cmp_i(ic2, 2, site, 0);
482 int in = ic2 +
NC * (ic +
NC * (
id + 2));
483 int idx_r = index.idx(2 * in, Nin, site, 0);
484 int idx_i = index.idx(2 * in + 1, Nin, site, 0);
485 m_T.set(idx_r, vt_r);
486 m_T.set(idx_i, vt_i);
489 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
490 real_t vt_r = -m_w2.cmp_r(ic2, 3, site, 0);
491 real_t vt_i = -m_w2.cmp_i(ic2, 3, site, 0);
492 int in = ic2 +
NC * (ic +
NC * (
id + 6));
493 int idx_r = index.idx(2 * in, Nin, site, 0);
494 int idx_i = index.idx(2 * in + 1, Nin, site, 0);
495 m_T.set(idx_r, vt_r);
496 m_T.set(idx_i, vt_i);
504 real_t kappaR = 1.0 / m_CKs;
512 template<
typename AFIELD>
524 const int Nin =
NDF *
ND * 2;
526 m_fopr_csw->set_mode(
"D");
528 int ith, nth, is, ns;
529 set_threadtask_mult(ith, nth, is, ns, m_Nst);
532 constexpr
int idx[72] = {
533 0, -1, 6, 7, 16, 17, 28, 29, 24, 25, 34, 35,
534 -1, -1, 1, -1, 12, 13, 18, 19, 32, 33, 26, 27,
535 -1, -1, -1, -5, 2, -1, 8, 9, 20, 21, 30, 31,
536 -1, -1, -1, -1, -1, -1, 3, -1, 14, 15, 22, 23,
537 -1, -1, -1, -1, -1, -1, -1, -1, 4, -1, 10, 11,
538 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1,
541 for (
int id = 0;
id < m_Nd / 2; ++id) {
542 for (
int ic = 0; ic < m_Nc; ++ic) {
546 for (
int site = is; site < ns; ++site) {
547 m_w1.set_r(ic,
id, site, 0, 1.0);
548 m_w1.set_r(ic,
id + 2, site, 0, 1.0);
552 m_fopr_csw->mult(m_w2, m_w1);
555 for (
int site = is; site < ns; ++site) {
556 for (
int id2 = 0; id2 < m_Nd; ++id2) {
557 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
558 real_t vt_r = 0.5 * m_w2.cmp_r(ic2, id2, site, 0);
559 real_t vt_i = 0.5 * m_w2.cmp_i(ic2, id2, site, 0);
560 int i = ic2 + m_Nc * (id2 % 2);
561 int j = ic + m_Nc * id;
562 int ij = m_Nc * 2 * i + j;
563 int in_r =
idx[2 * ij];
564 int in_i =
idx[2 * ij + 1];
566 in_r += 36 * (id2 / 2);
567 int idx_r = index.idx(in_r, Nin, site, 0);
568 m_T.set(idx_r, vt_r);
571 in_i += 36 * (id2 / 2);
572 int idx_i = index.idx(in_i, Nin, site, 0);
573 m_T.set(idx_i, vt_i);
583 real_t kappaR = 1.0 / m_CKs;
591 template<
typename AFIELD>
602 template<
typename AFIELD>
613 template<
typename AFIELD>
621 }
else if (mu == 1) {
623 }
else if (mu == 2) {
625 }
else if (mu == 3) {
628 vout.
crucial(m_vl,
"%s: mult_up for %d direction is undefined.",
629 class_name.c_str(), mu);
636 template<
typename AFIELD>
644 }
else if (mu == 1) {
646 }
else if (mu == 2) {
648 }
else if (mu == 3) {
651 vout.
crucial(m_vl,
"%s: mult_dn for %d direction is undefined.",
652 class_name.c_str(), mu);
659 template<
typename AFIELD>
665 if (ith == 0) m_mode = mode;
672 template<
typename AFIELD>
680 template<
typename AFIELD>
685 }
else if (m_mode ==
"DdagD") {
687 }
else if (m_mode ==
"Ddag") {
689 }
else if (m_mode ==
"H") {
692 printf(
"mode=%s\n", m_mode.c_str());
694 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
702 template<
typename AFIELD>
707 }
else if (m_mode ==
"DdagD") {
709 }
else if (m_mode ==
"Ddag") {
711 }
else if (m_mode ==
"H") {
714 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
722 template<
typename AFIELD>
731 template<
typename AFIELD>
742 template<
typename AFIELD>
752 template<
typename AFIELD>
769 template<
typename AFIELD>
782 template<
typename AFIELD>
788 int ith, nth, is, ns;
789 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
792 for (
int site = is; site < ns; ++site) {
793 for (
int ic = 0; ic <
NC; ++ic) {
794 for (
int id = 0;
id <
ND; ++id) {
795 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
796 load_vec(wt, &w[
VLEN * idx1], 2);
798 scal_vec(wt, (
real_t)(-1.0), 2);
799 int id2 = (
id + 2) %
ND;
800 int idx2 = 2 * (id2 +
ND * ic) +
NVCD * site;
801 save_vec(&v[
VLEN * idx2], wt, 2);
812 template<
typename AFIELD>
815 int ith, nth, is, ns;
816 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
818 for (
int site = is; site < ns; ++site) {
821 scal_vec(vt, a,
NVCD);
828 template<
typename AFIELD>
834 int ith, nth, is, ns;
835 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
839 for (
int site = is; site < ns; ++site) {
840 for (
int ic = 0; ic <
NC; ++ic) {
841 for (
int id = 0;
id <
ND2; ++id) {
842 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
843 load_vec(wt, &wp[
VLEN * idx1], 2);
844 save_vec(&vp[
VLEN * idx1], wt, 2);
847 for (
int id =
ND2;
id <
ND; ++id) {
848 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
849 load_vec(wt, &wp[
VLEN * idx1], 2);
850 scal_vec(wt,
real_t(-1.0), 2);
851 save_vec(&vp[
VLEN * idx1], wt, 2);
861 template<
typename AFIELD>
866 int ith, nth, is, ns;
867 set_threadtask(ith, nth, is, ns, m_Nstv);
871 for (
int site = is; site < ns; ++site) {
880 for (
int jd = 0; jd <
ND2; ++jd) {
881 for (
int id = 0;
id <
ND; ++id) {
882 int ig =
VLEN *
NDF * (site + m_Nstv * (
id +
ND * jd));
883 load_vec(ut, &u[ig],
NDF);
884 for (
int ic = 0; ic <
NC; ++ic) {
886 int id2 = (
id +
ND2) %
ND;
887 mult_ctv(wt1, &ut[ic2], &v1v[2 *
id],
NC);
888 mult_ctv(wt2, &ut[ic2], &v1v[2 * id2],
NC);
889 int icd1 = 2 * (jd +
ND * ic);
890 int icd2 = 2 * (jd +
ND2 +
ND * ic);
891 axpy_vec(&v2v[icd1],
real_t(1.0), wt1, 2);
892 axpy_vec(&v2v[icd2],
real_t(1.0), wt2, 2);
905 template<
typename AFIELD>
917 if (do_comm_any > 0) {
918 if (ith == 0) chset_recv.start();
930 buf1_zp, buf1_zm, buf1_tp, buf1_tm,
931 up, v1, &m_boundary[0], m_Nsize, do_comm);
935 if (ith == 0) chset_send.start();
941 #ifdef CHIRAL_ROTATION
943 m_CKs, &m_boundary[0], m_Nsize, do_comm);
946 m_CKs, &m_boundary[0], m_Nsize, do_comm);
949 if (do_comm_any > 0) {
950 if (ith == 0) chset_recv.wait();
964 buf2_xp, buf2_xm, buf2_yp, buf2_ym,
965 buf2_zp, buf2_zm, buf2_tp, buf2_tm,
966 m_CKs, &m_boundary[0], m_Nsize, do_comm);
968 if (ith == 0) chset_send.wait();
976 template<
typename AFIELD>
985 int jeo = (ieo + m_Ieo) % 2;
990 #ifdef CHIRAL_ROTATION
992 m_CKs, &m_boundary[0],
993 m_Nsize, m_block_sizev, jeo);
996 m_CKs, &m_boundary[0],
997 m_Nsize, m_block_sizev, jeo);
1007 template<
typename AFIELD>
1017 #ifdef CHIRAL_ROTATION
1020 m_CKs, &m_boundary[0],
1021 m_Nsize, m_block_sizev, -1);
1024 m_CKs, &m_boundary[0],
1025 m_Nsize, m_block_sizev, -1);
1037 template<
typename AFIELD>
1056 aypx(-m_CKs, vp, wp);
1063 template<
typename AFIELD>
1076 scal_local(vp,
real_t(-1.0));
1078 }
else if (mu == 1) {
1080 scal_local(vp,
real_t(-1.0));
1082 }
else if (mu == 2) {
1084 scal_local(vp,
real_t(-1.0));
1086 }
else if (mu == 3) {
1088 scal_local(vp,
real_t(-1.0));
1092 scal_local(vp, -m_CKs);
1099 template<
typename AFIELD>
1112 scal_local(vp,
real_t(-1.0));
1114 }
else if (mu == 1) {
1116 scal_local(vp,
real_t(-1.0));
1118 }
else if (mu == 2) {
1120 scal_local(vp,
real_t(-1.0));
1122 }
else if (mu == 3) {
1124 scal_local(vp,
real_t(-1.0));
1128 scal_local(vp, -m_CKs);
1174 template<
typename AFIELD>
1183 template<
typename AFIELD>
1186 int ith, nth, is, ns;
1187 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1191 for (
int site = is; site < ns; ++site) {
1194 aypx_vec(a, vt, wt,
NVCD);
1201 template<
typename AFIELD>
1204 int ith, nth, is, ns;
1205 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1208 clear_vec(vt,
NVCD);
1210 for (
int site = is; site < ns; ++site) {
1217 template<
typename AFIELD>
1222 int ith, nth, is, ns;
1223 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1230 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1231 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1235 if (do_comm[0] > 0) {
1236 for (
int site = is; site < ns; ++site) {
1237 int ix = site % m_Nxv;
1238 int iyzt = site / m_Nxv;
1241 mult_wilson_xp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
1249 chrecv_up[0].start();
1250 chsend_dn[0].start();
1251 chrecv_up[0].wait();
1252 chsend_dn[0].wait();
1257 for (
int site = is; site < ns; ++site) {
1258 int ix = site % m_Nxv;
1259 int iyzt = site / m_Nxv;
1262 clear_vec(v2v,
NVCD);
1266 if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
1267 int nei = ix + 1 + m_Nxv * iyzt;
1268 if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
1270 mult_wilson_xpb(v2v, &u[
VLEN *
NDF * site], zL);
1274 mult_wilson_xpb(v2v, &u[
VLEN *
NDF * site], zL);
1275 mult_wilson_xp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
1286 template<
typename AFIELD>
1291 int ith, nth, is, ns;
1292 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1299 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1300 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1304 if (do_comm[0] > 0) {
1305 for (
int site = is; site < ns; ++site) {
1306 int ix = site % m_Nxv;
1307 int iyzt = site / m_Nxv;
1308 if (ix == m_Nxv - 1) {
1310 mult_wilson_xm1(&buf1[ibf], &u[
VLEN *
NDF * site],
1318 chrecv_dn[0].start();
1319 chsend_up[0].start();
1320 chrecv_dn[0].wait();
1321 chsend_up[0].wait();
1326 for (
int site = is; site < ns; ++site) {
1327 int ix = site % m_Nxv;
1328 int iyzt = site / m_Nxv;
1333 clear_vec(v2v,
NVCD);
1335 if ((ix > 0) || (do_comm[0] == 0)) {
1336 int nei = ix - 1 + m_Nxv * iyzt;
1337 if (ix == 0) nei = m_Nxv - 1 + m_Nxv * iyzt;
1340 mult_wilson_xmb(v2v, uL, zL);
1344 shift_vec0_xfw(uL, &u[
VLEN *
NDF * site],
NDF);
1345 mult_wilson_xmb(v2v, uL, zL);
1346 mult_wilson_xm2(v2v, &buf2[ibf]);
1357 template<
typename AFIELD>
1361 int Nxy = m_Nxv * m_Nyv;
1363 int ith, nth, is, ns;
1364 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1369 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1370 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1374 if (do_comm[1] > 0) {
1375 for (
int site = is; site < ns; ++site) {
1376 int ix = site % m_Nxv;
1377 int iy = (site / m_Nxv) % m_Nyv;
1378 int izt = site / Nxy;
1381 mult_wilson_yp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
1389 chrecv_up[1].start();
1390 chsend_dn[1].start();
1391 chrecv_up[1].wait();
1392 chsend_dn[1].wait();
1398 for (
int site = is; site < ns; ++site) {
1399 int ix = site % m_Nxv;
1400 int iy = (site / m_Nxv) % m_Nyv;
1401 int izt = site / Nxy;
1404 clear_vec(v2v,
NVCD);
1408 if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
1409 int iy2 = (iy + 1) % m_Nyv;
1410 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
1412 mult_wilson_ypb(v2v, &u[
VLEN *
NDF * site], zL);
1416 mult_wilson_ypb(v2v, &u[
VLEN *
NDF * site], zL);
1417 mult_wilson_yp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
1428 template<
typename AFIELD>
1432 int Nxy = m_Nxv * m_Nyv;
1434 int ith, nth, is, ns;
1435 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1440 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1441 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1445 if (do_comm[1] > 0) {
1446 for (
int site = is; site < ns; ++site) {
1447 int ix = site % m_Nxv;
1448 int iy = (site / m_Nxv) % m_Nyv;
1449 int izt = site / Nxy;
1450 if (iy == m_Nyv - 1) {
1452 mult_wilson_ym1(&buf1[ibf], &u[
VLEN *
NDF * site],
1461 chrecv_dn[1].start();
1462 chsend_up[1].start();
1463 chrecv_dn[1].wait();
1464 chsend_up[1].wait();
1470 for (
int site = is; site < ns; ++site) {
1471 int ix = site % m_Nxv;
1472 int iy = (site / m_Nxv) % m_Nyv;
1473 int izt = site / Nxy;
1476 clear_vec(v2v,
NVCD);
1481 if ((iy != 0) || (do_comm[idir] == 0)) {
1482 int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
1483 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
1486 mult_wilson_ymb(v2v, uL, zL);
1490 shift_vec0_yfw(uL, &u[
VLEN *
NDF * site],
NDF);
1491 mult_wilson_ymb(v2v, uL, zL);
1492 mult_wilson_ym2(v2v, &buf2[ibf]);
1503 template<
typename AFIELD>
1507 int Nxy = m_Nxv * m_Nyv;
1509 int ith, nth, is, ns;
1510 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1515 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1516 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1520 if (do_comm[2] > 0) {
1521 for (
int site = is; site < ns; ++site) {
1522 int ixy = site % Nxy;
1523 int iz = (site / Nxy) % m_Nz;
1524 int it = site / (Nxy * m_Nz);
1526 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1527 mult_wilson_zp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
1535 chrecv_up[2].start();
1536 chsend_dn[2].start();
1537 chrecv_up[2].wait();
1538 chsend_dn[2].wait();
1544 for (
int site = is; site < ns; ++site) {
1545 int ixy = site % Nxy;
1546 int iz = (site / Nxy) % m_Nz;
1547 int it = site / (Nxy * m_Nz);
1550 clear_vec(v2v,
NVCD);
1552 if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
1553 int iz2 = (iz + 1) % m_Nz;
1554 int nei = ixy + Nxy * (iz2 + m_Nz * it);
1557 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1558 mult_wilson_zp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
1569 template<
typename AFIELD>
1573 int Nxy = m_Nxv * m_Nyv;
1575 int ith, nth, is, ns;
1576 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1581 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1582 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1586 if (do_comm[2] > 0) {
1587 for (
int site = is; site < ns; ++site) {
1588 int ixy = site % Nxy;
1589 int iz = (site / Nxy) % m_Nz;
1590 int it = site / (Nxy * m_Nz);
1591 if (iz == m_Nz - 1) {
1592 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1593 mult_wilson_zm1(&buf1[ibf], &u[
VLEN *
NDF * site],
1602 chrecv_dn[2].start();
1603 chsend_up[2].start();
1604 chrecv_dn[2].wait();
1605 chsend_up[2].wait();
1611 for (
int site = is; site < ns; ++site) {
1612 int ixy = site % Nxy;
1613 int iz = (site / Nxy) % m_Nz;
1614 int it = site / (Nxy * m_Nz);
1617 clear_vec(v2v,
NVCD);
1619 if ((iz > 0) || (do_comm[2] == 0)) {
1620 int iz2 = (iz - 1 + m_Nz) % m_Nz;
1621 int nei = ixy + Nxy * (iz2 + m_Nz * it);
1624 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1625 mult_wilson_zm2(v2v, &buf2[ibf]);
1636 template<
typename AFIELD>
1640 int Nxyz = m_Nxv * m_Nyv * m_Nz;
1642 int ith, nth, is, ns;
1643 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1648 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1649 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1653 if (do_comm[3] > 0) {
1654 for (
int site = is; site < ns; ++site) {
1655 int ixyz = site % Nxyz;
1656 int it = site / Nxyz;
1658 mult_wilson_tp1_dirac(&buf1[
VLEN *
NVC *
ND2 * ixyz],
1667 chrecv_up[3].start();
1668 chsend_dn[3].start();
1669 chrecv_up[3].wait();
1670 chsend_dn[3].wait();
1676 for (
int site = is; site < ns; ++site) {
1677 int ixyz = site % Nxyz;
1678 int it = site / Nxyz;
1681 clear_vec(v2v,
NVCD);
1683 if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
1684 int it2 = (it + 1) % m_Nt;
1685 int nei = ixyz + Nxyz * it2;
1686 mult_wilson_tpb_dirac(v2v, &u[
VLEN *
NDF * site],
1689 mult_wilson_tp2_dirac(v2v, &u[
VLEN *
NDF * site],
1701 template<
typename AFIELD>
1705 int Nxyz = m_Nxv * m_Nyv * m_Nz;
1707 int ith, nth, is, ns;
1708 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1713 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1714 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1718 if (do_comm[3] > 0) {
1719 for (
int site = is; site < ns; ++site) {
1720 int ixyz = site % Nxyz;
1721 int it = site / Nxyz;
1722 if (it == m_Nt - 1) {
1723 mult_wilson_tm1_dirac(&buf1[
VLEN *
NVC *
ND2 * ixyz],
1732 chrecv_dn[3].start();
1733 chsend_up[3].start();
1734 chrecv_dn[3].wait();
1735 chsend_up[3].wait();
1740 for (
int site = is; site < ns; ++site) {
1741 int ixyz = site % Nxyz;
1742 int it = site / Nxyz;
1745 clear_vec(v2v,
NVCD);
1747 if ((it > 0) || (do_comm[3] == 0)) {
1748 int it2 = (it - 1 + m_Nt) % m_Nt;
1749 int nei = ixyz + Nxyz * it2;
1750 mult_wilson_tmb_dirac(v2v, &u[
VLEN *
NDF * nei],
1753 mult_wilson_tm2_dirac(v2v, &buf2[
VLEN *
NVC *
ND2 * ixyz]);
1764 template<
typename AFIELD>
1772 double flop_wilson, flop_clover, flop_site, flop;
1774 if (m_repr ==
"Dirac") {
1775 flop_wilson =
static_cast<double>(
1777 + 6 * (4 * m_Nc + 2)
1778 + 2 * (4 * m_Nc + 1)));
1780 flop_clover =
static_cast<double>(
1782 + 2 * (2 * (m_Nc * m_Nd - 1) + 1)
1785 }
else if (m_repr ==
"Chiral") {
1786 flop_wilson =
static_cast<double>(
1787 m_Nc * m_Nd * (4 + 8 * (4 * m_Nc + 2)));
1789 flop_clover =
static_cast<double>(
1790 m_Nc * m_Nd * (2 * (2 * (m_Nc * m_Nd - 1) + 1)
1793 vout.
crucial(m_vl,
"%s: input repr is undefined.\n");
1797 flop_site = flop_wilson + flop_clover;
1799 flop = flop_site *
static_cast<double>(Lvol);
1800 if ((mode ==
"DdagD") || (mode ==
"DDdag")) flop *= 2.0;
1807 template<
typename AFIELD>
1817 int NBx = m_Nx / m_block_size[0];
1818 int NBy = m_Ny / m_block_size[1];
1819 int NBz = m_Nz / m_block_size[2];
1820 int NBt = m_Nt / m_block_size[3];
1821 size_t nvol2 = NBx * NBy * NBz * NBt / 2;
1823 int block_x = m_block_size[0];
1824 int block_y = m_block_size[1];
1825 int block_z = m_block_size[2];
1826 int block_t = m_block_size[3];
1829 int hop_x_site = Nc * Nd * (4 * Nc + 2);
1830 int hop_y_site = Nc * Nd * (4 * Nc + 2);
1831 int hop_z_site = Nc * Nd * (4 * Nc + 2);
1832 int hop_t_site = Nc * Nd * (4 * Nc + 1);
1833 int accum_site = 4 * Nc * Nd;
1835 double flop_sap_mult;
1837 if (m_repr ==
"Dirac") {
1838 double flop_x = 2.0 *
static_cast<double>(hop_x_site
1839 * (block_x - 1) * block_y * block_z * block_t);
1840 double flop_y = 2.0 *
static_cast<double>(hop_y_site
1841 * block_x * (block_y - 1) * block_z * block_t);
1842 double flop_z = 2.0 *
static_cast<double>(hop_z_site
1843 * block_x * block_y * (block_z - 1) * block_t);
1844 double flop_t = 2.0 *
static_cast<double>(hop_t_site
1845 * block_x * block_y * block_z * (block_t - 1));
1848 double flop_clover =
static_cast<double>(
1850 + 2 * (2 * (m_Nc * m_Nd - 1) + 1)
1854 double flop_aypx =
static_cast<double>(accum_site);
1856 flop_sap_mult = flop_x + flop_y + flop_z + flop_t
1857 + (flop_clover + flop_aypx)
1858 *
static_cast<double>(block_x * block_y * block_z * block_t);
1860 vout.
crucial(m_vl,
"%s: input repr is undefined.\n");
1864 return flop_sap_mult *
static_cast<double>(nvol2)
1865 *
static_cast<double>(NPE);