16 template<
typename AFIELD>
21 template<
typename AFIELD>
34 m_Ndf = 2 * m_Nc * m_Nc;
45 m_Nstv = m_Nst /
VLEN;
53 for (
int mu = 0; mu < m_Ndim; ++mu) {
56 do_comm_any += do_comm[mu];
57 vout.
general(
" do_comm[%d] = %d\n", mu, do_comm[mu]);
60 m_bdsize.resize(m_Ndim);
62 m_bdsize[0] = m_Nvc * Nd2 * m_Ny * m_Nz * m_Nt;
63 m_bdsize[1] = m_Nvc * Nd2 * m_Nx * m_Nz * m_Nt;
64 m_bdsize[2] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nt;
65 m_bdsize[3] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nz;
73 m_U.reset(
NDF, m_Nst, m_Ndim);
76 m_Ublock.reset(
NDF, m_Nst, m_Ndim);
79 int NinF = 2 * m_Nc * m_Nd;
80 m_v1.reset(NinF, m_Nst, 1);
81 m_v2.reset(NinF, m_Nst, 1);
83 int Ndm2 = m_Nd * m_Nd / 2;
84 m_T.reset(
NDF, m_Nst, Ndm2);
86 m_T_qws.reset(72, m_Nst, 1);
91 template<
typename AFIELD>
94 chsend_up.resize(m_Ndim);
95 chrecv_up.resize(m_Ndim);
96 chsend_dn.resize(m_Ndim);
97 chrecv_dn.resize(m_Ndim);
99 for (
int mu = 0; mu < m_Ndim; ++mu) {
100 size_t Nvsize = m_bdsize[mu] *
sizeof(
real_t);
102 chsend_dn[mu].send_init(Nvsize, mu, -1);
103 chsend_up[mu].send_init(Nvsize, mu, 1);
105 chrecv_up[mu].recv_init(Nvsize, mu, 1);
106 chrecv_dn[mu].recv_init(Nvsize, mu, -1);
108 void *buf_up = (
void *)chsend_dn[mu].ptr();
109 chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
110 void *buf_dn = (
void *)chsend_up[mu].ptr();
111 chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
114 if (do_comm[mu] == 1) {
115 chset_send.append(chsend_up[mu]);
116 chset_send.append(chsend_dn[mu]);
117 chset_recv.append(chrecv_up[mu]);
118 chset_recv.append(chrecv_dn[mu]);
125 template<
typename AFIELD>
133 delete m_fopr_csw_chiral;
138 template<
typename AFIELD>
141 const string str_vlevel = params.
get_string(
"verbose_level");
147 std::vector<int> block_size;
154 vout.
crucial(m_vl,
"Error at %s: input parameter not found.\n",
159 set_parameters(
real_t(kappa), bc, block_size);
162 params_csw_chiral.
set_string(
"gamma_matrix_type",
"Chiral");
163 m_fopr_csw_chiral->set_parameters(params_csw_chiral);
165 #ifdef CHIRAL_ROTATION
168 m_fopr_csw->set_parameters(params);
174 template<
typename AFIELD>
177 const std::vector<int> bc,
178 const std::vector<int> block_size)
187 assert(bc.size() == m_Ndim);
188 m_boundary.resize(m_Ndim);
189 m_block_size.resize(m_Ndim);
190 for (
int mu = 0; mu < m_Ndim; ++mu) {
191 m_boundary[mu] = bc[mu];
192 m_block_size[mu] = block_size[mu];
198 vout.
general(m_vl,
"Parameters of %s:\n", class_name.c_str());
200 for (
int mu = 0; mu < m_Ndim; ++mu) {
201 vout.
general(m_vl,
" boundary[%d] = %2d\n", mu, m_boundary[mu]);
203 for (
int mu = 0; mu < m_Ndim; ++mu) {
204 vout.
general(m_vl,
" block_size[%d] = %2d\n", mu, m_block_size[mu]);
207 int rem = m_Nx % m_block_size[0]
208 + m_Ny % m_block_size[1]
209 + m_Nz % m_block_size[2]
210 + m_Nt % m_block_size[3];
212 vout.
crucial(m_vl,
"%s: block_size is irelevant.\n",
219 m_block_sizev[0] = m_block_size[0] /
VLENX;
220 m_block_sizev[1] = m_block_size[1] /
VLENY;
221 m_block_sizev[2] = m_block_size[2];
222 m_block_sizev[3] = m_block_size[3];
223 if ((m_block_sizev[0] *
VLENX != m_block_size[0]) ||
224 (m_block_sizev[1] *
VLENY != m_block_size[1])) {
225 vout.
crucial(m_vl,
"%s: bad blocksize in XY: must be divided by VLENX=%d, VLENY=%d but are %d, %d\n", class_name.c_str(),
VLENX,
VLENY, m_block_size[0], m_block_size[1]);
229 int NBx = m_Nx / m_block_size[0];
230 int NBy = m_Ny / m_block_size[1];
231 int NBz = m_Nz / m_block_size[2];
232 int NBt = m_Nt / m_block_size[3];
237 m_Ieo = (NBx * ipex + NBy * ipey + NBz * ipez + NBt * ipet) % 2;
241 QWS_lib::init_qws(&m_boundary[0]);
249 template<
typename AFIELD>
256 m_fopr_csw->set_config(u);
257 m_fopr_csw_chiral->set_config(u);
273 std::string str_reconst;
275 if (
sizeof(
real_t) == 4) {
276 num_reconst = SU3_RECONSTRUCT_S;
277 str_reconst =
"SU3_RECONSTRUCT_S";
278 }
else if (
sizeof(
real_t) == 8) {
279 num_reconst = SU3_RECONSTRUCT_D;
280 str_reconst =
"SU3_RECONSTRUCT_D";
282 for (
int mu = 0; mu < m_Ndim - 1; ++mu) {
283 if (m_boundary[mu] != 1) {
284 if (num_reconst == 18) {
285 set_boundary_config(m_U, mu);
287 vout.
crucial(
"%s: QWS is not available for\n", class_name.c_str());
288 vout.
crucial(
" boundary[%d] = %d\n", mu, m_boundary[mu]);
289 vout.
crucial(
" %s = %d\n", str_reconst.c_str(), num_reconst);
297 double kappa = (double)m_CKs;
298 if (
sizeof(
real_t) == 8) {
299 qws_loadg_dd_bridge_((
double *)m_U.ptr(0), &kappa);
300 }
else if (
sizeof(
real_t) == 4) {
301 qws_loadgs_dd_bridge_((
float *)m_U.ptr(0), &kappa);
308 if (m_boundary[mu] != 1) {
309 set_boundary_config(m_U, mu);
320 set_block_config(m_Ublock);
323 #ifdef CHIRAL_ROTATION
332 template<
typename AFIELD>
336 int ipe[4], Nsize[4], Lsize[4], j[4];
339 for (
int i = 0; i < m_Ndim; ++i) {
345 for (
int t = 0; t < m_Nt; ++t) {
346 j[3] = ipe[3] * Nsize[3] + t;
347 for (
int z = 0; z < m_Nz; ++z) {
348 j[2] = ipe[2] * Nsize[2] + z;
349 for (
int y = 0; y < m_Ny; ++y) {
350 j[1] = ipe[1] * Nsize[1] + y;
351 for (
int x = 0; x < m_Nx; ++x) {
352 j[0] = ipe[0] * Nsize[0] + x;
353 int site = x + m_Nx * (y + m_Ny * (z + m_Nz * t));
355 if (j[mu] == Lsize[mu] - 1) {
356 double bc = (double)(m_boundary[mu]);
357 for (
int in = 0; in < m_Ndf; ++in) {
358 int i = index_lex.idx_G(in, site, mu);
359 double uv = bc * U.
cmp(i);
371 template<
typename AFIELD>
380 if ((m_boundary[mu] != 1) && (ipex == npex - 1)) {
382 int Nyzt = m_Ny * m_Nz * m_Nt;
384 int ith, nth, is, ns;
385 set_threadtask_mult(ith, nth, is, ns, Nyzt);
387 for (
int iyzt = is; iyzt < ns; ++iyzt) {
388 int site = m_Nx - 1 + m_Nx * iyzt;
389 for (
int in = 0; in <
NDF; ++in) {
390 int i = index.idx_G(in, site, mu);
402 if ((m_boundary[mu] != 1) && (ipey == npey - 1)) {
404 int Nxzt = m_Nx * m_Nz * m_Nt;
406 int ith, nth, is, ns;
407 set_threadtask_mult(ith, nth, is, ns, Nxzt);
409 for (
int ixzt = is; ixzt < ns; ++ixzt) {
410 int ix = ixzt % m_Nx;
411 int izt = ixzt / m_Nx;
412 int site = ix + m_Nx * (m_Ny - 1 + m_Ny * izt);
413 for (
int in = 0; in <
NDF; ++in) {
414 int i = index.idx_G(in, site, mu);
426 if ((m_boundary[mu] != 1) && (ipez == npez - 1)) {
428 int Nxy = m_Nx * m_Ny;
429 int Nxyt = Nxy * m_Nt;
431 int ith, nth, is, ns;
432 set_threadtask_mult(ith, nth, is, ns, Nxyt);
434 for (
int ixyt = is; ixyt < ns; ++ixyt) {
435 int ixy = ixyt % Nxy;
437 int site = ixy + Nxy * (m_Nz - 1 + m_Nz * it);
438 for (
int in = 0; in <
NDF; ++in) {
439 int i = index.idx_G(in, site, mu);
451 if ((m_boundary[mu] != 1) && (ipet == npet - 1)) {
453 int Nxyz = m_Nx * m_Ny * m_Nz;
455 int ith, nth, is, ns;
456 set_threadtask_mult(ith, nth, is, ns, Nxyz);
458 for (
int ixyz = is; ixyz < ns; ++ixyz) {
459 int site = ixyz + Nxyz * (m_Nt - 1);
460 for (
int in = 0; in <
NDF; ++in) {
461 int i = index.idx_G(in, site, mu);
472 template<
typename AFIELD>
477 int ith, nth, is, ns;
478 set_threadtask_mult(ith, nth, is, ns, m_Nst);
480 for (
int site = is; site < ns; ++site) {
481 int ix = site % m_Nx;
482 int iy = (site / m_Nx) % m_Ny;
483 int iz = (site / (m_Nx * m_Ny)) % m_Nz;
484 int it = site / (m_Nx * m_Ny * m_Nz);
487 if ((ix + 1) % m_block_size[mu] == 0) {
488 for (
int in = 0; in <
NDF; ++in) {
489 int i = index.idx_G(in, site, mu);
495 if ((iy + 1) % m_block_size[mu] == 0) {
496 for (
int in = 0; in <
NDF; ++in) {
497 int i = index.idx_G(in, site, mu);
503 if ((iz + 1) % m_block_size[mu] == 0) {
504 for (
int in = 0; in <
NDF; ++in) {
505 int i = index.idx_G(in, site, mu);
511 if ((it + 1) % m_block_size[mu] == 0) {
512 for (
int in = 0; in <
NDF; ++in) {
513 int i = index.idx_G(in, site, mu);
522 template<
typename AFIELD>
533 Field_F v(m_Nst, 1), w(m_Nst, 1);
537 m_fopr_csw->set_mode(
"D");
539 int ith, nth, is, ns;
540 set_threadtask_mult(ith, nth, is, ns, m_Nst);
544 for (
int id = 0;
id < m_Nd / 2; ++id) {
545 for (
int ic = 0; ic < m_Nc; ++ic) {
547 for (
int site = is; site < ns; ++site) {
548 w.
set_r(ic,
id, site, 0, 1.0);
552 m_fopr_csw->mult(v, w);
555 for (
int site = is; site < ns; ++site) {
556 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
557 real_t vt_r = v.cmp_r(ic2, 0, site, 0);
558 real_t vt_i = v.cmp_i(ic2, 0, site, 0);
559 int idx_r = index.idx_Gr(ic, ic2, site,
id + 0);
560 int idx_i = index.idx_Gi(ic, ic2, site,
id + 0);
561 m_T.set(idx_r, vt_r);
562 m_T.set(idx_i, vt_i);
565 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
566 real_t vt_r = v.cmp_r(ic2, 1, site, 0);
567 real_t vt_i = v.cmp_i(ic2, 1, site, 0);
568 int idx_r = index.idx_Gr(ic, ic2, site,
id + 4);
569 int idx_i = index.idx_Gi(ic, ic2, site,
id + 4);
570 m_T.set(idx_r, vt_r);
571 m_T.set(idx_i, vt_i);
574 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
575 real_t vt_r = -v.cmp_r(ic2, 2, site, 0);
576 real_t vt_i = -v.cmp_i(ic2, 2, site, 0);
577 int idx_r = index.idx_Gr(ic, ic2, site,
id + 2);
578 int idx_i = index.idx_Gi(ic, ic2, site,
id + 2);
579 m_T.set(idx_r, vt_r);
580 m_T.set(idx_i, vt_i);
583 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
584 real_t vt_r = -v.cmp_r(ic2, 3, site, 0);
585 real_t vt_i = -v.cmp_i(ic2, 3, site, 0);
586 int idx_r = index.idx_Gr(ic, ic2, site,
id + 6);
587 int idx_i = index.idx_Gi(ic, ic2, site,
id + 6);
588 m_T.set(idx_r, vt_r);
589 m_T.set(idx_i, vt_i);
596 real_t kappaR = 1.0 / m_CKs;
605 template<
typename AFIELD>
615 Field_F v(m_Nst, 1), w(m_Nst, 1);
619 const int Nin =
NDF *
ND * 2;
621 m_fopr_csw_chiral->set_mode(
"D");
623 int ith, nth, is, ns;
624 set_threadtask_mult(ith, nth, is, ns, m_Nst);
627 constexpr
int idx[72] = {
628 0, -1, 6, 7, 16, 17, 28, 29, 24, 25, 34, 35,
629 -1, -1, 1, -1, 12, 13, 18, 19, 32, 33, 26, 27,
630 -1, -1, -1, -5, 2, -1, 8, 9, 20, 21, 30, 31,
631 -1, -1, -1, -1, -1, -1, 3, -1, 14, 15, 22, 23,
632 -1, -1, -1, -1, -1, -1, -1, -1, 4, -1, 10, 11,
633 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, -1,
635 vout.
general(
"hoge: chiral rep for Clover term\n");
638 for (
int id = 0;
id < m_Nd / 2; ++id) {
639 for (
int ic = 0; ic < m_Nc; ++ic) {
643 for (
int site = is; site < ns; ++site) {
644 w.
set_r(ic,
id, site, 0, 1.0);
645 w.
set_r(ic,
id + 2, site, 0, 1.0);
649 m_fopr_csw->mult(v, w);
652 for (
int site = is; site < ns; ++site) {
653 for (
int id2 = 0; id2 < m_Nd; ++id2) {
654 for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
655 real_t vt_r = 0.5 * v.cmp_r(ic2, id2, site, 0);
656 real_t vt_i = 0.5 * v.cmp_i(ic2, id2, site, 0);
662 int i = ic2 + m_Nc * (id2 % 2);
663 int j = ic + m_Nc * id;
664 int ij = m_Nc * 2 * i + j;
665 int in_r =
idx[2 * ij];
666 int in_i =
idx[2 * ij + 1];
668 in_r += 36 * (id2 / 2);
669 int idx_r = index.idx(in_r, Nin, site, 0);
670 m_T.set(idx_r, vt_r);
673 in_i += 36 * (id2 / 2);
674 int idx_i = index.idx(in_i, Nin, site, 0);
675 m_T.set(idx_i, vt_i);
685 real_t kappaR = 1.0 / m_CKs;
692 template<
typename AFIELD>
756 Field_F v(m_Nst, 1), w(m_Nst, 1);
757 Field T(2 * 6 * 6, m_Nst, 1);
758 Field Tinv(2 * 6 * 6, m_Nst, 1);
763 m_fopr_csw_chiral->set_mode(
"D");
765 int ith, nth, is, ns;
766 set_threadtask_mult(ith, nth, is, ns, m_Nst);
771 = { 0, 6, 8, 10, 12, 14,
772 -1, 1, 16, 18, 20, 22,
773 -1, -1, 2, 24, 26, 28,
774 -1, -1, -1, 3, 30, 32,
775 -1, -1, -1, -1, 4, 34,
776 -1, -1, -1, -1, -1, 5 };
777 constexpr
double factor = 0.5;
780 for (
int ud = 0; ud < 2; ++ud) {
781 for (
int id = 0;
id < m_Nd / 2; ++id) {
782 for (
int ic = 0; ic < m_Nc; ++ic) {
784 for (
int site = is; site < ns; ++site) {
785 w.
set_r(ic, 2 * ud +
id, site, 0, 1.0);
788 m_fopr_csw_chiral->mult(v, w);
791 for (
int site = is; site < ns; ++site) {
793 int offset = ud2 * 36;
795 double vt_r = v.cmp_r(ic, 2 * ud +
id, site, 0);
796 T.
set(offset + j, site, 0, factor * (1.0 - vt_r));
797 for (
int i = 0; i < 6; ++i) {
798 int ij =
idx[6 * i + j];
800 int id2 = i / 3 + 2 * ud;
802 double vt_r = v.cmp_r(ic2, id2, site, 0);
803 double vt_i = v.cmp_i(ic2, id2, site, 0);
804 T.
set(offset + ij, site, 0, -factor * vt_r);
805 T.
set(offset + ij + 1, site, 0, -factor * vt_i);
813 solve_csw_qws_inv(Tinv, T);
816 if (
sizeof(
real_t) == 8) {
817 qws_loadc_dd_bridge_((
double *)m_T_qws.ptr(0));
818 }
else if (
sizeof(
real_t) == 4) {
819 qws_loadcs_dd_bridge_((
float *)m_T_qws.ptr(0));
829 template<
class AFIELD>
838 constexpr
double factor = 0.25;
842 constexpr
int NS = 4;
849 assert(N == NS *
NCOL / 2);
852 const size_t vol = T.
nvol();
854 const double *pT = T.
ptr(0);
855 double *pTinv = Tinv.
ptr(0);
857 int ith, nth, is, ns;
858 set_threadtask(ith, nth, is, ns, T.
nvol());
866 for (
int s = is; s < ns; ++s) {
867 size_t offset_clov = 2 * s * N * N;
869 const double *clov = pT + offset_clov;
870 double *clov_inv = pTinv + offset_clov;
872 for (
int block = 0; block < 2; block++) {
875 for (
int i = 0; i < N; i++) {
876 LU[N * i + i] = clov[i];
880 for (
int i = 0; i < N; i++) {
881 for (
int j = i + 1; j < N; j++) {
882 LU[N * i + j] =
complex_t(clov[ii], clov[ii + 1]);
888 for (
int i = 0; i < N; i++) {
889 double diag_re = LU[N * i + i].real();
890 double diag_inv = 1.0 / diag_re;
891 LU[N * i + i] = diag_inv;
892 for (
int k = i + 1; k < N; k++) {
895 for (
int j = i + 1; j < N; j++) {
896 LU[N * k + j] -= conj(c) * LU[N * i + j];
901 for (
int i = 0; i < N; i++) {
903 for (
int j = 0; j < N; j++) {
909 for (
int j = 1; j < N; j++) {
911 for (
int k = i; k < j; k++) {
912 x_j -= conj(LU[N * j + k]) * x[k];
917 for (
int j = N - 1; j >= i; j--) {
919 for (
int k = j + 1; k < N; k++) {
920 x_j -= LU[N * j + k] * x[k];
922 x_j *= LU[N * j + j].real();
929 for (
int i = 0; i < N; i++) {
930 clov_inv[ii] = workT[N * i + i].real();
933 for (
int i = 0; i < N; i++) {
934 for (
int j = i + 1; j < N; j++) {
936 clov_inv[ii] = val.real();
938 clov_inv[ii] = val.imag();
951 template<
typename AFIELD>
955 if (
sizeof(
real_t) == 4) {
956 qws_loadfs_dd_bridge_((scs_t *)v.
ptr(0), (
const float *)w.
ptr(0));
957 }
else if (
sizeof(
real_t) == 8) {
958 qws_loadfd_dd_bridge_((scd_t *)v.
ptr(0), (
const double *)w.
ptr(0));
960 vout.
crucial(
" %s: cannot happen, unknown sizeof(real_t)=%ul in convert()\n", class_name.c_str(),
sizeof(
real_t));
970 template<
typename AFIELD>
981 template<
typename AFIELD>
985 if (
sizeof(
real_t) == 4) {
986 qws_storefs_dd_bridge_((
float *)v.
ptr(0), (
const scs_t *)w.
ptr(0));
987 }
else if (
sizeof(
real_t) == 8) {
988 qws_storefd_dd_bridge_((
double *)v.
ptr(0), (
const scd_t *)w.
ptr(0));
990 vout.
crucial(
" %s: cannot happen, unknown sizeof(real_t)=%ul in reverse()\n", class_name.c_str(),
sizeof(
real_t));
1000 template<
typename AFIELD>
1006 mult_gm4(m_v2, m_v1);
1012 template<
typename AFIELD>
1020 }
else if (mu == 1) {
1022 }
else if (mu == 2) {
1024 }
else if (mu == 3) {
1027 vout.
crucial(m_vl,
"%s: mult_up for %d direction is undefined.",
1028 class_name.c_str(), mu);
1035 template<
typename AFIELD>
1043 }
else if (mu == 1) {
1045 }
else if (mu == 2) {
1047 }
else if (mu == 3) {
1050 vout.
crucial(m_vl,
"%s: mult_dn for %d direction is undefined.",
1051 class_name.c_str(), mu);
1058 template<
typename AFIELD>
1066 template<
typename AFIELD>
1072 if (ith == 0) m_mode = mode;
1079 template<
typename AFIELD>
1082 if (m_mode ==
"D") {
1084 }
else if (m_mode ==
"DdagD") {
1086 }
else if (m_mode ==
"Ddag") {
1088 }
else if (m_mode ==
"H") {
1091 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
1098 template<
typename AFIELD>
1101 if (m_mode ==
"D") {
1103 }
else if (m_mode ==
"DdagD") {
1105 }
else if (m_mode ==
"Ddag") {
1107 }
else if (m_mode ==
"H") {
1110 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
1117 template<
typename AFIELD>
1130 template<
typename AFIELD>
1141 template<
typename AFIELD>
1151 template<
typename AFIELD>
1168 template<
typename AFIELD>
1181 template<
typename AFIELD>
1187 int ith, nth, is, ns;
1188 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1191 for (
int site = is; site < ns; ++site) {
1192 for (
int ic = 0; ic <
NC; ++ic) {
1193 for (
int id = 0;
id <
ND; ++id) {
1194 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
1195 load_vec(wt, &w[
VLEN * idx1], 2);
1196 scal_vec(wt, (
real_t)(-1.0), 2);
1197 int id2 = (
id + 2) %
ND;
1198 int idx2 = 2 * (id2 +
ND * ic) +
NVCD * site;
1199 save_vec(&v[
VLEN * idx2], wt, 2);
1210 template<
typename AFIELD>
1213 int ith, nth, is, ns;
1214 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1216 for (
int site = is; site < ns; ++site) {
1219 scal_vec(vt, a,
NVCD);
1226 template<
typename AFIELD>
1232 int ith, nth, is, ns;
1233 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1237 for (
int site = is; site < ns; ++site) {
1238 for (
int ic = 0; ic <
NC; ++ic) {
1239 for (
int id = 0;
id <
ND2; ++id) {
1240 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
1241 load_vec(wt, &wp[
VLEN * idx1], 2);
1242 save_vec(&vp[
VLEN * idx1], wt, 2);
1245 for (
int id =
ND2;
id <
ND; ++id) {
1246 int idx1 = 2 * (
id +
ND * ic) +
NVCD * site;
1247 load_vec(wt, &wp[
VLEN * idx1], 2);
1248 scal_vec(wt,
real_t(-1.0), 2);
1249 save_vec(&vp[
VLEN * idx1], wt, 2);
1259 template<
typename AFIELD>
1264 int ith, nth, is, ns;
1265 set_threadtask(ith, nth, is, ns, m_Nstv);
1269 for (
int site = is; site < ns; ++site) {
1278 for (
int jd = 0; jd <
ND2; ++jd) {
1279 for (
int id = 0;
id <
ND; ++id) {
1280 int ig =
VLEN *
NDF * (site + m_Nstv * (
id +
ND * jd));
1281 load_vec(ut, &u[ig],
NDF);
1282 for (
int ic = 0; ic <
NC; ++ic) {
1284 int id2 = (
id +
ND2) %
ND;
1285 mult_ctv(wt1, &ut[ic2], &v1v[2 *
id],
NC);
1286 mult_ctv(wt2, &ut[ic2], &v1v[2 * id2],
NC);
1287 int icd1 = 2 * (jd +
ND * ic);
1288 int icd2 = 2 * (jd +
ND2 +
ND * ic);
1289 axpy_vec(&v2v[icd1],
real_t(1.0), wt1, 2);
1290 axpy_vec(&v2v[icd2],
real_t(1.0), wt2, 2);
1303 template<
typename AFIELD>
1308 if (
sizeof(
real_t) == 4) {
1310 clv_s_dd_(&deo, (scs_t *)v.
ptr(0));
1312 clv_s_dd_(&deo, (scs_t *)v.
ptr(0));
1313 }
else if (
sizeof(
real_t) == 8) {
1315 clv_d_dd_(&deo, (scd_t *)v.
ptr(0));
1317 clv_d_dd_(&deo, (scd_t *)v.
ptr(0));
1324 template<
typename AFIELD>
1329 if (
sizeof(
real_t) == 4) {
1331 clv_matvec_s_dd_(&deo, (scs_t *)v.
ptr(0), (clvs_t *)m_T_qws.ptr(0), (scs_t *)v.
ptr(0));
1333 clv_matvec_s_dd_(&deo, (scs_t *)v.
ptr(0), (clvs_t *)m_T_qws.ptr(0), (scs_t *)v.
ptr(0));
1334 }
else if (
sizeof(
real_t) == 8) {
1336 clv_matvec_d_dd_(&deo, (scd_t *)v.
ptr(0), (clvd_t *)m_T_qws.ptr(0), (scd_t *)v.
ptr(0));
1338 clv_matvec_d_dd_(&deo, (scd_t *)v.
ptr(0), (clvd_t *)m_T_qws.ptr(0), (scd_t *)v.
ptr(0));
1345 template<
typename AFIELD>
1355 if (do_comm_any > 0) {
1366 buf1_zp, buf1_zm, buf1_tp, buf1_tm,
1367 up, v1, &m_boundary[0], m_Nsize, do_comm);
1379 m_CKs, &m_boundary[0], m_Nsize, do_comm);
1381 if (do_comm_any > 0) {
1400 buf2_xp, buf2_xm, buf2_yp, buf2_ym,
1401 buf2_zp, buf2_zm, buf2_tp, buf2_tm,
1402 m_CKs, &m_boundary[0], m_Nsize, do_comm);
1410 template<
typename AFIELD>
1419 int jeo = (ieo + m_Ieo) % 2;
1424 m_CKs, &m_boundary[0],
1425 m_Nsize, m_block_sizev, jeo);
1432 template<
typename AFIELD>
1446 m_CKs, &m_boundary[0],
1447 m_Nsize, m_block_sizev, -1);
1459 template<
typename AFIELD>
1463 if (
sizeof(
real_t) == 4) {
1465 ddd_s_noprl_((scs_t *)v.
ptr(0), (scs_t *)
const_cast<AFIELD *
>(&w)->
ptr(0));
1466 }
else if (
sizeof(
real_t) == 8) {
1470 vout.
crucial(m_vl,
"%s: mult_D_qws cannot be called in thread parallel region\n", class_name.c_str());
1473 ddd_d_((scd_t *)v.
ptr(0), (scd_t *)
const_cast<AFIELD *
>(&w)->
ptr(0));
1481 double w2 = w.
norm2();
1482 double v2 = v.
norm2();
1492 template<
typename AFIELD>
1511 aypx(-m_CKs, vp, wp);
1518 template<
typename AFIELD>
1531 scal_local(vp,
real_t(-1.0));
1533 }
else if (mu == 1) {
1535 scal_local(vp,
real_t(-1.0));
1537 }
else if (mu == 2) {
1539 scal_local(vp,
real_t(-1.0));
1541 }
else if (mu == 3) {
1543 scal_local(vp,
real_t(-1.0));
1547 scal_local(vp, -m_CKs);
1554 template<
typename AFIELD>
1567 scal_local(vp,
real_t(-1.0));
1569 }
else if (mu == 1) {
1571 scal_local(vp,
real_t(-1.0));
1573 }
else if (mu == 2) {
1575 scal_local(vp,
real_t(-1.0));
1577 }
else if (mu == 3) {
1579 scal_local(vp,
real_t(-1.0));
1583 scal_local(vp, -m_CKs);
1629 template<
typename AFIELD>
1638 template<
typename AFIELD>
1641 int ith, nth, is, ns;
1642 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1646 for (
int site = is; site < ns; ++site) {
1649 aypx_vec(a, vt, wt,
NVCD);
1656 template<
typename AFIELD>
1659 int ith, nth, is, ns;
1660 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1663 clear_vec(vt,
NVCD);
1665 for (
int site = is; site < ns; ++site) {
1672 template<
typename AFIELD>
1677 int ith, nth, is, ns;
1678 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1685 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1686 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1690 if (do_comm[0] > 0) {
1691 for (
int site = is; site < ns; ++site) {
1692 int ix = site % m_Nxv;
1693 int iyzt = site / m_Nxv;
1696 mult_wilson_xp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
1704 chrecv_up[0].start();
1705 chsend_dn[0].start();
1706 chrecv_up[0].wait();
1707 chsend_dn[0].wait();
1712 for (
int site = is; site < ns; ++site) {
1713 int ix = site % m_Nxv;
1714 int iyzt = site / m_Nxv;
1717 clear_vec(v2v,
NVCD);
1721 if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
1722 int nei = ix + 1 + m_Nxv * iyzt;
1723 if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
1725 mult_wilson_xpb(v2v, &u[
VLEN *
NDF * site], zL);
1729 mult_wilson_xpb(v2v, &u[
VLEN *
NDF * site], zL);
1730 mult_wilson_xp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
1741 template<
typename AFIELD>
1746 int ith, nth, is, ns;
1747 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1754 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1755 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1759 if (do_comm[0] > 0) {
1760 for (
int site = is; site < ns; ++site) {
1761 int ix = site % m_Nxv;
1762 int iyzt = site / m_Nxv;
1763 if (ix == m_Nxv - 1) {
1765 mult_wilson_xm1(&buf1[ibf], &u[
VLEN *
NDF * site],
1773 chrecv_dn[0].start();
1774 chsend_up[0].start();
1775 chrecv_dn[0].wait();
1776 chsend_up[0].wait();
1781 for (
int site = is; site < ns; ++site) {
1782 int ix = site % m_Nxv;
1783 int iyzt = site / m_Nxv;
1788 clear_vec(v2v,
NVCD);
1790 if ((ix > 0) || (do_comm[0] == 0)) {
1791 int nei = ix - 1 + m_Nxv * iyzt;
1792 if (ix == 0) nei = m_Nxv - 1 + m_Nxv * iyzt;
1795 mult_wilson_xmb(v2v, uL, zL);
1799 shift_vec0_xfw(uL, &u[
VLEN *
NDF * site],
NDF);
1800 mult_wilson_xmb(v2v, uL, zL);
1801 mult_wilson_xm2(v2v, &buf2[ibf]);
1812 template<
typename AFIELD>
1816 int Nxy = m_Nxv * m_Nyv;
1818 int ith, nth, is, ns;
1819 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1824 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1825 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1829 if (do_comm[1] > 0) {
1830 for (
int site = is; site < ns; ++site) {
1831 int ix = site % m_Nxv;
1832 int iy = (site / m_Nxv) % m_Nyv;
1833 int izt = site / Nxy;
1836 mult_wilson_yp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
1844 chrecv_up[1].start();
1845 chsend_dn[1].start();
1846 chrecv_up[1].wait();
1847 chsend_dn[1].wait();
1853 for (
int site = is; site < ns; ++site) {
1854 int ix = site % m_Nxv;
1855 int iy = (site / m_Nxv) % m_Nyv;
1856 int izt = site / Nxy;
1859 clear_vec(v2v,
NVCD);
1863 if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
1864 int iy2 = (iy + 1) % m_Nyv;
1865 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
1867 mult_wilson_ypb(v2v, &u[
VLEN *
NDF * site], zL);
1871 mult_wilson_ypb(v2v, &u[
VLEN *
NDF * site], zL);
1872 mult_wilson_yp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
1883 template<
typename AFIELD>
1887 int Nxy = m_Nxv * m_Nyv;
1889 int ith, nth, is, ns;
1890 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1895 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1896 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1900 if (do_comm[1] > 0) {
1901 for (
int site = is; site < ns; ++site) {
1902 int ix = site % m_Nxv;
1903 int iy = (site / m_Nxv) % m_Nyv;
1904 int izt = site / Nxy;
1905 if (iy == m_Nyv - 1) {
1907 mult_wilson_ym1(&buf1[ibf], &u[
VLEN *
NDF * site],
1916 chrecv_dn[1].start();
1917 chsend_up[1].start();
1918 chrecv_dn[1].wait();
1919 chsend_up[1].wait();
1925 for (
int site = is; site < ns; ++site) {
1926 int ix = site % m_Nxv;
1927 int iy = (site / m_Nxv) % m_Nyv;
1928 int izt = site / Nxy;
1931 clear_vec(v2v,
NVCD);
1936 if ((iy != 0) || (do_comm[idir] == 0)) {
1937 int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
1938 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
1941 mult_wilson_ymb(v2v, uL, zL);
1945 shift_vec0_yfw(uL, &u[
VLEN *
NDF * site],
NDF);
1946 mult_wilson_ymb(v2v, uL, zL);
1947 mult_wilson_ym2(v2v, &buf2[ibf]);
1958 template<
typename AFIELD>
1962 int Nxy = m_Nxv * m_Nyv;
1964 int ith, nth, is, ns;
1965 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1970 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1971 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
1975 if (do_comm[2] > 0) {
1976 for (
int site = is; site < ns; ++site) {
1977 int ixy = site % Nxy;
1978 int iz = (site / Nxy) % m_Nz;
1979 int it = site / (Nxy * m_Nz);
1981 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
1982 mult_wilson_zp1(&buf1[ibf], &v1[
VLEN *
NVCD * site]);
1990 chrecv_up[2].start();
1991 chsend_dn[2].start();
1992 chrecv_up[2].wait();
1993 chsend_dn[2].wait();
1999 for (
int site = is; site < ns; ++site) {
2000 int ixy = site % Nxy;
2001 int iz = (site / Nxy) % m_Nz;
2002 int it = site / (Nxy * m_Nz);
2005 clear_vec(v2v,
NVCD);
2007 if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
2008 int iz2 = (iz + 1) % m_Nz;
2009 int nei = ixy + Nxy * (iz2 + m_Nz * it);
2012 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
2013 mult_wilson_zp2(v2v, &u[
VLEN *
NDF * site], &buf2[ibf]);
2024 template<
typename AFIELD>
2028 int Nxy = m_Nxv * m_Nyv;
2030 int ith, nth, is, ns;
2031 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2036 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2037 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
2041 if (do_comm[2] > 0) {
2042 for (
int site = is; site < ns; ++site) {
2043 int ixy = site % Nxy;
2044 int iz = (site / Nxy) % m_Nz;
2045 int it = site / (Nxy * m_Nz);
2046 if (iz == m_Nz - 1) {
2047 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
2048 mult_wilson_zm1(&buf1[ibf], &u[
VLEN *
NDF * site],
2057 chrecv_dn[2].start();
2058 chsend_up[2].start();
2059 chrecv_dn[2].wait();
2060 chsend_up[2].wait();
2066 for (
int site = is; site < ns; ++site) {
2067 int ixy = site % Nxy;
2068 int iz = (site / Nxy) % m_Nz;
2069 int it = site / (Nxy * m_Nz);
2072 clear_vec(v2v,
NVCD);
2074 if ((iz > 0) || (do_comm[2] == 0)) {
2075 int iz2 = (iz - 1 + m_Nz) % m_Nz;
2076 int nei = ixy + Nxy * (iz2 + m_Nz * it);
2079 int ibf =
VLEN *
NVC *
ND2 * (ixy + Nxy * it);
2080 mult_wilson_zm2(v2v, &buf2[ibf]);
2091 template<
typename AFIELD>
2095 int Nxyz = m_Nxv * m_Nyv * m_Nz;
2097 int ith, nth, is, ns;
2098 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2103 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2104 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
2108 if (do_comm[3] > 0) {
2109 for (
int site = is; site < ns; ++site) {
2110 int ixyz = site % Nxyz;
2111 int it = site / Nxyz;
2113 mult_wilson_tp1_dirac(&buf1[
VLEN *
NVC *
ND2 * ixyz],
2122 chrecv_up[3].start();
2123 chsend_dn[3].start();
2124 chrecv_up[3].wait();
2125 chsend_dn[3].wait();
2131 for (
int site = is; site < ns; ++site) {
2132 int ixyz = site % Nxyz;
2133 int it = site / Nxyz;
2136 clear_vec(v2v,
NVCD);
2138 if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
2139 int it2 = (it + 1) % m_Nt;
2140 int nei = ixyz + Nxyz * it2;
2141 mult_wilson_tpb_dirac(v2v, &u[
VLEN *
NDF * site],
2144 mult_wilson_tp2_dirac(v2v, &u[
VLEN *
NDF * site],
2156 template<
typename AFIELD>
2160 int Nxyz = m_Nxv * m_Nyv * m_Nz;
2162 int ith, nth, is, ns;
2163 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2168 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2169 if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
2173 if (do_comm[3] > 0) {
2174 for (
int site = is; site < ns; ++site) {
2175 int ixyz = site % Nxyz;
2176 int it = site / Nxyz;
2177 if (it == m_Nt - 1) {
2178 mult_wilson_tm1_dirac(&buf1[
VLEN *
NVC *
ND2 * ixyz],
2187 chrecv_dn[3].start();
2188 chsend_up[3].start();
2189 chrecv_dn[3].wait();
2190 chsend_up[3].wait();
2195 for (
int site = is; site < ns; ++site) {
2196 int ixyz = site % Nxyz;
2197 int it = site / Nxyz;
2200 clear_vec(v2v,
NVCD);
2202 if ((it > 0) || (do_comm[3] == 0)) {
2203 int it2 = (it - 1 + m_Nt) % m_Nt;
2204 int nei = ixyz + Nxyz * it2;
2205 mult_wilson_tmb_dirac(v2v, &u[
VLEN *
NDF * nei],
2208 mult_wilson_tm2_dirac(v2v, &buf2[
VLEN *
NVC *
ND2 * ixyz]);
2219 template<
typename AFIELD>
2227 double flop_site, flop;
2229 if (m_repr ==
"Dirac") {
2230 flop_site =
static_cast<double>(
2231 m_Nc * m_Nd * (4 + 6 * (4 * m_Nc + 2) + 2 * (4 * m_Nc + 1)));
2232 }
else if (m_repr ==
"Chiral") {
2233 flop_site =
static_cast<double>(
2234 m_Nc * m_Nd * (4 + 8 * (4 * m_Nc + 2)));
2236 vout.
crucial(m_vl,
"%s: input repr is undefined.\n");
2240 flop = flop_site *
static_cast<double>(Lvol);
2241 if ((mode ==
"DdagD") || (mode ==
"DDdag")) flop *= 2.0;
2248 template<
typename AFIELD>
2258 int NBx = m_Nx / m_block_size[0];
2259 int NBy = m_Ny / m_block_size[1];
2260 int NBz = m_Nz / m_block_size[2];
2261 int NBt = m_Nt / m_block_size[3];
2262 size_t nvol2 = NBx * NBy * NBz * NBt / 2;
2264 int block_x = m_block_size[0];
2265 int block_y = m_block_size[1];
2266 int block_z = m_block_size[2];
2267 int block_t = m_block_size[3];
2269 int clover_site = 4 * Nc * Nc * Nd * Nd;
2270 int hop_x_site = Nc * Nd * (4 * Nc + 2);
2271 int hop_y_site = Nc * Nd * (4 * Nc + 2);
2272 int hop_z_site = Nc * Nd * (4 * Nc + 2);
2273 int hop_t_site = Nc * Nd * (4 * Nc + 1);
2274 int accum_site = 4 * Nc * Nd;
2277 double flop_x = 2.0 *
static_cast<double>(hop_x_site
2278 * (block_x - 1) * block_y * block_z * block_t);
2279 double flop_y = 2.0 *
static_cast<double>(hop_y_site
2280 * block_x * (block_y - 1) * block_z * block_t);
2281 double flop_z = 2.0 *
static_cast<double>(hop_z_site
2282 * block_x * block_y * (block_z - 1) * block_t);
2283 double flop_t = 2.0 *
static_cast<double>(hop_t_site
2284 * block_x * block_y * block_z * (block_t - 1));
2285 double flop_sap_mult = flop_x + flop_y + flop_z + flop_t
2286 +
static_cast<double>(clover_site + accum_site);
2288 return flop_sap_mult *
static_cast<double>(nvol2)
2289 *
static_cast<double>(NPE);