12 template<
typename AFIELD>
 
   16 template<
typename AFIELD>
 
   32   vout.
general(m_vl, 
"%s: construction\n", class_name.c_str());
 
   38     if (repr != 
"Dirac") {
 
   39       vout.
crucial(
"  Error at %s: unsupported gamma-matrix type: %s\n",
 
   40                    class_name.c_str(), repr.c_str());
 
   54   m_Ndf = 2 * m_Nc * m_Nc;
 
   65   m_Nstv = m_Nst / 
VLEN;
 
   67   if (
VLENX * m_Nxv != m_Nx) {
 
   68     vout.
crucial(m_vl, 
"%s: Nx must be multiple of VLENX.\n",
 
   72   if (
VLENY * m_Nyv != m_Ny) {
 
   73     vout.
crucial(m_vl, 
"%s: Ny must be multiple of VLENY.\n",
 
   88   for (
int mu = 0; mu < m_Ndim; ++mu) {
 
   91     do_comm_any += do_comm[mu];
 
   92     vout.
general(
"  do_comm[%d] = %d\n", mu, do_comm[mu]);
 
   95   m_bdsize.resize(m_Ndim);
 
   97   m_bdsize[0] = m_Nvc * Nd2 * m_Ny * m_Nz * m_Nt;
 
   98   m_bdsize[1] = m_Nvc * Nd2 * m_Nx * m_Nz * m_Nt;
 
   99   m_bdsize[2] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nt;
 
  100   m_bdsize[3] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nz;
 
  105   m_U.reset(
NDF, m_Nst, m_Ndim);
 
  108   int NinF = 2 * m_Nc * m_Nd;
 
  109   m_v2.reset(NinF, m_Nst, 1);
 
  113   set_parameters(params);
 
  123 template<
typename AFIELD>
 
  126   chsend_up.resize(m_Ndim);
 
  127   chrecv_up.resize(m_Ndim);
 
  128   chsend_dn.resize(m_Ndim);
 
  129   chrecv_dn.resize(m_Ndim);
 
  131   for (
int mu = 0; mu < m_Ndim; ++mu) {
 
  133     size_t Nvsize = m_bdsize[mu] * 
sizeof(
real_t);
 
  135     chsend_dn[mu].send_init(Nvsize, mu, -1);
 
  136     chsend_up[mu].send_init(Nvsize, mu, 1);
 
  138     chrecv_up[mu].recv_init(Nvsize, mu, 1);
 
  139     chrecv_dn[mu].recv_init(Nvsize, mu, -1);
 
  141     void *buf_up = (
void *)chsend_dn[mu].ptr();
 
  142     chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
 
  143     void *buf_dn = (
void *)chsend_up[mu].ptr();
 
  144     chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
 
  147     if (do_comm[mu] == 1) {
 
  148       chset_send.append(chsend_up[mu]);
 
  149       chset_send.append(chsend_dn[mu]);
 
  150       chset_recv.append(chrecv_up[mu]);
 
  151       chset_recv.append(chrecv_dn[mu]);
 
  158 template<
typename AFIELD>
 
  168 template<
typename AFIELD>
 
  178     vout.
crucial(m_vl, 
"Error at %s: input parameter not found.\n",
 
  183   set_parameters(
real_t(kappa), bc);
 
  188 template<
typename AFIELD>
 
  190                                           const std::vector<int> bc)
 
  192   assert(bc.size() == m_Ndim);
 
  200     m_boundary.resize(m_Ndim);
 
  201     for (
int mu = 0; mu < m_Ndim; ++mu) {
 
  202       m_boundary[mu] = bc[mu];
 
  206   vout.
general(m_vl, 
"%s: set parameters\n", class_name.c_str());
 
  207   vout.
general(m_vl, 
"  gamma-matrix type = %s\n", m_repr.c_str());
 
  209   for (
int mu = 0; mu < m_Ndim; ++mu) {
 
  210     vout.
general(m_vl, 
"  boundary[%d] = %2d\n", mu, m_boundary[mu]);
 
  218 template<
typename AFIELD>
 
  221   params.
set_double(
"hopping_parameter", 
double(m_CKs));
 
  223   params.
set_string(
"gamma_matrix_type", m_repr);
 
  230 template<
typename AFIELD>
 
  235   vout.
detailed(m_vl, 
"%s: set_config is called: num_threads = %d\n",
 
  236                 class_name.c_str(), nth);
 
  244   vout.
detailed(m_vl, 
"%s: set_config finished\n", class_name.c_str());
 
  249 template<
typename AFIELD>
 
  262 template<
typename AFIELD>
 
  269   if (ith == 0) m_conf = u;
 
  280 template<
typename AFIELD>
 
  291 template<
typename AFIELD>
 
  302 template<
typename AFIELD>
 
  310   } 
else if (mu == 1) {
 
  312   } 
else if (mu == 2) {
 
  314   } 
else if (mu == 3) {
 
  317     vout.
crucial(m_vl, 
"%s: mult_up for %d direction is undefined.",
 
  318                  class_name.c_str(), mu);
 
  325 template<
typename AFIELD>
 
  333   } 
else if (mu == 1) {
 
  335   } 
else if (mu == 2) {
 
  337   } 
else if (mu == 3) {
 
  340     vout.
crucial(m_vl, 
"%s: mult_dn for %d direction is undefined.",
 
  341                  class_name.c_str(), mu);
 
  348 template<
typename AFIELD>
 
  354   if (ith == 0) m_mode = mode;
 
  361 template<
typename AFIELD>
 
  369 template<
typename AFIELD>
 
  374   } 
else if (m_mode == 
"DdagD") {
 
  376   } 
else if (m_mode == 
"Ddag") {
 
  378   } 
else if (m_mode == 
"H") {
 
  381     vout.
crucial(m_vl, 
"%s: mode undefined.\n", class_name.c_str());
 
  388 template<
typename AFIELD>
 
  393   } 
else if (m_mode == 
"DdagD") {
 
  395   } 
else if (m_mode == 
"Ddag") {
 
  397   } 
else if (m_mode == 
"H") {
 
  400     vout.
crucial(m_vl, 
"%s: mode undefined.\n", class_name.c_str());
 
  407 template<
typename AFIELD>
 
  415 template<
typename AFIELD>
 
  426 template<
typename AFIELD>
 
  436 template<
typename AFIELD>
 
  453 template<
typename AFIELD>
 
  459   int ith, nth, is, ns;
 
  460   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  464   for (
int site = is; site < ns; ++site) {
 
  465     for (
int ic = 0; ic < 
NC; ++ic) {
 
  466       for (
int id = 0; 
id < 
ND2; ++id) {
 
  467         int idx1 = 2 * (
id + 
ND * ic) + 
NVCD * site;
 
  468         load_vec(wt, &wp[
VLEN * idx1], 2);
 
  469         save_vec(&vp[
VLEN * idx1], wt, 2);
 
  472       for (
int id = 
ND2; 
id < 
ND; ++id) {
 
  473         int idx1 = 2 * (
id + 
ND * ic) + 
NVCD * site;
 
  474         load_vec(wt, &wp[
VLEN * idx1], 2);
 
  475         scal_vec(wt, 
real_t(-1.0), 2);
 
  476         save_vec(&vp[
VLEN * idx1], wt, 2);
 
  486 template<
typename AFIELD>
 
  497   if (do_comm_any > 0) {
 
  498     if (ith == 0) chset_recv.start();
 
  510                                    buf1_zp, buf1_zm, buf1_tp, buf1_tm,
 
  511                                    up, v1, &m_boundary[0], m_Nsize, do_comm);
 
  515     if (ith == 0) chset_send.start();
 
  519                                     m_CKs, &m_boundary[0], m_Nsize, do_comm);
 
  521   if (do_comm_any > 0) {
 
  522     if (ith == 0) chset_recv.wait();
 
  536                                    buf2_xp, buf2_xm, buf2_yp, buf2_ym,
 
  537                                    buf2_zp, buf2_zm, buf2_tp, buf2_tm,
 
  538                                    m_CKs, &m_boundary[0], m_Nsize, do_comm);
 
  540     if (ith == 0) chset_send.wait();
 
  548 template<
typename AFIELD>
 
  565   aypx(-m_CKs, vp, wp);
 
  572 template<
typename AFIELD>
 
  581 template<
typename AFIELD>
 
  584   int ith, nth, is, ns;
 
  585   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  589   for (
int site = is; site < ns; ++site) {
 
  592     aypx_vec(a, vt, wt, 
NVCD);
 
  599 template<
typename AFIELD>
 
  602   int ith, nth, is, ns;
 
  603   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  608   for (
int site = is; site < ns; ++site) {
 
  615 template<
typename AFIELD>
 
  620   int ith, nth, is, ns;
 
  621   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  628   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
  632   if (do_comm[0] > 0) {
 
  633     for (
int site = is; site < ns; ++site) {
 
  634       int ix   = site % m_Nxv;
 
  635       int iyzt = site / m_Nxv;
 
  638         mult_wilson_xp1(&buf1[ibf], &v1[
VLEN * 
NVCD * site]);
 
  646       chrecv_up[0].start();
 
  647       chsend_dn[0].start();
 
  654   for (
int site = is; site < ns; ++site) {
 
  655     int ix   = site % m_Nxv;
 
  656     int iyzt = site / m_Nxv;
 
  659     clear_vec(v2v, 
NVCD);
 
  663     if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
 
  664       int nei = ix + 1 + m_Nxv * iyzt;
 
  665       if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
 
  667       mult_wilson_xpb(v2v, &u[
VLEN * 
NDF * site], zL);
 
  671       mult_wilson_xpb(v2v, &u[
VLEN * 
NDF * site], zL);
 
  672       mult_wilson_xp2(v2v, &u[
VLEN * 
NDF * site], &buf2[ibf]);
 
  683 template<
typename AFIELD>
 
  688   int ith, nth, is, ns;
 
  689   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  696   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
  700   if (do_comm[0] > 0) {
 
  701     for (
int site = is; site < ns; ++site) {
 
  702       int ix   = site % m_Nxv;
 
  703       int iyzt = site / m_Nxv;
 
  704       if (ix == m_Nxv - 1) {
 
  706         mult_wilson_xm1(&buf1[ibf], &u[
VLEN * 
NDF * site],
 
  714       chrecv_dn[0].start();
 
  715       chsend_up[0].start();
 
  722   for (
int site = is; site < ns; ++site) {
 
  723     int ix   = site % m_Nxv;
 
  724     int iyzt = site / m_Nxv;
 
  729     clear_vec(v2v, 
NVCD);
 
  731     if ((ix > 0) || (do_comm[0] == 0)) {
 
  732       int nei = ix - 1 + m_Nxv * iyzt;
 
  733       if (ix == 0) nei = m_Nxv - 1 + m_Nxv * iyzt;
 
  736       mult_wilson_xmb(v2v, uL, zL);
 
  740       shift_vec0_xfw(uL, &u[
VLEN * 
NDF * site], 
NDF);
 
  741       mult_wilson_xmb(v2v, uL, zL);
 
  742       mult_wilson_xm2(v2v, &buf2[ibf]);
 
  753 template<
typename AFIELD>
 
  757   int Nxy  = m_Nxv * m_Nyv;
 
  759   int ith, nth, is, ns;
 
  760   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  765   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
  769   if (do_comm[1] > 0) {
 
  770     for (
int site = is; site < ns; ++site) {
 
  771       int ix  = site % m_Nxv;
 
  772       int iy  = (site / m_Nxv) % m_Nyv;
 
  773       int izt = site / Nxy;
 
  776         mult_wilson_yp1(&buf1[ibf], &v1[
VLEN * 
NVCD * site]);
 
  784       chrecv_up[1].start();
 
  785       chsend_dn[1].start();
 
  793   for (
int site = is; site < ns; ++site) {
 
  794     int ix  = site % m_Nxv;
 
  795     int iy  = (site / m_Nxv) % m_Nyv;
 
  796     int izt = site / Nxy;
 
  799     clear_vec(v2v, 
NVCD);
 
  803     if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
 
  804       int iy2 = (iy + 1) % m_Nyv;
 
  805       int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
 
  807       mult_wilson_ypb(v2v, &u[
VLEN * 
NDF * site], zL);
 
  811       mult_wilson_ypb(v2v, &u[
VLEN * 
NDF * site], zL);
 
  812       mult_wilson_yp2(v2v, &u[
VLEN * 
NDF * site], &buf2[ibf]);
 
  823 template<
typename AFIELD>
 
  827   int Nxy  = m_Nxv * m_Nyv;
 
  829   int ith, nth, is, ns;
 
  830   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  835   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
  839   if (do_comm[1] > 0) {
 
  840     for (
int site = is; site < ns; ++site) {
 
  841       int ix  = site % m_Nxv;
 
  842       int iy  = (site / m_Nxv) % m_Nyv;
 
  843       int izt = site / Nxy;
 
  844       if (iy == m_Nyv - 1) {
 
  846         mult_wilson_ym1(&buf1[ibf], &u[
VLEN * 
NDF * site],
 
  855       chrecv_dn[1].start();
 
  856       chsend_up[1].start();
 
  864   for (
int site = is; site < ns; ++site) {
 
  865     int ix  = site % m_Nxv;
 
  866     int iy  = (site / m_Nxv) % m_Nyv;
 
  867     int izt = site / Nxy;
 
  870     clear_vec(v2v, 
NVCD);
 
  875     if ((iy != 0) || (do_comm[idir] == 0)) {
 
  876       int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
 
  877       int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
 
  880       mult_wilson_ymb(v2v, uL, zL);
 
  884       shift_vec0_yfw(uL, &u[
VLEN * 
NDF * site], 
NDF);
 
  885       mult_wilson_ymb(v2v, uL, zL);
 
  886       mult_wilson_ym2(v2v, &buf2[ibf]);
 
  897 template<
typename AFIELD>
 
  901   int Nxy  = m_Nxv * m_Nyv;
 
  903   int ith, nth, is, ns;
 
  904   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  909   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
  913   if (do_comm[2] > 0) {
 
  914     for (
int site = is; site < ns; ++site) {
 
  915       int ixy = site % Nxy;
 
  916       int iz  = (site / Nxy) % m_Nz;
 
  917       int it  = site / (Nxy * m_Nz);
 
  919         int ibf = 
VLEN * 
NVC * 
ND2 * (ixy + Nxy * it);
 
  920         mult_wilson_zp1(&buf1[ibf], &v1[
VLEN * 
NVCD * site]);
 
  928       chrecv_up[2].start();
 
  929       chsend_dn[2].start();
 
  937   for (
int site = is; site < ns; ++site) {
 
  938     int ixy = site % Nxy;
 
  939     int iz  = (site / Nxy) % m_Nz;
 
  940     int it  = site / (Nxy * m_Nz);
 
  943     clear_vec(v2v, 
NVCD);
 
  945     if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
 
  946       int iz2 = (iz + 1) % m_Nz;
 
  947       int nei = ixy + Nxy * (iz2 + m_Nz * it);
 
  950       int ibf = 
VLEN * 
NVC * 
ND2 * (ixy + Nxy * it);
 
  951       mult_wilson_zp2(v2v, &u[
VLEN * 
NDF * site], &buf2[ibf]);
 
  962 template<
typename AFIELD>
 
  966   int Nxy  = m_Nxv * m_Nyv;
 
  968   int ith, nth, is, ns;
 
  969   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
  974   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
  978   if (do_comm[2] > 0) {
 
  979     for (
int site = is; site < ns; ++site) {
 
  980       int ixy = site % Nxy;
 
  981       int iz  = (site / Nxy) % m_Nz;
 
  982       int it  = site / (Nxy * m_Nz);
 
  983       if (iz == m_Nz - 1) {
 
  984         int ibf = 
VLEN * 
NVC * 
ND2 * (ixy + Nxy * it);
 
  985         mult_wilson_zm1(&buf1[ibf], &u[
VLEN * 
NDF * site],
 
  994       chrecv_dn[2].start();
 
  995       chsend_up[2].start();
 
 1003   for (
int site = is; site < ns; ++site) {
 
 1004     int ixy = site % Nxy;
 
 1005     int iz  = (site / Nxy) % m_Nz;
 
 1006     int it  = site / (Nxy * m_Nz);
 
 1009     clear_vec(v2v, 
NVCD);
 
 1011     if ((iz > 0) || (do_comm[2] == 0)) {
 
 1012       int iz2 = (iz - 1 + m_Nz) % m_Nz;
 
 1013       int nei = ixy + Nxy * (iz2 + m_Nz * it);
 
 1016       int ibf = 
VLEN * 
NVC * 
ND2 * (ixy + Nxy * it);
 
 1017       mult_wilson_zm2(v2v, &buf2[ibf]);
 
 1028 template<
typename AFIELD>
 
 1032   int Nxyz = m_Nxv * m_Nyv * m_Nz;
 
 1034   int ith, nth, is, ns;
 
 1035   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1040   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1044   if (do_comm[3] > 0) {
 
 1045     for (
int site = is; site < ns; ++site) {
 
 1046       int ixyz = site % Nxyz;
 
 1047       int it   = site / Nxyz;
 
 1049         mult_wilson_tp1_dirac(&buf1[
VLEN * 
NVC * 
ND2 * ixyz],
 
 1058       chrecv_up[3].start();
 
 1059       chsend_dn[3].start();
 
 1060       chrecv_up[3].wait();
 
 1061       chsend_dn[3].wait();
 
 1067   for (
int site = is; site < ns; ++site) {
 
 1068     int ixyz = site % Nxyz;
 
 1069     int it   = site / Nxyz;
 
 1072     clear_vec(v2v, 
NVCD);
 
 1074     if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
 
 1075       int it2 = (it + 1) % m_Nt;
 
 1076       int nei = ixyz + Nxyz * it2;
 
 1077       mult_wilson_tpb_dirac(v2v, &u[
VLEN * 
NDF * site],
 
 1080       mult_wilson_tp2_dirac(v2v, &u[
VLEN * 
NDF * site],
 
 1092 template<
typename AFIELD>
 
 1096   int Nxyz = m_Nxv * m_Nyv * m_Nz;
 
 1098   int ith, nth, is, ns;
 
 1099   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1104   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1108   if (do_comm[3] > 0) {
 
 1109     for (
int site = is; site < ns; ++site) {
 
 1110       int ixyz = site % Nxyz;
 
 1111       int it   = site / Nxyz;
 
 1112       if (it == m_Nt - 1) {
 
 1113         mult_wilson_tm1_dirac(&buf1[
VLEN * 
NVC * 
ND2 * ixyz],
 
 1122       chrecv_dn[3].start();
 
 1123       chsend_up[3].start();
 
 1124       chrecv_dn[3].wait();
 
 1125       chsend_up[3].wait();
 
 1130   for (
int site = is; site < ns; ++site) {
 
 1131     int ixyz = site % Nxyz;
 
 1132     int it   = site / Nxyz;
 
 1135     clear_vec(v2v, 
NVCD);
 
 1137     if ((it > 0) || (do_comm[3] == 0)) {
 
 1138       int it2 = (it - 1 + m_Nt) % m_Nt;
 
 1139       int nei = ixyz + Nxyz * it2;
 
 1140       mult_wilson_tmb_dirac(v2v, &u[
VLEN * 
NDF * nei],
 
 1143       mult_wilson_tm2_dirac(v2v, &buf2[
VLEN * 
NVC * 
ND2 * ixyz]);
 
 1154 template<
typename AFIELD>
 
 1161   double flop_site, flop;
 
 1163   if (m_repr == 
"Dirac") {
 
 1164     flop_site = 
static_cast<double>(
 
 1165       m_Nc * m_Nd * (4 + 6 * (4 * m_Nc + 2) + 2 * (4 * m_Nc + 1)));
 
 1166   } 
else if (m_repr == 
"Chiral") {
 
 1167     flop_site = 
static_cast<double>(
 
 1168       m_Nc * m_Nd * (4 + 8 * (4 * m_Nc + 2)));
 
 1170     vout.
crucial(m_vl, 
"%s: input repr is undefined.\n");
 
 1174   flop = flop_site * 
static_cast<double>(Lvol);
 
 1175   if ((mode == 
"DdagD") || (mode == 
"DDdag")) flop *= 2.0;