16 template<
typename AFIELD>
 
   21 template<
typename AFIELD>
 
   34   m_Ndf = 2 * m_Nc * m_Nc;
 
   45   m_Nstv = m_Nst / 
VLEN;
 
   53   for (
int mu = 0; mu < m_Ndim; ++mu) {
 
   56     do_comm_any += do_comm[mu];
 
   57     vout.
general(
"  do_comm[%d] = %d\n", mu, do_comm[mu]);
 
   60   m_bdsize.resize(m_Ndim);
 
   62   m_bdsize[0] = m_Nvc * Nd2 * m_Ny * m_Nz * m_Nt;
 
   63   m_bdsize[1] = m_Nvc * Nd2 * m_Nx * m_Nz * m_Nt;
 
   64   m_bdsize[2] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nt;
 
   65   m_bdsize[3] = m_Nvc * Nd2 * m_Nx * m_Ny * m_Nz;
 
   73   m_U.reset(
NDF, m_Nst, m_Ndim);
 
   76   m_Ublock.reset(
NDF, m_Nst, m_Ndim);
 
   79   int NinF = 2 * m_Nc * m_Nd;
 
   80   m_v1.reset(NinF, m_Nst, 1);
 
   81   m_v2.reset(NinF, m_Nst, 1);
 
   83   int Ndm2 = m_Nd * m_Nd / 2;
 
   84   m_T.reset(
NDF, m_Nst, Ndm2);
 
   86   m_T_qws.reset(72, m_Nst, 1);  
 
   91 template<
typename AFIELD>
 
   94   chsend_up.resize(m_Ndim);
 
   95   chrecv_up.resize(m_Ndim);
 
   96   chsend_dn.resize(m_Ndim);
 
   97   chrecv_dn.resize(m_Ndim);
 
   99   for (
int mu = 0; mu < m_Ndim; ++mu) {
 
  100     size_t Nvsize = m_bdsize[mu] * 
sizeof(
real_t);
 
  102     chsend_dn[mu].send_init(Nvsize, mu, -1);
 
  103     chsend_up[mu].send_init(Nvsize, mu, 1);
 
  105     chrecv_up[mu].recv_init(Nvsize, mu, 1);
 
  106     chrecv_dn[mu].recv_init(Nvsize, mu, -1);
 
  108     void *buf_up = (
void *)chsend_dn[mu].ptr();
 
  109     chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
 
  110     void *buf_dn = (
void *)chsend_up[mu].ptr();
 
  111     chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
 
  114     if (do_comm[mu] == 1) {
 
  115       chset_send.append(chsend_up[mu]);
 
  116       chset_send.append(chsend_dn[mu]);
 
  117       chset_recv.append(chrecv_up[mu]);
 
  118       chset_recv.append(chrecv_dn[mu]);
 
  125 template<
typename AFIELD>
 
  133   delete m_fopr_csw_chiral;
 
  138 template<
typename AFIELD>
 
  141   const string str_vlevel = params.
get_string(
"verbose_level");
 
  147   std::vector<int> block_size;
 
  154     vout.
crucial(m_vl, 
"Error at %s: input parameter not found.\n",
 
  159   set_parameters(
real_t(kappa), bc, block_size);
 
  162   params_csw_chiral.
set_string(
"gamma_matrix_type", 
"Chiral");
 
  163   m_fopr_csw_chiral->set_parameters(params_csw_chiral);
 
  165 #ifdef CHIRAL_ROTATION 
  168   m_fopr_csw->set_parameters(params);
 
  174 template<
typename AFIELD>
 
  177   const std::vector<int> bc,
 
  178   const std::vector<int> block_size)
 
  187     assert(bc.size() == m_Ndim);
 
  188     m_boundary.resize(m_Ndim);
 
  189     m_block_size.resize(m_Ndim);
 
  190     for (
int mu = 0; mu < m_Ndim; ++mu) {
 
  191       m_boundary[mu]   = bc[mu];
 
  192       m_block_size[mu] = block_size[mu];
 
  198   vout.
general(m_vl, 
"Parameters of %s:\n", class_name.c_str());
 
  200   for (
int mu = 0; mu < m_Ndim; ++mu) {
 
  201     vout.
general(m_vl, 
"  boundary[%d] = %2d\n", mu, m_boundary[mu]);
 
  203   for (
int mu = 0; mu < m_Ndim; ++mu) {
 
  204     vout.
general(m_vl, 
"  block_size[%d] = %2d\n", mu, m_block_size[mu]);
 
  207   int rem = m_Nx % m_block_size[0]
 
  208             + m_Ny % m_block_size[1]
 
  209             + m_Nz % m_block_size[2]
 
  210             + m_Nt % m_block_size[3];
 
  212     vout.
crucial(m_vl, 
"%s: block_size is irelevant.\n",
 
  219     m_block_sizev[0] = m_block_size[0] / 
VLENX;
 
  220     m_block_sizev[1] = m_block_size[1] / 
VLENY;
 
  221     m_block_sizev[2] = m_block_size[2];
 
  222     m_block_sizev[3] = m_block_size[3];
 
  223     if ((m_block_sizev[0] * 
VLENX != m_block_size[0]) ||
 
  224         (m_block_sizev[1] * 
VLENY != m_block_size[1])) {
 
  225       vout.
crucial(m_vl, 
"%s: bad blocksize in XY: must be divided by VLENX=%d, VLENY=%d but are %d, %d\n", class_name.c_str(), 
VLENX, 
VLENY, m_block_size[0], m_block_size[1]);
 
  229     int NBx  = m_Nx / m_block_size[0];
 
  230     int NBy  = m_Ny / m_block_size[1];
 
  231     int NBz  = m_Nz / m_block_size[2];
 
  232     int NBt  = m_Nt / m_block_size[3];
 
  237     m_Ieo = (NBx * ipex + NBy * ipey + NBz * ipez + NBt * ipet) % 2;
 
  241   QWS_lib::init_qws(&m_boundary[0]);
 
  249 template<
typename AFIELD>
 
  256   m_fopr_csw->set_config(u);
 
  257   m_fopr_csw_chiral->set_config(u);
 
  273       std::string str_reconst;
 
  275       if (
sizeof(
real_t) == 4) {
 
  276         num_reconst = SU3_RECONSTRUCT_S;
 
  277         str_reconst = 
"SU3_RECONSTRUCT_S";
 
  278       } 
else if (
sizeof(
real_t) == 8) {
 
  279         num_reconst = SU3_RECONSTRUCT_D;
 
  280         str_reconst = 
"SU3_RECONSTRUCT_D";
 
  282       for (
int mu = 0; mu < m_Ndim - 1; ++mu) {
 
  283         if (m_boundary[mu] != 1) {
 
  284           if (num_reconst == 18) {
 
  285             set_boundary_config(m_U, mu);
 
  287             vout.
crucial(
"%s: QWS is not available for\n", class_name.c_str());
 
  288             vout.
crucial(
"  boundary[%d] = %d\n", mu, m_boundary[mu]);
 
  289             vout.
crucial(
"  %s = %d\n", str_reconst.c_str(), num_reconst);
 
  297     double kappa = (double)m_CKs;
 
  298     if (
sizeof(
real_t) == 8) {
 
  299       qws_loadg_dd_bridge_((
double *)m_U.ptr(0), &kappa);
 
  300     } 
else if (
sizeof(
real_t) == 4) {
 
  301       qws_loadgs_dd_bridge_((
float *)m_U.ptr(0), &kappa);
 
  308       if (m_boundary[mu] != 1) {
 
  309         set_boundary_config(m_U, mu);
 
  320     set_block_config(m_Ublock);
 
  323 #ifdef CHIRAL_ROTATION 
  332 template<
typename AFIELD>
 
  336   int ipe[4], Nsize[4], Lsize[4], j[4];
 
  339   for (
int i = 0; i < m_Ndim; ++i) {
 
  345   for (
int t = 0; t < m_Nt; ++t) {
 
  346     j[3] = ipe[3] * Nsize[3] + t;
 
  347     for (
int z = 0; z < m_Nz; ++z) {
 
  348       j[2] = ipe[2] * Nsize[2] + z;
 
  349       for (
int y = 0; y < m_Ny; ++y) {
 
  350         j[1] = ipe[1] * Nsize[1] + y;
 
  351         for (
int x = 0; x < m_Nx; ++x) {
 
  352           j[0] = ipe[0] * Nsize[0] + x;
 
  353           int site = x + m_Nx * (y + m_Ny * (z + m_Nz * t));
 
  355           if (j[mu] == Lsize[mu] - 1) {
 
  356             double bc = (double)(m_boundary[mu]);
 
  357             for (
int in = 0; in < m_Ndf; ++in) {
 
  358               int    i  = index_lex.idx_G(in, site, mu);
 
  359               double uv = bc * U.
cmp(i);
 
  371 template<
typename AFIELD>
 
  380   if ((m_boundary[mu] != 1) && (ipex == npex - 1)) {
 
  382     int    Nyzt = m_Ny * m_Nz * m_Nt;
 
  384     int ith, nth, is, ns;
 
  385     set_threadtask_mult(ith, nth, is, ns, Nyzt);
 
  387     for (
int iyzt = is; iyzt < ns; ++iyzt) {
 
  388       int site = m_Nx - 1 + m_Nx * iyzt;
 
  389       for (
int in = 0; in < 
NDF; ++in) {
 
  390         int    i  = index.idx_G(in, site, mu);
 
  402   if ((m_boundary[mu] != 1) && (ipey == npey - 1)) {
 
  404     int    Nxzt = m_Nx * m_Nz * m_Nt;
 
  406     int ith, nth, is, ns;
 
  407     set_threadtask_mult(ith, nth, is, ns, Nxzt);
 
  409     for (
int ixzt = is; ixzt < ns; ++ixzt) {
 
  410       int ix   = ixzt % m_Nx;
 
  411       int izt  = ixzt / m_Nx;
 
  412       int site = ix + m_Nx * (m_Ny - 1 + m_Ny * izt);
 
  413       for (
int in = 0; in < 
NDF; ++in) {
 
  414         int    i  = index.idx_G(in, site, mu);
 
  426   if ((m_boundary[mu] != 1) && (ipez == npez - 1)) {
 
  428     int    Nxy  = m_Nx * m_Ny;
 
  429     int    Nxyt = Nxy * m_Nt;
 
  431     int ith, nth, is, ns;
 
  432     set_threadtask_mult(ith, nth, is, ns, Nxyt);
 
  434     for (
int ixyt = is; ixyt < ns; ++ixyt) {
 
  435       int ixy  = ixyt % Nxy;
 
  437       int site = ixy + Nxy * (m_Nz - 1 + m_Nz * it);
 
  438       for (
int in = 0; in < 
NDF; ++in) {
 
  439         int    i  = index.idx_G(in, site, mu);
 
  451   if ((m_boundary[mu] != 1) && (ipet == npet - 1)) {
 
  453     int    Nxyz = m_Nx * m_Ny * m_Nz;
 
  455     int ith, nth, is, ns;
 
  456     set_threadtask_mult(ith, nth, is, ns, Nxyz);
 
  458     for (
int ixyz = is; ixyz < ns; ++ixyz) {
 
  459       int site = ixyz + Nxyz * (m_Nt - 1);
 
  460       for (
int in = 0; in < 
NDF; ++in) {
 
  461         int    i  = index.idx_G(in, site, mu);
 
  472 template<
typename AFIELD>
 
  477   int ith, nth, is, ns;
 
  478   set_threadtask_mult(ith, nth, is, ns, m_Nst);
 
  480   for (
int site = is; site < ns; ++site) {
 
  481     int ix = site % m_Nx;
 
  482     int iy = (site / m_Nx) % m_Ny;
 
  483     int iz = (site / (m_Nx * m_Ny)) % m_Nz;
 
  484     int it = site / (m_Nx * m_Ny * m_Nz);
 
  487     if ((ix + 1) % m_block_size[mu] == 0) {
 
  488       for (
int in = 0; in < 
NDF; ++in) {
 
  489         int i = index.idx_G(in, site, mu);
 
  495     if ((iy + 1) % m_block_size[mu] == 0) {
 
  496       for (
int in = 0; in < 
NDF; ++in) {
 
  497         int i = index.idx_G(in, site, mu);
 
  503     if ((iz + 1) % m_block_size[mu] == 0) {
 
  504       for (
int in = 0; in < 
NDF; ++in) {
 
  505         int i = index.idx_G(in, site, mu);
 
  511     if ((it + 1) % m_block_size[mu] == 0) {
 
  512       for (
int in = 0; in < 
NDF; ++in) {
 
  513         int i = index.idx_G(in, site, mu);
 
  522 template<
typename AFIELD>
 
  533   Field_F v(m_Nst, 1), w(m_Nst, 1);
 
  537   m_fopr_csw->set_mode(
"D");
 
  539   int ith, nth, is, ns;
 
  540   set_threadtask_mult(ith, nth, is, ns, m_Nst);
 
  544     for (
int id = 0; 
id < m_Nd / 2; ++id) {
 
  545       for (
int ic = 0; ic < m_Nc; ++ic) {
 
  547         for (
int site = is; site < ns; ++site) {
 
  548           w.
set_r(ic, 
id, site, 0, 1.0);
 
  552         m_fopr_csw->mult(v, w);
 
  555         for (
int site = is; site < ns; ++site) {
 
  556           for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
 
  557             real_t vt_r  = v.cmp_r(ic2, 0, site, 0);
 
  558             real_t vt_i  = v.cmp_i(ic2, 0, site, 0);
 
  559             int    idx_r = index.idx_Gr(ic, ic2, site, 
id + 0);
 
  560             int    idx_i = index.idx_Gi(ic, ic2, site, 
id + 0);
 
  561             m_T.set(idx_r, vt_r);
 
  562             m_T.set(idx_i, vt_i);
 
  565           for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
 
  566             real_t vt_r  = v.cmp_r(ic2, 1, site, 0);
 
  567             real_t vt_i  = v.cmp_i(ic2, 1, site, 0);
 
  568             int    idx_r = index.idx_Gr(ic, ic2, site, 
id + 4);
 
  569             int    idx_i = index.idx_Gi(ic, ic2, site, 
id + 4);
 
  570             m_T.set(idx_r, vt_r);
 
  571             m_T.set(idx_i, vt_i);
 
  574           for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
 
  575             real_t vt_r  = -v.cmp_r(ic2, 2, site, 0);
 
  576             real_t vt_i  = -v.cmp_i(ic2, 2, site, 0);
 
  577             int    idx_r = index.idx_Gr(ic, ic2, site, 
id + 2);
 
  578             int    idx_i = index.idx_Gi(ic, ic2, site, 
id + 2);
 
  579             m_T.set(idx_r, vt_r);
 
  580             m_T.set(idx_i, vt_i);
 
  583           for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
 
  584             real_t vt_r  = -v.cmp_r(ic2, 3, site, 0);
 
  585             real_t vt_i  = -v.cmp_i(ic2, 3, site, 0);
 
  586             int    idx_r = index.idx_Gr(ic, ic2, site, 
id + 6);
 
  587             int    idx_i = index.idx_Gi(ic, ic2, site, 
id + 6);
 
  588             m_T.set(idx_r, vt_r);
 
  589             m_T.set(idx_i, vt_i);
 
  596     real_t kappaR = 1.0 / m_CKs;
 
  605 template<
typename AFIELD>
 
  615   Field_F v(m_Nst, 1), w(m_Nst, 1);
 
  619   const int Nin = 
NDF * 
ND * 2;
 
  621   m_fopr_csw_chiral->set_mode(
"D");
 
  623   int ith, nth, is, ns;
 
  624   set_threadtask_mult(ith, nth, is, ns, m_Nst);
 
  627   constexpr 
int idx[72] = {
 
  628     0,  -1,  6,  7, 16, 17, 28, 29, 24, 25, 34, 35,
 
  629     -1, -1,  1, -1, 12, 13, 18, 19, 32, 33, 26, 27,
 
  630     -1, -1, -1, -5,  2, -1,  8,  9, 20, 21, 30, 31,
 
  631     -1, -1, -1, -1, -1, -1,  3, -1, 14, 15, 22, 23,
 
  632     -1, -1, -1, -1, -1, -1, -1, -1,  4, -1, 10, 11,
 
  633     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  5, -1,
 
  635   vout.
general(
"hoge: chiral rep for Clover term\n");
 
  638     for (
int id = 0; 
id < m_Nd / 2; ++id) {
 
  639       for (
int ic = 0; ic < m_Nc; ++ic) {
 
  643         for (
int site = is; site < ns; ++site) {
 
  644           w.
set_r(ic, 
id, site, 0, 1.0);
 
  645           w.
set_r(ic, 
id + 2, site, 0, 1.0);
 
  649         m_fopr_csw->mult(v, w);
 
  652         for (
int site = is; site < ns; ++site) {
 
  653           for (
int id2 = 0; id2 < m_Nd; ++id2) {
 
  654             for (
int ic2 = 0; ic2 < m_Nc; ++ic2) {
 
  655               real_t vt_r = 0.5 * v.cmp_r(ic2, id2, site, 0);
 
  656               real_t vt_i = 0.5 * v.cmp_i(ic2, id2, site, 0);
 
  662               int i    = ic2 + m_Nc * (id2 % 2);
 
  663               int j    = ic + m_Nc * id;
 
  664               int ij   = m_Nc * 2 * i + j;
 
  665               int in_r = 
idx[2 * ij];
 
  666               int in_i = 
idx[2 * ij + 1];
 
  668                 in_r += 36 * (id2 / 2);
 
  669                 int idx_r = index.idx(in_r, Nin, site, 0);
 
  670                 m_T.set(idx_r, vt_r);
 
  673                 in_i += 36 * (id2 / 2);
 
  674                 int idx_i = index.idx(in_i, Nin, site, 0);
 
  675                 m_T.set(idx_i, vt_i);
 
  685     real_t kappaR = 1.0 / m_CKs;
 
  692 template<
typename AFIELD>
 
  756   Field_F v(m_Nst, 1), w(m_Nst, 1);
 
  757   Field   T(2 * 6 * 6, m_Nst, 1);
 
  758   Field   Tinv(2 * 6 * 6, m_Nst, 1);
 
  763   m_fopr_csw_chiral->set_mode(
"D");
 
  765   int ith, nth, is, ns;
 
  766   set_threadtask_mult(ith, nth, is, ns, m_Nst);
 
  771       = { 0,   6,  8, 10, 12, 14,
 
  772           -1,  1, 16, 18, 20, 22,
 
  773           -1, -1,  2, 24, 26, 28,
 
  774           -1, -1, -1,  3, 30, 32,
 
  775           -1, -1, -1, -1,  4, 34,
 
  776           -1, -1, -1, -1, -1, 5 };
 
  777     constexpr 
double factor = 0.5;
 
  780     for (
int ud = 0; ud < 2; ++ud) {
 
  781       for (
int id = 0; 
id < m_Nd / 2; ++id) {
 
  782         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  784           for (
int site = is; site < ns; ++site) {
 
  785             w.
set_r(ic, 2 * ud + 
id, site, 0, 1.0);
 
  788           m_fopr_csw_chiral->mult(v, w);
 
  791           for (
int site = is; site < ns; ++site) {
 
  793             int offset = ud2 * 36;
 
  795             double vt_r = v.cmp_r(ic, 2 * ud + 
id, site, 0);
 
  796             T.
set(offset + j, site, 0, factor * (1.0 - vt_r));
 
  797             for (
int i = 0; i < 6; ++i) {
 
  798               int ij = 
idx[6 * i + j];
 
  800                 int    id2  = i / 3 + 2 * ud;
 
  802                 double vt_r = v.cmp_r(ic2, id2, site, 0);
 
  803                 double vt_i = v.cmp_i(ic2, id2, site, 0);
 
  804                 T.
set(offset + ij, site, 0, -factor * vt_r);
 
  805                 T.
set(offset + ij + 1, site, 0, -factor * vt_i);
 
  813     solve_csw_qws_inv(Tinv, T);
 
  816     if (
sizeof(
real_t) == 8) {
 
  817       qws_loadc_dd_bridge_((
double *)m_T_qws.ptr(0));
 
  818     } 
else if (
sizeof(
real_t) == 4) {
 
  819       qws_loadcs_dd_bridge_((
float *)m_T_qws.ptr(0));
 
  829 template<
class AFIELD>
 
  838   constexpr 
double factor = 0.25;
 
  842   constexpr 
int NS = 4; 
 
  849   assert(N == NS * 
NCOL / 2);
 
  852   const size_t vol = T.
nvol();
 
  854   const double *pT    = T.
ptr(0);
 
  855   double       *pTinv = Tinv.
ptr(0);
 
  857   int ith, nth, is, ns;
 
  858   set_threadtask(ith, nth, is, ns, T.
nvol());
 
  866   for (
int s = is; s < ns; ++s) {
 
  867     size_t offset_clov = 2 * s * N * N;
 
  869     const double *clov     = pT + offset_clov;
 
  870     double       *clov_inv = pTinv + offset_clov;
 
  872     for (
int block = 0; block < 2; block++) {
 
  875       for (
int i = 0; i < N; i++) {
 
  876         LU[N * i + i] = clov[i];
 
  880       for (
int i = 0; i < N; i++) {
 
  881         for (
int j = i + 1; j < N; j++) {
 
  882           LU[N * i + j] = 
complex_t(clov[ii], clov[ii + 1]);
 
  888       for (
int i = 0; i < N; i++) {
 
  889         double diag_re  = LU[N * i + i].real(); 
 
  890         double diag_inv = 1.0 / diag_re;
 
  891         LU[N * i + i] = diag_inv;               
 
  892         for (
int k = i + 1; k < N; k++) {
 
  895           for (
int j = i + 1; j < N; j++) { 
 
  896             LU[N * k + j] -= conj(c) * LU[N * i + j];
 
  901       for (
int i = 0; i < N; i++) { 
 
  903         for (
int j = 0; j < N; j++) {
 
  909         for (
int j = 1; j < N; j++) {
 
  911           for (
int k = i; k < j; k++) {
 
  912             x_j -= conj(LU[N * j + k]) * x[k];
 
  917         for (
int j = N - 1; j >= i; j--) {
 
  919           for (
int k = j + 1; k < N; k++) {
 
  920             x_j -= LU[N * j + k] * x[k];
 
  922           x_j *= LU[N * j + j].real();
 
  929       for (
int i = 0; i < N; i++) {
 
  930         clov_inv[ii] = workT[N * i + i].real();
 
  933       for (
int i = 0; i < N; i++) {
 
  934         for (
int j = i + 1; j < N; j++) {
 
  936           clov_inv[ii] = val.real();
 
  938           clov_inv[ii] = val.imag();
 
  951 template<
typename AFIELD>
 
  955   if (
sizeof(
real_t) == 4) {
 
  956     qws_loadfs_dd_bridge_((scs_t *)v.
ptr(0), (
const float *)w.
ptr(0));
 
  957   } 
else if (
sizeof(
real_t) == 8) {
 
  958     qws_loadfd_dd_bridge_((scd_t *)v.
ptr(0), (
const double *)w.
ptr(0));
 
  960     vout.
crucial(
" %s: cannot happen, unknown sizeof(real_t)=%ul in convert()\n", class_name.c_str(), 
sizeof(
real_t));
 
  970 template<
typename AFIELD>
 
  981 template<
typename AFIELD>
 
  985   if (
sizeof(
real_t) == 4) {
 
  986     qws_storefs_dd_bridge_((
float *)v.
ptr(0), (
const scs_t *)w.
ptr(0));
 
  987   } 
else if (
sizeof(
real_t) == 8) {
 
  988     qws_storefd_dd_bridge_((
double *)v.
ptr(0), (
const scd_t *)w.
ptr(0));
 
  990     vout.
crucial(
" %s: cannot happen, unknown sizeof(real_t)=%ul in reverse()\n", class_name.c_str(), 
sizeof(
real_t));
 
 1000 template<
typename AFIELD>
 
 1006   mult_gm4(m_v2, m_v1);
 
 1012 template<
typename AFIELD>
 
 1020   } 
else if (mu == 1) {
 
 1022   } 
else if (mu == 2) {
 
 1024   } 
else if (mu == 3) {
 
 1027     vout.
crucial(m_vl, 
"%s: mult_up for %d direction is undefined.",
 
 1028                  class_name.c_str(), mu);
 
 1035 template<
typename AFIELD>
 
 1043   } 
else if (mu == 1) {
 
 1045   } 
else if (mu == 2) {
 
 1047   } 
else if (mu == 3) {
 
 1050     vout.
crucial(m_vl, 
"%s: mult_dn for %d direction is undefined.",
 
 1051                  class_name.c_str(), mu);
 
 1058 template<
typename AFIELD>
 
 1066 template<
typename AFIELD>
 
 1072   if (ith == 0) m_mode = mode;
 
 1079 template<
typename AFIELD>
 
 1082   if (m_mode == 
"D") {
 
 1084   } 
else if (m_mode == 
"DdagD") {
 
 1086   } 
else if (m_mode == 
"Ddag") {
 
 1088   } 
else if (m_mode == 
"H") {
 
 1091     vout.
crucial(m_vl, 
"%s: mode undefined.\n", class_name.c_str());
 
 1098 template<
typename AFIELD>
 
 1101   if (m_mode == 
"D") {
 
 1103   } 
else if (m_mode == 
"DdagD") {
 
 1105   } 
else if (m_mode == 
"Ddag") {
 
 1107   } 
else if (m_mode == 
"H") {
 
 1110     vout.
crucial(m_vl, 
"%s: mode undefined.\n", class_name.c_str());
 
 1117 template<
typename AFIELD>
 
 1130 template<
typename AFIELD>
 
 1141 template<
typename AFIELD>
 
 1151 template<
typename AFIELD>
 
 1168 template<
typename AFIELD>
 
 1181 template<
typename AFIELD>
 
 1187   int ith, nth, is, ns;
 
 1188   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1191   for (
int site = is; site < ns; ++site) {
 
 1192     for (
int ic = 0; ic < 
NC; ++ic) {
 
 1193       for (
int id = 0; 
id < 
ND; ++id) {
 
 1194         int idx1 = 2 * (
id + 
ND * ic) + 
NVCD * site;
 
 1195         load_vec(wt, &w[
VLEN * idx1], 2);
 
 1196         scal_vec(wt, (
real_t)(-1.0), 2);
 
 1197         int id2  = (
id + 2) % 
ND;
 
 1198         int idx2 = 2 * (id2 + 
ND * ic) + 
NVCD * site;
 
 1199         save_vec(&v[
VLEN * idx2], wt, 2);
 
 1210 template<
typename AFIELD>
 
 1213   int ith, nth, is, ns;
 
 1214   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1216   for (
int site = is; site < ns; ++site) {
 
 1219     scal_vec(vt, a, 
NVCD);
 
 1226 template<
typename AFIELD>
 
 1232   int ith, nth, is, ns;
 
 1233   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1237   for (
int site = is; site < ns; ++site) {
 
 1238     for (
int ic = 0; ic < 
NC; ++ic) {
 
 1239       for (
int id = 0; 
id < 
ND2; ++id) {
 
 1240         int idx1 = 2 * (
id + 
ND * ic) + 
NVCD * site;
 
 1241         load_vec(wt, &wp[
VLEN * idx1], 2);
 
 1242         save_vec(&vp[
VLEN * idx1], wt, 2);
 
 1245       for (
int id = 
ND2; 
id < 
ND; ++id) {
 
 1246         int idx1 = 2 * (
id + 
ND * ic) + 
NVCD * site;
 
 1247         load_vec(wt, &wp[
VLEN * idx1], 2);
 
 1248         scal_vec(wt, 
real_t(-1.0), 2);
 
 1249         save_vec(&vp[
VLEN * idx1], wt, 2);
 
 1259 template<
typename AFIELD>
 
 1264   int ith, nth, is, ns;
 
 1265   set_threadtask(ith, nth, is, ns, m_Nstv);
 
 1269   for (
int site = is; site < ns; ++site) {
 
 1278     for (
int jd = 0; jd < 
ND2; ++jd) {
 
 1279       for (
int id = 0; 
id < 
ND; ++id) {
 
 1280         int ig = 
VLEN * 
NDF * (site + m_Nstv * (
id + 
ND * jd));
 
 1281         load_vec(ut, &u[ig], 
NDF);
 
 1282         for (
int ic = 0; ic < 
NC; ++ic) {
 
 1284           int id2 = (
id + 
ND2) % 
ND;
 
 1285           mult_ctv(wt1, &ut[ic2], &v1v[2 * 
id], 
NC);
 
 1286           mult_ctv(wt2, &ut[ic2], &v1v[2 * id2], 
NC);
 
 1287           int icd1 = 2 * (jd + 
ND * ic);
 
 1288           int icd2 = 2 * (jd + 
ND2 + 
ND * ic);
 
 1289           axpy_vec(&v2v[icd1], 
real_t(1.0), wt1, 2);
 
 1290           axpy_vec(&v2v[icd2], 
real_t(1.0), wt2, 2);
 
 1303 template<
typename AFIELD>
 
 1308   if (
sizeof(
real_t) == 4) {
 
 1310     clv_s_dd_(&deo, (scs_t *)v.
ptr(0));
 
 1312     clv_s_dd_(&deo, (scs_t *)v.
ptr(0));
 
 1313   } 
else if (
sizeof(
real_t) == 8) {
 
 1315     clv_d_dd_(&deo, (scd_t *)v.
ptr(0));
 
 1317     clv_d_dd_(&deo, (scd_t *)v.
ptr(0));
 
 1324 template<
typename AFIELD>
 
 1329   if (
sizeof(
real_t) == 4) {
 
 1331     clv_matvec_s_dd_(&deo, (scs_t *)v.
ptr(0), (clvs_t *)m_T_qws.ptr(0), (scs_t *)v.
ptr(0));
 
 1333     clv_matvec_s_dd_(&deo, (scs_t *)v.
ptr(0), (clvs_t *)m_T_qws.ptr(0), (scs_t *)v.
ptr(0));
 
 1334   } 
else if (
sizeof(
real_t) == 8) {
 
 1336     clv_matvec_d_dd_(&deo, (scd_t *)v.
ptr(0), (clvd_t *)m_T_qws.ptr(0), (scd_t *)v.
ptr(0));
 
 1338     clv_matvec_d_dd_(&deo, (scd_t *)v.
ptr(0), (clvd_t *)m_T_qws.ptr(0), (scd_t *)v.
ptr(0));
 
 1345 template<
typename AFIELD>
 
 1355   if (do_comm_any > 0) {
 
 1366                                    buf1_zp, buf1_zm, buf1_tp, buf1_tm,
 
 1367                                    up, v1, &m_boundary[0], m_Nsize, do_comm);
 
 1379                                     m_CKs, &m_boundary[0], m_Nsize, do_comm);
 
 1381   if (do_comm_any > 0) {
 
 1400                                    buf2_xp, buf2_xm, buf2_yp, buf2_ym,
 
 1401                                    buf2_zp, buf2_zm, buf2_tp, buf2_tm,
 
 1402                                    m_CKs, &m_boundary[0], m_Nsize, do_comm);
 
 1410 template<
typename AFIELD>
 
 1419   int jeo = (ieo + m_Ieo) % 2;
 
 1424                                   m_CKs, &m_boundary[0],
 
 1425                                   m_Nsize, m_block_sizev, jeo);
 
 1432 template<
typename AFIELD>
 
 1446                                   m_CKs, &m_boundary[0],
 
 1447                                   m_Nsize, m_block_sizev, -1);
 
 1459 template<
typename AFIELD>
 
 1463   if (
sizeof(
real_t) == 4) {
 
 1465     ddd_s_noprl_((scs_t *)v.
ptr(0), (scs_t *)
const_cast<AFIELD *
>(&w)->
ptr(0));
 
 1466   } 
else if (
sizeof(
real_t) == 8) {
 
 1470       vout.
crucial(m_vl, 
"%s: mult_D_qws cannot be called in thread parallel region\n", class_name.c_str());
 
 1473     ddd_d_((scd_t *)v.
ptr(0), (scd_t *)
const_cast<AFIELD *
>(&w)->
ptr(0));
 
 1481     double w2 = w.
norm2();
 
 1482     double v2 = v.
norm2();
 
 1492 template<
typename AFIELD>
 
 1511   aypx(-m_CKs, vp, wp);
 
 1518 template<
typename AFIELD>
 
 1531     scal_local(vp, 
real_t(-1.0));
 
 1533   } 
else if (mu == 1) {
 
 1535     scal_local(vp, 
real_t(-1.0));
 
 1537   } 
else if (mu == 2) {
 
 1539     scal_local(vp, 
real_t(-1.0));
 
 1541   } 
else if (mu == 3) {
 
 1543     scal_local(vp, 
real_t(-1.0));
 
 1547   scal_local(vp, -m_CKs);
 
 1554 template<
typename AFIELD>
 
 1567     scal_local(vp, 
real_t(-1.0));
 
 1569   } 
else if (mu == 1) {
 
 1571     scal_local(vp, 
real_t(-1.0));
 
 1573   } 
else if (mu == 2) {
 
 1575     scal_local(vp, 
real_t(-1.0));
 
 1577   } 
else if (mu == 3) {
 
 1579     scal_local(vp, 
real_t(-1.0));
 
 1583   scal_local(vp, -m_CKs);
 
 1629 template<
typename AFIELD>
 
 1638 template<
typename AFIELD>
 
 1641   int ith, nth, is, ns;
 
 1642   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1646   for (
int site = is; site < ns; ++site) {
 
 1649     aypx_vec(a, vt, wt, 
NVCD);
 
 1656 template<
typename AFIELD>
 
 1659   int ith, nth, is, ns;
 
 1660   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1663   clear_vec(vt, 
NVCD);
 
 1665   for (
int site = is; site < ns; ++site) {
 
 1672 template<
typename AFIELD>
 
 1677   int ith, nth, is, ns;
 
 1678   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1685   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1686   if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
 
 1690   if (do_comm[0] > 0) {
 
 1691     for (
int site = is; site < ns; ++site) {
 
 1692       int ix   = site % m_Nxv;
 
 1693       int iyzt = site / m_Nxv;
 
 1696         mult_wilson_xp1(&buf1[ibf], &v1[
VLEN * 
NVCD * site]);
 
 1704       chrecv_up[0].start();
 
 1705       chsend_dn[0].start();
 
 1706       chrecv_up[0].wait();
 
 1707       chsend_dn[0].wait();
 
 1712   for (
int site = is; site < ns; ++site) {
 
 1713     int ix   = site % m_Nxv;
 
 1714     int iyzt = site / m_Nxv;
 
 1717     clear_vec(v2v, 
NVCD);
 
 1721     if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
 
 1722       int nei = ix + 1 + m_Nxv * iyzt;
 
 1723       if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
 
 1725       mult_wilson_xpb(v2v, &u[
VLEN * 
NDF * site], zL);
 
 1729       mult_wilson_xpb(v2v, &u[
VLEN * 
NDF * site], zL);
 
 1730       mult_wilson_xp2(v2v, &u[
VLEN * 
NDF * site], &buf2[ibf]);
 
 1741 template<
typename AFIELD>
 
 1746   int ith, nth, is, ns;
 
 1747   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1754   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1755   if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
 
 1759   if (do_comm[0] > 0) {
 
 1760     for (
int site = is; site < ns; ++site) {
 
 1761       int ix   = site % m_Nxv;
 
 1762       int iyzt = site / m_Nxv;
 
 1763       if (ix == m_Nxv - 1) {
 
 1765         mult_wilson_xm1(&buf1[ibf], &u[
VLEN * 
NDF * site],
 
 1773       chrecv_dn[0].start();
 
 1774       chsend_up[0].start();
 
 1775       chrecv_dn[0].wait();
 
 1776       chsend_up[0].wait();
 
 1781   for (
int site = is; site < ns; ++site) {
 
 1782     int ix   = site % m_Nxv;
 
 1783     int iyzt = site / m_Nxv;
 
 1788     clear_vec(v2v, 
NVCD);
 
 1790     if ((ix > 0) || (do_comm[0] == 0)) {
 
 1791       int nei = ix - 1 + m_Nxv * iyzt;
 
 1792       if (ix == 0) nei = m_Nxv - 1 + m_Nxv * iyzt;
 
 1795       mult_wilson_xmb(v2v, uL, zL);
 
 1799       shift_vec0_xfw(uL, &u[
VLEN * 
NDF * site], 
NDF);
 
 1800       mult_wilson_xmb(v2v, uL, zL);
 
 1801       mult_wilson_xm2(v2v, &buf2[ibf]);
 
 1812 template<
typename AFIELD>
 
 1816   int Nxy  = m_Nxv * m_Nyv;
 
 1818   int ith, nth, is, ns;
 
 1819   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1824   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1825   if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
 
 1829   if (do_comm[1] > 0) {
 
 1830     for (
int site = is; site < ns; ++site) {
 
 1831       int ix  = site % m_Nxv;
 
 1832       int iy  = (site / m_Nxv) % m_Nyv;
 
 1833       int izt = site / Nxy;
 
 1836         mult_wilson_yp1(&buf1[ibf], &v1[
VLEN * 
NVCD * site]);
 
 1844       chrecv_up[1].start();
 
 1845       chsend_dn[1].start();
 
 1846       chrecv_up[1].wait();
 
 1847       chsend_dn[1].wait();
 
 1853   for (
int site = is; site < ns; ++site) {
 
 1854     int ix  = site % m_Nxv;
 
 1855     int iy  = (site / m_Nxv) % m_Nyv;
 
 1856     int izt = site / Nxy;
 
 1859     clear_vec(v2v, 
NVCD);
 
 1863     if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
 
 1864       int iy2 = (iy + 1) % m_Nyv;
 
 1865       int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
 
 1867       mult_wilson_ypb(v2v, &u[
VLEN * 
NDF * site], zL);
 
 1871       mult_wilson_ypb(v2v, &u[
VLEN * 
NDF * site], zL);
 
 1872       mult_wilson_yp2(v2v, &u[
VLEN * 
NDF * site], &buf2[ibf]);
 
 1883 template<
typename AFIELD>
 
 1887   int Nxy  = m_Nxv * m_Nyv;
 
 1889   int ith, nth, is, ns;
 
 1890   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1895   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1896   if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
 
 1900   if (do_comm[1] > 0) {
 
 1901     for (
int site = is; site < ns; ++site) {
 
 1902       int ix  = site % m_Nxv;
 
 1903       int iy  = (site / m_Nxv) % m_Nyv;
 
 1904       int izt = site / Nxy;
 
 1905       if (iy == m_Nyv - 1) {
 
 1907         mult_wilson_ym1(&buf1[ibf], &u[
VLEN * 
NDF * site],
 
 1916       chrecv_dn[1].start();
 
 1917       chsend_up[1].start();
 
 1918       chrecv_dn[1].wait();
 
 1919       chsend_up[1].wait();
 
 1925   for (
int site = is; site < ns; ++site) {
 
 1926     int ix  = site % m_Nxv;
 
 1927     int iy  = (site / m_Nxv) % m_Nyv;
 
 1928     int izt = site / Nxy;
 
 1931     clear_vec(v2v, 
NVCD);
 
 1936     if ((iy != 0) || (do_comm[idir] == 0)) {
 
 1937       int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
 
 1938       int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
 
 1941       mult_wilson_ymb(v2v, uL, zL);
 
 1945       shift_vec0_yfw(uL, &u[
VLEN * 
NDF * site], 
NDF);
 
 1946       mult_wilson_ymb(v2v, uL, zL);
 
 1947       mult_wilson_ym2(v2v, &buf2[ibf]);
 
 1958 template<
typename AFIELD>
 
 1962   int Nxy  = m_Nxv * m_Nyv;
 
 1964   int ith, nth, is, ns;
 
 1965   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1970   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1971   if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
 
 1975   if (do_comm[2] > 0) {
 
 1976     for (
int site = is; site < ns; ++site) {
 
 1977       int ixy = site % Nxy;
 
 1978       int iz  = (site / Nxy) % m_Nz;
 
 1979       int it  = site / (Nxy * m_Nz);
 
 1981         int ibf = 
VLEN * 
NVC * 
ND2 * (ixy + Nxy * it);
 
 1982         mult_wilson_zp1(&buf1[ibf], &v1[
VLEN * 
NVCD * site]);
 
 1990       chrecv_up[2].start();
 
 1991       chsend_dn[2].start();
 
 1992       chrecv_up[2].wait();
 
 1993       chsend_dn[2].wait();
 
 1999   for (
int site = is; site < ns; ++site) {
 
 2000     int ixy = site % Nxy;
 
 2001     int iz  = (site / Nxy) % m_Nz;
 
 2002     int it  = site / (Nxy * m_Nz);
 
 2005     clear_vec(v2v, 
NVCD);
 
 2007     if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
 
 2008       int iz2 = (iz + 1) % m_Nz;
 
 2009       int nei = ixy + Nxy * (iz2 + m_Nz * it);
 
 2012       int ibf = 
VLEN * 
NVC * 
ND2 * (ixy + Nxy * it);
 
 2013       mult_wilson_zp2(v2v, &u[
VLEN * 
NDF * site], &buf2[ibf]);
 
 2024 template<
typename AFIELD>
 
 2028   int Nxy  = m_Nxv * m_Nyv;
 
 2030   int ith, nth, is, ns;
 
 2031   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2036   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2037   if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
 
 2041   if (do_comm[2] > 0) {
 
 2042     for (
int site = is; site < ns; ++site) {
 
 2043       int ixy = site % Nxy;
 
 2044       int iz  = (site / Nxy) % m_Nz;
 
 2045       int it  = site / (Nxy * m_Nz);
 
 2046       if (iz == m_Nz - 1) {
 
 2047         int ibf = 
VLEN * 
NVC * 
ND2 * (ixy + Nxy * it);
 
 2048         mult_wilson_zm1(&buf1[ibf], &u[
VLEN * 
NDF * site],
 
 2057       chrecv_dn[2].start();
 
 2058       chsend_up[2].start();
 
 2059       chrecv_dn[2].wait();
 
 2060       chsend_up[2].wait();
 
 2066   for (
int site = is; site < ns; ++site) {
 
 2067     int ixy = site % Nxy;
 
 2068     int iz  = (site / Nxy) % m_Nz;
 
 2069     int it  = site / (Nxy * m_Nz);
 
 2072     clear_vec(v2v, 
NVCD);
 
 2074     if ((iz > 0) || (do_comm[2] == 0)) {
 
 2075       int iz2 = (iz - 1 + m_Nz) % m_Nz;
 
 2076       int nei = ixy + Nxy * (iz2 + m_Nz * it);
 
 2079       int ibf = 
VLEN * 
NVC * 
ND2 * (ixy + Nxy * it);
 
 2080       mult_wilson_zm2(v2v, &buf2[ibf]);
 
 2091 template<
typename AFIELD>
 
 2095   int Nxyz = m_Nxv * m_Nyv * m_Nz;
 
 2097   int ith, nth, is, ns;
 
 2098   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2103   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2104   if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
 
 2108   if (do_comm[3] > 0) {
 
 2109     for (
int site = is; site < ns; ++site) {
 
 2110       int ixyz = site % Nxyz;
 
 2111       int it   = site / Nxyz;
 
 2113         mult_wilson_tp1_dirac(&buf1[
VLEN * 
NVC * 
ND2 * ixyz],
 
 2122       chrecv_up[3].start();
 
 2123       chsend_dn[3].start();
 
 2124       chrecv_up[3].wait();
 
 2125       chsend_dn[3].wait();
 
 2131   for (
int site = is; site < ns; ++site) {
 
 2132     int ixyz = site % Nxyz;
 
 2133     int it   = site / Nxyz;
 
 2136     clear_vec(v2v, 
NVCD);
 
 2138     if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
 
 2139       int it2 = (it + 1) % m_Nt;
 
 2140       int nei = ixyz + Nxyz * it2;
 
 2141       mult_wilson_tpb_dirac(v2v, &u[
VLEN * 
NDF * site],
 
 2144       mult_wilson_tp2_dirac(v2v, &u[
VLEN * 
NDF * site],
 
 2156 template<
typename AFIELD>
 
 2160   int Nxyz = m_Nxv * m_Nyv * m_Nz;
 
 2162   int ith, nth, is, ns;
 
 2163   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2168   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2169   if (isap != 0) u = m_Ublock.ptr(m_Ndf * m_Nst * idir);
 
 2173   if (do_comm[3] > 0) {
 
 2174     for (
int site = is; site < ns; ++site) {
 
 2175       int ixyz = site % Nxyz;
 
 2176       int it   = site / Nxyz;
 
 2177       if (it == m_Nt - 1) {
 
 2178         mult_wilson_tm1_dirac(&buf1[
VLEN * 
NVC * 
ND2 * ixyz],
 
 2187       chrecv_dn[3].start();
 
 2188       chsend_up[3].start();
 
 2189       chrecv_dn[3].wait();
 
 2190       chsend_up[3].wait();
 
 2195   for (
int site = is; site < ns; ++site) {
 
 2196     int ixyz = site % Nxyz;
 
 2197     int it   = site / Nxyz;
 
 2200     clear_vec(v2v, 
NVCD);
 
 2202     if ((it > 0) || (do_comm[3] == 0)) {
 
 2203       int it2 = (it - 1 + m_Nt) % m_Nt;
 
 2204       int nei = ixyz + Nxyz * it2;
 
 2205       mult_wilson_tmb_dirac(v2v, &u[
VLEN * 
NDF * nei],
 
 2208       mult_wilson_tm2_dirac(v2v, &buf2[
VLEN * 
NVC * 
ND2 * ixyz]);
 
 2219 template<
typename AFIELD>
 
 2227   double flop_site, flop;
 
 2229   if (m_repr == 
"Dirac") {
 
 2230     flop_site = 
static_cast<double>(
 
 2231       m_Nc * m_Nd * (4 + 6 * (4 * m_Nc + 2) + 2 * (4 * m_Nc + 1)));
 
 2232   } 
else if (m_repr == 
"Chiral") {
 
 2233     flop_site = 
static_cast<double>(
 
 2234       m_Nc * m_Nd * (4 + 8 * (4 * m_Nc + 2)));
 
 2236     vout.
crucial(m_vl, 
"%s: input repr is undefined.\n");
 
 2240   flop = flop_site * 
static_cast<double>(Lvol);
 
 2241   if ((mode == 
"DdagD") || (mode == 
"DDdag")) flop *= 2.0;
 
 2248 template<
typename AFIELD>
 
 2258   int    NBx   = m_Nx / m_block_size[0];
 
 2259   int    NBy   = m_Ny / m_block_size[1];
 
 2260   int    NBz   = m_Nz / m_block_size[2];
 
 2261   int    NBt   = m_Nt / m_block_size[3];
 
 2262   size_t nvol2 = NBx * NBy * NBz * NBt / 2;
 
 2264   int block_x = m_block_size[0];
 
 2265   int block_y = m_block_size[1];
 
 2266   int block_z = m_block_size[2];
 
 2267   int block_t = m_block_size[3];
 
 2269   int clover_site = 4 * Nc * Nc * Nd * Nd;
 
 2270   int hop_x_site  = Nc * Nd * (4 * Nc + 2);
 
 2271   int hop_y_site  = Nc * Nd * (4 * Nc + 2);
 
 2272   int hop_z_site  = Nc * Nd * (4 * Nc + 2);
 
 2273   int hop_t_site  = Nc * Nd * (4 * Nc + 1);
 
 2274   int accum_site  = 4 * Nc * Nd;
 
 2277   double flop_x = 2.0 * 
static_cast<double>(hop_x_site
 
 2278                                             * (block_x - 1) * block_y * block_z * block_t);
 
 2279   double flop_y = 2.0 * 
static_cast<double>(hop_y_site
 
 2280                                             * block_x * (block_y - 1) * block_z * block_t);
 
 2281   double flop_z = 2.0 * 
static_cast<double>(hop_z_site
 
 2282                                             * block_x * block_y * (block_z - 1) * block_t);
 
 2283   double flop_t = 2.0 * 
static_cast<double>(hop_t_site
 
 2284                                             * block_x * block_y * block_z * (block_t - 1));
 
 2285   double flop_sap_mult = flop_x + flop_y + flop_z + flop_t
 
 2286                          + 
static_cast<double>(clover_site + accum_site);
 
 2288   return flop_sap_mult * 
static_cast<double>(nvol2)
 
 2289          * 
static_cast<double>(NPE);