28 #if defined USE_GROUP_SU3 
   29 #include "fopr_Wilson_impl_SU3.inc" 
   30 #elif defined USE_GROUP_SU2 
   31 #include "fopr_Wilson_impl_SU2.inc" 
   32 #elif defined USE_GROUP_SU_N 
   33 #include "fopr_Wilson_impl_SU_N.inc" 
   85   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   86     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
   87       int itask = ith_z + m_Ntask_z * ith_t;
 
   95       if (ith_t == 0) 
m_arg[itask].kt0 = 1;
 
   96       if (ith_z == 0) 
m_arg[itask].kz0 = 1;
 
   97       if (ith_t == m_Ntask_t - 1) 
m_arg[itask].kt1 = 1;
 
   98       if (ith_z == m_Ntask_z - 1) 
m_arg[itask].kz1 = 1;
 
  102       m_arg[itask].isite_cpz = ith_t * 
m_Mt * Nxy;
 
  103       m_arg[itask].isite_cpt = ith_z * 
m_Mz * Nxy;
 
  110   int Nvcd2 = 2 * Nc * Nd / 2;
 
  112   std::vector<int> destid(
m_Ntask);
 
  113   std::vector<int> offset(
m_Ntask);
 
  114   std::vector<int> datasize(
m_Ntask);
 
  115   std::vector<int> offset_up(
m_Ntask);
 
  116   std::vector<int> offset_lw(
m_Ntask);
 
  117   std::vector<int> datasize_up(
m_Ntask);
 
  118   std::vector<int> datasize_lw(
m_Ntask);
 
  121   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  122     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  125       destid[itask]   = itask;
 
  126       offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  127       datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Ny;
 
  136   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  137     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  140       destid[itask]   = itask;
 
  141       offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  142       datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Nx;
 
  151   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  152     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  153       int itask = ith_z + m_Ntask_z * ith_t;
 
  155       offset_up[itask]   = 0;
 
  156       offset_lw[itask]   = 0;
 
  157       datasize_up[itask] = 0;
 
  158       datasize_lw[itask] = 0;
 
  160         destid[itask]      = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
 
  161         offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx * m_Ny;
 
  162         datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx * m_Ny;
 
  164       if (ith_z == m_Ntask_z - 1) {
 
  166         offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx * m_Ny;
 
  167         datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx * m_Ny;
 
  177   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  178     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  179       int itask = ith_z + m_Ntask_z * ith_t;
 
  181       offset_up[itask]   = 0;
 
  182       offset_lw[itask]   = 0;
 
  183       datasize_up[itask] = 0;
 
  184       datasize_lw[itask] = 0;
 
  186         destid[itask]      = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
 
  187         offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx * m_Ny;
 
  188         datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx * m_Ny;
 
  190       if (ith_t == m_Ntask_t - 1) {
 
  191         destid[itask]      = ith_z;
 
  192         offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx * m_Ny;
 
  193         datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx * m_Ny;
 
  206   int itask, 
double *v2, 
double fac, 
const double *v1)
 
  208   int Nvcd = m_Nvc * m_Nd;
 
  209   int Nvxy = Nvcd * m_Nx * m_Ny;
 
  211   int isite = m_arg[itask].isite;
 
  213   double       *w2 = &v2[Nvcd * isite];
 
  214   const double *w1 = &v1[Nvcd * isite];
 
  216   for (
int it = 0; it < m_Mt; ++it) {
 
  217     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  218       for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  219         int iv = ivxy + Nvxy * (iz + m_Nz * it);
 
  220         w2[iv] = fac * w2[iv] + w1[iv];
 
  231   int Nvcd = m_Nvc * m_Nd;
 
  232   int Nvxy = Nvcd * m_Nx * m_Ny;
 
  234   int    isite = m_arg[itask].isite;
 
  235   double *w2   = &v2[Nvcd * isite];
 
  237   for (
int it = 0; it < m_Mt; ++it) {
 
  238     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  239       for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  240         int iv = ivxy + Nvxy * (iz + m_Nz * it);
 
  250   int itask, 
double *vcp1, 
const double *v1)
 
  252   int Nvc2  = 2 * m_Nvc;
 
  253   int Nvcd  = m_Nvc * m_Nd;
 
  254   int Nvcd2 = Nvcd / 2;
 
  262   double bc2  = m_boundary2[idir];
 
  264   int isite    = m_arg[itask].isite;
 
  265   int isite_cp = m_arg[itask].isite_cpx;
 
  269     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  270   const double *w1 = &v1[Nvcd * isite];
 
  274   for (
int it = 0; it < m_Mt; ++it) {
 
  275     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  276       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  277         int is  = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  278         int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  280         int ix1 = Nvc2 * is2;
 
  281         int ix2 = ix1 + m_Nvc;
 
  283         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  284           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
 
  285           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
 
  286           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  287           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
 
  293   m_bw_send[idir]->start_thread(itask);
 
  299   int itask, 
double *v2, 
const double *vcp2)
 
  301   int Nvc2  = 2 * m_Nvc;
 
  302   int Nvcd  = m_Nvc * m_Nd;
 
  303   int Nvcd2 = Nvcd / 2;
 
  312   double wt1r, wt1i, wt2r, wt2i;
 
  314   int isite    = m_arg[itask].isite;
 
  315   int isite_cp = m_arg[itask].isite_cpx;
 
  317   double *w2 = &v2[Nvcd * isite];
 
  320     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  321   const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
  323   m_bw_recv[idir]->wait_thread(itask);
 
  326   for (
int it = 0; it < m_Mt; ++it) {
 
  327     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  328       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  329         int is  = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  330         int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  333         int ix1 = Nvc2 * is2;
 
  334         int ix2 = ix1 + m_Nvc;
 
  336         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  337           int ic2 = ic * m_Nvc;
 
  339           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  340           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  341           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  342           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  344           w2[2 * ic + id1 + iv]     += wt1r;
 
  345           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  346           w2[2 * ic + id2 + iv]     += wt2r;
 
  347           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  348           w2[2 * ic + id3 + iv]     += wt2i;
 
  349           w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  350           w2[2 * ic + id4 + iv]     += wt1i;
 
  351           w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  361   int itask, 
double *v2, 
const double *v1)
 
  363   int Nvcd = m_Nvc * m_Nd;
 
  372   double vt1[m_Nvc], vt2[m_Nvc];
 
  373   double wt1r, wt1i, wt2r, wt2i;
 
  375   int isite = m_arg[itask].isite;
 
  377   double       *w2 = &v2[Nvcd * isite];
 
  378   const double *w1 = &v1[Nvcd * isite];
 
  379   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
  381   for (
int it = 0; it < m_Mt; ++it) {
 
  382     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  383       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  384         for (
int ix = 0; ix < m_Nx - 1; ++ix) {
 
  385           int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  387           int in = Nvcd * (is + 1);
 
  390           for (
int ic = 0; ic < m_Nc; ++ic) {
 
  391             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  392             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
 
  393             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  394             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
 
  397           for (
int ic = 0; ic < m_Nc; ++ic) {
 
  398             int ic2 = ic * m_Nvc;
 
  400             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  401             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  402             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  403             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  405             w2[2 * ic + id1 + iv]     += wt1r;
 
  406             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  407             w2[2 * ic + id2 + iv]     += wt2r;
 
  408             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  409             w2[2 * ic + id3 + iv]     += wt2i;
 
  410             w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  411             w2[2 * ic + id4 + iv]     += wt1i;
 
  412             w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  423   int itask, 
double *vcp1, 
const double *v1)
 
  425   int Nvc2  = 2 * m_Nvc;
 
  426   int Nvcd  = m_Nvc * m_Nd;
 
  427   int Nvcd2 = Nvcd / 2;
 
  436   int isite    = m_arg[itask].isite;
 
  437   int isite_cp = m_arg[itask].isite_cpx;
 
  441     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  442   const double *w1 = &v1[Nvcd * isite];
 
  443   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
  445   double vt1[m_Nvc], vt2[m_Nvc];
 
  449   for (
int it = 0; it < m_Mt; ++it) {
 
  450     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  451       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  452         int is  = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  453         int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  456         int ix1 = Nvc2 * is2;
 
  457         int ix2 = ix1 + m_Nvc;
 
  459         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  460           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  461           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  462           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  463           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  466         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  468           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  469           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  470           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  471           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  477   m_fw_send[idir]->start_thread(itask);
 
  483   int itask, 
double *v2, 
const double *vcp2)
 
  485   int Nvc2  = 2 * m_Nvc;
 
  486   int Nvcd  = m_Nvc * m_Nd;
 
  487   int Nvcd2 = Nvcd / 2;
 
  495   double bc2  = m_boundary2[idir];
 
  497   double wt1r, wt1i, wt2r, wt2i;
 
  499   int isite    = m_arg[itask].isite;
 
  500   int isite_cp = m_arg[itask].isite_cpx;
 
  502   double *w2 = &v2[Nvcd * isite];
 
  505     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  507   m_fw_recv[idir]->wait_thread(itask);
 
  510   for (
int it = 0; it < m_Mt; ++it) {
 
  511     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  512       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  513         int is  = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  514         int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  516         int ix1 = Nvc2 * is2;
 
  517         int ix2 = ix1 + m_Nvc;
 
  519         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  521           int ici = 2 * ic + 1;
 
  522           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  523           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  524           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  525           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  526           w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
 
  527           w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
 
  528           w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
 
  529           w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
 
  539   int itask, 
double *v2, 
const double *v1)
 
  541   int Nvcd = m_Nvc * m_Nd;
 
  550   double vt1[m_Nvc], vt2[m_Nvc];
 
  551   double wt1r, wt1i, wt2r, wt2i;
 
  553   int isite = m_arg[itask].isite;
 
  555   double       *w2 = &v2[Nvcd * isite];
 
  556   const double *w1 = &v1[Nvcd * isite];
 
  557   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
  559   for (
int it = 0; it < m_Mt; ++it) {
 
  560     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  561       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  562         for (
int ix = 1; ix < m_Nx; ++ix) {
 
  563           int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  565           int in = Nvcd * (is - 1);
 
  566           int ig = m_Ndf * (is - 1);
 
  568           for (
int ic = 0; ic < m_Nc; ++ic) {
 
  569             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  570             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  571             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  572             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  575           for (
int ic = 0; ic < m_Nc; ++ic) {
 
  578             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  579             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  580             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  581             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  583             w2[2 * ic + id1 + iv]     += wt1r;
 
  584             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  585             w2[2 * ic + id2 + iv]     += wt2r;
 
  586             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  587             w2[2 * ic + id3 + iv]     += -wt2i;
 
  588             w2[2 * ic + 1 + id3 + iv] += +wt2r;
 
  589             w2[2 * ic + id4 + iv]     += -wt1i;
 
  590             w2[2 * ic + 1 + id4 + iv] += +wt1r;
 
  601   int itask, 
double *vcp1, 
const double *v1)
 
  603   int Nvc2  = 2 * m_Nvc;
 
  604   int Nvcd  = m_Nvc * m_Nd;
 
  605   int Nvcd2 = Nvcd / 2;
 
  612   int isite    = m_arg[itask].isite;
 
  613   int isite_cp = m_arg[itask].isite_cpy;
 
  616   double bc2  = m_boundary2[idir];
 
  620     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  621   const double *w1 = &v1[Nvcd * isite];
 
  625   for (
int it = 0; it < m_Mt; ++it) {
 
  626     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  627       for (
int ix = 0; ix < m_Nx; ++ix) {
 
  628         int is  = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  629         int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  631         int ix1 = Nvc2 * is2;
 
  632         int ix2 = ix1 + m_Nvc;
 
  634         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  635           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
 
  636           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
 
  637           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
 
  638           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  644   m_bw_send[idir]->start_thread(itask);
 
  650   int itask, 
double *v2, 
const double *vcp2)
 
  652   int Nvc2  = 2 * m_Nvc;
 
  653   int Nvcd  = m_Nvc * m_Nd;
 
  654   int Nvcd2 = Nvcd / 2;
 
  663   double wt1r, wt1i, wt2r, wt2i;
 
  665   int isite    = m_arg[itask].isite;
 
  666   int isite_cp = m_arg[itask].isite_cpy;
 
  668   double *w2 = &v2[Nvcd * isite];
 
  671     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  672   const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
  674   m_bw_recv[idir]->wait_thread(itask);
 
  677   for (
int it = 0; it < m_Mt; ++it) {
 
  678     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  679       for (
int ix = 0; ix < m_Nx; ++ix) {
 
  680         int is  = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  681         int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  684         int ix1 = Nvc2 * is2;
 
  685         int ix2 = ix1 + m_Nvc;
 
  687         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  688           int ic2 = ic * m_Nvc;
 
  690           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  691           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  692           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  693           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  695           w2[2 * ic + id1 + iv]     += wt1r;
 
  696           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  697           w2[2 * ic + id2 + iv]     += wt2r;
 
  698           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  699           w2[2 * ic + id3 + iv]     += -wt2r;
 
  700           w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  701           w2[2 * ic + id4 + iv]     += wt1r;
 
  702           w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  712   int itask, 
double *v2, 
const double *v1)
 
  714   int Nvcd = m_Nvc * m_Nd;
 
  723   double vt1[m_Nvc], vt2[m_Nvc];
 
  724   double wt1r, wt1i, wt2r, wt2i;
 
  726   int isite = m_arg[itask].isite;
 
  728   double       *w2 = &v2[Nvcd * isite];
 
  729   const double *w1 = &v1[Nvcd * isite];
 
  730   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
  732   for (
int it = 0; it < m_Mt; ++it) {
 
  733     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  734       for (
int iy = 0; iy < m_Ny - 1; ++iy) {
 
  735         for (
int ix = 0; ix < m_Nx; ++ix) {
 
  736           int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  738           int in = Nvcd * (is + m_Nx);
 
  741           for (
int ic = 0; ic < m_Nc; ++ic) {
 
  742             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
 
  743             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  744             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
 
  745             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  748           for (
int ic = 0; ic < m_Nc; ++ic) {
 
  749             int ic2 = ic * m_Nvc;
 
  751             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  752             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  753             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  754             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  756             w2[2 * ic + id1 + iv]     += wt1r;
 
  757             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  758             w2[2 * ic + id2 + iv]     += wt2r;
 
  759             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  760             w2[2 * ic + id3 + iv]     += -wt2r;
 
  761             w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  762             w2[2 * ic + id4 + iv]     += wt1r;
 
  763             w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  774   int itask, 
double *vcp1, 
const double *v1)
 
  776   int Nvc2  = 2 * m_Nvc;
 
  777   int Nvcd  = m_Nvc * m_Nd;
 
  778   int Nvcd2 = Nvcd / 2;
 
  787   int isite    = m_arg[itask].isite;
 
  788   int isite_cp = m_arg[itask].isite_cpy;
 
  792     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  794   const double *w1 = &v1[Nvcd * isite];
 
  795   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
  797   double vt1[m_Nvc], vt2[m_Nvc];
 
  801   for (
int it = 0; it < m_Mt; ++it) {
 
  802     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  803       for (
int ix = 0; ix < m_Nx; ++ix) {
 
  804         int is  = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  805         int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  808         int ix1 = Nvc2 * is2;
 
  809         int ix2 = ix1 + m_Nvc;
 
  811         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  812           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  813           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  814           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  815           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  818         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  820           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  821           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  822           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  823           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  829   m_fw_send[idir]->start_thread(itask);
 
  835   int itask, 
double *v2, 
const double *vcp2)
 
  837   int Nvc2  = 2 * m_Nvc;
 
  838   int Nvcd  = m_Nvc * m_Nd;
 
  839   int Nvcd2 = Nvcd / 2;
 
  847   double bc2  = m_boundary2[idir];
 
  849   double wt1r, wt1i, wt2r, wt2i;
 
  851   int isite    = m_arg[itask].isite;
 
  852   int isite_cp = m_arg[itask].isite_cpy;
 
  854   double *w2 = &v2[Nvcd * isite];
 
  857     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  859   m_fw_recv[idir]->wait_thread(itask);
 
  862   for (
int it = 0; it < m_Mt; ++it) {
 
  863     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  864       for (
int ix = 0; ix < m_Nx; ++ix) {
 
  865         int is  = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  866         int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  868         int ix1 = Nvc2 * is2;
 
  869         int ix2 = ix1 + m_Nvc;
 
  871         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  873           int ici = 2 * ic + 1;
 
  874           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  875           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  876           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  877           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  878           w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
 
  879           w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
 
  880           w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
 
  881           w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
 
  891   int itask, 
double *v2, 
const double *v1)
 
  893   int Nvcd = m_Nvc * m_Nd;
 
  902   double vt1[m_Nvc], vt2[m_Nvc];
 
  903   double wt1r, wt1i, wt2r, wt2i;
 
  905   int isite = m_arg[itask].isite;
 
  907   double       *w2 = &v2[Nvcd * isite];
 
  908   const double *w1 = &v1[Nvcd * isite];
 
  909   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
  911   for (
int it = 0; it < m_Mt; ++it) {
 
  912     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  913       for (
int iy = 1; iy < m_Ny; ++iy) {
 
  914         for (
int ix = 0; ix < m_Nx; ++ix) {
 
  915           int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
 
  917           int in = Nvcd * (is - m_Nx);
 
  918           int ig = m_Ndf * (is - m_Nx);
 
  920           for (
int ic = 0; ic < m_Nc; ++ic) {
 
  921             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  922             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  923             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  924             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  927           for (
int ic = 0; ic < m_Nc; ++ic) {
 
  929             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  930             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  931             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  932             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  934             w2[ic2 + id1 + iv]     += wt1r;
 
  935             w2[ic2 + 1 + id1 + iv] += wt1i;
 
  936             w2[ic2 + id2 + iv]     += wt2r;
 
  937             w2[ic2 + 1 + id2 + iv] += wt2i;
 
  938             w2[ic2 + id3 + iv]     += wt2r;
 
  939             w2[ic2 + 1 + id3 + iv] += wt2i;
 
  940             w2[ic2 + id4 + iv]     += -wt1r;
 
  941             w2[ic2 + 1 + id4 + iv] += -wt1i;
 
  952   int itask, 
double *vcp1, 
const double *v1)
 
  954   int Nvc2  = 2 * m_Nvc;
 
  955   int Nvcd  = m_Nvc * m_Nd;
 
  956   int Nvcd2 = Nvcd / 2;
 
  963   int isite    = m_arg[itask].isite;
 
  964   int isite_cp = m_arg[itask].isite_cpz;
 
  967   double bc2  = m_boundary2[idir];
 
  971     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  972   const double *w1 = &v1[Nvcd * isite];
 
  974   if (m_arg[itask].kz0 == 1) {
 
  975     int Nxy = m_Nx * m_Ny;
 
  977     for (
int it = 0; it < m_Mt; ++it) {
 
  978       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  979         int is  = ixy + Nxy * (iz + m_Nz * it);
 
  980         int is2 = ixy + Nxy * it;
 
  983         int ix1 = Nvc2 * is2;
 
  984         int ix2 = ix1 + m_Nvc;
 
  986         for (
int ic = 0; ic < m_Nc; ++ic) {
 
  987           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
 
  988           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
 
  989           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
  990           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
 
  996   m_bw_send[idir]->start_thread(itask);
 
 1002   int itask, 
double *v2, 
const double *vcp2)
 
 1004   int Nvc2  = 2 * m_Nvc;
 
 1005   int Nvcd  = m_Nvc * m_Nd;
 
 1006   int Nvcd2 = Nvcd / 2;
 
 1010   int id3 = m_Nvc * 2;
 
 1011   int id4 = m_Nvc * 3;
 
 1015   double wt1r, wt1i, wt2r, wt2i;
 
 1017   int isite    = m_arg[itask].isite;
 
 1018   int isite_cp = m_arg[itask].isite_cpz;
 
 1020   double *w2 = &v2[Nvcd * isite];
 
 1023     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1024   const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1026   m_bw_recv[idir]->wait_thread(itask);
 
 1028   if (m_arg[itask].kz1 == 1) {
 
 1029     int Nxy = m_Nx * m_Ny;
 
 1031     for (
int it = 0; it < m_Mt; ++it) {
 
 1032       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1033         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1034         int is2 = ixy + Nxy * it;
 
 1036         int ig  = m_Ndf * is;
 
 1037         int ix1 = Nvc2 * is2;
 
 1038         int ix2 = ix1 + m_Nvc;
 
 1040         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1041           int ic2 = ic * m_Nvc;
 
 1043           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1044           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1045           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1046           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1048           w2[2 * ic + id1 + iv]     += wt1r;
 
 1049           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1050           w2[2 * ic + id2 + iv]     += wt2r;
 
 1051           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1052           w2[2 * ic + id3 + iv]     += wt1i;
 
 1053           w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1054           w2[2 * ic + id4 + iv]     += -wt2i;
 
 1055           w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1065   int itask, 
double *v2, 
const double *v1)
 
 1067   int Nvcd = m_Nvc * m_Nd;
 
 1071   int id3 = m_Nvc * 2;
 
 1072   int id4 = m_Nvc * 3;
 
 1076   double vt1[m_Nvc], vt2[m_Nvc];
 
 1077   double wt1r, wt1i, wt2r, wt2i;
 
 1079   int isite = m_arg[itask].isite;
 
 1081   double       *w2 = &v2[Nvcd * isite];
 
 1082   const double *w1 = &v1[Nvcd * isite];
 
 1083   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1085   int kz1 = m_arg[itask].kz1;
 
 1086   int Nxy = m_Nx * m_Ny;
 
 1088   for (
int it = 0; it < m_Mt; ++it) {
 
 1089     for (
int iz = 0; iz < m_Mz - kz1; ++iz) {
 
 1090       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1091         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1093         int in = Nvcd * (is + Nxy);
 
 1094         int ig = m_Ndf * is;
 
 1096         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1097           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1098           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
 
 1099           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1100           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
 
 1103         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1104           int ic2 = ic * m_Nvc;
 
 1106           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1107           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1108           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1109           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1111           w2[2 * ic + id1 + iv]     += wt1r;
 
 1112           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1113           w2[2 * ic + id2 + iv]     += wt2r;
 
 1114           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1115           w2[2 * ic + id3 + iv]     += wt1i;
 
 1116           w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1117           w2[2 * ic + id4 + iv]     += -wt2i;
 
 1118           w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1128   int itask, 
double *vcp1, 
const double *v1)
 
 1130   int Nvc2  = 2 * m_Nvc;
 
 1131   int Nvcd  = m_Nvc * m_Nd;
 
 1132   int Nvcd2 = Nvcd / 2;
 
 1136   int id3 = m_Nvc * 2;
 
 1137   int id4 = m_Nvc * 3;
 
 1141   int isite    = m_arg[itask].isite;
 
 1142   int isite_cp = m_arg[itask].isite_cpz;
 
 1146     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1147   const double *w1 = &v1[Nvcd * isite];
 
 1148   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1150   double vt1[m_Nvc], vt2[m_Nvc];
 
 1152   if (m_arg[itask].kz1 == 1) {
 
 1153     int Nxy = m_Nx * m_Ny;
 
 1155     for (
int it = 0; it < m_Mt; ++it) {
 
 1156       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1157         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1158         int is2 = ixy + Nxy * it;
 
 1160         int ig  = m_Ndf * is;
 
 1161         int ix1 = Nvc2 * is2;
 
 1162         int ix2 = ix1 + m_Nvc;
 
 1164         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1165           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1166           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1167           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1168           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1171         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1173           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1174           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1175           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1176           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1182   m_fw_send[idir]->start_thread(itask);
 
 1188   int itask, 
double *v2, 
const double *vcp2)
 
 1190   int Nvc2  = 2 * m_Nvc;
 
 1191   int Nvcd  = m_Nvc * m_Nd;
 
 1192   int Nvcd2 = Nvcd / 2;
 
 1196   int id3 = m_Nvc * 2;
 
 1197   int id4 = m_Nvc * 3;
 
 1200   double bc2  = m_boundary2[idir];
 
 1202   double wt1r, wt1i, wt2r, wt2i;
 
 1204   int isite    = m_arg[itask].isite;
 
 1205   int isite_cp = m_arg[itask].isite_cpz;
 
 1207   double *w2 = &v2[Nvcd * isite];
 
 1210     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1212   m_fw_recv[idir]->wait_thread(itask);
 
 1214   if (m_arg[itask].kz0 == 1) {
 
 1215     int Nxy = m_Nx * m_Ny;
 
 1218     for (
int it = 0; it < m_Mt; ++it) {
 
 1219       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1220         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1221         int is2 = ixy + Nxy * it;
 
 1223         int ix1 = Nvc2 * is2;
 
 1224         int ix2 = ix1 + m_Nvc;
 
 1226         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1228           int ici = 2 * ic + 1;
 
 1229           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1230           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1231           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1232           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1233           w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
 
 1234           w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
 
 1235           w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
 
 1236           w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
 
 1246   int itask, 
double *v2, 
const double *v1)
 
 1248   int Nvcd = m_Nvc * m_Nd;
 
 1252   int id3 = m_Nvc * 2;
 
 1253   int id4 = m_Nvc * 3;
 
 1257   double vt1[m_Nvc], vt2[m_Nvc];
 
 1258   double wt1r, wt1i, wt2r, wt2i;
 
 1260   int isite = m_arg[itask].isite;
 
 1262   double       *w2 = &v2[Nvcd * isite];
 
 1263   const double *w1 = &v1[Nvcd * isite];
 
 1264   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1266   int kz0 = m_arg[itask].kz0;
 
 1267   int Nxy = m_Nx * m_Ny;
 
 1269   for (
int it = 0; it < m_Mt; ++it) {
 
 1270     for (
int iz = kz0; iz < m_Mz; ++iz) {
 
 1271       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1272         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1274         int in = Nvcd * (is - Nxy);
 
 1275         int ig = m_Ndf * (is - Nxy);
 
 1277         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1278           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1279           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1280           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1281           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1284         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1286           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1287           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1288           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1289           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1291           w2[ic2 + id1 + iv]     += wt1r;
 
 1292           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1293           w2[ic2 + id2 + iv]     += wt2r;
 
 1294           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1295           w2[ic2 + id3 + iv]     += -wt1i;
 
 1296           w2[ic2 + 1 + id3 + iv] += wt1r;
 
 1297           w2[ic2 + id4 + iv]     += wt2i;
 
 1298           w2[ic2 + 1 + id4 + iv] += -wt2r;
 
 1308   int itask, 
double *vcp1, 
const double *v1)
 
 1310   int Nvc2  = 2 * m_Nvc;
 
 1311   int Nvcd  = m_Nvc * m_Nd;
 
 1312   int Nvcd2 = Nvcd / 2;
 
 1316   int id3 = m_Nvc * 2;
 
 1317   int id4 = m_Nvc * 3;
 
 1319   int isite    = m_arg[itask].isite;
 
 1320   int isite_cp = m_arg[itask].isite_cpt;
 
 1323   double bc2  = m_boundary2[idir];
 
 1327     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1328   const double *w1 = &v1[Nvcd * isite];
 
 1330   if (m_arg[itask].kt0 == 1) {
 
 1331     int Nxy = m_Nx * m_Ny;
 
 1333     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1334       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1335         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1336         int is2 = ixy + Nxy * iz;
 
 1339         int ix1 = Nvc2 * is2;
 
 1340         int ix2 = ix1 + m_Nvc;
 
 1342         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1343           w2[2 * ic + ix1]     = 2.0 * bc2 * w1[2 * ic + id3 + in];
 
 1344           w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
 
 1345           w2[2 * ic + ix2]     = 2.0 * bc2 * w1[2 * ic + id4 + in];
 
 1346           w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
 
 1352   m_bw_send[idir]->start_thread(itask);
 
 1358   int itask, 
double *v2, 
const double *vcp2)
 
 1360   int Nvc2  = 2 * m_Nvc;
 
 1361   int Nvcd  = m_Nvc * m_Nd;
 
 1362   int Nvcd2 = Nvcd / 2;
 
 1366   int id3 = m_Nvc * 2;
 
 1367   int id4 = m_Nvc * 3;
 
 1371   double wt1r, wt1i, wt2r, wt2i;
 
 1373   int isite    = m_arg[itask].isite;
 
 1374   int isite_cp = m_arg[itask].isite_cpt;
 
 1376   double *w2 = &v2[Nvcd * isite];
 
 1379     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1380   const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1382   m_bw_recv[idir]->wait_thread(itask);
 
 1384   if (m_arg[itask].kt1 == 1) {
 
 1385     int Nxy = m_Nx * m_Ny;
 
 1387     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1388       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1389         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1390         int is2 = ixy + Nxy * iz;
 
 1392         int ig  = m_Ndf * is;
 
 1393         int ix1 = Nvc2 * is2;
 
 1394         int ix2 = ix1 + m_Nvc;
 
 1396         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1397           int ic2 = ic * m_Nvc;
 
 1399           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1400           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1401           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1402           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1404           w2[2 * ic + id3 + iv]     += wt1r;
 
 1405           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1406           w2[2 * ic + id4 + iv]     += wt2r;
 
 1407           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1417   int itask, 
double *v2, 
const double *v1)
 
 1419   int Nvcd = m_Nvc * m_Nd;
 
 1423   int id3 = m_Nvc * 2;
 
 1424   int id4 = m_Nvc * 3;
 
 1428   double vt1[m_Nvc], vt2[m_Nvc];
 
 1429   double wt1r, wt1i, wt2r, wt2i;
 
 1431   int isite = m_arg[itask].isite;
 
 1433   double       *w2 = &v2[Nvcd * isite];
 
 1434   const double *w1 = &v1[Nvcd * isite];
 
 1435   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1437   int kt1  = m_arg[itask].kt1;
 
 1438   int Nxy  = m_Nx * m_Ny;
 
 1439   int Nxyz = Nxy * m_Nz;
 
 1441   for (
int it = 0; it < m_Mt - kt1; ++it) {
 
 1442     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1443       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1444         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1446         int in = Nvcd * (is + Nxyz);
 
 1447         int ig = m_Ndf * is;
 
 1449         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1450           vt1[2 * ic]     = 2.0 * w1[2 * ic + id3 + in];
 
 1451           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
 
 1452           vt2[2 * ic]     = 2.0 * w1[2 * ic + id4 + in];
 
 1453           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
 
 1456         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1457           int ic2 = ic * m_Nvc;
 
 1459           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1460           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1461           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1462           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1464           w2[2 * ic + id3 + iv]     += wt1r;
 
 1465           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1466           w2[2 * ic + id4 + iv]     += wt2r;
 
 1467           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1477   int itask, 
double *vcp1, 
const double *v1)
 
 1479   int Nvc2  = 2 * m_Nvc;
 
 1480   int Nvcd  = m_Nvc * m_Nd;
 
 1481   int Nvcd2 = Nvcd / 2;
 
 1485   int id3 = m_Nvc * 2;
 
 1486   int id4 = m_Nvc * 3;
 
 1490   int isite    = m_arg[itask].isite;
 
 1491   int isite_cp = m_arg[itask].isite_cpt;
 
 1495     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1496   const double *w1 = &v1[Nvcd * isite];
 
 1497   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1499   double vt1[m_Nvc], vt2[m_Nvc];
 
 1501   if (m_arg[itask].kt1 == 1) {
 
 1502     int Nxy = m_Nx * m_Ny;
 
 1504     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1505       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1506         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1507         int is2 = ixy + Nxy * iz;
 
 1509         int ig  = m_Ndf * is;
 
 1510         int ix1 = Nvc2 * is2;
 
 1511         int ix2 = ix1 + m_Nvc;
 
 1513         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1514           vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1515           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1516           vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1517           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1520         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1522           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1523           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1524           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1525           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1531   m_fw_send[idir]->start_thread(itask);
 
 1537   int itask, 
double *v2, 
const double *vcp2)
 
 1539   int Nvc2  = 2 * m_Nvc;
 
 1540   int Nvcd  = m_Nvc * m_Nd;
 
 1541   int Nvcd2 = Nvcd / 2;
 
 1545   int id3 = m_Nvc * 2;
 
 1546   int id4 = m_Nvc * 3;
 
 1549   double bc2  = m_boundary2[idir];
 
 1551   double wt1r, wt1i, wt2r, wt2i;
 
 1553   int isite    = m_arg[itask].isite;
 
 1554   int isite_cp = m_arg[itask].isite_cpt;
 
 1556   double *w2 = &v2[Nvcd * isite];
 
 1559     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1561   m_fw_recv[idir]->wait_thread(itask);
 
 1563   if (m_arg[itask].kt0 == 1) {
 
 1564     int Nxy = m_Nx * m_Ny;
 
 1566     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1567       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1568         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1569         int is2 = ixy + Nxy * iz;
 
 1571         int ix1 = Nvc2 * is2;
 
 1572         int ix2 = ix1 + m_Nvc;
 
 1574         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1576           int ici = 2 * ic + 1;
 
 1577           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1578           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1579           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1580           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1590   int itask, 
double *v2, 
const double *v1)
 
 1592   int Nvcd = m_Nvc * m_Nd;
 
 1596   int id3 = m_Nvc * 2;
 
 1597   int id4 = m_Nvc * 3;
 
 1601   double vt1[m_Nvc], vt2[m_Nvc];
 
 1602   double wt1r, wt1i, wt2r, wt2i;
 
 1604   int isite = m_arg[itask].isite;
 
 1606   double       *w2 = &v2[Nvcd * isite];
 
 1607   const double *w1 = &v1[Nvcd * isite];
 
 1608   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1610   int kt0  = m_arg[itask].kt0;
 
 1611   int Nxy  = m_Nx * m_Ny;
 
 1612   int Nxyz = Nxy * m_Nz;
 
 1614   for (
int it = kt0; it < m_Mt; ++it) {
 
 1615     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1616       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1617         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1619         int in = Nvcd * (is - Nxyz);
 
 1620         int ig = m_Ndf * (is - Nxyz);
 
 1622         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1623           vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1624           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1625           vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1626           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1629         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1631           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1632           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1633           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1634           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1636           w2[ic2 + id1 + iv]     += wt1r;
 
 1637           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1638           w2[ic2 + id2 + iv]     += wt2r;
 
 1639           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1649   int itask, 
double *vcp1, 
const double *v1)
 
 1651   int Nvc2  = 2 * m_Nvc;
 
 1652   int Nvcd  = m_Nvc * m_Nd;
 
 1653   int Nvcd2 = Nvcd / 2;
 
 1657   int id3 = m_Nvc * 2;
 
 1658   int id4 = m_Nvc * 3;
 
 1660   int isite    = m_arg[itask].isite;
 
 1661   int isite_cp = m_arg[itask].isite_cpt;
 
 1664   double bc2  = m_boundary2[idir];
 
 1668     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1669   const double *w1 = &v1[Nvcd * isite];
 
 1671   if (m_arg[itask].kt0 == 1) {
 
 1672     int Nxy = m_Nx * m_Ny;
 
 1674     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1675       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1676         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1677         int is2 = ixy + Nxy * iz;
 
 1680         int ix1 = Nvc2 * is2;
 
 1681         int ix2 = ix1 + m_Nvc;
 
 1683         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1684           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
 
 1685           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
 
 1686           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
 
 1687           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1693   m_bw_send[idir]->start_thread(itask);
 
 1699   int itask, 
double *v2, 
const double *vcp2)
 
 1701   int Nvc2  = 2 * m_Nvc;
 
 1702   int Nvcd  = m_Nvc * m_Nd;
 
 1703   int Nvcd2 = Nvcd / 2;
 
 1707   int id3 = m_Nvc * 2;
 
 1708   int id4 = m_Nvc * 3;
 
 1712   double wt1r, wt1i, wt2r, wt2i;
 
 1714   int isite    = m_arg[itask].isite;
 
 1715   int isite_cp = m_arg[itask].isite_cpt;
 
 1717   double *w2 = &v2[Nvcd * isite];
 
 1720     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1721   const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1723   m_bw_recv[idir]->wait_thread(itask);
 
 1725   if (m_arg[itask].kt1 == 1) {
 
 1726     int Nxy = m_Nx * m_Ny;
 
 1728     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1729       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1730         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1731         int is2 = ixy + Nxy * iz;
 
 1733         int ig  = m_Ndf * is;
 
 1734         int ix1 = Nvc2 * is2;
 
 1735         int ix2 = ix1 + m_Nvc;
 
 1737         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1738           int ic2 = ic * m_Nvc;
 
 1740           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1741           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1742           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1743           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1745           w2[2 * ic + id1 + iv]     += wt1r;
 
 1746           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1747           w2[2 * ic + id2 + iv]     += wt2r;
 
 1748           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1749           w2[2 * ic + id3 + iv]     += wt1r;
 
 1750           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1751           w2[2 * ic + id4 + iv]     += wt2r;
 
 1752           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1762   int itask, 
double *v2, 
const double *v1)
 
 1764   int Nvcd = m_Nvc * m_Nd;
 
 1768   int id3 = m_Nvc * 2;
 
 1769   int id4 = m_Nvc * 3;
 
 1773   double vt1[m_Nvc], vt2[m_Nvc];
 
 1774   double wt1r, wt1i, wt2r, wt2i;
 
 1776   int isite = m_arg[itask].isite;
 
 1778   double       *w2 = &v2[Nvcd * isite];
 
 1779   const double *w1 = &v1[Nvcd * isite];
 
 1780   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1782   int kt1  = m_arg[itask].kt1;
 
 1783   int Nxy  = m_Nx * m_Ny;
 
 1784   int Nxyz = Nxy * m_Nz;
 
 1786   for (
int it = 0; it < m_Mt - kt1; ++it) {
 
 1787     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1788       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1789         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1791         int in = Nvcd * (is + Nxyz);
 
 1792         int ig = m_Ndf * is;
 
 1794         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1795           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
 
 1796           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1797           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
 
 1798           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1801         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1802           int ic2 = ic * m_Nvc;
 
 1804           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1805           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1806           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1807           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1809           w2[2 * ic + id1 + iv]     += wt1r;
 
 1810           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1811           w2[2 * ic + id2 + iv]     += wt2r;
 
 1812           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1813           w2[2 * ic + id3 + iv]     += wt1r;
 
 1814           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1815           w2[2 * ic + id4 + iv]     += wt2r;
 
 1816           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1826   int itask, 
double *vcp1, 
const double *v1)
 
 1828   int Nvc2  = 2 * m_Nvc;
 
 1829   int Nvcd  = m_Nvc * m_Nd;
 
 1830   int Nvcd2 = Nvcd / 2;
 
 1834   int id3 = m_Nvc * 2;
 
 1835   int id4 = m_Nvc * 3;
 
 1839   int isite    = m_arg[itask].isite;
 
 1840   int isite_cp = m_arg[itask].isite_cpt;
 
 1844     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1845   const double *w1 = &v1[Nvcd * isite];
 
 1846   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1848   double vt1[m_Nvc], vt2[m_Nvc];
 
 1850   if (m_arg[itask].kt1 == 1) {
 
 1851     int Nxy = m_Nx * m_Ny;
 
 1853     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1854       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1855         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1856         int is2 = ixy + Nxy * iz;
 
 1858         int ig  = m_Ndf * is;
 
 1859         int ix1 = Nvc2 * is2;
 
 1860         int ix2 = ix1 + m_Nvc;
 
 1862         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1863           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1864           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1865           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1866           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1869         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1871           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1872           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1873           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1874           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1880   m_fw_send[idir]->start_thread(itask);
 
 1886   int itask, 
double *v2, 
const double *vcp2)
 
 1888   int Nvc2  = 2 * m_Nvc;
 
 1889   int Nvcd  = m_Nvc * m_Nd;
 
 1890   int Nvcd2 = Nvcd / 2;
 
 1894   int id3 = m_Nvc * 2;
 
 1895   int id4 = m_Nvc * 3;
 
 1898   double bc2  = m_boundary2[idir];
 
 1900   double wt1r, wt1i, wt2r, wt2i;
 
 1902   int isite    = m_arg[itask].isite;
 
 1903   int isite_cp = m_arg[itask].isite_cpt;
 
 1905   double *w2 = &v2[Nvcd * isite];
 
 1908     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1910   m_fw_recv[idir]->wait_thread(itask);
 
 1912   if (m_arg[itask].kt0 == 1) {
 
 1913     int Nxy = m_Nx * m_Ny;
 
 1915     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1916       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1917         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1918         int is2 = ixy + Nxy * iz;
 
 1920         int ix1 = Nvc2 * is2;
 
 1921         int ix2 = ix1 + m_Nvc;
 
 1923         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1925           int ici = 2 * ic + 1;
 
 1926           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1927           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1928           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1929           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1930           w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
 
 1931           w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
 
 1932           w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
 
 1933           w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
 
 1943   int itask, 
double *v2, 
const double *v1)
 
 1945   int Nvcd = m_Nvc * m_Nd;
 
 1949   int id3 = m_Nvc * 2;
 
 1950   int id4 = m_Nvc * 3;
 
 1954   double vt1[m_Nvc], vt2[m_Nvc];
 
 1955   double wt1r, wt1i, wt2r, wt2i;
 
 1957   int isite = m_arg[itask].isite;
 
 1959   double       *w2 = &v2[Nvcd * isite];
 
 1960   const double *w1 = &v1[Nvcd * isite];
 
 1961   const double *u  = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
 
 1963   int kt0  = m_arg[itask].kt0;
 
 1964   int Nxy  = m_Nx * m_Ny;
 
 1965   int Nxyz = Nxy * m_Nz;
 
 1967   for (
int it = kt0; it < m_Mt; ++it) {
 
 1968     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1969       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1970         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1972         int in = Nvcd * (is - Nxyz);
 
 1973         int ig = m_Ndf * (is - Nxyz);
 
 1975         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1976           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1977           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1978           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1979           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1982         for (
int ic = 0; ic < m_Nc; ++ic) {
 
 1984           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1985           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1986           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1987           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1989           w2[ic2 + id1 + iv]     += wt1r;
 
 1990           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1991           w2[ic2 + id2 + iv]     += wt2r;
 
 1992           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1993           w2[ic2 + id3 + iv]     -= wt1r;
 
 1994           w2[ic2 + 1 + id3 + iv] -= wt1i;
 
 1995           w2[ic2 + id4 + iv]     -= wt2r;
 
 1996           w2[ic2 + 1 + id4 + iv] -= wt2i;
 
 2006   int itask, 
double *v2, 
const double *v1)
 
 2008   int Nvcd = m_Nvc * m_Nd;
 
 2009   int Nxy  = m_Nx * m_Ny;
 
 2013   int id3 = m_Nvc * 2;
 
 2014   int id4 = m_Nvc * 3;
 
 2016   int          isite = m_arg[itask].isite;
 
 2017   double       *w2   = &v2[Nvcd * isite];
 
 2018   const double *w1   = &v1[Nvcd * isite];
 
 2020   for (
int it = 0; it < m_Mt; ++it) {
 
 2021     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2022       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2023         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2024         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2025           w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2026           w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2027           w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
 
 2028           w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
 
 2038   int itask, 
double *v2, 
const double *v1)
 
 2040   int Nvcd = m_Nvc * m_Nd;
 
 2041   int Nxy  = m_Nx * m_Ny;
 
 2045   int id3 = m_Nvc * 2;
 
 2046   int id4 = m_Nvc * 3;
 
 2048   int          isite = m_arg[itask].isite;
 
 2049   double       *w2   = &v2[Nvcd * isite];
 
 2050   const double *w1   = &v1[Nvcd * isite];
 
 2052   for (
int it = 0; it < m_Mt; ++it) {
 
 2053     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2054       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2055         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2056         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2057           w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
 
 2058           w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
 
 2059           w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2060           w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
void mult_zp2_thread(int, double *, const double *)
 
void mult_yp2_thread(int, double *, const double *)
 
void mult_tm1_dirac_thread(int, double *, const double *)
 
void gm5_dirac_thread(int, double *, const double *)
 
void general(const char *format,...)
 
std::vector< Channel * > m_bw_recv
 
void mult_ym2_thread(int, double *, const double *)
 
void mult_xp2_thread(int, double *, const double *)
 
void mult_yp1_thread(int, double *, const double *)
 
void mult_tm1_chiral_thread(int, double *, const double *)
 
void mult_ym1_thread(int, double *, const double *)
 
void mult_xp1_thread(int, double *, const double *)
 
std::vector< Channel * > m_fw_recv
 
std::vector< Channel * > m_fw_send
 
void mult_xm2_thread(int, double *, const double *)
 
void mult_tp1_dirac_thread(int, double *, const double *)
 
std::vector< mult_arg > m_arg
 
void mult_ymb_thread(int, double *, const double *)
 
void mult_zpb_thread(int, double *, const double *)
 
void mult_xm1_thread(int, double *, const double *)
 
void daypx_thread(int, double *, double, const double *)
 
Bridge::VerboseLevel m_vl
 
void mult_tpb_chiral_thread(int, double *, const double *)
 
void mult_tp1_chiral_thread(int, double *, const double *)
 
std::vector< Channel * > m_bw_send
 
static int get_num_threads_available()
returns number of threads (works outside of parallel region). 
 
void gm5_chiral_thread(int, double *, const double *)
 
void mult_tp2_chiral_thread(int, double *, const double *)
 
void mult_zp1_thread(int, double *, const double *)
 
void clear_thread(int, double *)
 
void mult_ypb_thread(int, double *, const double *)
 
void crucial(const char *format,...)
 
void mult_zmb_thread(int, double *, const double *)
 
void mult_tmb_dirac_thread(int, double *, const double *)
 
void mult_xmb_thread(int, double *, const double *)
 
void mult_xpb_thread(int, double *, const double *)
 
void mult_zm2_thread(int, double *, const double *)
 
void mult_tm2_dirac_thread(int, double *, const double *)
 
void mult_tpb_dirac_thread(int, double *, const double *)
 
void mult_tp2_dirac_thread(int, double *, const double *)
 
void mult_zm1_thread(int, double *, const double *)
 
void mult_tm2_chiral_thread(int, double *, const double *)
 
void mult_tmb_chiral_thread(int, double *, const double *)