22 #if defined USE_GROUP_SU3 
   23 #include "fopr_Wilson_impl_SU3.inc" 
   24 #elif defined USE_GROUP_SU2 
   25 #include "fopr_Wilson_impl_SU2.inc" 
   26 #elif defined USE_GROUP_SU_N 
   27 #include "fopr_Wilson_impl_SU_N.inc" 
   62   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   63     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
   64       int itask = ith_z + m_Ntask_z * ith_t;
 
   72       if (ith_t == 0) 
m_arg[itask].kt0 = 1;
 
   73       if (ith_z == 0) 
m_arg[itask].kz0 = 1;
 
   74       if (ith_t == m_Ntask_t - 1) 
m_arg[itask].kt1 = 1;
 
   75       if (ith_z == m_Ntask_z - 1) 
m_arg[itask].kz1 = 1;
 
   79       m_arg[itask].isite_cpz = ith_t * 
m_Mt * Nxy2;
 
   80       m_arg[itask].isite_cpt = ith_z * 
m_Mz * Nxy2;
 
   87   int Nvcd2 = 2 * Nc * Nd / 2;
 
   89   std::vector<int> destid(
m_Ntask);
 
   90   std::vector<int> offset(
m_Ntask);
 
   91   std::vector<int> datasize(
m_Ntask);
 
   92   std::vector<int> offset_up(
m_Ntask);
 
   93   std::vector<int> offset_lw(
m_Ntask);
 
   94   std::vector<int> datasize_up(
m_Ntask);
 
   95   std::vector<int> datasize_lw(
m_Ntask);
 
   98   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   99     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  101       int isite_cp = itask * 
m_Mz * 
m_Mt * (m_Ny / 2);
 
  102       destid[itask]   = itask;
 
  103       offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  104       datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * (m_Ny / 2);
 
  113   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  114     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  117       destid[itask]   = itask;
 
  118       offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  119       datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Nx2;
 
  128   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  129     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  130       int itask = ith_z + m_Ntask_z * ith_t;
 
  132       offset_up[itask]   = 0;
 
  133       offset_lw[itask]   = 0;
 
  134       datasize_up[itask] = 0;
 
  135       datasize_lw[itask] = 0;
 
  137         destid[itask]      = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
 
  138         offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx2 * m_Ny;
 
  139         datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx2 * m_Ny;
 
  141       if (ith_z == m_Ntask_z - 1) {
 
  143         offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx2 * m_Ny;
 
  144         datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx2 * m_Ny;
 
  154   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  155     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  156       int itask = ith_z + m_Ntask_z * ith_t;
 
  158       offset_up[itask]   = 0;
 
  159       offset_lw[itask]   = 0;
 
  160       datasize_up[itask] = 0;
 
  161       datasize_lw[itask] = 0;
 
  163         destid[itask]      = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
 
  164         offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx2 * m_Ny;
 
  165         datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx2 * m_Ny;
 
  167       if (ith_t == m_Ntask_t - 1) {
 
  168         destid[itask]      = ith_z;
 
  169         offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx2 * m_Ny;
 
  170         datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx2 * m_Ny;
 
  183                                                       double *w, 
double fac)
 
  185   int Nvcd = m_Nvc * 
m_Nd;
 
  186   int Nvxy = Nvcd * m_Nx2 * m_Ny;
 
  188   int    isite = m_arg[itask].isite;
 
  189   double *wp   = &w[Nvcd * isite];
 
  191   for (
int it = 0; it < m_Mt; ++it) {
 
  192     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  193       for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  194         int iv = ivxy + Nvxy * (iz + m_Nz * it);
 
  195         wp[iv] = fac * wp[iv];
 
  206   int Nvcd = m_Nvc * 
m_Nd;
 
  207   int Nvxy = Nvcd * m_Nx2 * m_Ny;
 
  209   int    isite = m_arg[itask].isite;
 
  210   double *wp   = &v[Nvcd * isite];
 
  212   for (
int it = 0; it < m_Mt; ++it) {
 
  213     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  214       for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  215         int iv = ivxy + Nvxy * (iz + m_Nz * it);
 
  225   int itask, 
double *vcp1, 
const double *
v1, 
int ieo)
 
  227   int Nvc2  = 2 * m_Nvc;
 
  228   int Nvcd  = m_Nvc * 
m_Nd;
 
  229   int Nvcd2 = Nvcd / 2;
 
  238   int isite    = m_arg[itask].isite;
 
  239   int isite_cp = m_arg[itask].isite_cpx;
 
  240   int iyzt0    = isite / m_Nx2;
 
  244     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  245   const double *w1 = &v1[Nvcd * isite];
 
  247   double bc2 = m_boundary2[idir];
 
  252   for (
int it = 0; it < m_Mt; ++it) {
 
  253     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  254       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  255         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  256         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  258           int is = ix + m_Nx2 * iyzt;
 
  261           int ix1 = Nvc2 * ibf;
 
  262           int ix2 = ix1 + m_Nvc;
 
  264           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  265             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
 
  266             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
 
  267             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  268             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
 
  276   m_bw_send[idir]->start_thread(itask);
 
  282   int itask, 
double *
v2, 
const double *vcp2, 
int ieo)
 
  284   int Nvc2  = 2 * m_Nvc;
 
  285   int Nvcd  = m_Nvc * 
m_Nd;
 
  286   int Nvcd2 = Nvcd / 2;
 
  295   double wt1r, wt1i, wt2r, wt2i;
 
  297   int isite    = m_arg[itask].isite;
 
  298   int isite_cp = m_arg[itask].isite_cpx;
 
  299   int iyzt0    = isite / m_Nx2;
 
  301   double *w2 = &v2[Nvcd * isite];
 
  304     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  305   const double *u = m_U->ptr(m_Ndf * (isite + ieo * 
m_Nvol / 2 + idir * 
m_Nvol));
 
  307   m_bw_recv[idir]->wait_thread(itask);
 
  311   for (
int it = 0; it < m_Mt; ++it) {
 
  312     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  313       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  314         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  315         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  318           int is  = ix + m_Nx2 * iyzt;
 
  321           int ix1 = Nvc2 * ibf;
 
  322           int ix2 = ix1 + m_Nvc;
 
  324           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  325             int ic2 = ic * m_Nvc;
 
  326             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  327             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  328             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  329             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  330             w2[2 * ic + id1 + iv]     += wt1r;
 
  331             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  332             w2[2 * ic + id2 + iv]     += wt2r;
 
  333             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  334             w2[2 * ic + id3 + iv]     += wt2i;
 
  335             w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  336             w2[2 * ic + id4 + iv]     += wt1i;
 
  337             w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  349   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
  351   int Nvcd = m_Nvc * 
m_Nd;
 
  360   double vt1[m_Nvc], vt2[m_Nvc];
 
  361   double wt1r, wt1i, wt2r, wt2i;
 
  363   int isite = m_arg[itask].isite;
 
  364   int iyzt0 = isite / m_Nx2;
 
  366   double       *w2 = &v2[Nvcd * isite];
 
  367   const double *w1 = &v1[Nvcd * isite];
 
  368   const double *u  = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
  370   for (
int it = 0; it < m_Mt; ++it) {
 
  371     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  372       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  373         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  374         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  375         for (
int ix = 0; ix < m_Nx2 - Leo; ++ix) {
 
  376           int is = ix + m_Nx2 * iyzt;
 
  378           int in = Nvcd * (is + Leo);
 
  381           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  382             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  383             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
 
  384             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  385             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
 
  388           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  389             int ic2 = ic * m_Nvc;
 
  391             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  392             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  393             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  394             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  396             w2[2 * ic + id1 + iv]     += wt1r;
 
  397             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  398             w2[2 * ic + id2 + iv]     += wt2r;
 
  399             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  400             w2[2 * ic + id3 + iv]     += wt2i;
 
  401             w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  402             w2[2 * ic + id4 + iv]     += wt1i;
 
  403             w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  414   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  416   int Nvc2  = 2 * m_Nvc;
 
  417   int Nvcd  = m_Nvc * 
m_Nd;
 
  418   int Nvcd2 = Nvcd / 2;
 
  427   int isite    = m_arg[itask].isite;
 
  428   int isite_cp = m_arg[itask].isite_cpx;
 
  429   int iyzt0    = isite / m_Nx2;
 
  433     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  434   const double *w1 = &v1[Nvcd * isite];
 
  435   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
  437   double vt1[m_Nvc], vt2[m_Nvc];
 
  442   for (
int it = 0; it < m_Mt; ++it) {
 
  443     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  444       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  445         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  446         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  448           int is = ix + m_Nx2 * iyzt;
 
  452           int ix1 = Nvc2 * ibf;
 
  453           int ix2 = ix1 + m_Nvc;
 
  455           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  456             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  457             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  458             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  459             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  462           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  464             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  465             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  466             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  467             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  475   m_fw_send[idir]->start_thread(itask);
 
  481   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
  483   int Nvc2  = 2 * m_Nvc;
 
  484   int Nvcd  = m_Nvc * 
m_Nd;
 
  485   int Nvcd2 = Nvcd / 2;
 
  493   double bc2  = m_boundary2[idir];
 
  495   double wt1r, wt1i, wt2r, wt2i;
 
  497   int isite    = m_arg[itask].isite;
 
  498   int isite_cp = m_arg[itask].isite_cpx;
 
  499   int iyzt0    = isite / m_Nx2;
 
  501   double *w2 = &v2[Nvcd * isite];
 
  504     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  506   m_fw_recv[idir]->wait_thread(itask);
 
  510   for (
int it = 0; it < m_Mt; ++it) {
 
  511     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  512       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  513         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  514         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  516           int is = ix + m_Nx2 * iyzt;
 
  519           int ix1 = Nvc2 * ibf;
 
  520           int ix2 = ix1 + m_Nvc;
 
  522           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  524             int ici = 2 * ic + 1;
 
  525             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  526             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  527             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  528             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  529             w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
 
  530             w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
 
  531             w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
 
  532             w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
 
  544   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
  546   int Nvcd = m_Nvc * 
m_Nd;
 
  555   double vt1[m_Nvc], vt2[m_Nvc];
 
  556   double wt1r, wt1i, wt2r, wt2i;
 
  558   int isite = m_arg[itask].isite;
 
  559   int iyzt0 = isite / m_Nx2;
 
  561   double       *w2 = &v2[Nvcd * isite];
 
  562   const double *w1 = &v1[Nvcd * isite];
 
  563   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
  565   for (
int it = 0; it < m_Mt; ++it) {
 
  566     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  567       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  568         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  569         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  571         for (
int ix = Meo; ix < m_Nx2; ++ix) {
 
  572           int is = ix + m_Nx2 * iyzt;
 
  574           int in = Nvcd * (is - 
Meo);
 
  575           int ig = m_Ndf * (is - 
Meo);
 
  577           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  578             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  579             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  580             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  581             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  584           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  587             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  588             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  589             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  590             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  592             w2[2 * ic + id1 + iv]     += wt1r;
 
  593             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  594             w2[2 * ic + id2 + iv]     += wt2r;
 
  595             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  596             w2[2 * ic + id3 + iv]     += -wt2i;
 
  597             w2[2 * ic + 1 + id3 + iv] += +wt2r;
 
  598             w2[2 * ic + id4 + iv]     += -wt1i;
 
  599             w2[2 * ic + 1 + id4 + iv] += +wt1r;
 
  610   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  612   int Nvc2  = 2 * m_Nvc;
 
  613   int Nvcd  = m_Nvc * 
m_Nd;
 
  614   int Nvcd2 = Nvcd / 2;
 
  623   int isite    = m_arg[itask].isite;
 
  624   int isite_cp = m_arg[itask].isite_cpy;
 
  628     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  629   const double *w1 = &v1[Nvcd * isite];
 
  631   double bc2 = m_boundary2[idir];
 
  635   for (
int it = 0; it < m_Mt; ++it) {
 
  636     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  637       for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  638         int is  = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  639         int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  641         int ix1 = Nvc2 * is2;
 
  642         int ix2 = ix1 + m_Nvc;
 
  644         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  645           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
 
  646           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
 
  647           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
 
  648           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  654   m_bw_send[idir]->start_thread(itask);
 
  660   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
  662   int Nvc2  = 2 * m_Nvc;
 
  663   int Nvcd  = m_Nvc * 
m_Nd;
 
  664   int Nvcd2 = Nvcd / 2;
 
  673   double wt1r, wt1i, wt2r, wt2i;
 
  675   int isite    = m_arg[itask].isite;
 
  676   int isite_cp = m_arg[itask].isite_cpy;
 
  678   double *w2 = &v2[Nvcd * isite];
 
  681     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  682   const double *u = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
  684   m_bw_recv[idir]->wait_thread(itask);
 
  687   for (
int it = 0; it < m_Mt; ++it) {
 
  688     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  689       for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  690         int is  = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  691         int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  694         int ix1 = Nvc2 * is2;
 
  695         int ix2 = ix1 + m_Nvc;
 
  697         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  698           int ic2 = ic * m_Nvc;
 
  700           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  701           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  702           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  703           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  705           w2[2 * ic + id1 + iv]     += wt1r;
 
  706           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  707           w2[2 * ic + id2 + iv]     += wt2r;
 
  708           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  709           w2[2 * ic + id3 + iv]     += -wt2r;
 
  710           w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  711           w2[2 * ic + id4 + iv]     += wt1r;
 
  712           w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  722   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
  724   int Nvcd = m_Nvc * 
m_Nd;
 
  733   double vt1[m_Nvc], vt2[m_Nvc];
 
  734   double wt1r, wt1i, wt2r, wt2i;
 
  736   int isite = m_arg[itask].isite;
 
  738   double       *w2 = &v2[Nvcd * isite];
 
  739   const double *w1 = &v1[Nvcd * isite];
 
  740   const double *u  = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
  742   for (
int it = 0; it < m_Mt; ++it) {
 
  743     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  744       for (
int iy = 0; iy < m_Ny - 1; ++iy) {
 
  745         for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  746           int is = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  748           int in = Nvcd * (is + m_Nx2);
 
  751           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  752             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
 
  753             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  754             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
 
  755             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  758           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  759             int ic2 = ic * m_Nvc;
 
  761             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  762             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  763             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  764             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  766             w2[2 * ic + id1 + iv]     += wt1r;
 
  767             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  768             w2[2 * ic + id2 + iv]     += wt2r;
 
  769             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  770             w2[2 * ic + id3 + iv]     += -wt2r;
 
  771             w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  772             w2[2 * ic + id4 + iv]     += wt1r;
 
  773             w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  784   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  786   int Nvc2  = 2 * m_Nvc;
 
  787   int Nvcd  = m_Nvc * 
m_Nd;
 
  788   int Nvcd2 = Nvcd / 2;
 
  797   int isite    = m_arg[itask].isite;
 
  798   int isite_cp = m_arg[itask].isite_cpy;
 
  802     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  803   const double *w1 = &v1[Nvcd * isite];
 
  804   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
  806   double vt1[m_Nvc], vt2[m_Nvc];
 
  810   for (
int it = 0; it < m_Mt; ++it) {
 
  811     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  812       for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  813         int is  = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  814         int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  817         int ix1 = Nvc2 * is2;
 
  818         int ix2 = ix1 + m_Nvc;
 
  820         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  821           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  822           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  823           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  824           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  827         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  829           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  830           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  831           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  832           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  838   m_fw_send[idir]->start_thread(itask);
 
  844   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
  846   int Nvc2  = 2 * m_Nvc;
 
  847   int Nvcd  = m_Nvc * 
m_Nd;
 
  848   int Nvcd2 = Nvcd / 2;
 
  856   double bc2  = m_boundary2[idir];
 
  858   double wt1r, wt1i, wt2r, wt2i;
 
  860   int isite    = m_arg[itask].isite;
 
  861   int isite_cp = m_arg[itask].isite_cpy;
 
  863   double *w2 = &v2[Nvcd * isite];
 
  866     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  868   m_fw_recv[idir]->wait_thread(itask);
 
  871   for (
int it = 0; it < m_Mt; ++it) {
 
  872     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  873       for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  874         int is  = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  875         int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  877         int ix1 = Nvc2 * is2;
 
  878         int ix2 = ix1 + m_Nvc;
 
  880         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  882           int ici = 2 * ic + 1;
 
  883           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  884           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  885           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  886           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  887           w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
 
  888           w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
 
  889           w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
 
  890           w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
 
  900   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
  902   int Nvcd = m_Nvc * 
m_Nd;
 
  911   double vt1[m_Nvc], vt2[m_Nvc];
 
  912   double wt1r, wt1i, wt2r, wt2i;
 
  914   int isite = m_arg[itask].isite;
 
  916   double       *w2 = &v2[Nvcd * isite];
 
  917   const double *w1 = &v1[Nvcd * isite];
 
  918   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
  920   for (
int it = 0; it < m_Mt; ++it) {
 
  921     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  922       for (
int iy = 1; iy < m_Ny; ++iy) {
 
  923         for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  924           int is = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  926           int in = Nvcd * (is - m_Nx2);
 
  927           int ig = m_Ndf * (is - m_Nx2);
 
  929           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  930             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  931             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  932             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  933             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  936           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  938             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  939             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  940             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  941             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  943             w2[ic2 + id1 + iv]     += wt1r;
 
  944             w2[ic2 + 1 + id1 + iv] += wt1i;
 
  945             w2[ic2 + id2 + iv]     += wt2r;
 
  946             w2[ic2 + 1 + id2 + iv] += wt2i;
 
  947             w2[ic2 + id3 + iv]     += wt2r;
 
  948             w2[ic2 + 1 + id3 + iv] += wt2i;
 
  949             w2[ic2 + id4 + iv]     += -wt1r;
 
  950             w2[ic2 + 1 + id4 + iv] += -wt1i;
 
  961   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  963   int Nvc2  = 2 * m_Nvc;
 
  964   int Nvcd  = m_Nvc * 
m_Nd;
 
  965   int Nvcd2 = Nvcd / 2;
 
  974   int isite    = m_arg[itask].isite;
 
  975   int isite_cp = m_arg[itask].isite_cpz;
 
  979     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  980   const double *w1 = &v1[Nvcd * isite];
 
  982   double bc2 = m_boundary2[idir];
 
  984   if (m_arg[itask].kz0 == 1) {
 
  985     int Nxy = m_Nx2 * m_Ny;
 
  987     for (
int it = 0; it < m_Mt; ++it) {
 
  988       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  989         int is  = ixy + Nxy * (iz + m_Nz * it);
 
  990         int is2 = ixy + Nxy * it;
 
  993         int ix1 = Nvc2 * is2;
 
  994         int ix2 = ix1 + m_Nvc;
 
  996         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  997           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
 
  998           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
 
  999           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1000           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
 
 1006   m_bw_send[idir]->start_thread(itask);
 
 1012   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1014   int Nvc2  = 2 * m_Nvc;
 
 1015   int Nvcd  = m_Nvc * 
m_Nd;
 
 1016   int Nvcd2 = Nvcd / 2;
 
 1020   int id3 = m_Nvc * 2;
 
 1021   int id4 = m_Nvc * 3;
 
 1025   double wt1r, wt1i, wt2r, wt2i;
 
 1027   int isite    = m_arg[itask].isite;
 
 1028   int isite_cp = m_arg[itask].isite_cpz;
 
 1030   double *w2 = &v2[Nvcd * isite];
 
 1033     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1034   const double *u = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1036   m_bw_recv[idir]->wait_thread(itask);
 
 1038   if (m_arg[itask].kz1 == 1) {
 
 1039     int Nxy = m_Nx2 * m_Ny;
 
 1041     for (
int it = 0; it < m_Mt; ++it) {
 
 1042       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1043         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1044         int is2 = ixy + Nxy * it;
 
 1046         int ig  = m_Ndf * is;
 
 1047         int ix1 = Nvc2 * is2;
 
 1048         int ix2 = ix1 + m_Nvc;
 
 1050         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1051           int ic2 = ic * m_Nvc;
 
 1053           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1054           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1055           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1056           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1058           w2[2 * ic + id1 + iv]     += wt1r;
 
 1059           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1060           w2[2 * ic + id2 + iv]     += wt2r;
 
 1061           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1062           w2[2 * ic + id3 + iv]     += wt1i;
 
 1063           w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1064           w2[2 * ic + id4 + iv]     += -wt2i;
 
 1065           w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1075   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1077   int Nvcd = m_Nvc * 
m_Nd;
 
 1081   int id3 = m_Nvc * 2;
 
 1082   int id4 = m_Nvc * 3;
 
 1086   double vt1[m_Nvc], vt2[m_Nvc];
 
 1087   double wt1r, wt1i, wt2r, wt2i;
 
 1089   int isite = m_arg[itask].isite;
 
 1091   double       *w2 = &v2[Nvcd * isite];
 
 1092   const double *w1 = &v1[Nvcd * isite];
 
 1093   const double *u  = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1095   int kz1 = m_arg[itask].kz1;
 
 1096   int Nxy = m_Nx2 * m_Ny;
 
 1098   for (
int it = 0; it < m_Mt; ++it) {
 
 1099     for (
int iz = 0; iz < m_Mz - kz1; ++iz) {
 
 1100       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1101         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1103         int in = Nvcd * (is + Nxy);
 
 1104         int ig = m_Ndf * is;
 
 1106         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1107           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1108           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
 
 1109           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1110           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
 
 1113         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1114           int ic2 = ic * m_Nvc;
 
 1116           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1117           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1118           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1119           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1121           w2[2 * ic + id1 + iv]     += wt1r;
 
 1122           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1123           w2[2 * ic + id2 + iv]     += wt2r;
 
 1124           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1125           w2[2 * ic + id3 + iv]     += wt1i;
 
 1126           w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1127           w2[2 * ic + id4 + iv]     += -wt2i;
 
 1128           w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1138   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1140   int Nvc2  = 2 * m_Nvc;
 
 1141   int Nvcd  = m_Nvc * 
m_Nd;
 
 1142   int Nvcd2 = Nvcd / 2;
 
 1146   int id3 = m_Nvc * 2;
 
 1147   int id4 = m_Nvc * 3;
 
 1151   int isite    = m_arg[itask].isite;
 
 1152   int isite_cp = m_arg[itask].isite_cpz;
 
 1156     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1157   const double *w1 = &v1[Nvcd * isite];
 
 1158   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1160   double vt1[m_Nvc], vt2[m_Nvc];
 
 1162   if (m_arg[itask].kz1 == 1) {
 
 1163     int Nxy = m_Nx2 * m_Ny;
 
 1165     for (
int it = 0; it < m_Mt; ++it) {
 
 1166       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1167         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1168         int is2 = ixy + Nxy * it;
 
 1170         int ig  = m_Ndf * is;
 
 1171         int ix1 = Nvc2 * is2;
 
 1172         int ix2 = ix1 + m_Nvc;
 
 1174         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1175           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1176           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1177           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1178           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1181         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1183           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1184           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1185           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1186           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1192   m_fw_send[idir]->start_thread(itask);
 
 1198   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1200   int Nvc2  = 2 * m_Nvc;
 
 1201   int Nvcd  = m_Nvc * 
m_Nd;
 
 1202   int Nvcd2 = Nvcd / 2;
 
 1206   int id3 = m_Nvc * 2;
 
 1207   int id4 = m_Nvc * 3;
 
 1210   double bc2  = m_boundary2[idir];
 
 1212   double wt1r, wt1i, wt2r, wt2i;
 
 1214   int isite    = m_arg[itask].isite;
 
 1215   int isite_cp = m_arg[itask].isite_cpz;
 
 1217   double *w2 = &v2[Nvcd * isite];
 
 1220     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1222   m_fw_recv[idir]->wait_thread(itask);
 
 1224   if (m_arg[itask].kz0 == 1) {
 
 1225     int Nxy = m_Nx2 * m_Ny;
 
 1228     for (
int it = 0; it < m_Mt; ++it) {
 
 1229       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1230         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1231         int is2 = ixy + Nxy * it;
 
 1233         int ix1 = Nvc2 * is2;
 
 1234         int ix2 = ix1 + m_Nvc;
 
 1236         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1238           int ici = 2 * ic + 1;
 
 1239           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1240           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1241           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1242           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1243           w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
 
 1244           w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
 
 1245           w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
 
 1246           w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
 
 1256   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1258   int Nvcd = m_Nvc * 
m_Nd;
 
 1262   int id3 = m_Nvc * 2;
 
 1263   int id4 = m_Nvc * 3;
 
 1267   double vt1[m_Nvc], vt2[m_Nvc];
 
 1268   double wt1r, wt1i, wt2r, wt2i;
 
 1270   int isite = m_arg[itask].isite;
 
 1272   double       *w2 = &v2[Nvcd * isite];
 
 1273   const double *w1 = &v1[Nvcd * isite];
 
 1274   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1276   int kz0 = m_arg[itask].kz0;
 
 1277   int Nxy = m_Nx2 * m_Ny;
 
 1279   for (
int it = 0; it < m_Mt; ++it) {
 
 1280     for (
int iz = kz0; iz < m_Mz; ++iz) {
 
 1281       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1282         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1284         int in = Nvcd * (is - Nxy);
 
 1285         int ig = m_Ndf * (is - Nxy);
 
 1287         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1288           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1289           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1290           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1291           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1294         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1296           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1297           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1298           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1299           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1301           w2[ic2 + id1 + iv]     += wt1r;
 
 1302           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1303           w2[ic2 + id2 + iv]     += wt2r;
 
 1304           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1305           w2[ic2 + id3 + iv]     += -wt1i;
 
 1306           w2[ic2 + 1 + id3 + iv] += wt1r;
 
 1307           w2[ic2 + id4 + iv]     += wt2i;
 
 1308           w2[ic2 + 1 + id4 + iv] += -wt2r;
 
 1318   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1320   int Nvc2  = 2 * m_Nvc;
 
 1321   int Nvcd  = m_Nvc * 
m_Nd;
 
 1322   int Nvcd2 = Nvcd / 2;
 
 1326   int id3 = m_Nvc * 2;
 
 1327   int id4 = m_Nvc * 3;
 
 1331   int isite    = m_arg[itask].isite;
 
 1332   int isite_cp = m_arg[itask].isite_cpt;
 
 1336     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1337   const double *w1 = &v1[Nvcd * isite];
 
 1339   double bc2 = m_boundary2[idir];
 
 1341   if (m_arg[itask].kt0 == 1) {
 
 1342     int Nxy = m_Nx2 * m_Ny;
 
 1344     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1345       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1346         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1347         int is2 = ixy + Nxy * iz;
 
 1350         int ix1 = Nvc2 * is2;
 
 1351         int ix2 = ix1 + m_Nvc;
 
 1353         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1354           w2[2 * ic + ix1]     = 2.0 * bc2 * w1[2 * ic + id3 + in];
 
 1355           w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
 
 1356           w2[2 * ic + ix2]     = 2.0 * bc2 * w1[2 * ic + id4 + in];
 
 1357           w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
 
 1363   m_bw_send[idir]->start_thread(itask);
 
 1369   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1371   int Nvc2  = 2 * m_Nvc;
 
 1372   int Nvcd  = m_Nvc * 
m_Nd;
 
 1373   int Nvcd2 = Nvcd / 2;
 
 1377   int id3 = m_Nvc * 2;
 
 1378   int id4 = m_Nvc * 3;
 
 1382   double wt1r, wt1i, wt2r, wt2i;
 
 1384   int isite    = m_arg[itask].isite;
 
 1385   int isite_cp = m_arg[itask].isite_cpt;
 
 1387   double *w2 = &v2[Nvcd * isite];
 
 1390     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1391   const double *u = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1393   m_bw_recv[idir]->wait_thread(itask);
 
 1395   if (m_arg[itask].kt1 == 1) {
 
 1396     int Nxy = m_Nx2 * m_Ny;
 
 1398     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1399       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1400         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1401         int is2 = ixy + Nxy * iz;
 
 1403         int ig  = m_Ndf * is;
 
 1404         int ix1 = Nvc2 * is2;
 
 1405         int ix2 = ix1 + m_Nvc;
 
 1407         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1408           int ic2 = ic * m_Nvc;
 
 1410           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1411           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1412           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1413           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1415           w2[2 * ic + id3 + iv]     += wt1r;
 
 1416           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1417           w2[2 * ic + id4 + iv]     += wt2r;
 
 1418           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1428   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1430   int Nvcd = m_Nvc * 
m_Nd;
 
 1434   int id3 = m_Nvc * 2;
 
 1435   int id4 = m_Nvc * 3;
 
 1439   double vt1[m_Nvc], vt2[m_Nvc];
 
 1440   double wt1r, wt1i, wt2r, wt2i;
 
 1442   int isite = m_arg[itask].isite;
 
 1444   double       *w2 = &v2[Nvcd * isite];
 
 1445   const double *w1 = &v1[Nvcd * isite];
 
 1446   const double *u  = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1448   int kt1  = m_arg[itask].kt1;
 
 1449   int Nxy  = m_Nx2 * m_Ny;
 
 1450   int Nxyz = Nxy * m_Nz;
 
 1452   for (
int it = 0; it < m_Mt - kt1; ++it) {
 
 1453     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1454       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1455         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1457         int in = Nvcd * (is + Nxyz);
 
 1458         int ig = m_Ndf * is;
 
 1460         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1461           vt1[2 * ic]     = 2.0 * w1[2 * ic + id3 + in];
 
 1462           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
 
 1463           vt2[2 * ic]     = 2.0 * w1[2 * ic + id4 + in];
 
 1464           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
 
 1467         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1468           int ic2 = ic * m_Nvc;
 
 1470           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1471           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1472           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1473           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1475           w2[2 * ic + id3 + iv]     += wt1r;
 
 1476           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1477           w2[2 * ic + id4 + iv]     += wt2r;
 
 1478           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1488   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1490   int Nvc2  = 2 * m_Nvc;
 
 1491   int Nvcd  = m_Nvc * 
m_Nd;
 
 1492   int Nvcd2 = Nvcd / 2;
 
 1496   int id3 = m_Nvc * 2;
 
 1497   int id4 = m_Nvc * 3;
 
 1501   int isite    = m_arg[itask].isite;
 
 1502   int isite_cp = m_arg[itask].isite_cpt;
 
 1506     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1507   const double *w1 = &v1[Nvcd * isite];
 
 1508   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1510   double vt1[m_Nvc], vt2[m_Nvc];
 
 1512   if (m_arg[itask].kt1 == 1) {
 
 1513     int Nxy = m_Nx2 * m_Ny;
 
 1515     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1516       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1517         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1518         int is2 = ixy + Nxy * iz;
 
 1520         int ig  = m_Ndf * is;
 
 1521         int ix1 = Nvc2 * is2;
 
 1522         int ix2 = ix1 + m_Nvc;
 
 1524         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1525           vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1526           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1527           vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1528           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1531         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1533           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1534           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1535           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1536           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1542   m_fw_send[idir]->start_thread(itask);
 
 1548   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1550   int Nvc2  = 2 * m_Nvc;
 
 1551   int Nvcd  = m_Nvc * 
m_Nd;
 
 1552   int Nvcd2 = Nvcd / 2;
 
 1556   int id3 = m_Nvc * 2;
 
 1557   int id4 = m_Nvc * 3;
 
 1560   double bc2  = m_boundary2[idir];
 
 1562   double wt1r, wt1i, wt2r, wt2i;
 
 1564   int isite    = m_arg[itask].isite;
 
 1565   int isite_cp = m_arg[itask].isite_cpt;
 
 1567   double *w2 = &v2[Nvcd * isite];
 
 1570     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1572   m_fw_recv[idir]->wait_thread(itask);
 
 1574   if (m_arg[itask].kt0 == 1) {
 
 1575     int Nxy = m_Nx2 * m_Ny;
 
 1577     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1578       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1579         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1580         int is2 = ixy + Nxy * iz;
 
 1582         int ix1 = Nvc2 * is2;
 
 1583         int ix2 = ix1 + m_Nvc;
 
 1585         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1587           int ici = 2 * ic + 1;
 
 1588           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1589           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1590           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1591           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1601   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1603   int Nvcd = m_Nvc * 
m_Nd;
 
 1607   int id3 = m_Nvc * 2;
 
 1608   int id4 = m_Nvc * 3;
 
 1612   double vt1[m_Nvc], vt2[m_Nvc];
 
 1613   double wt1r, wt1i, wt2r, wt2i;
 
 1615   int isite = m_arg[itask].isite;
 
 1617   double       *w2 = &v2[Nvcd * isite];
 
 1618   const double *w1 = &v1[Nvcd * isite];
 
 1619   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1621   int kt0  = m_arg[itask].kt0;
 
 1622   int Nxy  = m_Nx2 * m_Ny;
 
 1623   int Nxyz = Nxy * m_Nz;
 
 1625   for (
int it = kt0; it < m_Mt; ++it) {
 
 1626     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1627       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1628         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1630         int in = Nvcd * (is - Nxyz);
 
 1631         int ig = m_Ndf * (is - Nxyz);
 
 1633         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1634           vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1635           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1636           vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1637           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1640         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1642           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1643           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1644           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1645           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1647           w2[ic2 + id1 + iv]     += wt1r;
 
 1648           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1649           w2[ic2 + id2 + iv]     += wt2r;
 
 1650           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1660   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1662   int Nvc2  = 2 * m_Nvc;
 
 1663   int Nvcd  = m_Nvc * 
m_Nd;
 
 1664   int Nvcd2 = Nvcd / 2;
 
 1668   int id3 = m_Nvc * 2;
 
 1669   int id4 = m_Nvc * 3;
 
 1673   int isite    = m_arg[itask].isite;
 
 1674   int isite_cp = m_arg[itask].isite_cpt;
 
 1678     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1679   const double *w1 = &v1[Nvcd * isite];
 
 1681   double bc2 = m_boundary2[idir];
 
 1683   if (m_arg[itask].kt0 == 1) {
 
 1684     int Nxy = m_Nx2 * m_Ny;
 
 1686     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1687       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1688         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1689         int is2 = ixy + Nxy * iz;
 
 1692         int ix1 = Nvc2 * is2;
 
 1693         int ix2 = ix1 + m_Nvc;
 
 1695         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1696           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
 
 1697           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
 
 1698           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
 
 1699           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1705   m_bw_send[idir]->start_thread(itask);
 
 1711   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1713   int Nvc2  = 2 * m_Nvc;
 
 1714   int Nvcd  = m_Nvc * 
m_Nd;
 
 1715   int Nvcd2 = Nvcd / 2;
 
 1719   int id3 = m_Nvc * 2;
 
 1720   int id4 = m_Nvc * 3;
 
 1724   double wt1r, wt1i, wt2r, wt2i;
 
 1726   int isite    = m_arg[itask].isite;
 
 1727   int isite_cp = m_arg[itask].isite_cpt;
 
 1729   double *w2 = &v2[Nvcd * isite];
 
 1732     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1733   const double *u = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1735   m_bw_recv[idir]->wait_thread(itask);
 
 1737   if (m_arg[itask].kt1 == 1) {
 
 1738     int Nxy = m_Nx2 * m_Ny;
 
 1740     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1741       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1742         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1743         int is2 = ixy + Nxy * iz;
 
 1745         int ig  = m_Ndf * is;
 
 1746         int ix1 = Nvc2 * is2;
 
 1747         int ix2 = ix1 + m_Nvc;
 
 1749         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1750           int ic2 = ic * m_Nvc;
 
 1752           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1753           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1754           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1755           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1757           w2[2 * ic + id1 + iv]     += wt1r;
 
 1758           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1759           w2[2 * ic + id2 + iv]     += wt2r;
 
 1760           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1761           w2[2 * ic + id3 + iv]     += wt1r;
 
 1762           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1763           w2[2 * ic + id4 + iv]     += wt2r;
 
 1764           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1774   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1776   int Nvcd = m_Nvc * 
m_Nd;
 
 1780   int id3 = m_Nvc * 2;
 
 1781   int id4 = m_Nvc * 3;
 
 1785   double vt1[m_Nvc], vt2[m_Nvc];
 
 1786   double wt1r, wt1i, wt2r, wt2i;
 
 1788   int isite = m_arg[itask].isite;
 
 1790   double       *w2 = &v2[Nvcd * isite];
 
 1791   const double *w1 = &v1[Nvcd * isite];
 
 1792   const double *u  = m_U->ptr(m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1794   int kt1  = m_arg[itask].kt1;
 
 1795   int Nxy  = m_Nx2 * m_Ny;
 
 1796   int Nxyz = Nxy * m_Nz;
 
 1798   for (
int it = 0; it < m_Mt - kt1; ++it) {
 
 1799     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1800       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1801         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1803         int in = Nvcd * (is + Nxyz);
 
 1804         int ig = m_Ndf * is;
 
 1806         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1807           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
 
 1808           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1809           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
 
 1810           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1813         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1814           int ic2 = ic * m_Nvc;
 
 1816           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1817           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1818           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1819           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1821           w2[2 * ic + id1 + iv]     += wt1r;
 
 1822           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1823           w2[2 * ic + id2 + iv]     += wt2r;
 
 1824           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1825           w2[2 * ic + id3 + iv]     += wt1r;
 
 1826           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1827           w2[2 * ic + id4 + iv]     += wt2r;
 
 1828           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1838   int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1840   int Nvc2  = 2 * m_Nvc;
 
 1841   int Nvcd  = m_Nvc * 
m_Nd;
 
 1842   int Nvcd2 = Nvcd / 2;
 
 1846   int id3 = m_Nvc * 2;
 
 1847   int id4 = m_Nvc * 3;
 
 1851   int isite    = m_arg[itask].isite;
 
 1852   int isite_cp = m_arg[itask].isite_cpt;
 
 1856     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1857   const double *w1 = &v1[Nvcd * isite];
 
 1858   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1860   double vt1[m_Nvc], vt2[m_Nvc];
 
 1862   if (m_arg[itask].kt1 == 1) {
 
 1863     int Nxy = m_Nx2 * m_Ny;
 
 1865     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1866       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1867         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1868         int is2 = ixy + Nxy * iz;
 
 1870         int ig  = m_Ndf * is;
 
 1871         int ix1 = Nvc2 * is2;
 
 1872         int ix2 = ix1 + m_Nvc;
 
 1874         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1875           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1876           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1877           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1878           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1881         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1883           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1884           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1885           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1886           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1892   m_fw_send[idir]->start_thread(itask);
 
 1898   int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1900   int Nvc2  = 2 * m_Nvc;
 
 1901   int Nvcd  = m_Nvc * 
m_Nd;
 
 1902   int Nvcd2 = Nvcd / 2;
 
 1906   int id3 = m_Nvc * 2;
 
 1907   int id4 = m_Nvc * 3;
 
 1910   double bc2  = m_boundary2[idir];
 
 1912   double wt1r, wt1i, wt2r, wt2i;
 
 1914   int isite    = m_arg[itask].isite;
 
 1915   int isite_cp = m_arg[itask].isite_cpt;
 
 1917   double *w2 = &v2[Nvcd * isite];
 
 1920     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1922   m_fw_recv[idir]->wait_thread(itask);
 
 1924   if (m_arg[itask].kt0 == 1) {
 
 1925     int Nxy = m_Nx2 * m_Ny;
 
 1927     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1928       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1929         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1930         int is2 = ixy + Nxy * iz;
 
 1932         int ix1 = Nvc2 * is2;
 
 1933         int ix2 = ix1 + m_Nvc;
 
 1935         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1937           int ici = 2 * ic + 1;
 
 1938           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1939           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1940           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1941           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1942           w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
 
 1943           w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
 
 1944           w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
 
 1945           w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
 
 1955   int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1957   int Nvcd = m_Nvc * 
m_Nd;
 
 1961   int id3 = m_Nvc * 2;
 
 1962   int id4 = m_Nvc * 3;
 
 1966   double vt1[m_Nvc], vt2[m_Nvc];
 
 1967   double wt1r, wt1i, wt2r, wt2i;
 
 1969   int isite = m_arg[itask].isite;
 
 1971   double       *w2 = &v2[Nvcd * isite];
 
 1972   const double *w1 = &v1[Nvcd * isite];
 
 1973   const double *u  = m_U->ptr(m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1975   int kt0  = m_arg[itask].kt0;
 
 1976   int Nxy  = m_Nx2 * m_Ny;
 
 1977   int Nxyz = Nxy * m_Nz;
 
 1979   for (
int it = kt0; it < m_Mt; ++it) {
 
 1980     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1981       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1982         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1984         int in = Nvcd * (is - Nxyz);
 
 1985         int ig = m_Ndf * (is - Nxyz);
 
 1987         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1988           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1989           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1990           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1991           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1994         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1996           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1997           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1998           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1999           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 2001           w2[ic2 + id1 + iv]     += wt1r;
 
 2002           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 2003           w2[ic2 + id2 + iv]     += wt2r;
 
 2004           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 2005           w2[ic2 + id3 + iv]     -= wt1r;
 
 2006           w2[ic2 + 1 + id3 + iv] -= wt1i;
 
 2007           w2[ic2 + id4 + iv]     -= wt2r;
 
 2008           w2[ic2 + 1 + id4 + iv] -= wt2i;
 
 2018   int itask, 
double *v2, 
const double *v1)
 
 2020   int Nvcd = m_Nvc * 
m_Nd;
 
 2021   int Nxy  = m_Nx2 * m_Ny;
 
 2025   int id3 = m_Nvc * 2;
 
 2026   int id4 = m_Nvc * 3;
 
 2028   int          isite = m_arg[itask].isite;
 
 2029   double       *w2   = &v2[Nvcd * isite];
 
 2030   const double *w1   = &v1[Nvcd * isite];
 
 2032   for (
int it = 0; it < m_Mt; ++it) {
 
 2033     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2034       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2035         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2036         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2037           w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2038           w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2039           w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
 
 2040           w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
 
 2050   int itask, 
double *v2, 
const double *v1)
 
 2052   int Nvcd = m_Nvc * 
m_Nd;
 
 2053   int Nxy  = m_Nx2 * m_Ny;
 
 2057   int id3 = m_Nvc * 2;
 
 2058   int id4 = m_Nvc * 3;
 
 2060   int          isite = m_arg[itask].isite;
 
 2061   double       *w2   = &v2[Nvcd * isite];
 
 2062   const double *w1   = &v1[Nvcd * isite];
 
 2064   for (
int it = 0; it < m_Mt; ++it) {
 
 2065     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2066       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2067         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2068         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2069           w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
 
 2070           w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
 
 2071           w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2072           w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
 2084   int Nvcd = m_Nvc * 
m_Nd;
 
 2085   int Nxy  = m_Nx2 * m_Ny;
 
 2089   int id3 = m_Nvc * 2;
 
 2090   int id4 = m_Nvc * 3;
 
 2092   int    isite = m_arg[itask].isite;
 
 2093   double *w1   = &v1[Nvcd * isite];
 
 2095   for (
int it = 0; it < m_Mt; ++it) {
 
 2096     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2097       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2098         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2099         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2100           double wt1 = w1[ivc + id1 + iv];
 
 2101           double wt2 = w1[ivc + id2 + iv];
 
 2102           w1[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2103           w1[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2104           w1[ivc + id3 + iv] = wt1;
 
 2105           w1[ivc + id4 + iv] = wt2;
 
 2117   int Nvcd = m_Nvc * 
m_Nd;
 
 2118   int Nxy  = m_Nx2 * m_Ny;
 
 2122   int id3 = m_Nvc * 2;
 
 2123   int id4 = m_Nvc * 3;
 
 2125   int    isite = m_arg[itask].isite;
 
 2126   double *w1   = &v1[Nvcd * isite];
 
 2128   for (
int it = 0; it < m_Mt; ++it) {
 
 2129     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2130       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2131         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2132         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2133           w1[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2134           w1[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
void mult_xpb_thread(int, double *, const double *, int)
 
void mult_tpb_dirac_thread(int, double *, const double *, int)
 
void mult_tmb_chiral_thread(int, double *, const double *, int)
 
void mult_tp2_chiral_thread(int, double *, const double *, int)
 
void mult_ym1_thread(int, double *, const double *, int)
 
void mult_ymb_thread(int, double *, const double *, int)
 
void mult_tmb_dirac_thread(int, double *, const double *, int)
 
void general(const char *format,...)
 
void mult_xm2_thread(int, double *, const double *, int)
 
void mult_zm2_thread(int, double *, const double *, int)
 
void mult_zm1_thread(int, double *, const double *, int)
 
void clear_thread(int, double *)
 
void mult_zp1_thread(int, double *, const double *, int)
 
void mult_tm2_chiral_thread(int, double *, const double *, int)
 
void mult_tp2_dirac_thread(int, double *, const double *, int)
 
void mult_ypb_thread(int, double *, const double *, int)
 
void mult_tm1_dirac_thread(int, double *, const double *, int)
 
void mult_tm2_dirac_thread(int, double *, const double *, int)
 
void mult_tp1_chiral_thread(int, double *, const double *, int)
 
void mult_yp1_thread(int, double *, const double *, int)
 
void mult_zmb_thread(int, double *, const double *, int)
 
std::vector< Channel * > m_bw_recv
 
static int get_num_threads_available()
returns number of threads (works outside of parallel region). 
 
void mult_zpb_thread(int, double *, const double *, int)
 
void mult_xm1_thread(int, double *, const double *, int)
 
void mult_yp2_thread(int, double *, const double *, int)
 
void crucial(const char *format,...)
 
void mult_tm1_chiral_thread(int, double *, const double *, int)
 
std::vector< mult_arg > m_arg
 
void mult_xmb_thread(int, double *, const double *, int)
 
std::vector< Channel * > m_fw_send
 
std::vector< Channel * > m_fw_recv
 
void gm5_dirac_thread(int, double *, const double *)
 
void Meo(Field &, const Field &, const int ieo)
even-odd operatior: ieo=0: even <– odd, ieo=1: odd <– even 
 
void mult_tp1_dirac_thread(int, double *, const double *, int)
 
Bridge::VerboseLevel m_vl
 
void mult_zp2_thread(int, double *, const double *, int)
 
void mult_tpb_chiral_thread(int, double *, const double *, int)
 
std::vector< Channel * > m_bw_send
 
void mult_xp2_thread(int, double *, const double *, int)
 
void scal_thread(int, double *, double)
 
void mult_ym2_thread(int, double *, const double *, int)
 
void gm5_chiral_thread(int, double *, const double *)
 
void mult_xp1_thread(int, double *, const double *, int)