22 #if defined USE_GROUP_SU3 
   23 #include "fopr_Wilson_impl_SU3.inc" 
   24 #elif defined USE_GROUP_SU2 
   25 #include "fopr_Wilson_impl_SU2.inc" 
   26 #elif defined USE_GROUP_SU_N 
   27 #include "fopr_Wilson_impl_SU_N.inc" 
   63     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   64       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
   65         int itask = ith_z + m_Ntask_z * ith_t;
 
   73         if (ith_t == 0) 
m_arg[itask].kt0 = 1;
 
   74         if (ith_z == 0) 
m_arg[itask].kz0 = 1;
 
   75         if (ith_t == m_Ntask_t - 1) 
m_arg[itask].kt1 = 1;
 
   76         if (ith_z == m_Ntask_z - 1) 
m_arg[itask].kz1 = 1;
 
   80         m_arg[itask].isite_cpz = ith_t * 
m_Mt * Nxy2;
 
   81         m_arg[itask].isite_cpt = ith_z * 
m_Mz * Nxy2;
 
   88     int Nvcd2 = 2 * Nc * Nd / 2;
 
   90     std::vector<int> destid(
m_Ntask);
 
   91     std::vector<int> offset(
m_Ntask);
 
   92     std::vector<int> datasize(
m_Ntask);
 
   93     std::vector<int> offset_up(
m_Ntask);
 
   94     std::vector<int> offset_lw(
m_Ntask);
 
   95     std::vector<int> datasize_up(
m_Ntask);
 
   96     std::vector<int> datasize_lw(
m_Ntask);
 
   99     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  100       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  102         int isite_cp = itask * 
m_Mz * 
m_Mt * (m_Ny / 2);
 
  103         destid[itask]   = itask;
 
  104         offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  105         datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * (m_Ny / 2);
 
  114     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  115       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  118         destid[itask]   = itask;
 
  119         offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  120         datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Nx2;
 
  129     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  130       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  131         int itask = ith_z + m_Ntask_z * ith_t;
 
  133         offset_up[itask]   = 0;
 
  134         offset_lw[itask]   = 0;
 
  135         datasize_up[itask] = 0;
 
  136         datasize_lw[itask] = 0;
 
  138           destid[itask]      = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
 
  139           offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx2 * m_Ny;
 
  140           datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx2 * m_Ny;
 
  142         if (ith_z == m_Ntask_z - 1) {
 
  144           offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx2 * m_Ny;
 
  145           datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx2 * m_Ny;
 
  155     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  156       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  157         int itask = ith_z + m_Ntask_z * ith_t;
 
  159         offset_up[itask]   = 0;
 
  160         offset_lw[itask]   = 0;
 
  161         datasize_up[itask] = 0;
 
  162         datasize_lw[itask] = 0;
 
  164           destid[itask]      = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
 
  165           offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx2 * m_Ny;
 
  166           datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx2 * m_Ny;
 
  168         if (ith_t == m_Ntask_t - 1) {
 
  169           destid[itask]      = ith_z;
 
  170           offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx2 * m_Ny;
 
  171           datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx2 * m_Ny;
 
  184                                    double *w, 
double fac)
 
  189     int    isite = 
m_arg[itask].isite;
 
  190     double *wp   = &w[Nvcd * isite];
 
  192     for (
int it = 0; it < 
m_Mt; ++it) {
 
  193       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  194         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  195           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  196           wp[iv] = fac * wp[iv];
 
  210     int    isite = 
m_arg[itask].isite;
 
  211     double *wp   = &v[Nvcd * isite];
 
  213     for (
int it = 0; it < 
m_Mt; ++it) {
 
  214       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  215         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  216           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  226     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  228     int Nvc2  = 2 * 
m_Nvc;
 
  230     int Nvcd2 = Nvcd / 2;
 
  239     int isite    = 
m_arg[itask].isite;
 
  240     int isite_cp = 
m_arg[itask].isite_cpx;
 
  241     int iyzt0    = isite / 
m_Nx2;
 
  245       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  246     const double *w1 = &v1[Nvcd * isite];
 
  253     for (
int it = 0; it < 
m_Mt; ++it) {
 
  254       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  255         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  256           int iyzt = iy + m_Ny * (iz + 
m_Nz * it);
 
  257           int Leo  = ieo + (1 - 2 * ieo) * 
m_Leo[iyzt0 + iyzt];
 
  259             int is = ix + 
m_Nx2 * iyzt;
 
  262             int ix1 = Nvc2 * ibf;
 
  263             int ix2 = ix1 + 
m_Nvc;
 
  265             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  266               w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
 
  267               w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
 
  268               w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  269               w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
 
  283     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
  285     int Nvc2  = 2 * 
m_Nvc;
 
  287     int Nvcd2 = Nvcd / 2;
 
  296     double wt1r, wt1i, wt2r, wt2i;
 
  298     int isite    = 
m_arg[itask].isite;
 
  299     int isite_cp = 
m_arg[itask].isite_cpx;
 
  300     int iyzt0    = isite / 
m_Nx2;
 
  302     double *w2 = &v2[Nvcd * isite];
 
  305       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  312     for (
int it = 0; it < 
m_Mt; ++it) {
 
  313       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  314         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  315           int iyzt = iy + m_Ny * (iz + 
m_Nz * it);
 
  316           int Leo  = ieo + (1 - 2 * ieo) * 
m_Leo[iyzt0 + iyzt];
 
  319             int is  = ix + 
m_Nx2 * iyzt;
 
  322             int ix1 = Nvc2 * ibf;
 
  323             int ix2 = ix1 + 
m_Nvc;
 
  325             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  326               int ic2 = ic * 
m_Nvc;
 
  327               wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  328               wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  329               wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  330               wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  331               w2[2 * ic + id1 + iv]     += wt1r;
 
  332               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  333               w2[2 * ic + id2 + iv]     += wt2r;
 
  334               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  335               w2[2 * ic + id3 + iv]     += wt2i;
 
  336               w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  337               w2[2 * ic + id4 + iv]     += wt1i;
 
  338               w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  350     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
  362     double wt1r, wt1i, wt2r, wt2i;
 
  364     int isite = 
m_arg[itask].isite;
 
  365     int iyzt0 = isite / 
m_Nx2;
 
  367     double       *w2 = &v2[Nvcd * isite];
 
  368     const double *w1 = &v1[Nvcd * isite];
 
  371     for (
int it = 0; it < 
m_Mt; ++it) {
 
  372       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  373         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  374           int iyzt = iy + m_Ny * (iz + 
m_Nz * it);
 
  375           int Leo  = ieo + (1 - 2 * ieo) * 
m_Leo[iyzt0 + iyzt];
 
  376           for (
int ix = 0; ix < 
m_Nx2 - Leo; ++ix) {
 
  377             int is = ix + 
m_Nx2 * iyzt;
 
  379             int in = Nvcd * (is + Leo);
 
  382             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  383               vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  384               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
 
  385               vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  386               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
 
  389             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  390               int ic2 = ic * 
m_Nvc;
 
  392               wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  393               wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  394               wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  395               wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  397               w2[2 * ic + id1 + iv]     += wt1r;
 
  398               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  399               w2[2 * ic + id2 + iv]     += wt2r;
 
  400               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  401               w2[2 * ic + id3 + iv]     += wt2i;
 
  402               w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  403               w2[2 * ic + id4 + iv]     += wt1i;
 
  404               w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  415     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  417     int Nvc2  = 2 * 
m_Nvc;
 
  419     int Nvcd2 = Nvcd / 2;
 
  428     int isite    = 
m_arg[itask].isite;
 
  429     int isite_cp = 
m_arg[itask].isite_cpx;
 
  430     int iyzt0    = isite / 
m_Nx2;
 
  434       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  435     const double *w1 = &v1[Nvcd * isite];
 
  443     for (
int it = 0; it < 
m_Mt; ++it) {
 
  444       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  445         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  446           int iyzt = iy + m_Ny * (iz + 
m_Nz * it);
 
  447           int Leo  = ieo + (1 - 2 * ieo) * 
m_Leo[iyzt0 + iyzt];
 
  449             int is = ix + 
m_Nx2 * iyzt;
 
  453             int ix1 = Nvc2 * ibf;
 
  454             int ix2 = ix1 + 
m_Nvc;
 
  456             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  457               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  458               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  459               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  460               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  463             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  465               w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  466               w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  467               w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  468               w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  482     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
  484     int Nvc2  = 2 * 
m_Nvc;
 
  486     int Nvcd2 = Nvcd / 2;
 
  496     double wt1r, wt1i, wt2r, wt2i;
 
  498     int isite    = 
m_arg[itask].isite;
 
  499     int isite_cp = 
m_arg[itask].isite_cpx;
 
  500     int iyzt0    = isite / 
m_Nx2;
 
  502     double *w2 = &v2[Nvcd * isite];
 
  505       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  511     for (
int it = 0; it < 
m_Mt; ++it) {
 
  512       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  513         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  514           int iyzt = iy + m_Ny * (iz + 
m_Nz * it);
 
  515           int Leo  = ieo + (1 - 2 * ieo) * 
m_Leo[iyzt0 + iyzt];
 
  517             int is = ix + 
m_Nx2 * iyzt;
 
  520             int ix1 = Nvc2 * ibf;
 
  521             int ix2 = ix1 + 
m_Nvc;
 
  523             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  525               int ici = 2 * ic + 1;
 
  526               w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  527               w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  528               w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  529               w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  530               w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
 
  531               w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
 
  532               w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
 
  533               w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
 
  545     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
  557     double wt1r, wt1i, wt2r, wt2i;
 
  559     int isite = 
m_arg[itask].isite;
 
  560     int iyzt0 = isite / 
m_Nx2;
 
  562     double       *w2 = &v2[Nvcd * isite];
 
  563     const double *w1 = &v1[Nvcd * isite];
 
  566     for (
int it = 0; it < 
m_Mt; ++it) {
 
  567       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  568         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  569           int iyzt = iy + m_Ny * (iz + 
m_Nz * it);
 
  570           int Leo  = ieo + (1 - 2 * ieo) * 
m_Leo[iyzt0 + iyzt];
 
  572           for (
int ix = Meo; ix < 
m_Nx2; ++ix) {
 
  573             int is = ix + m_Nx2 * iyzt;
 
  575             int in = Nvcd * (is - 
Meo);
 
  578             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  579               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  580               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  581               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  582               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  585             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  588               wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  589               wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  590               wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  591               wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  593               w2[2 * ic + id1 + iv]     += wt1r;
 
  594               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  595               w2[2 * ic + id2 + iv]     += wt2r;
 
  596               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  597               w2[2 * ic + id3 + iv]     += -wt2i;
 
  598               w2[2 * ic + 1 + id3 + iv] += +wt2r;
 
  599               w2[2 * ic + id4 + iv]     += -wt1i;
 
  600               w2[2 * ic + 1 + id4 + iv] += +wt1r;
 
  611     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  613     int Nvc2  = 2 * 
m_Nvc;
 
  615     int Nvcd2 = Nvcd / 2;
 
  624     int isite    = 
m_arg[itask].isite;
 
  625     int isite_cp = 
m_arg[itask].isite_cpy;
 
  629       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  630     const double *w1 = &v1[Nvcd * isite];
 
  636     for (
int it = 0; it < 
m_Mt; ++it) {
 
  637       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  638         for (
int ix = 0; ix < 
m_Nx2; ++ix) {
 
  639           int is  = ix + m_Nx2 * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  640           int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  642           int ix1 = Nvc2 * is2;
 
  643           int ix2 = ix1 + 
m_Nvc;
 
  645           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  646             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
 
  647             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
 
  648             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
 
  649             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  661     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
  663     int Nvc2  = 2 * 
m_Nvc;
 
  665     int Nvcd2 = Nvcd / 2;
 
  674     double wt1r, wt1i, wt2r, wt2i;
 
  676     int isite    = 
m_arg[itask].isite;
 
  677     int isite_cp = 
m_arg[itask].isite_cpy;
 
  679     double *w2 = &v2[Nvcd * isite];
 
  682       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  688     for (
int it = 0; it < 
m_Mt; ++it) {
 
  689       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  690         for (
int ix = 0; ix < 
m_Nx2; ++ix) {
 
  691           int is  = ix + m_Nx2 * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  692           int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  695           int ix1 = Nvc2 * is2;
 
  696           int ix2 = ix1 + 
m_Nvc;
 
  698           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  699             int ic2 = ic * 
m_Nvc;
 
  701             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  702             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  703             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  704             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  706             w2[2 * ic + id1 + iv]     += wt1r;
 
  707             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  708             w2[2 * ic + id2 + iv]     += wt2r;
 
  709             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  710             w2[2 * ic + id3 + iv]     += -wt2r;
 
  711             w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  712             w2[2 * ic + id4 + iv]     += wt1r;
 
  713             w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  723     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
  735     double wt1r, wt1i, wt2r, wt2i;
 
  737     int isite = 
m_arg[itask].isite;
 
  739     double       *w2 = &v2[Nvcd * isite];
 
  740     const double *w1 = &v1[Nvcd * isite];
 
  743     for (
int it = 0; it < 
m_Mt; ++it) {
 
  744       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  745         for (
int iy = 0; iy < 
m_Ny - 1; ++iy) {
 
  746           for (
int ix = 0; ix < 
m_Nx2; ++ix) {
 
  747             int is = ix + m_Nx2 * (iy + m_Ny * (iz + 
m_Nz * it));
 
  749             int in = Nvcd * (is + 
m_Nx2);
 
  752             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  753               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
 
  754               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  755               vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
 
  756               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  759             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  760               int ic2 = ic * 
m_Nvc;
 
  762               wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  763               wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  764               wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  765               wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  767               w2[2 * ic + id1 + iv]     += wt1r;
 
  768               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  769               w2[2 * ic + id2 + iv]     += wt2r;
 
  770               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  771               w2[2 * ic + id3 + iv]     += -wt2r;
 
  772               w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  773               w2[2 * ic + id4 + iv]     += wt1r;
 
  774               w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  785     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  787     int Nvc2  = 2 * 
m_Nvc;
 
  789     int Nvcd2 = Nvcd / 2;
 
  798     int isite    = 
m_arg[itask].isite;
 
  799     int isite_cp = 
m_arg[itask].isite_cpy;
 
  803       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  804     const double *w1 = &v1[Nvcd * isite];
 
  811     for (
int it = 0; it < 
m_Mt; ++it) {
 
  812       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  813         for (
int ix = 0; ix < 
m_Nx2; ++ix) {
 
  814           int is  = ix + m_Nx2 * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  815           int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  818           int ix1 = Nvc2 * is2;
 
  819           int ix2 = ix1 + 
m_Nvc;
 
  821           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  822             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  823             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  824             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  825             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  828           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  830             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  831             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  832             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  833             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  845     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
  847     int Nvc2  = 2 * 
m_Nvc;
 
  849     int Nvcd2 = Nvcd / 2;
 
  859     double wt1r, wt1i, wt2r, wt2i;
 
  861     int isite    = 
m_arg[itask].isite;
 
  862     int isite_cp = 
m_arg[itask].isite_cpy;
 
  864     double *w2 = &v2[Nvcd * isite];
 
  867       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  872     for (
int it = 0; it < 
m_Mt; ++it) {
 
  873       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  874         for (
int ix = 0; ix < 
m_Nx2; ++ix) {
 
  875           int is  = ix + m_Nx2 * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  876           int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  878           int ix1 = Nvc2 * is2;
 
  879           int ix2 = ix1 + 
m_Nvc;
 
  881           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  883             int ici = 2 * ic + 1;
 
  884             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  885             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  886             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  887             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  888             w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
 
  889             w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
 
  890             w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
 
  891             w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
 
  901     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
  913     double wt1r, wt1i, wt2r, wt2i;
 
  915     int isite = 
m_arg[itask].isite;
 
  917     double       *w2 = &v2[Nvcd * isite];
 
  918     const double *w1 = &v1[Nvcd * isite];
 
  921     for (
int it = 0; it < 
m_Mt; ++it) {
 
  922       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  923         for (
int iy = 1; iy < 
m_Ny; ++iy) {
 
  924           for (
int ix = 0; ix < 
m_Nx2; ++ix) {
 
  925             int is = ix + m_Nx2 * (iy + m_Ny * (iz + 
m_Nz * it));
 
  927             int in = Nvcd * (is - 
m_Nx2);
 
  930             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  931               vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  932               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  933               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  934               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  937             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  939               wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  940               wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  941               wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  942               wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  944               w2[ic2 + id1 + iv]     += wt1r;
 
  945               w2[ic2 + 1 + id1 + iv] += wt1i;
 
  946               w2[ic2 + id2 + iv]     += wt2r;
 
  947               w2[ic2 + 1 + id2 + iv] += wt2i;
 
  948               w2[ic2 + id3 + iv]     += wt2r;
 
  949               w2[ic2 + 1 + id3 + iv] += wt2i;
 
  950               w2[ic2 + id4 + iv]     += -wt1r;
 
  951               w2[ic2 + 1 + id4 + iv] += -wt1i;
 
  962     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
  964     int Nvc2  = 2 * 
m_Nvc;
 
  966     int Nvcd2 = Nvcd / 2;
 
  975     int isite    = 
m_arg[itask].isite;
 
  976     int isite_cp = 
m_arg[itask].isite_cpz;
 
  980       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  981     const double *w1 = &v1[Nvcd * isite];
 
  985     if (
m_arg[itask].kz0 == 1) {
 
  988       for (
int it = 0; it < 
m_Mt; ++it) {
 
  989         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  990           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
  991           int is2 = ixy + Nxy * it;
 
  994           int ix1 = Nvc2 * is2;
 
  995           int ix2 = ix1 + 
m_Nvc;
 
  997           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  998             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
 
  999             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
 
 1000             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1001             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
 
 1013     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1015     int Nvc2  = 2 * 
m_Nvc;
 
 1017     int Nvcd2 = Nvcd / 2;
 
 1021     int id3 = 
m_Nvc * 2;
 
 1022     int id4 = 
m_Nvc * 3;
 
 1026     double wt1r, wt1i, wt2r, wt2i;
 
 1028     int isite    = 
m_arg[itask].isite;
 
 1029     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1031     double *w2 = &v2[Nvcd * isite];
 
 1034       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1039     if (
m_arg[itask].kz1 == 1) {
 
 1042       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1043         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1044           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1045           int is2 = ixy + Nxy * it;
 
 1047           int ig  = 
m_Ndf * is;
 
 1048           int ix1 = Nvc2 * is2;
 
 1049           int ix2 = ix1 + 
m_Nvc;
 
 1051           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1052             int ic2 = ic * 
m_Nvc;
 
 1054             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1055             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1056             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1057             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1059             w2[2 * ic + id1 + iv]     += wt1r;
 
 1060             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1061             w2[2 * ic + id2 + iv]     += wt2r;
 
 1062             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1063             w2[2 * ic + id3 + iv]     += wt1i;
 
 1064             w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1065             w2[2 * ic + id4 + iv]     += -wt2i;
 
 1066             w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1076     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1082     int id3 = 
m_Nvc * 2;
 
 1083     int id4 = 
m_Nvc * 3;
 
 1088     double wt1r, wt1i, wt2r, wt2i;
 
 1090     int isite = 
m_arg[itask].isite;
 
 1092     double       *w2 = &v2[Nvcd * isite];
 
 1093     const double *w1 = &v1[Nvcd * isite];
 
 1096     int kz1 = 
m_arg[itask].kz1;
 
 1099     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1100       for (
int iz = 0; iz < 
m_Mz - kz1; ++iz) {
 
 1101         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1102           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1104           int in = Nvcd * (is + Nxy);
 
 1105           int ig = 
m_Ndf * is;
 
 1107           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1108             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1109             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
 
 1110             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1111             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
 
 1114           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1115             int ic2 = ic * 
m_Nvc;
 
 1117             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1118             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1119             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1120             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1122             w2[2 * ic + id1 + iv]     += wt1r;
 
 1123             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1124             w2[2 * ic + id2 + iv]     += wt2r;
 
 1125             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1126             w2[2 * ic + id3 + iv]     += wt1i;
 
 1127             w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1128             w2[2 * ic + id4 + iv]     += -wt2i;
 
 1129             w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1139     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1141     int Nvc2  = 2 * 
m_Nvc;
 
 1143     int Nvcd2 = Nvcd / 2;
 
 1147     int id3 = 
m_Nvc * 2;
 
 1148     int id4 = 
m_Nvc * 3;
 
 1152     int isite    = 
m_arg[itask].isite;
 
 1153     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1157       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1158     const double *w1 = &v1[Nvcd * isite];
 
 1163     if (
m_arg[itask].kz1 == 1) {
 
 1166       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1167         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1168           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1169           int is2 = ixy + Nxy * it;
 
 1171           int ig  = 
m_Ndf * is;
 
 1172           int ix1 = Nvc2 * is2;
 
 1173           int ix2 = ix1 + 
m_Nvc;
 
 1175           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1176             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1177             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1178             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1179             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1182           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1184             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1185             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1186             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1187             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1199     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1201     int Nvc2  = 2 * 
m_Nvc;
 
 1203     int Nvcd2 = Nvcd / 2;
 
 1207     int id3 = 
m_Nvc * 2;
 
 1208     int id4 = 
m_Nvc * 3;
 
 1213     double wt1r, wt1i, wt2r, wt2i;
 
 1215     int isite    = 
m_arg[itask].isite;
 
 1216     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1218     double *w2 = &v2[Nvcd * isite];
 
 1221       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1225     if (
m_arg[itask].kz0 == 1) {
 
 1229       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1230         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1231           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1232           int is2 = ixy + Nxy * it;
 
 1234           int ix1 = Nvc2 * is2;
 
 1235           int ix2 = ix1 + 
m_Nvc;
 
 1237           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1239             int ici = 2 * ic + 1;
 
 1240             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1241             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1242             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1243             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1244             w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
 
 1245             w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
 
 1246             w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
 
 1247             w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
 
 1257     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1263     int id3 = 
m_Nvc * 2;
 
 1264     int id4 = 
m_Nvc * 3;
 
 1269     double wt1r, wt1i, wt2r, wt2i;
 
 1271     int isite = 
m_arg[itask].isite;
 
 1273     double       *w2 = &v2[Nvcd * isite];
 
 1274     const double *w1 = &v1[Nvcd * isite];
 
 1277     int kz0 = 
m_arg[itask].kz0;
 
 1280     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1281       for (
int iz = kz0; iz < 
m_Mz; ++iz) {
 
 1282         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1283           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1285           int in = Nvcd * (is - Nxy);
 
 1286           int ig = 
m_Ndf * (is - Nxy);
 
 1288           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1289             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1290             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1291             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1292             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1295           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1297             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1298             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1299             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1300             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1302             w2[ic2 + id1 + iv]     += wt1r;
 
 1303             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1304             w2[ic2 + id2 + iv]     += wt2r;
 
 1305             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1306             w2[ic2 + id3 + iv]     += -wt1i;
 
 1307             w2[ic2 + 1 + id3 + iv] += wt1r;
 
 1308             w2[ic2 + id4 + iv]     += wt2i;
 
 1309             w2[ic2 + 1 + id4 + iv] += -wt2r;
 
 1319     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1321     int Nvc2  = 2 * 
m_Nvc;
 
 1323     int Nvcd2 = Nvcd / 2;
 
 1327     int id3 = 
m_Nvc * 2;
 
 1328     int id4 = 
m_Nvc * 3;
 
 1332     int isite    = 
m_arg[itask].isite;
 
 1333     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1337       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1338     const double *w1 = &v1[Nvcd * isite];
 
 1342     if (
m_arg[itask].kt0 == 1) {
 
 1345       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1346         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1347           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1348           int is2 = ixy + Nxy * iz;
 
 1351           int ix1 = Nvc2 * is2;
 
 1352           int ix2 = ix1 + 
m_Nvc;
 
 1354           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1355             w2[2 * ic + ix1]     = 2.0 * bc2 * w1[2 * ic + id3 + in];
 
 1356             w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
 
 1357             w2[2 * ic + ix2]     = 2.0 * bc2 * w1[2 * ic + id4 + in];
 
 1358             w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
 
 1370     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1372     int Nvc2  = 2 * 
m_Nvc;
 
 1374     int Nvcd2 = Nvcd / 2;
 
 1378     int id3 = 
m_Nvc * 2;
 
 1379     int id4 = 
m_Nvc * 3;
 
 1383     double wt1r, wt1i, wt2r, wt2i;
 
 1385     int isite    = 
m_arg[itask].isite;
 
 1386     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1388     double *w2 = &v2[Nvcd * isite];
 
 1391       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1396     if (
m_arg[itask].kt1 == 1) {
 
 1399       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1400         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1401           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1402           int is2 = ixy + Nxy * iz;
 
 1404           int ig  = 
m_Ndf * is;
 
 1405           int ix1 = Nvc2 * is2;
 
 1406           int ix2 = ix1 + 
m_Nvc;
 
 1408           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1409             int ic2 = ic * 
m_Nvc;
 
 1411             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1412             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1413             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1414             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1416             w2[2 * ic + id3 + iv]     += wt1r;
 
 1417             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1418             w2[2 * ic + id4 + iv]     += wt2r;
 
 1419             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1429     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1435     int id3 = 
m_Nvc * 2;
 
 1436     int id4 = 
m_Nvc * 3;
 
 1441     double wt1r, wt1i, wt2r, wt2i;
 
 1443     int isite = 
m_arg[itask].isite;
 
 1445     double       *w2 = &v2[Nvcd * isite];
 
 1446     const double *w1 = &v1[Nvcd * isite];
 
 1449     int kt1  = 
m_arg[itask].kt1;
 
 1451     int Nxyz = Nxy * 
m_Nz;
 
 1453     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1454       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1455         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1456           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1458           int in = Nvcd * (is + Nxyz);
 
 1459           int ig = 
m_Ndf * is;
 
 1461           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1462             vt1[2 * ic]     = 2.0 * w1[2 * ic + id3 + in];
 
 1463             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
 
 1464             vt2[2 * ic]     = 2.0 * w1[2 * ic + id4 + in];
 
 1465             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
 
 1468           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1469             int ic2 = ic * 
m_Nvc;
 
 1471             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1472             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1473             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1474             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1476             w2[2 * ic + id3 + iv]     += wt1r;
 
 1477             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1478             w2[2 * ic + id4 + iv]     += wt2r;
 
 1479             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1489     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1491     int Nvc2  = 2 * 
m_Nvc;
 
 1493     int Nvcd2 = Nvcd / 2;
 
 1497     int id3 = 
m_Nvc * 2;
 
 1498     int id4 = 
m_Nvc * 3;
 
 1502     int isite    = 
m_arg[itask].isite;
 
 1503     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1507       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1508     const double *w1 = &v1[Nvcd * isite];
 
 1513     if (
m_arg[itask].kt1 == 1) {
 
 1516       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1517         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1518           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1519           int is2 = ixy + Nxy * iz;
 
 1521           int ig  = 
m_Ndf * is;
 
 1522           int ix1 = Nvc2 * is2;
 
 1523           int ix2 = ix1 + 
m_Nvc;
 
 1525           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1526             vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1527             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1528             vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1529             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1532           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1534             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1535             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1536             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1537             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1549     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1551     int Nvc2  = 2 * 
m_Nvc;
 
 1553     int Nvcd2 = Nvcd / 2;
 
 1557     int id3 = 
m_Nvc * 2;
 
 1558     int id4 = 
m_Nvc * 3;
 
 1563     double wt1r, wt1i, wt2r, wt2i;
 
 1565     int isite    = 
m_arg[itask].isite;
 
 1566     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1568     double *w2 = &v2[Nvcd * isite];
 
 1571       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1575     if (
m_arg[itask].kt0 == 1) {
 
 1578       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1579         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1580           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1581           int is2 = ixy + Nxy * iz;
 
 1583           int ix1 = Nvc2 * is2;
 
 1584           int ix2 = ix1 + 
m_Nvc;
 
 1586           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1588             int ici = 2 * ic + 1;
 
 1589             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1590             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1591             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1592             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1602     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1608     int id3 = 
m_Nvc * 2;
 
 1609     int id4 = 
m_Nvc * 3;
 
 1614     double wt1r, wt1i, wt2r, wt2i;
 
 1616     int isite = 
m_arg[itask].isite;
 
 1618     double       *w2 = &v2[Nvcd * isite];
 
 1619     const double *w1 = &v1[Nvcd * isite];
 
 1622     int kt0  = 
m_arg[itask].kt0;
 
 1624     int Nxyz = Nxy * 
m_Nz;
 
 1626     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1627       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1628         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1629           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1631           int in = Nvcd * (is - Nxyz);
 
 1632           int ig = 
m_Ndf * (is - Nxyz);
 
 1634           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1635             vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1636             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1637             vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1638             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1641           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1643             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1644             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1645             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1646             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1648             w2[ic2 + id1 + iv]     += wt1r;
 
 1649             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1650             w2[ic2 + id2 + iv]     += wt2r;
 
 1651             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1661     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1663     int Nvc2  = 2 * 
m_Nvc;
 
 1665     int Nvcd2 = Nvcd / 2;
 
 1669     int id3 = 
m_Nvc * 2;
 
 1670     int id4 = 
m_Nvc * 3;
 
 1674     int isite    = 
m_arg[itask].isite;
 
 1675     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1679       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1680     const double *w1 = &v1[Nvcd * isite];
 
 1684     if (
m_arg[itask].kt0 == 1) {
 
 1687       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1688         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1689           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1690           int is2 = ixy + Nxy * iz;
 
 1693           int ix1 = Nvc2 * is2;
 
 1694           int ix2 = ix1 + 
m_Nvc;
 
 1696           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1697             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
 
 1698             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
 
 1699             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
 
 1700             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1712     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1714     int Nvc2  = 2 * 
m_Nvc;
 
 1716     int Nvcd2 = Nvcd / 2;
 
 1720     int id3 = 
m_Nvc * 2;
 
 1721     int id4 = 
m_Nvc * 3;
 
 1725     double wt1r, wt1i, wt2r, wt2i;
 
 1727     int isite    = 
m_arg[itask].isite;
 
 1728     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1730     double *w2 = &v2[Nvcd * isite];
 
 1733       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1738     if (
m_arg[itask].kt1 == 1) {
 
 1741       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1742         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1743           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1744           int is2 = ixy + Nxy * iz;
 
 1746           int ig  = 
m_Ndf * is;
 
 1747           int ix1 = Nvc2 * is2;
 
 1748           int ix2 = ix1 + 
m_Nvc;
 
 1750           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1751             int ic2 = ic * 
m_Nvc;
 
 1753             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1754             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1755             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1756             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1758             w2[2 * ic + id1 + iv]     += wt1r;
 
 1759             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1760             w2[2 * ic + id2 + iv]     += wt2r;
 
 1761             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1762             w2[2 * ic + id3 + iv]     += wt1r;
 
 1763             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1764             w2[2 * ic + id4 + iv]     += wt2r;
 
 1765             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1775     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1781     int id3 = 
m_Nvc * 2;
 
 1782     int id4 = 
m_Nvc * 3;
 
 1787     double wt1r, wt1i, wt2r, wt2i;
 
 1789     int isite = 
m_arg[itask].isite;
 
 1791     double       *w2 = &v2[Nvcd * isite];
 
 1792     const double *w1 = &v1[Nvcd * isite];
 
 1795     int kt1  = 
m_arg[itask].kt1;
 
 1797     int Nxyz = Nxy * 
m_Nz;
 
 1799     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1800       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1801         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1802           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1804           int in = Nvcd * (is + Nxyz);
 
 1805           int ig = 
m_Ndf * is;
 
 1807           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1808             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
 
 1809             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1810             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
 
 1811             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1814           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1815             int ic2 = ic * 
m_Nvc;
 
 1817             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1818             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1819             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1820             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1822             w2[2 * ic + id1 + iv]     += wt1r;
 
 1823             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1824             w2[2 * ic + id2 + iv]     += wt2r;
 
 1825             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1826             w2[2 * ic + id3 + iv]     += wt1r;
 
 1827             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1828             w2[2 * ic + id4 + iv]     += wt2r;
 
 1829             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1839     int itask, 
double *vcp1, 
const double *v1, 
int ieo)
 
 1841     int Nvc2  = 2 * 
m_Nvc;
 
 1843     int Nvcd2 = Nvcd / 2;
 
 1847     int id3 = 
m_Nvc * 2;
 
 1848     int id4 = 
m_Nvc * 3;
 
 1852     int isite    = 
m_arg[itask].isite;
 
 1853     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1857       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1858     const double *w1 = &v1[Nvcd * isite];
 
 1863     if (
m_arg[itask].kt1 == 1) {
 
 1866       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1867         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1868           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1869           int is2 = ixy + Nxy * iz;
 
 1871           int ig  = 
m_Ndf * is;
 
 1872           int ix1 = Nvc2 * is2;
 
 1873           int ix2 = ix1 + 
m_Nvc;
 
 1875           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1876             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1877             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1878             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1879             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1882           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1884             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1885             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1886             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1887             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1899     int itask, 
double *v2, 
const double *vcp2, 
int ieo)
 
 1901     int Nvc2  = 2 * 
m_Nvc;
 
 1903     int Nvcd2 = Nvcd / 2;
 
 1907     int id3 = 
m_Nvc * 2;
 
 1908     int id4 = 
m_Nvc * 3;
 
 1913     double wt1r, wt1i, wt2r, wt2i;
 
 1915     int isite    = 
m_arg[itask].isite;
 
 1916     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1918     double *w2 = &v2[Nvcd * isite];
 
 1921       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1925     if (
m_arg[itask].kt0 == 1) {
 
 1928       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1929         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1930           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1931           int is2 = ixy + Nxy * iz;
 
 1933           int ix1 = Nvc2 * is2;
 
 1934           int ix2 = ix1 + 
m_Nvc;
 
 1936           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1938             int ici = 2 * ic + 1;
 
 1939             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1940             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1941             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1942             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1943             w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
 
 1944             w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
 
 1945             w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
 
 1946             w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
 
 1956     int itask, 
double *v2, 
const double *v1, 
int ieo)
 
 1962     int id3 = 
m_Nvc * 2;
 
 1963     int id4 = 
m_Nvc * 3;
 
 1968     double wt1r, wt1i, wt2r, wt2i;
 
 1970     int isite = 
m_arg[itask].isite;
 
 1972     double       *w2 = &v2[Nvcd * isite];
 
 1973     const double *w1 = &v1[Nvcd * isite];
 
 1976     int kt0  = 
m_arg[itask].kt0;
 
 1978     int Nxyz = Nxy * 
m_Nz;
 
 1980     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1981       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1982         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1983           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1985           int in = Nvcd * (is - Nxyz);
 
 1986           int ig = 
m_Ndf * (is - Nxyz);
 
 1988           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1989             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1990             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1991             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1992             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1995           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1997             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1998             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1999             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 2000             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 2002             w2[ic2 + id1 + iv]     += wt1r;
 
 2003             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 2004             w2[ic2 + id2 + iv]     += wt2r;
 
 2005             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 2006             w2[ic2 + id3 + iv]     -= wt1r;
 
 2007             w2[ic2 + 1 + id3 + iv] -= wt1i;
 
 2008             w2[ic2 + id4 + iv]     -= wt2r;
 
 2009             w2[ic2 + 1 + id4 + iv] -= wt2i;
 
 2019     int itask, 
double *v2, 
const double *v1)
 
 2026     int id3 = 
m_Nvc * 2;
 
 2027     int id4 = 
m_Nvc * 3;
 
 2029     int          isite = 
m_arg[itask].isite;
 
 2030     double       *w2   = &v2[Nvcd * isite];
 
 2031     const double *w1   = &v1[Nvcd * isite];
 
 2033     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2034       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2035         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2036           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2037           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2038             w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2039             w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2040             w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
 
 2041             w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
 
 2051     int itask, 
double *v2, 
const double *v1)
 
 2058     int id3 = 
m_Nvc * 2;
 
 2059     int id4 = 
m_Nvc * 3;
 
 2061     int          isite = 
m_arg[itask].isite;
 
 2062     double       *w2   = &v2[Nvcd * isite];
 
 2063     const double *w1   = &v1[Nvcd * isite];
 
 2065     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2066       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2067         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2068           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2069           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2070             w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
 
 2071             w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
 
 2072             w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2073             w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
 2090     int id3 = 
m_Nvc * 2;
 
 2091     int id4 = 
m_Nvc * 3;
 
 2093     int    isite = 
m_arg[itask].isite;
 
 2094     double *w1   = &v1[Nvcd * isite];
 
 2096     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2097       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2098         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2099           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2100           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2101             double wt1 = w1[ivc + id1 + iv];
 
 2102             double wt2 = w1[ivc + id2 + iv];
 
 2103             w1[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2104             w1[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2105             w1[ivc + id3 + iv] = wt1;
 
 2106             w1[ivc + id4 + iv] = wt2;
 
 2123     int id3 = 
m_Nvc * 2;
 
 2124     int id4 = 
m_Nvc * 3;
 
 2126     int    isite = 
m_arg[itask].isite;
 
 2127     double *w1   = &v1[Nvcd * isite];
 
 2129     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2130       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2131         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2132           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2133           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2134             w1[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2135             w1[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
std::vector< Channel * > m_fw_send
 
void mult_tm1_chiral_thread(int, double *, const double *, int)
 
void mult_tm1_dirac_thread(int, double *, const double *, int)
 
std::vector< double > m_boundary2
b.c. for each node. 
 
const double * ptr(const int jin, const int site, const int jex) const 
 
void mult_yp2_thread(int, double *, const double *, int)
 
void general(const char *format,...)
 
static const std::string class_name
 
void mult_tp2_dirac_thread(int, double *, const double *, int)
 
std::vector< mult_arg > m_arg
 
void mult_ypb_thread(int, double *, const double *, int)
 
void gm5_chiral_thread(int, double *, const double *)
 
void mult_yp1_thread(int, double *, const double *, int)
 
void Meo(Field &, const Field &, const int ieo)
 
void mult_tm2_chiral_thread(int, double *, const double *, int)
 
void mult_ym2_thread(int, double *, const double *, int)
 
void scal_thread(int, double *, double)
 
void mult_xm1_thread(int, double *, const double *, int)
 
void mult_zp2_thread(int, double *, const double *, int)
 
void mult_tp1_dirac_thread(int, double *, const double *, int)
 
void gm5_dirac_thread(int, double *, const double *)
 
void mult_xp2_thread(int, double *, const double *, int)
 
void mult_zp1_thread(int, double *, const double *, int)
 
Bridge::VerboseLevel m_vl
 
void mult_tpb_dirac_thread(int, double *, const double *, int)
 
void mult_zmb_thread(int, double *, const double *, int)
 
void mult_tp1_chiral_thread(int, double *, const double *, int)
 
void mult_zpb_thread(int, double *, const double *, int)
 
void mult_tm2_dirac_thread(int, double *, const double *, int)
 
void mult_xmb_thread(int, double *, const double *, int)
 
void mult_tpb_chiral_thread(int, double *, const double *, int)
 
static int get_num_threads_available()
returns number of threads (works outside of parallel region). 
 
Field_G * m_U
dummy: pointing m_Ueo. 
 
void crucial(const char *format,...)
 
std::vector< Channel * > m_bw_recv
 
void mult_tp2_chiral_thread(int, double *, const double *, int)
 
void mult_ymb_thread(int, double *, const double *, int)
 
void clear_thread(int, double *)
 
void mult_tmb_dirac_thread(int, double *, const double *, int)
 
void mult_ym1_thread(int, double *, const double *, int)
 
void mult_xm2_thread(int, double *, const double *, int)
 
void mult_zm2_thread(int, double *, const double *, int)
 
void mult_xpb_thread(int, double *, const double *, int)
 
void mult_zm1_thread(int, double *, const double *, int)
 
std::vector< Channel * > m_fw_recv
 
void mult_tmb_chiral_thread(int, double *, const double *, int)
 
void mult_xp1_thread(int, double *, const double *, int)
 
std::vector< Channel * > m_bw_send