19 #if defined USE_GROUP_SU3 
   20 #include "fopr_Wilson_impl_SU3.inc" 
   21 #elif defined USE_GROUP_SU2 
   22 #include "fopr_Wilson_impl_SU2.inc" 
   23 #elif defined USE_GROUP_SU_N 
   24 #include "fopr_Wilson_impl_SU_N.inc" 
   77     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   78       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
   79         int itask = ith_z + m_Ntask_z * ith_t;
 
   87         if (ith_t == 0) 
m_arg[itask].kt0 = 1;
 
   88         if (ith_z == 0) 
m_arg[itask].kz0 = 1;
 
   89         if (ith_t == m_Ntask_t - 1) 
m_arg[itask].kt1 = 1;
 
   90         if (ith_z == m_Ntask_z - 1) 
m_arg[itask].kz1 = 1;
 
   94         m_arg[itask].isite_cpz = ith_t * 
m_Mt * Nxy;
 
   95         m_arg[itask].isite_cpt = ith_z * 
m_Mz * Nxy;
 
  102     int Nvcd2 = 2 * Nc * Nd / 2;
 
  104     std::vector<int> destid(
m_Ntask);
 
  105     std::vector<int> offset(
m_Ntask);
 
  106     std::vector<int> datasize(
m_Ntask);
 
  107     std::vector<int> offset_up(
m_Ntask);
 
  108     std::vector<int> offset_lw(
m_Ntask);
 
  109     std::vector<int> datasize_up(
m_Ntask);
 
  110     std::vector<int> datasize_lw(
m_Ntask);
 
  113     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  114       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  117         destid[itask]   = itask;
 
  118         offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  119         datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Ny;
 
  128     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  129       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  132         destid[itask]   = itask;
 
  133         offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  134         datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Nx;
 
  143     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  144       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  145         int itask = ith_z + m_Ntask_z * ith_t;
 
  147         offset_up[itask]   = 0;
 
  148         offset_lw[itask]   = 0;
 
  149         datasize_up[itask] = 0;
 
  150         datasize_lw[itask] = 0;
 
  152           destid[itask]      = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
 
  153           offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx * m_Ny;
 
  154           datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx * m_Ny;
 
  156         if (ith_z == m_Ntask_z - 1) {
 
  158           offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx * m_Ny;
 
  159           datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx * m_Ny;
 
  169     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  170       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  171         int itask = ith_z + m_Ntask_z * ith_t;
 
  173         offset_up[itask]   = 0;
 
  174         offset_lw[itask]   = 0;
 
  175         datasize_up[itask] = 0;
 
  176         datasize_lw[itask] = 0;
 
  178           destid[itask]      = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
 
  179           offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx * m_Ny;
 
  180           datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx * m_Ny;
 
  182         if (ith_t == m_Ntask_t - 1) {
 
  183           destid[itask]      = ith_z;
 
  184           offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx * m_Ny;
 
  185           datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx * m_Ny;
 
  198     int itask, 
double *v2, 
double fac, 
const double *v1)
 
  203     int isite = 
m_arg[itask].isite;
 
  205     const double *w1 = &v1[Nvcd * isite];
 
  206     double       *w2 = &v2[Nvcd * isite];
 
  208     for (
int it = 0; it < 
m_Mt; ++it) {
 
  209       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  210         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  211           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  212           w2[iv] = fac * w2[iv] + w1[iv];
 
  226     int    isite = 
m_arg[itask].isite;
 
  227     double *w2   = &v2[Nvcd * isite];
 
  229     for (
int it = 0; it < 
m_Mt; ++it) {
 
  230       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  231         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  232           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  242     int itask, 
double *vcp1, 
const double *v1)
 
  244     int Nvc2  = 2 * 
m_Nvc;
 
  246     int Nvcd2 = Nvcd / 2;
 
  256     int isite    = 
m_arg[itask].isite;
 
  257     int isite_cp = 
m_arg[itask].isite_cpx;
 
  260     const double *w1 = &v1[Nvcd * isite];
 
  262       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  266     for (
int it = 0; it < 
m_Mt; ++it) {
 
  267       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  268         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  269           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  270           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  272           int ix1 = Nvc2 * is2;
 
  273           int ix2 = ix1 + 
m_Nvc;
 
  275           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  276             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
 
  277             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
 
  278             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  279             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
 
  291     int itask, 
double *v2, 
const double *vcp2)
 
  293     int Nvc2  = 2 * 
m_Nvc;
 
  295     int Nvcd2 = Nvcd / 2;
 
  304     double wt1r, wt1i, wt2r, wt2i;
 
  306     int isite    = 
m_arg[itask].isite;
 
  307     int isite_cp = 
m_arg[itask].isite_cpx;
 
  309     double *w2 = &v2[Nvcd * isite];
 
  312       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  318     for (
int it = 0; it < 
m_Mt; ++it) {
 
  319       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  320         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  321           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  322           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  325           int ix1 = Nvc2 * is2;
 
  326           int ix2 = ix1 + 
m_Nvc;
 
  328           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  329             int ic2 = ic * 
m_Nvc;
 
  331             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  332             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  333             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  334             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  336             w2[2 * ic + id1 + iv]     += wt1r;
 
  337             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  338             w2[2 * ic + id2 + iv]     += wt2r;
 
  339             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  340             w2[2 * ic + id3 + iv]     += wt2i;
 
  341             w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  342             w2[2 * ic + id4 + iv]     += wt1i;
 
  343             w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  353     int itask, 
double *v2, 
const double *v1)
 
  365     double wt1r, wt1i, wt2r, wt2i;
 
  367     int isite = 
m_arg[itask].isite;
 
  369     const double *w1 = &v1[Nvcd * isite];
 
  370     double       *w2 = &v2[Nvcd * isite];
 
  373     for (
int it = 0; it < 
m_Mt; ++it) {
 
  374       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  375         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  376           for (
int ix = 0; ix < 
m_Nx - 1; ++ix) {
 
  377             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  379             int in = Nvcd * (is + 1);
 
  382             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  383               vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  384               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
 
  385               vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  386               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
 
  389             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  390               int ic2 = ic * 
m_Nvc;
 
  392               wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  393               wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  394               wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  395               wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  397               w2[2 * ic + id1 + iv]     += wt1r;
 
  398               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  399               w2[2 * ic + id2 + iv]     += wt2r;
 
  400               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  401               w2[2 * ic + id3 + iv]     += wt2i;
 
  402               w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  403               w2[2 * ic + id4 + iv]     += wt1i;
 
  404               w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  415     int itask, 
double *vcp1, 
const double *v1)
 
  417     int Nvc2  = 2 * 
m_Nvc;
 
  419     int Nvcd2 = Nvcd / 2;
 
  428     int isite    = 
m_arg[itask].isite;
 
  429     int isite_cp = 
m_arg[itask].isite_cpx;
 
  431     const double *w1 = &v1[Nvcd * isite];
 
  434       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  441     for (
int it = 0; it < 
m_Mt; ++it) {
 
  442       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  443         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  444           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  445           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  448           int ix1 = Nvc2 * is2;
 
  449           int ix2 = ix1 + 
m_Nvc;
 
  451           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  452             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  453             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  454             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  455             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  458           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  460             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  461             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  462             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  463             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  475     int itask, 
double *v2, 
const double *vcp2)
 
  477     int Nvc2  = 2 * 
m_Nvc;
 
  479     int Nvcd2 = Nvcd / 2;
 
  489     double wt1r, wt1i, wt2r, wt2i;
 
  491     int isite    = 
m_arg[itask].isite;
 
  492     int isite_cp = 
m_arg[itask].isite_cpx;
 
  494     double *w2 = &v2[Nvcd * isite];
 
  497       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  502     for (
int it = 0; it < 
m_Mt; ++it) {
 
  503       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  504         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  505           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  506           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  508           int ix1 = Nvc2 * is2;
 
  509           int ix2 = ix1 + 
m_Nvc;
 
  511           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  513             int ici = 2 * ic + 1;
 
  514             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  515             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  516             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  517             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  518             w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
 
  519             w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
 
  520             w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
 
  521             w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
 
  531     int itask, 
double *v2, 
const double *v1)
 
  543     double wt1r, wt1i, wt2r, wt2i;
 
  545     int isite = 
m_arg[itask].isite;
 
  547     const double *w1 = &v1[Nvcd * isite];
 
  548     double       *w2 = &v2[Nvcd * isite];
 
  551     for (
int it = 0; it < 
m_Mt; ++it) {
 
  552       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  553         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  554           for (
int ix = 1; ix < 
m_Nx; ++ix) {
 
  555             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  557             int in = Nvcd * (is - 1);
 
  558             int ig = 
m_Ndf * (is - 1);
 
  560             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  561               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  562               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  563               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  564               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  567             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  570               wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  571               wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  572               wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  573               wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  575               w2[2 * ic + id1 + iv]     += wt1r;
 
  576               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  577               w2[2 * ic + id2 + iv]     += wt2r;
 
  578               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  579               w2[2 * ic + id3 + iv]     += -wt2i;
 
  580               w2[2 * ic + 1 + id3 + iv] += +wt2r;
 
  581               w2[2 * ic + id4 + iv]     += -wt1i;
 
  582               w2[2 * ic + 1 + id4 + iv] += +wt1r;
 
  593     int itask, 
double *vcp1, 
const double *v1)
 
  595     int Nvc2  = 2 * 
m_Nvc;
 
  597     int Nvcd2 = Nvcd / 2;
 
  604     int isite    = 
m_arg[itask].isite;
 
  605     int isite_cp = 
m_arg[itask].isite_cpy;
 
  610     const double *w1 = &v1[Nvcd * isite];
 
  613       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  618     for (
int it = 0; it < 
m_Mt; ++it) {
 
  619       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  620         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  621           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  622           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  624           int ix1 = Nvc2 * is2;
 
  625           int ix2 = ix1 + 
m_Nvc;
 
  627           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  628             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
 
  629             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
 
  630             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
 
  631             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  643     int itask, 
double *v2, 
const double *vcp2)
 
  645     int Nvc2  = 2 * 
m_Nvc;
 
  647     int Nvcd2 = Nvcd / 2;
 
  656     double wt1r, wt1i, wt2r, wt2i;
 
  658     int isite    = 
m_arg[itask].isite;
 
  659     int isite_cp = 
m_arg[itask].isite_cpy;
 
  661     double *w2 = &v2[Nvcd * isite];
 
  664       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  670     for (
int it = 0; it < 
m_Mt; ++it) {
 
  671       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  672         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  673           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  674           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  677           int ix1 = Nvc2 * is2;
 
  678           int ix2 = ix1 + 
m_Nvc;
 
  680           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  681             int ic2 = ic * 
m_Nvc;
 
  683             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  684             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  685             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  686             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  688             w2[2 * ic + id1 + iv]     += wt1r;
 
  689             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  690             w2[2 * ic + id2 + iv]     += wt2r;
 
  691             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  692             w2[2 * ic + id3 + iv]     += -wt2r;
 
  693             w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  694             w2[2 * ic + id4 + iv]     += wt1r;
 
  695             w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  705     int itask, 
double *v2, 
const double *v1)
 
  717     double wt1r, wt1i, wt2r, wt2i;
 
  719     int isite = 
m_arg[itask].isite;
 
  721     double       *w2 = &v2[Nvcd * isite];
 
  722     const double *w1 = &v1[Nvcd * isite];
 
  725     for (
int it = 0; it < 
m_Mt; ++it) {
 
  726       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  727         for (
int iy = 0; iy < 
m_Ny - 1; ++iy) {
 
  728           for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  729             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  731             int in = Nvcd * (is + 
m_Nx);
 
  734             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  735               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
 
  736               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  737               vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
 
  738               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  741             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  742               int ic2 = ic * 
m_Nvc;
 
  744               wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  745               wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  746               wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  747               wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  749               w2[2 * ic + id1 + iv]     += wt1r;
 
  750               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  751               w2[2 * ic + id2 + iv]     += wt2r;
 
  752               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  753               w2[2 * ic + id3 + iv]     += -wt2r;
 
  754               w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  755               w2[2 * ic + id4 + iv]     += wt1r;
 
  756               w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  767     int itask, 
double *vcp1, 
const double *v1)
 
  769     int Nvc2  = 2 * 
m_Nvc;
 
  771     int Nvcd2 = Nvcd / 2;
 
  780     int isite    = 
m_arg[itask].isite;
 
  781     int isite_cp = 
m_arg[itask].isite_cpy;
 
  785       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  787     const double *w1 = &v1[Nvcd * isite];
 
  794     for (
int it = 0; it < 
m_Mt; ++it) {
 
  795       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  796         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  797           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  798           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  801           int ix1 = Nvc2 * is2;
 
  802           int ix2 = ix1 + 
m_Nvc;
 
  804           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  805             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  806             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  807             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  808             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  811           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  813             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  814             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  815             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  816             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  828     int itask, 
double *v2, 
const double *vcp2)
 
  830     int Nvc2  = 2 * 
m_Nvc;
 
  832     int Nvcd2 = Nvcd / 2;
 
  842     double wt1r, wt1i, wt2r, wt2i;
 
  844     int isite    = 
m_arg[itask].isite;
 
  845     int isite_cp = 
m_arg[itask].isite_cpy;
 
  847     double *w2 = &v2[Nvcd * isite];
 
  850       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  855     for (
int it = 0; it < 
m_Mt; ++it) {
 
  856       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  857         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  858           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  859           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  861           int ix1 = Nvc2 * is2;
 
  862           int ix2 = ix1 + 
m_Nvc;
 
  864           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  866             int ici = 2 * ic + 1;
 
  867             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  868             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  869             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  870             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  871             w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
 
  872             w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
 
  873             w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
 
  874             w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
 
  884     int itask, 
double *v2, 
const double *v1)
 
  896     double wt1r, wt1i, wt2r, wt2i;
 
  898     int isite = 
m_arg[itask].isite;
 
  900     double       *w2 = &v2[Nvcd * isite];
 
  901     const double *w1 = &v1[Nvcd * isite];
 
  904     for (
int it = 0; it < 
m_Mt; ++it) {
 
  905       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  906         for (
int iy = 1; iy < 
m_Ny; ++iy) {
 
  907           for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  908             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  910             int in = Nvcd * (is - 
m_Nx);
 
  913             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  914               vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  915               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  916               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  917               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  920             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  922               wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  923               wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  924               wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  925               wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  927               w2[ic2 + id1 + iv]     += wt1r;
 
  928               w2[ic2 + 1 + id1 + iv] += wt1i;
 
  929               w2[ic2 + id2 + iv]     += wt2r;
 
  930               w2[ic2 + 1 + id2 + iv] += wt2i;
 
  931               w2[ic2 + id3 + iv]     += wt2r;
 
  932               w2[ic2 + 1 + id3 + iv] += wt2i;
 
  933               w2[ic2 + id4 + iv]     += -wt1r;
 
  934               w2[ic2 + 1 + id4 + iv] += -wt1i;
 
  945     int itask, 
double *vcp1, 
const double *v1)
 
  947     int Nvc2  = 2 * 
m_Nvc;
 
  949     int Nvcd2 = Nvcd / 2;
 
  956     int isite    = 
m_arg[itask].isite;
 
  957     int isite_cp = 
m_arg[itask].isite_cpz;
 
  964       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  965     const double *w1 = &v1[Nvcd * isite];
 
  967     if (
m_arg[itask].kz0 == 1) {
 
  970       for (
int it = 0; it < 
m_Mt; ++it) {
 
  971         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  972           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
  973           int is2 = ixy + Nxy * it;
 
  976           int ix1 = Nvc2 * is2;
 
  977           int ix2 = ix1 + 
m_Nvc;
 
  979           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  980             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
 
  981             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
 
  982             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
  983             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
 
  995     int itask, 
double *v2, 
const double *vcp2)
 
  997     int Nvc2  = 2 * 
m_Nvc;
 
  999     int Nvcd2 = Nvcd / 2;
 
 1003     int id3 = 
m_Nvc * 2;
 
 1004     int id4 = 
m_Nvc * 3;
 
 1008     double wt1r, wt1i, wt2r, wt2i;
 
 1010     int isite    = 
m_arg[itask].isite;
 
 1011     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1013     double *w2 = &v2[Nvcd * isite];
 
 1016       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1021     if (
m_arg[itask].kz1 == 1) {
 
 1024       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1025         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1026           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1027           int is2 = ixy + Nxy * it;
 
 1029           int ig  = 
m_Ndf * is;
 
 1030           int ix1 = Nvc2 * is2;
 
 1031           int ix2 = ix1 + 
m_Nvc;
 
 1033           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1034             int ic2 = ic * 
m_Nvc;
 
 1036             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1037             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1038             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1039             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1041             w2[2 * ic + id1 + iv]     += wt1r;
 
 1042             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1043             w2[2 * ic + id2 + iv]     += wt2r;
 
 1044             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1045             w2[2 * ic + id3 + iv]     += wt1i;
 
 1046             w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1047             w2[2 * ic + id4 + iv]     += -wt2i;
 
 1048             w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1058     int itask, 
double *v2, 
const double *v1)
 
 1064     int id3 = 
m_Nvc * 2;
 
 1065     int id4 = 
m_Nvc * 3;
 
 1070     double wt1r, wt1i, wt2r, wt2i;
 
 1072     int isite = 
m_arg[itask].isite;
 
 1074     double       *w2 = &v2[Nvcd * isite];
 
 1075     const double *w1 = &v1[Nvcd * isite];
 
 1078     int kz1 = 
m_arg[itask].kz1;
 
 1081     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1082       for (
int iz = 0; iz < 
m_Mz - kz1; ++iz) {
 
 1083         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1084           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1086           int in = Nvcd * (is + Nxy);
 
 1087           int ig = 
m_Ndf * is;
 
 1089           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1090             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1091             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
 
 1092             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1093             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
 
 1096           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1097             int ic2 = ic * 
m_Nvc;
 
 1099             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1100             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1101             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1102             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1104             w2[2 * ic + id1 + iv]     += wt1r;
 
 1105             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1106             w2[2 * ic + id2 + iv]     += wt2r;
 
 1107             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1108             w2[2 * ic + id3 + iv]     += wt1i;
 
 1109             w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1110             w2[2 * ic + id4 + iv]     += -wt2i;
 
 1111             w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1121     int itask, 
double *vcp1, 
const double *v1)
 
 1123     int Nvc2  = 2 * 
m_Nvc;
 
 1125     int Nvcd2 = Nvcd / 2;
 
 1129     int id3 = 
m_Nvc * 2;
 
 1130     int id4 = 
m_Nvc * 3;
 
 1134     int isite    = 
m_arg[itask].isite;
 
 1135     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1139       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1140     const double *w1 = &v1[Nvcd * isite];
 
 1145     if (
m_arg[itask].kz1 == 1) {
 
 1148       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1149         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1150           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1151           int is2 = ixy + Nxy * it;
 
 1153           int ig  = 
m_Ndf * is;
 
 1154           int ix1 = Nvc2 * is2;
 
 1155           int ix2 = ix1 + 
m_Nvc;
 
 1157           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1158             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1159             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1160             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1161             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1164           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1166             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1167             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1168             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1169             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1181     int itask, 
double *v2, 
const double *vcp2)
 
 1183     int Nvc2  = 2 * 
m_Nvc;
 
 1185     int Nvcd2 = Nvcd / 2;
 
 1189     int id3 = 
m_Nvc * 2;
 
 1190     int id4 = 
m_Nvc * 3;
 
 1195     double wt1r, wt1i, wt2r, wt2i;
 
 1197     int isite    = 
m_arg[itask].isite;
 
 1198     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1200     double *w2 = &v2[Nvcd * isite];
 
 1203       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1207     if (
m_arg[itask].kz0 == 1) {
 
 1211       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1212         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1213           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1214           int is2 = ixy + Nxy * it;
 
 1216           int ix1 = Nvc2 * is2;
 
 1217           int ix2 = ix1 + 
m_Nvc;
 
 1219           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1221             int ici = 2 * ic + 1;
 
 1222             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1223             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1224             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1225             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1226             w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
 
 1227             w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
 
 1228             w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
 
 1229             w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
 
 1239     int itask, 
double *v2, 
const double *v1)
 
 1245     int id3 = 
m_Nvc * 2;
 
 1246     int id4 = 
m_Nvc * 3;
 
 1251     double wt1r, wt1i, wt2r, wt2i;
 
 1253     int isite = 
m_arg[itask].isite;
 
 1255     double       *w2 = &v2[Nvcd * isite];
 
 1256     const double *w1 = &v1[Nvcd * isite];
 
 1259     int kz0 = 
m_arg[itask].kz0;
 
 1262     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1263       for (
int iz = kz0; iz < 
m_Mz; ++iz) {
 
 1264         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1265           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1267           int in = Nvcd * (is - Nxy);
 
 1268           int ig = 
m_Ndf * (is - Nxy);
 
 1270           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1271             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1272             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1273             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1274             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1277           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1279             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1280             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1281             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1282             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1284             w2[ic2 + id1 + iv]     += wt1r;
 
 1285             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1286             w2[ic2 + id2 + iv]     += wt2r;
 
 1287             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1288             w2[ic2 + id3 + iv]     += -wt1i;
 
 1289             w2[ic2 + 1 + id3 + iv] += wt1r;
 
 1290             w2[ic2 + id4 + iv]     += wt2i;
 
 1291             w2[ic2 + 1 + id4 + iv] += -wt2r;
 
 1301     int itask, 
double *vcp1, 
const double *v1)
 
 1303     int Nvc2  = 2 * 
m_Nvc;
 
 1305     int Nvcd2 = Nvcd / 2;
 
 1309     int id3 = 
m_Nvc * 2;
 
 1310     int id4 = 
m_Nvc * 3;
 
 1312     int isite    = 
m_arg[itask].isite;
 
 1313     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1320       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1321     const double *w1 = &v1[Nvcd * isite];
 
 1323     if (
m_arg[itask].kt0 == 1) {
 
 1326       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1327         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1328           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1329           int is2 = ixy + Nxy * iz;
 
 1332           int ix1 = Nvc2 * is2;
 
 1333           int ix2 = ix1 + 
m_Nvc;
 
 1335           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1336             w2[2 * ic + ix1]     = 2.0 * bc2 * w1[2 * ic + id3 + in];
 
 1337             w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
 
 1338             w2[2 * ic + ix2]     = 2.0 * bc2 * w1[2 * ic + id4 + in];
 
 1339             w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
 
 1351     int itask, 
double *v2, 
const double *vcp2)
 
 1353     int Nvc2  = 2 * 
m_Nvc;
 
 1355     int Nvcd2 = Nvcd / 2;
 
 1359     int id3 = 
m_Nvc * 2;
 
 1360     int id4 = 
m_Nvc * 3;
 
 1364     double wt1r, wt1i, wt2r, wt2i;
 
 1366     int isite    = 
m_arg[itask].isite;
 
 1367     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1369     double *w2 = &v2[Nvcd * isite];
 
 1372       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1377     if (
m_arg[itask].kt1 == 1) {
 
 1380       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1381         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1382           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1383           int is2 = ixy + Nxy * iz;
 
 1385           int ig  = 
m_Ndf * is;
 
 1386           int ix1 = Nvc2 * is2;
 
 1387           int ix2 = ix1 + 
m_Nvc;
 
 1389           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1390             int ic2 = ic * 
m_Nvc;
 
 1392             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1393             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1394             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1395             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1397             w2[2 * ic + id3 + iv]     += wt1r;
 
 1398             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1399             w2[2 * ic + id4 + iv]     += wt2r;
 
 1400             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1410     int itask, 
double *v2, 
const double *v1)
 
 1416     int id3 = 
m_Nvc * 2;
 
 1417     int id4 = 
m_Nvc * 3;
 
 1422     double wt1r, wt1i, wt2r, wt2i;
 
 1424     int isite = 
m_arg[itask].isite;
 
 1426     double       *w2 = &v2[Nvcd * isite];
 
 1427     const double *w1 = &v1[Nvcd * isite];
 
 1430     int kt1  = 
m_arg[itask].kt1;
 
 1432     int Nxyz = Nxy * 
m_Nz;
 
 1434     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1435       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1436         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1437           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1439           int in = Nvcd * (is + Nxyz);
 
 1440           int ig = 
m_Ndf * is;
 
 1442           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1443             vt1[2 * ic]     = 2.0 * w1[2 * ic + id3 + in];
 
 1444             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
 
 1445             vt2[2 * ic]     = 2.0 * w1[2 * ic + id4 + in];
 
 1446             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
 
 1449           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1450             int ic2 = ic * 
m_Nvc;
 
 1452             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1453             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1454             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1455             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1457             w2[2 * ic + id3 + iv]     += wt1r;
 
 1458             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1459             w2[2 * ic + id4 + iv]     += wt2r;
 
 1460             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1470     int itask, 
double *vcp1, 
const double *v1)
 
 1472     int Nvc2  = 2 * 
m_Nvc;
 
 1474     int Nvcd2 = Nvcd / 2;
 
 1478     int id3 = 
m_Nvc * 2;
 
 1479     int id4 = 
m_Nvc * 3;
 
 1483     int isite    = 
m_arg[itask].isite;
 
 1484     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1488       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1489     const double *w1 = &v1[Nvcd * isite];
 
 1494     if (
m_arg[itask].kt1 == 1) {
 
 1497       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1498         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1499           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1500           int is2 = ixy + Nxy * iz;
 
 1502           int ig  = 
m_Ndf * is;
 
 1503           int ix1 = Nvc2 * is2;
 
 1504           int ix2 = ix1 + 
m_Nvc;
 
 1506           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1507             vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1508             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1509             vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1510             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1513           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1515             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1516             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1517             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1518             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1530     int itask, 
double *v2, 
const double *vcp2)
 
 1532     int Nvc2  = 2 * 
m_Nvc;
 
 1534     int Nvcd2 = Nvcd / 2;
 
 1538     int id3 = 
m_Nvc * 2;
 
 1539     int id4 = 
m_Nvc * 3;
 
 1544     double wt1r, wt1i, wt2r, wt2i;
 
 1546     int isite    = 
m_arg[itask].isite;
 
 1547     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1549     double *w2 = &v2[Nvcd * isite];
 
 1552       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1556     if (
m_arg[itask].kt0 == 1) {
 
 1559       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1560         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1561           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1562           int is2 = ixy + Nxy * iz;
 
 1564           int ix1 = Nvc2 * is2;
 
 1565           int ix2 = ix1 + 
m_Nvc;
 
 1567           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1569             int ici = 2 * ic + 1;
 
 1570             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1571             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1572             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1573             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1583     int itask, 
double *v2, 
const double *v1)
 
 1589     int id3 = 
m_Nvc * 2;
 
 1590     int id4 = 
m_Nvc * 3;
 
 1595     double wt1r, wt1i, wt2r, wt2i;
 
 1597     int isite = 
m_arg[itask].isite;
 
 1599     double       *w2 = &v2[Nvcd * isite];
 
 1600     const double *w1 = &v1[Nvcd * isite];
 
 1603     int kt0  = 
m_arg[itask].kt0;
 
 1605     int Nxyz = Nxy * 
m_Nz;
 
 1607     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1608       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1609         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1610           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1612           int in = Nvcd * (is - Nxyz);
 
 1613           int ig = 
m_Ndf * (is - Nxyz);
 
 1615           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1616             vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1617             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1618             vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1619             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1622           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1624             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1625             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1626             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1627             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1629             w2[ic2 + id1 + iv]     += wt1r;
 
 1630             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1631             w2[ic2 + id2 + iv]     += wt2r;
 
 1632             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1642     int itask, 
double *vcp1, 
const double *v1)
 
 1644     int Nvc2  = 2 * 
m_Nvc;
 
 1646     int Nvcd2 = Nvcd / 2;
 
 1650     int id3 = 
m_Nvc * 2;
 
 1651     int id4 = 
m_Nvc * 3;
 
 1653     int isite    = 
m_arg[itask].isite;
 
 1654     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1661       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1662     const double *w1 = &v1[Nvcd * isite];
 
 1664     if (
m_arg[itask].kt0 == 1) {
 
 1667       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1668         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1669           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1670           int is2 = ixy + Nxy * iz;
 
 1673           int ix1 = Nvc2 * is2;
 
 1674           int ix2 = ix1 + 
m_Nvc;
 
 1676           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1677             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
 
 1678             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
 
 1679             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
 
 1680             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1692     int itask, 
double *v2, 
const double *vcp2)
 
 1694     int Nvc2  = 2 * 
m_Nvc;
 
 1696     int Nvcd2 = Nvcd / 2;
 
 1700     int id3 = 
m_Nvc * 2;
 
 1701     int id4 = 
m_Nvc * 3;
 
 1705     double wt1r, wt1i, wt2r, wt2i;
 
 1707     int isite    = 
m_arg[itask].isite;
 
 1708     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1710     double *w2 = &v2[Nvcd * isite];
 
 1713       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1718     if (
m_arg[itask].kt1 == 1) {
 
 1721       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1722         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1723           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1724           int is2 = ixy + Nxy * iz;
 
 1726           int ig  = 
m_Ndf * is;
 
 1727           int ix1 = Nvc2 * is2;
 
 1728           int ix2 = ix1 + 
m_Nvc;
 
 1730           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1731             int ic2 = ic * 
m_Nvc;
 
 1733             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1734             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1735             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1736             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1738             w2[2 * ic + id1 + iv]     += wt1r;
 
 1739             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1740             w2[2 * ic + id2 + iv]     += wt2r;
 
 1741             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1742             w2[2 * ic + id3 + iv]     += wt1r;
 
 1743             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1744             w2[2 * ic + id4 + iv]     += wt2r;
 
 1745             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1755     int itask, 
double *v2, 
const double *v1)
 
 1761     int id3 = 
m_Nvc * 2;
 
 1762     int id4 = 
m_Nvc * 3;
 
 1767     double wt1r, wt1i, wt2r, wt2i;
 
 1769     int isite = 
m_arg[itask].isite;
 
 1771     double       *w2 = &v2[Nvcd * isite];
 
 1772     const double *w1 = &v1[Nvcd * isite];
 
 1775     int kt1  = 
m_arg[itask].kt1;
 
 1777     int Nxyz = Nxy * 
m_Nz;
 
 1779     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1780       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1781         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1782           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1784           int in = Nvcd * (is + Nxyz);
 
 1785           int ig = 
m_Ndf * is;
 
 1787           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1788             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
 
 1789             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1790             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
 
 1791             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1794           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1795             int ic2 = ic * 
m_Nvc;
 
 1797             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1798             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1799             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1800             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1802             w2[2 * ic + id1 + iv]     += wt1r;
 
 1803             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1804             w2[2 * ic + id2 + iv]     += wt2r;
 
 1805             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1806             w2[2 * ic + id3 + iv]     += wt1r;
 
 1807             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1808             w2[2 * ic + id4 + iv]     += wt2r;
 
 1809             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1819     int itask, 
double *vcp1, 
const double *v1)
 
 1821     int Nvc2  = 2 * 
m_Nvc;
 
 1823     int Nvcd2 = Nvcd / 2;
 
 1827     int id3 = 
m_Nvc * 2;
 
 1828     int id4 = 
m_Nvc * 3;
 
 1832     int isite    = 
m_arg[itask].isite;
 
 1833     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1837       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1838     const double *w1 = &v1[Nvcd * isite];
 
 1843     if (
m_arg[itask].kt1 == 1) {
 
 1846       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1847         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1848           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1849           int is2 = ixy + Nxy * iz;
 
 1851           int ig  = 
m_Ndf * is;
 
 1852           int ix1 = Nvc2 * is2;
 
 1853           int ix2 = ix1 + 
m_Nvc;
 
 1855           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1856             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1857             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1858             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1859             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1862           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1864             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1865             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1866             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1867             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1879     int itask, 
double *v2, 
const double *vcp2)
 
 1881     int Nvc2  = 2 * 
m_Nvc;
 
 1883     int Nvcd2 = Nvcd / 2;
 
 1887     int id3 = 
m_Nvc * 2;
 
 1888     int id4 = 
m_Nvc * 3;
 
 1893     double wt1r, wt1i, wt2r, wt2i;
 
 1895     int isite    = 
m_arg[itask].isite;
 
 1896     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1898     double *w2 = &v2[Nvcd * isite];
 
 1901       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1905     if (
m_arg[itask].kt0 == 1) {
 
 1908       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1909         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1910           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1911           int is2 = ixy + Nxy * iz;
 
 1913           int ix1 = Nvc2 * is2;
 
 1914           int ix2 = ix1 + 
m_Nvc;
 
 1916           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1918             int ici = 2 * ic + 1;
 
 1919             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1920             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1921             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1922             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1923             w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
 
 1924             w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
 
 1925             w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
 
 1926             w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
 
 1936     int itask, 
double *v2, 
const double *v1)
 
 1942     int id3 = 
m_Nvc * 2;
 
 1943     int id4 = 
m_Nvc * 3;
 
 1948     double wt1r, wt1i, wt2r, wt2i;
 
 1950     int isite = 
m_arg[itask].isite;
 
 1952     double       *w2 = &v2[Nvcd * isite];
 
 1953     const double *w1 = &v1[Nvcd * isite];
 
 1956     int kt0  = 
m_arg[itask].kt0;
 
 1958     int Nxyz = Nxy * 
m_Nz;
 
 1960     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1961       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1962         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1963           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1965           int in = Nvcd * (is - Nxyz);
 
 1966           int ig = 
m_Ndf * (is - Nxyz);
 
 1968           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1969             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1970             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1971             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1972             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1975           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1977             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1978             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1979             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1980             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1982             w2[ic2 + id1 + iv]     += wt1r;
 
 1983             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1984             w2[ic2 + id2 + iv]     += wt2r;
 
 1985             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1986             w2[ic2 + id3 + iv]     -= wt1r;
 
 1987             w2[ic2 + 1 + id3 + iv] -= wt1i;
 
 1988             w2[ic2 + id4 + iv]     -= wt2r;
 
 1989             w2[ic2 + 1 + id4 + iv] -= wt2i;
 
 1999     int itask, 
double *v2, 
const double *v1)
 
 2006     int id3 = 
m_Nvc * 2;
 
 2007     int id4 = 
m_Nvc * 3;
 
 2009     int          isite = 
m_arg[itask].isite;
 
 2010     double       *w2   = &v2[Nvcd * isite];
 
 2011     const double *w1   = &v1[Nvcd * isite];
 
 2013     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2014       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2015         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2016           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2017           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2018             w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2019             w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2020             w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
 
 2021             w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
 
 2031     int itask, 
double *v2, 
const double *v1)
 
 2038     int id3 = 
m_Nvc * 2;
 
 2039     int id4 = 
m_Nvc * 3;
 
 2041     int          isite = 
m_arg[itask].isite;
 
 2042     double       *w2   = &v2[Nvcd * isite];
 
 2043     const double *w1   = &v1[Nvcd * isite];
 
 2045     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2046       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2047         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2048           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2049           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2050             w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
 
 2051             w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
 
 2052             w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2053             w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
void mult_zpb_thread(int, double *, const double *)
 
void mult_tp1_chiral_thread(int, double *, const double *)
 
const double * ptr(const int jin, const int site, const int jex) const 
 
void mult_tm1_dirac_thread(int, double *, const double *)
 
const Field_G * m_U
gauge configuration. 
 
void mult_yp1_thread(int, double *, const double *)
 
void mult_yp2_thread(int, double *, const double *)
 
void mult_zp2_thread(int, double *, const double *)
 
void general(const char *format,...)
 
std::vector< mult_arg > m_arg
 
void mult_ym2_thread(int, double *, const double *)
 
void clear_thread(int, double *)
 
void mult_tpb_chiral_thread(int, double *, const double *)
 
void gm5_chiral_thread(int, double *, const double *)
 
std::vector< Channel * > m_bw_recv
 
void mult_tpb_dirac_thread(int, double *, const double *)
 
void mult_tp2_dirac_thread(int, double *, const double *)
 
void mult_zm2_thread(int, double *, const double *)
 
void mult_xm2_thread(int, double *, const double *)
 
void mult_tm1_chiral_thread(int, double *, const double *)
 
void mult_tmb_dirac_thread(int, double *, const double *)
 
void mult_tm2_dirac_thread(int, double *, const double *)
 
void mult_tp1_dirac_thread(int, double *, const double *)
 
static int get_num_threads_available()
returns number of threads (works outside of parallel region). 
 
void mult_zmb_thread(int, double *, const double *)
 
void crucial(const char *format,...)
 
void mult_xmb_thread(int, double *, const double *)
 
void gm5_dirac_thread(int, double *, const double *)
 
std::vector< double > m_boundary2
b.c. for each node. 
 
void daypx_thread(int, double *, double, const double *)
 
void mult_tmb_chiral_thread(int, double *, const double *)
 
void mult_ypb_thread(int, double *, const double *)
 
void mult_ymb_thread(int, double *, const double *)
 
void mult_xpb_thread(int, double *, const double *)
 
void mult_ym1_thread(int, double *, const double *)
 
void mult_xp1_thread(int, double *, const double *)
 
void mult_tm2_chiral_thread(int, double *, const double *)
 
static const std::string class_name
 
Bridge::VerboseLevel m_vl
 
void mult_tp2_chiral_thread(int, double *, const double *)
 
void mult_zp1_thread(int, double *, const double *)
 
std::vector< Channel * > m_fw_send
 
void mult_xm1_thread(int, double *, const double *)
 
void mult_xp2_thread(int, double *, const double *)
 
void mult_zm1_thread(int, double *, const double *)
 
std::vector< Channel * > m_fw_recv
 
std::vector< Channel * > m_bw_send