22 #if defined USE_GROUP_SU3 
   23 #include "fopr_Wilson_impl_SU3.inc" 
   24 #elif defined USE_GROUP_SU2 
   25 #include "fopr_Wilson_impl_SU2.inc" 
   26 #elif defined USE_GROUP_SU_N 
   27 #include "fopr_Wilson_impl_SU_N.inc" 
   79     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   80       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
   81         int itask = ith_z + m_Ntask_z * ith_t;
 
   89         if (ith_t == 0) 
m_arg[itask].kt0 = 1;
 
   90         if (ith_z == 0) 
m_arg[itask].kz0 = 1;
 
   91         if (ith_t == m_Ntask_t - 1) 
m_arg[itask].kt1 = 1;
 
   92         if (ith_z == m_Ntask_z - 1) 
m_arg[itask].kz1 = 1;
 
   96         m_arg[itask].isite_cpz = ith_t * 
m_Mt * Nxy;
 
   97         m_arg[itask].isite_cpt = ith_z * 
m_Mz * Nxy;
 
  104     int Nvcd2 = 2 * Nc * Nd / 2;
 
  106     std::vector<int> destid(
m_Ntask);
 
  107     std::vector<int> offset(
m_Ntask);
 
  108     std::vector<int> datasize(
m_Ntask);
 
  109     std::vector<int> offset_up(
m_Ntask);
 
  110     std::vector<int> offset_lw(
m_Ntask);
 
  111     std::vector<int> datasize_up(
m_Ntask);
 
  112     std::vector<int> datasize_lw(
m_Ntask);
 
  115     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  116       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  119         destid[itask]   = itask;
 
  120         offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  121         datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Ny;
 
  130     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  131       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  134         destid[itask]   = itask;
 
  135         offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  136         datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Nx;
 
  145     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  146       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  147         int itask = ith_z + m_Ntask_z * ith_t;
 
  149         offset_up[itask]   = 0;
 
  150         offset_lw[itask]   = 0;
 
  151         datasize_up[itask] = 0;
 
  152         datasize_lw[itask] = 0;
 
  154           destid[itask]      = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
 
  155           offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx * m_Ny;
 
  156           datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx * m_Ny;
 
  158         if (ith_z == m_Ntask_z - 1) {
 
  160           offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx * m_Ny;
 
  161           datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx * m_Ny;
 
  171     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  172       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  173         int itask = ith_z + m_Ntask_z * ith_t;
 
  175         offset_up[itask]   = 0;
 
  176         offset_lw[itask]   = 0;
 
  177         datasize_up[itask] = 0;
 
  178         datasize_lw[itask] = 0;
 
  180           destid[itask]      = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
 
  181           offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx * m_Ny;
 
  182           datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx * m_Ny;
 
  184         if (ith_t == m_Ntask_t - 1) {
 
  185           destid[itask]      = ith_z;
 
  186           offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx * m_Ny;
 
  187           datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx * m_Ny;
 
  200     int itask, 
double *v2, 
double fac, 
const double *v1)
 
  205     int isite = 
m_arg[itask].isite;
 
  207     double       *w2 = &v2[Nvcd * isite];
 
  208     const double *w1 = &v1[Nvcd * isite];
 
  210     for (
int it = 0; it < 
m_Mt; ++it) {
 
  211       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  212         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  213           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  214           w2[iv] = fac * w2[iv] + w1[iv];
 
  228     int    isite = 
m_arg[itask].isite;
 
  229     double *w2   = &v2[Nvcd * isite];
 
  231     for (
int it = 0; it < 
m_Mt; ++it) {
 
  232       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  233         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  234           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  244     int itask, 
double *vcp1, 
const double *v1)
 
  246     int Nvc2  = 2 * 
m_Nvc;
 
  248     int Nvcd2 = Nvcd / 2;
 
  258     int isite    = 
m_arg[itask].isite;
 
  259     int isite_cp = 
m_arg[itask].isite_cpx;
 
  263       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  264     const double *w1 = &v1[Nvcd * isite];
 
  268     for (
int it = 0; it < 
m_Mt; ++it) {
 
  269       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  270         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  271           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  272           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  274           int ix1 = Nvc2 * is2;
 
  275           int ix2 = ix1 + 
m_Nvc;
 
  277           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  278             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
 
  279             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
 
  280             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  281             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
 
  293     int itask, 
double *v2, 
const double *vcp2)
 
  295     int Nvc2  = 2 * 
m_Nvc;
 
  297     int Nvcd2 = Nvcd / 2;
 
  306     double wt1r, wt1i, wt2r, wt2i;
 
  308     int isite    = 
m_arg[itask].isite;
 
  309     int isite_cp = 
m_arg[itask].isite_cpx;
 
  311     double *w2 = &v2[Nvcd * isite];
 
  314       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  320     for (
int it = 0; it < 
m_Mt; ++it) {
 
  321       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  322         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  323           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  324           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  327           int ix1 = Nvc2 * is2;
 
  328           int ix2 = ix1 + 
m_Nvc;
 
  330           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  331             int ic2 = ic * 
m_Nvc;
 
  333             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  334             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  335             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  336             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  338             w2[2 * ic + id1 + iv]     += wt1r;
 
  339             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  340             w2[2 * ic + id2 + iv]     += wt2r;
 
  341             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  342             w2[2 * ic + id3 + iv]     += wt2i;
 
  343             w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  344             w2[2 * ic + id4 + iv]     += wt1i;
 
  345             w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  355     int itask, 
double *v2, 
const double *v1)
 
  367     double wt1r, wt1i, wt2r, wt2i;
 
  369     int isite = 
m_arg[itask].isite;
 
  371     double       *w2 = &v2[Nvcd * isite];
 
  372     const double *w1 = &v1[Nvcd * isite];
 
  375     for (
int it = 0; it < 
m_Mt; ++it) {
 
  376       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  377         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  378           for (
int ix = 0; ix < 
m_Nx - 1; ++ix) {
 
  379             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  381             int in = Nvcd * (is + 1);
 
  384             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  385               vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  386               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
 
  387               vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  388               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
 
  391             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  392               int ic2 = ic * 
m_Nvc;
 
  394               wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  395               wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  396               wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  397               wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  399               w2[2 * ic + id1 + iv]     += wt1r;
 
  400               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  401               w2[2 * ic + id2 + iv]     += wt2r;
 
  402               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  403               w2[2 * ic + id3 + iv]     += wt2i;
 
  404               w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  405               w2[2 * ic + id4 + iv]     += wt1i;
 
  406               w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  417     int itask, 
double *vcp1, 
const double *v1)
 
  419     int Nvc2  = 2 * 
m_Nvc;
 
  421     int Nvcd2 = Nvcd / 2;
 
  430     int isite    = 
m_arg[itask].isite;
 
  431     int isite_cp = 
m_arg[itask].isite_cpx;
 
  435       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  436     const double *w1 = &v1[Nvcd * isite];
 
  443     for (
int it = 0; it < 
m_Mt; ++it) {
 
  444       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  445         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  446           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  447           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  450           int ix1 = Nvc2 * is2;
 
  451           int ix2 = ix1 + 
m_Nvc;
 
  453           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  454             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  455             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  456             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  457             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  460           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  462             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  463             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  464             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  465             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  477     int itask, 
double *v2, 
const double *vcp2)
 
  479     int Nvc2  = 2 * 
m_Nvc;
 
  481     int Nvcd2 = Nvcd / 2;
 
  491     double wt1r, wt1i, wt2r, wt2i;
 
  493     int isite    = 
m_arg[itask].isite;
 
  494     int isite_cp = 
m_arg[itask].isite_cpx;
 
  496     double *w2 = &v2[Nvcd * isite];
 
  499       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  504     for (
int it = 0; it < 
m_Mt; ++it) {
 
  505       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  506         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  507           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  508           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  510           int ix1 = Nvc2 * is2;
 
  511           int ix2 = ix1 + 
m_Nvc;
 
  513           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  515             int ici = 2 * ic + 1;
 
  516             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  517             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  518             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  519             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  520             w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
 
  521             w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
 
  522             w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
 
  523             w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
 
  533     int itask, 
double *v2, 
const double *v1)
 
  545     double wt1r, wt1i, wt2r, wt2i;
 
  547     int isite = 
m_arg[itask].isite;
 
  549     double       *w2 = &v2[Nvcd * isite];
 
  550     const double *w1 = &v1[Nvcd * isite];
 
  553     for (
int it = 0; it < 
m_Mt; ++it) {
 
  554       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  555         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  556           for (
int ix = 1; ix < 
m_Nx; ++ix) {
 
  557             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  559             int in = Nvcd * (is - 1);
 
  560             int ig = 
m_Ndf * (is - 1);
 
  562             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  563               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  564               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  565               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  566               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  569             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  572               wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  573               wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  574               wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  575               wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  577               w2[2 * ic + id1 + iv]     += wt1r;
 
  578               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  579               w2[2 * ic + id2 + iv]     += wt2r;
 
  580               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  581               w2[2 * ic + id3 + iv]     += -wt2i;
 
  582               w2[2 * ic + 1 + id3 + iv] += +wt2r;
 
  583               w2[2 * ic + id4 + iv]     += -wt1i;
 
  584               w2[2 * ic + 1 + id4 + iv] += +wt1r;
 
  595     int itask, 
double *vcp1, 
const double *v1)
 
  597     int Nvc2  = 2 * 
m_Nvc;
 
  599     int Nvcd2 = Nvcd / 2;
 
  606     int isite    = 
m_arg[itask].isite;
 
  607     int isite_cp = 
m_arg[itask].isite_cpy;
 
  614       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  615     const double *w1 = &v1[Nvcd * isite];
 
  619     for (
int it = 0; it < 
m_Mt; ++it) {
 
  620       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  621         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  622           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  623           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  625           int ix1 = Nvc2 * is2;
 
  626           int ix2 = ix1 + 
m_Nvc;
 
  628           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  629             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
 
  630             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
 
  631             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
 
  632             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  644     int itask, 
double *v2, 
const double *vcp2)
 
  646     int Nvc2  = 2 * 
m_Nvc;
 
  648     int Nvcd2 = Nvcd / 2;
 
  657     double wt1r, wt1i, wt2r, wt2i;
 
  659     int isite    = 
m_arg[itask].isite;
 
  660     int isite_cp = 
m_arg[itask].isite_cpy;
 
  662     double *w2 = &v2[Nvcd * isite];
 
  665       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  671     for (
int it = 0; it < 
m_Mt; ++it) {
 
  672       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  673         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  674           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  675           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  678           int ix1 = Nvc2 * is2;
 
  679           int ix2 = ix1 + 
m_Nvc;
 
  681           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  682             int ic2 = ic * 
m_Nvc;
 
  684             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  685             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  686             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  687             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  689             w2[2 * ic + id1 + iv]     += wt1r;
 
  690             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  691             w2[2 * ic + id2 + iv]     += wt2r;
 
  692             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  693             w2[2 * ic + id3 + iv]     += -wt2r;
 
  694             w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  695             w2[2 * ic + id4 + iv]     += wt1r;
 
  696             w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  706     int itask, 
double *v2, 
const double *v1)
 
  718     double wt1r, wt1i, wt2r, wt2i;
 
  720     int isite = 
m_arg[itask].isite;
 
  722     double       *w2 = &v2[Nvcd * isite];
 
  723     const double *w1 = &v1[Nvcd * isite];
 
  726     for (
int it = 0; it < 
m_Mt; ++it) {
 
  727       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  728         for (
int iy = 0; iy < 
m_Ny - 1; ++iy) {
 
  729           for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  730             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  732             int in = Nvcd * (is + 
m_Nx);
 
  735             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  736               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
 
  737               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  738               vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
 
  739               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  742             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  743               int ic2 = ic * 
m_Nvc;
 
  745               wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  746               wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  747               wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  748               wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  750               w2[2 * ic + id1 + iv]     += wt1r;
 
  751               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  752               w2[2 * ic + id2 + iv]     += wt2r;
 
  753               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  754               w2[2 * ic + id3 + iv]     += -wt2r;
 
  755               w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  756               w2[2 * ic + id4 + iv]     += wt1r;
 
  757               w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  768     int itask, 
double *vcp1, 
const double *v1)
 
  770     int Nvc2  = 2 * 
m_Nvc;
 
  772     int Nvcd2 = Nvcd / 2;
 
  781     int isite    = 
m_arg[itask].isite;
 
  782     int isite_cp = 
m_arg[itask].isite_cpy;
 
  786       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  788     const double *w1 = &v1[Nvcd * isite];
 
  795     for (
int it = 0; it < 
m_Mt; ++it) {
 
  796       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  797         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  798           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  799           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  802           int ix1 = Nvc2 * is2;
 
  803           int ix2 = ix1 + 
m_Nvc;
 
  805           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  806             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  807             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  808             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  809             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  812           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  814             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  815             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  816             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  817             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  829     int itask, 
double *v2, 
const double *vcp2)
 
  831     int Nvc2  = 2 * 
m_Nvc;
 
  833     int Nvcd2 = Nvcd / 2;
 
  843     double wt1r, wt1i, wt2r, wt2i;
 
  845     int isite    = 
m_arg[itask].isite;
 
  846     int isite_cp = 
m_arg[itask].isite_cpy;
 
  848     double *w2 = &v2[Nvcd * isite];
 
  851       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  856     for (
int it = 0; it < 
m_Mt; ++it) {
 
  857       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  858         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  859           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  860           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  862           int ix1 = Nvc2 * is2;
 
  863           int ix2 = ix1 + 
m_Nvc;
 
  865           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  867             int ici = 2 * ic + 1;
 
  868             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  869             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  870             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  871             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  872             w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
 
  873             w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
 
  874             w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
 
  875             w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
 
  885     int itask, 
double *v2, 
const double *v1)
 
  897     double wt1r, wt1i, wt2r, wt2i;
 
  899     int isite = 
m_arg[itask].isite;
 
  901     double       *w2 = &v2[Nvcd * isite];
 
  902     const double *w1 = &v1[Nvcd * isite];
 
  905     for (
int it = 0; it < 
m_Mt; ++it) {
 
  906       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  907         for (
int iy = 1; iy < 
m_Ny; ++iy) {
 
  908           for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  909             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  911             int in = Nvcd * (is - 
m_Nx);
 
  914             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  915               vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  916               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  917               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  918               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  921             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  923               wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  924               wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  925               wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  926               wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  928               w2[ic2 + id1 + iv]     += wt1r;
 
  929               w2[ic2 + 1 + id1 + iv] += wt1i;
 
  930               w2[ic2 + id2 + iv]     += wt2r;
 
  931               w2[ic2 + 1 + id2 + iv] += wt2i;
 
  932               w2[ic2 + id3 + iv]     += wt2r;
 
  933               w2[ic2 + 1 + id3 + iv] += wt2i;
 
  934               w2[ic2 + id4 + iv]     += -wt1r;
 
  935               w2[ic2 + 1 + id4 + iv] += -wt1i;
 
  946     int itask, 
double *vcp1, 
const double *v1)
 
  948     int Nvc2  = 2 * 
m_Nvc;
 
  950     int Nvcd2 = Nvcd / 2;
 
  957     int isite    = 
m_arg[itask].isite;
 
  958     int isite_cp = 
m_arg[itask].isite_cpz;
 
  965       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  966     const double *w1 = &v1[Nvcd * isite];
 
  968     if (
m_arg[itask].kz0 == 1) {
 
  971       for (
int it = 0; it < 
m_Mt; ++it) {
 
  972         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  973           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
  974           int is2 = ixy + Nxy * it;
 
  977           int ix1 = Nvc2 * is2;
 
  978           int ix2 = ix1 + 
m_Nvc;
 
  980           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  981             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
 
  982             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
 
  983             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
  984             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
 
  996     int itask, 
double *v2, 
const double *vcp2)
 
  998     int Nvc2  = 2 * 
m_Nvc;
 
 1000     int Nvcd2 = Nvcd / 2;
 
 1004     int id3 = 
m_Nvc * 2;
 
 1005     int id4 = 
m_Nvc * 3;
 
 1009     double wt1r, wt1i, wt2r, wt2i;
 
 1011     int isite    = 
m_arg[itask].isite;
 
 1012     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1014     double *w2 = &v2[Nvcd * isite];
 
 1017       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1022     if (
m_arg[itask].kz1 == 1) {
 
 1025       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1026         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1027           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1028           int is2 = ixy + Nxy * it;
 
 1030           int ig  = 
m_Ndf * is;
 
 1031           int ix1 = Nvc2 * is2;
 
 1032           int ix2 = ix1 + 
m_Nvc;
 
 1034           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1035             int ic2 = ic * 
m_Nvc;
 
 1037             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1038             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1039             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1040             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1042             w2[2 * ic + id1 + iv]     += wt1r;
 
 1043             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1044             w2[2 * ic + id2 + iv]     += wt2r;
 
 1045             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1046             w2[2 * ic + id3 + iv]     += wt1i;
 
 1047             w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1048             w2[2 * ic + id4 + iv]     += -wt2i;
 
 1049             w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1059     int itask, 
double *v2, 
const double *v1)
 
 1065     int id3 = 
m_Nvc * 2;
 
 1066     int id4 = 
m_Nvc * 3;
 
 1071     double wt1r, wt1i, wt2r, wt2i;
 
 1073     int isite = 
m_arg[itask].isite;
 
 1075     double       *w2 = &v2[Nvcd * isite];
 
 1076     const double *w1 = &v1[Nvcd * isite];
 
 1079     int kz1 = 
m_arg[itask].kz1;
 
 1082     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1083       for (
int iz = 0; iz < 
m_Mz - kz1; ++iz) {
 
 1084         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1085           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1087           int in = Nvcd * (is + Nxy);
 
 1088           int ig = 
m_Ndf * is;
 
 1090           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1091             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1092             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
 
 1093             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1094             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
 
 1097           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1098             int ic2 = ic * 
m_Nvc;
 
 1100             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1101             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1102             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1103             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1105             w2[2 * ic + id1 + iv]     += wt1r;
 
 1106             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1107             w2[2 * ic + id2 + iv]     += wt2r;
 
 1108             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1109             w2[2 * ic + id3 + iv]     += wt1i;
 
 1110             w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1111             w2[2 * ic + id4 + iv]     += -wt2i;
 
 1112             w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1122     int itask, 
double *vcp1, 
const double *v1)
 
 1124     int Nvc2  = 2 * 
m_Nvc;
 
 1126     int Nvcd2 = Nvcd / 2;
 
 1130     int id3 = 
m_Nvc * 2;
 
 1131     int id4 = 
m_Nvc * 3;
 
 1135     int isite    = 
m_arg[itask].isite;
 
 1136     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1140       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1141     const double *w1 = &v1[Nvcd * isite];
 
 1146     if (
m_arg[itask].kz1 == 1) {
 
 1149       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1150         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1151           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1152           int is2 = ixy + Nxy * it;
 
 1154           int ig  = 
m_Ndf * is;
 
 1155           int ix1 = Nvc2 * is2;
 
 1156           int ix2 = ix1 + 
m_Nvc;
 
 1158           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1159             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1160             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1161             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1162             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1165           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1167             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1168             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1169             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1170             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1182     int itask, 
double *v2, 
const double *vcp2)
 
 1184     int Nvc2  = 2 * 
m_Nvc;
 
 1186     int Nvcd2 = Nvcd / 2;
 
 1190     int id3 = 
m_Nvc * 2;
 
 1191     int id4 = 
m_Nvc * 3;
 
 1196     double wt1r, wt1i, wt2r, wt2i;
 
 1198     int isite    = 
m_arg[itask].isite;
 
 1199     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1201     double *w2 = &v2[Nvcd * isite];
 
 1204       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1208     if (
m_arg[itask].kz0 == 1) {
 
 1212       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1213         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1214           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1215           int is2 = ixy + Nxy * it;
 
 1217           int ix1 = Nvc2 * is2;
 
 1218           int ix2 = ix1 + 
m_Nvc;
 
 1220           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1222             int ici = 2 * ic + 1;
 
 1223             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1224             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1225             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1226             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1227             w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
 
 1228             w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
 
 1229             w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
 
 1230             w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
 
 1240     int itask, 
double *v2, 
const double *v1)
 
 1246     int id3 = 
m_Nvc * 2;
 
 1247     int id4 = 
m_Nvc * 3;
 
 1252     double wt1r, wt1i, wt2r, wt2i;
 
 1254     int isite = 
m_arg[itask].isite;
 
 1256     double       *w2 = &v2[Nvcd * isite];
 
 1257     const double *w1 = &v1[Nvcd * isite];
 
 1260     int kz0 = 
m_arg[itask].kz0;
 
 1263     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1264       for (
int iz = kz0; iz < 
m_Mz; ++iz) {
 
 1265         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1266           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1268           int in = Nvcd * (is - Nxy);
 
 1269           int ig = 
m_Ndf * (is - Nxy);
 
 1271           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1272             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1273             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1274             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1275             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1278           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1280             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1281             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1282             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1283             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1285             w2[ic2 + id1 + iv]     += wt1r;
 
 1286             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1287             w2[ic2 + id2 + iv]     += wt2r;
 
 1288             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1289             w2[ic2 + id3 + iv]     += -wt1i;
 
 1290             w2[ic2 + 1 + id3 + iv] += wt1r;
 
 1291             w2[ic2 + id4 + iv]     += wt2i;
 
 1292             w2[ic2 + 1 + id4 + iv] += -wt2r;
 
 1302     int itask, 
double *vcp1, 
const double *v1)
 
 1304     int Nvc2  = 2 * 
m_Nvc;
 
 1306     int Nvcd2 = Nvcd / 2;
 
 1310     int id3 = 
m_Nvc * 2;
 
 1311     int id4 = 
m_Nvc * 3;
 
 1313     int isite    = 
m_arg[itask].isite;
 
 1314     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1321       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1322     const double *w1 = &v1[Nvcd * isite];
 
 1324     if (
m_arg[itask].kt0 == 1) {
 
 1327       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1328         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1329           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1330           int is2 = ixy + Nxy * iz;
 
 1333           int ix1 = Nvc2 * is2;
 
 1334           int ix2 = ix1 + 
m_Nvc;
 
 1336           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1337             w2[2 * ic + ix1]     = 2.0 * bc2 * w1[2 * ic + id3 + in];
 
 1338             w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
 
 1339             w2[2 * ic + ix2]     = 2.0 * bc2 * w1[2 * ic + id4 + in];
 
 1340             w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
 
 1352     int itask, 
double *v2, 
const double *vcp2)
 
 1354     int Nvc2  = 2 * 
m_Nvc;
 
 1356     int Nvcd2 = Nvcd / 2;
 
 1360     int id3 = 
m_Nvc * 2;
 
 1361     int id4 = 
m_Nvc * 3;
 
 1365     double wt1r, wt1i, wt2r, wt2i;
 
 1367     int isite    = 
m_arg[itask].isite;
 
 1368     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1370     double *w2 = &v2[Nvcd * isite];
 
 1373       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1378     if (
m_arg[itask].kt1 == 1) {
 
 1381       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1382         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1383           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1384           int is2 = ixy + Nxy * iz;
 
 1386           int ig  = 
m_Ndf * is;
 
 1387           int ix1 = Nvc2 * is2;
 
 1388           int ix2 = ix1 + 
m_Nvc;
 
 1390           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1391             int ic2 = ic * 
m_Nvc;
 
 1393             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1394             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1395             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1396             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1398             w2[2 * ic + id3 + iv]     += wt1r;
 
 1399             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1400             w2[2 * ic + id4 + iv]     += wt2r;
 
 1401             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1411     int itask, 
double *v2, 
const double *v1)
 
 1417     int id3 = 
m_Nvc * 2;
 
 1418     int id4 = 
m_Nvc * 3;
 
 1423     double wt1r, wt1i, wt2r, wt2i;
 
 1425     int isite = 
m_arg[itask].isite;
 
 1427     double       *w2 = &v2[Nvcd * isite];
 
 1428     const double *w1 = &v1[Nvcd * isite];
 
 1431     int kt1  = 
m_arg[itask].kt1;
 
 1433     int Nxyz = Nxy * 
m_Nz;
 
 1435     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1436       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1437         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1438           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1440           int in = Nvcd * (is + Nxyz);
 
 1441           int ig = 
m_Ndf * is;
 
 1443           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1444             vt1[2 * ic]     = 2.0 * w1[2 * ic + id3 + in];
 
 1445             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
 
 1446             vt2[2 * ic]     = 2.0 * w1[2 * ic + id4 + in];
 
 1447             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
 
 1450           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1451             int ic2 = ic * 
m_Nvc;
 
 1453             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1454             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1455             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1456             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1458             w2[2 * ic + id3 + iv]     += wt1r;
 
 1459             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1460             w2[2 * ic + id4 + iv]     += wt2r;
 
 1461             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1471     int itask, 
double *vcp1, 
const double *v1)
 
 1473     int Nvc2  = 2 * 
m_Nvc;
 
 1475     int Nvcd2 = Nvcd / 2;
 
 1479     int id3 = 
m_Nvc * 2;
 
 1480     int id4 = 
m_Nvc * 3;
 
 1484     int isite    = 
m_arg[itask].isite;
 
 1485     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1489       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1490     const double *w1 = &v1[Nvcd * isite];
 
 1495     if (
m_arg[itask].kt1 == 1) {
 
 1498       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1499         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1500           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1501           int is2 = ixy + Nxy * iz;
 
 1503           int ig  = 
m_Ndf * is;
 
 1504           int ix1 = Nvc2 * is2;
 
 1505           int ix2 = ix1 + 
m_Nvc;
 
 1507           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1508             vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1509             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1510             vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1511             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1514           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1516             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1517             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1518             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1519             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1531     int itask, 
double *v2, 
const double *vcp2)
 
 1533     int Nvc2  = 2 * 
m_Nvc;
 
 1535     int Nvcd2 = Nvcd / 2;
 
 1539     int id3 = 
m_Nvc * 2;
 
 1540     int id4 = 
m_Nvc * 3;
 
 1545     double wt1r, wt1i, wt2r, wt2i;
 
 1547     int isite    = 
m_arg[itask].isite;
 
 1548     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1550     double *w2 = &v2[Nvcd * isite];
 
 1553       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1557     if (
m_arg[itask].kt0 == 1) {
 
 1560       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1561         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1562           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1563           int is2 = ixy + Nxy * iz;
 
 1565           int ix1 = Nvc2 * is2;
 
 1566           int ix2 = ix1 + 
m_Nvc;
 
 1568           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1570             int ici = 2 * ic + 1;
 
 1571             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1572             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1573             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1574             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1584     int itask, 
double *v2, 
const double *v1)
 
 1590     int id3 = 
m_Nvc * 2;
 
 1591     int id4 = 
m_Nvc * 3;
 
 1596     double wt1r, wt1i, wt2r, wt2i;
 
 1598     int isite = 
m_arg[itask].isite;
 
 1600     double       *w2 = &v2[Nvcd * isite];
 
 1601     const double *w1 = &v1[Nvcd * isite];
 
 1604     int kt0  = 
m_arg[itask].kt0;
 
 1606     int Nxyz = Nxy * 
m_Nz;
 
 1608     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1609       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1610         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1611           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1613           int in = Nvcd * (is - Nxyz);
 
 1614           int ig = 
m_Ndf * (is - Nxyz);
 
 1616           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1617             vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1618             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1619             vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1620             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1623           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1625             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1626             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1627             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1628             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1630             w2[ic2 + id1 + iv]     += wt1r;
 
 1631             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1632             w2[ic2 + id2 + iv]     += wt2r;
 
 1633             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1643     int itask, 
double *vcp1, 
const double *v1)
 
 1645     int Nvc2  = 2 * 
m_Nvc;
 
 1647     int Nvcd2 = Nvcd / 2;
 
 1651     int id3 = 
m_Nvc * 2;
 
 1652     int id4 = 
m_Nvc * 3;
 
 1654     int isite    = 
m_arg[itask].isite;
 
 1655     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1662       = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1663     const double *w1 = &v1[Nvcd * isite];
 
 1665     if (
m_arg[itask].kt0 == 1) {
 
 1668       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1669         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1670           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1671           int is2 = ixy + Nxy * iz;
 
 1674           int ix1 = Nvc2 * is2;
 
 1675           int ix2 = ix1 + 
m_Nvc;
 
 1677           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1678             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
 
 1679             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
 
 1680             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
 
 1681             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1693     int itask, 
double *v2, 
const double *vcp2)
 
 1695     int Nvc2  = 2 * 
m_Nvc;
 
 1697     int Nvcd2 = Nvcd / 2;
 
 1701     int id3 = 
m_Nvc * 2;
 
 1702     int id4 = 
m_Nvc * 3;
 
 1706     double wt1r, wt1i, wt2r, wt2i;
 
 1708     int isite    = 
m_arg[itask].isite;
 
 1709     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1711     double *w2 = &v2[Nvcd * isite];
 
 1714       = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1719     if (
m_arg[itask].kt1 == 1) {
 
 1722       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1723         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1724           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1725           int is2 = ixy + Nxy * iz;
 
 1727           int ig  = 
m_Ndf * is;
 
 1728           int ix1 = Nvc2 * is2;
 
 1729           int ix2 = ix1 + 
m_Nvc;
 
 1731           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1732             int ic2 = ic * 
m_Nvc;
 
 1734             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1735             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1736             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1737             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1739             w2[2 * ic + id1 + iv]     += wt1r;
 
 1740             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1741             w2[2 * ic + id2 + iv]     += wt2r;
 
 1742             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1743             w2[2 * ic + id3 + iv]     += wt1r;
 
 1744             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1745             w2[2 * ic + id4 + iv]     += wt2r;
 
 1746             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1756     int itask, 
double *v2, 
const double *v1)
 
 1762     int id3 = 
m_Nvc * 2;
 
 1763     int id4 = 
m_Nvc * 3;
 
 1768     double wt1r, wt1i, wt2r, wt2i;
 
 1770     int isite = 
m_arg[itask].isite;
 
 1772     double       *w2 = &v2[Nvcd * isite];
 
 1773     const double *w1 = &v1[Nvcd * isite];
 
 1776     int kt1  = 
m_arg[itask].kt1;
 
 1778     int Nxyz = Nxy * 
m_Nz;
 
 1780     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1781       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1782         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1783           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1785           int in = Nvcd * (is + Nxyz);
 
 1786           int ig = 
m_Ndf * is;
 
 1788           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1789             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
 
 1790             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1791             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
 
 1792             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1795           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1796             int ic2 = ic * 
m_Nvc;
 
 1798             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1799             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1800             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1801             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1803             w2[2 * ic + id1 + iv]     += wt1r;
 
 1804             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1805             w2[2 * ic + id2 + iv]     += wt2r;
 
 1806             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1807             w2[2 * ic + id3 + iv]     += wt1r;
 
 1808             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1809             w2[2 * ic + id4 + iv]     += wt2r;
 
 1810             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1820     int itask, 
double *vcp1, 
const double *v1)
 
 1822     int Nvc2  = 2 * 
m_Nvc;
 
 1824     int Nvcd2 = Nvcd / 2;
 
 1828     int id3 = 
m_Nvc * 2;
 
 1829     int id4 = 
m_Nvc * 3;
 
 1833     int isite    = 
m_arg[itask].isite;
 
 1834     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1838       = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1839     const double *w1 = &v1[Nvcd * isite];
 
 1844     if (
m_arg[itask].kt1 == 1) {
 
 1847       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1848         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1849           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1850           int is2 = ixy + Nxy * iz;
 
 1852           int ig  = 
m_Ndf * is;
 
 1853           int ix1 = Nvc2 * is2;
 
 1854           int ix2 = ix1 + 
m_Nvc;
 
 1856           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1857             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1858             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1859             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1860             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1863           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1865             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1866             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1867             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1868             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1880     int itask, 
double *v2, 
const double *vcp2)
 
 1882     int Nvc2  = 2 * 
m_Nvc;
 
 1884     int Nvcd2 = Nvcd / 2;
 
 1888     int id3 = 
m_Nvc * 2;
 
 1889     int id4 = 
m_Nvc * 3;
 
 1894     double wt1r, wt1i, wt2r, wt2i;
 
 1896     int isite    = 
m_arg[itask].isite;
 
 1897     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1899     double *w2 = &v2[Nvcd * isite];
 
 1902       = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1906     if (
m_arg[itask].kt0 == 1) {
 
 1909       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1910         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1911           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1912           int is2 = ixy + Nxy * iz;
 
 1914           int ix1 = Nvc2 * is2;
 
 1915           int ix2 = ix1 + 
m_Nvc;
 
 1917           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1919             int ici = 2 * ic + 1;
 
 1920             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1921             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1922             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1923             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1924             w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
 
 1925             w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
 
 1926             w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
 
 1927             w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
 
 1937     int itask, 
double *v2, 
const double *v1)
 
 1943     int id3 = 
m_Nvc * 2;
 
 1944     int id4 = 
m_Nvc * 3;
 
 1949     double wt1r, wt1i, wt2r, wt2i;
 
 1951     int isite = 
m_arg[itask].isite;
 
 1953     double       *w2 = &v2[Nvcd * isite];
 
 1954     const double *w1 = &v1[Nvcd * isite];
 
 1957     int kt0  = 
m_arg[itask].kt0;
 
 1959     int Nxyz = Nxy * 
m_Nz;
 
 1961     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1962       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1963         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1964           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1966           int in = Nvcd * (is - Nxyz);
 
 1967           int ig = 
m_Ndf * (is - Nxyz);
 
 1969           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1970             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1971             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1972             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1973             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1976           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1978             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1979             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1980             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1981             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1983             w2[ic2 + id1 + iv]     += wt1r;
 
 1984             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1985             w2[ic2 + id2 + iv]     += wt2r;
 
 1986             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1987             w2[ic2 + id3 + iv]     -= wt1r;
 
 1988             w2[ic2 + 1 + id3 + iv] -= wt1i;
 
 1989             w2[ic2 + id4 + iv]     -= wt2r;
 
 1990             w2[ic2 + 1 + id4 + iv] -= wt2i;
 
 2000     int itask, 
double *v2, 
const double *v1)
 
 2007     int id3 = 
m_Nvc * 2;
 
 2008     int id4 = 
m_Nvc * 3;
 
 2010     int          isite = 
m_arg[itask].isite;
 
 2011     double       *w2   = &v2[Nvcd * isite];
 
 2012     const double *w1   = &v1[Nvcd * isite];
 
 2014     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2015       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2016         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2017           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2018           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2019             w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2020             w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2021             w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
 
 2022             w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
 
 2032     int itask, 
double *v2, 
const double *v1)
 
 2039     int id3 = 
m_Nvc * 2;
 
 2040     int id4 = 
m_Nvc * 3;
 
 2042     int          isite = 
m_arg[itask].isite;
 
 2043     double       *w2   = &v2[Nvcd * isite];
 
 2044     const double *w1   = &v1[Nvcd * isite];
 
 2046     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2047       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2048         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2049           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2050           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2051             w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
 
 2052             w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
 
 2053             w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2054             w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
void clear_thread(int, double *)
 
const double * ptr(const int jin, const int site, const int jex) const 
 
void mult_ym1_thread(int, double *, const double *)
 
void mult_yp2_thread(int, double *, const double *)
 
void mult_xm1_thread(int, double *, const double *)
 
std::vector< Channel * > m_bw_recv
 
void mult_zm1_thread(int, double *, const double *)
 
static const std::string class_name
 
std::vector< Channel * > m_bw_send
 
void general(const char *format,...)
 
void gm5_dirac_thread(int, double *, const double *)
 
std::vector< mult_arg > m_arg
 
void mult_tp2_chiral_thread(int, double *, const double *)
 
void mult_tp2_dirac_thread(int, double *, const double *)
 
void mult_ymb_thread(int, double *, const double *)
 
void mult_ypb_thread(int, double *, const double *)
 
void mult_zm2_thread(int, double *, const double *)
 
void mult_tmb_dirac_thread(int, double *, const double *)
 
void mult_tp1_dirac_thread(int, double *, const double *)
 
void gm5_chiral_thread(int, double *, const double *)
 
void mult_tmb_chiral_thread(int, double *, const double *)
 
std::vector< double > m_boundary2
b.c. for each node. 
 
void mult_yp1_thread(int, double *, const double *)
 
Bridge::VerboseLevel m_vl
 
void mult_xp1_thread(int, double *, const double *)
 
void mult_zp1_thread(int, double *, const double *)
 
std::vector< Channel * > m_fw_recv
 
void mult_tm1_dirac_thread(int, double *, const double *)
 
static int get_num_threads_available()
returns number of threads (works outside of parallel region). 
 
const Field_G * m_U
gauge configuration. 
 
void mult_ym2_thread(int, double *, const double *)
 
void crucial(const char *format,...)
 
void mult_tp1_chiral_thread(int, double *, const double *)
 
void mult_xpb_thread(int, double *, const double *)
 
void daypx_thread(int, double *, double, const double *)
 
void mult_tpb_chiral_thread(int, double *, const double *)
 
std::vector< Channel * > m_fw_send
 
void mult_xp2_thread(int, double *, const double *)
 
void mult_zpb_thread(int, double *, const double *)
 
void mult_zp2_thread(int, double *, const double *)
 
void mult_tm2_dirac_thread(int, double *, const double *)
 
void mult_xmb_thread(int, double *, const double *)
 
void mult_tm2_chiral_thread(int, double *, const double *)
 
void mult_tm1_chiral_thread(int, double *, const double *)
 
void mult_tpb_dirac_thread(int, double *, const double *)
 
void mult_xm2_thread(int, double *, const double *)
 
void mult_zmb_thread(int, double *, const double *)