22 #if defined USE_GROUP_SU3 
   23 #include "fopr_Wilson_impl_SU3.inc" 
   24 #elif defined USE_GROUP_SU2 
   25 #include "fopr_Wilson_impl_SU2.inc" 
   26 #elif defined USE_GROUP_SU_N 
   27 #include "fopr_Wilson_impl_SU_N.inc" 
   50       vout.
crucial(
m_vl, 
"Error at %s:  Nz = %d and Nt = %d do not match Nthread = %d\n",
 
   60       vout.
crucial(
m_vl, 
"Error at %s:  Mz = %d and Ntask_z = %d do not match Nz = %d\n",
 
   66       vout.
crucial(
m_vl, 
"Error at %s:  Mt = %d and Ntask_t = %d do not match Nt = %d\n",
 
   96     for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   97       for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
   98         int itask = ith_z + m_Ntask_z * ith_t;
 
  102         m_arg[itask].kt0 = 0;
 
  103         m_arg[itask].kt1 = 0;
 
  104         m_arg[itask].kz0 = 0;
 
  105         m_arg[itask].kz1 = 0;
 
  106         if (ith_t == 0) 
m_arg[itask].kt0 = 1;
 
  107         if (ith_z == 0) 
m_arg[itask].kz0 = 1;
 
  108         if (ith_t == m_Ntask_t - 1) 
m_arg[itask].kt1 = 1;
 
  109         if (ith_z == m_Ntask_z - 1) 
m_arg[itask].kz1 = 1;
 
  113         m_arg[itask].isite_cpz = ith_t * 
m_Mt * Nxy;
 
  114         m_arg[itask].isite_cpt = ith_z * 
m_Mz * Nxy;
 
  122                                  double *v2, 
double fac, 
const double *v1)
 
  127     int          isite = 
m_arg[itask].isite;
 
  128     double       *w2   = &v2[Nvcd * isite];
 
  129     const double *w1   = &v1[Nvcd * isite];
 
  131     for (
int it = 0; it < 
m_Mt; ++it) {
 
  132       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  133         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  134           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  135           w2[iv] = fac * w2[iv] + w1[iv];
 
  149     int    isite = 
m_arg[itask].isite;
 
  150     double *w2   = &v2[Nvcd * isite];
 
  152     for (
int it = 0; it < 
m_Mt; ++it) {
 
  153       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  154         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  155           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  165                                     double *vcp1, 
const double *v1)
 
  167     int Nvc2  = 2 * 
m_Nvc;
 
  169     int Nvcd2 = Nvcd / 2;
 
  176     int isite    = 
m_arg[itask].isite;
 
  177     int isite_cp = 
m_arg[itask].isite_cpx;
 
  179     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
  180     const double *w1 = &v1[Nvcd * isite];
 
  187     for (
int it = 0; it < 
m_Mt; ++it) {
 
  188       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  189         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  190           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  191           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  193           int ix1 = Nvc2 * is2;
 
  194           int ix2 = ix1 + 
m_Nvc;
 
  196           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  197             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
 
  198             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
 
  199             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  200             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
 
  210                                     double *v2, 
const double *vcp2)
 
  212     int Nvc2  = 2 * 
m_Nvc;
 
  214     int Nvcd2 = Nvcd / 2;
 
  223     double wt1r, wt1i, wt2r, wt2i;
 
  225     int isite    = 
m_arg[itask].isite;
 
  226     int isite_cp = 
m_arg[itask].isite_cpx;
 
  228     double       *w2 = &v2[Nvcd * isite];
 
  229     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
  234     for (
int it = 0; it < 
m_Mt; ++it) {
 
  235       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  236         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  237           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  238           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  241           int ix1 = Nvc2 * is2;
 
  242           int ix2 = ix1 + 
m_Nvc;
 
  244           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  245             int ic2 = ic * 
m_Nvc;
 
  247             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  248             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  249             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  250             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  252             w2[2 * ic + id1 + iv]     += wt1r;
 
  253             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  254             w2[2 * ic + id2 + iv]     += wt2r;
 
  255             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  256             w2[2 * ic + id3 + iv]     += wt2i;
 
  257             w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  258             w2[2 * ic + id4 + iv]     += wt1i;
 
  259             w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  269                                     double *v2, 
const double *v1)
 
  281     double wt1r, wt1i, wt2r, wt2i;
 
  283     int isite = 
m_arg[itask].isite;
 
  285     double       *w2 = &v2[Nvcd * isite];
 
  286     const double *w1 = &v1[Nvcd * isite];
 
  289     for (
int it = 0; it < 
m_Mt; ++it) {
 
  290       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  291         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  292           for (
int ix = 0; ix < 
m_Nx - 1; ++ix) {
 
  293             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  295             int in = Nvcd * (is + 1);
 
  298             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  299               vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  300               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
 
  301               vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  302               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
 
  305             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  306               int ic2 = ic * 
m_Nvc;
 
  308               wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  309               wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  310               wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  311               wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  313               w2[2 * ic + id1 + iv]     += wt1r;
 
  314               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  315               w2[2 * ic + id2 + iv]     += wt2r;
 
  316               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  317               w2[2 * ic + id3 + iv]     += wt2i;
 
  318               w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  319               w2[2 * ic + id4 + iv]     += wt1i;
 
  320               w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  331                                     double *vcp1, 
const double *v1)
 
  333     int Nvc2  = 2 * 
m_Nvc;
 
  335     int Nvcd2 = Nvcd / 2;
 
  344     int isite    = 
m_arg[itask].isite;
 
  345     int isite_cp = 
m_arg[itask].isite_cpx;
 
  347     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
  348     const double *w1 = &v1[Nvcd * isite];
 
  355     for (
int it = 0; it < 
m_Mt; ++it) {
 
  356       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  357         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  358           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  359           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  362           int ix1 = Nvc2 * is2;
 
  363           int ix2 = ix1 + 
m_Nvc;
 
  365           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  366             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  367             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  368             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  369             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  372           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  374             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  375             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  376             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  377             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  387                                     double *v2, 
const double *vcp2)
 
  389     int Nvc2  = 2 * 
m_Nvc;
 
  391     int Nvcd2 = Nvcd / 2;
 
  403     int isite    = 
m_arg[itask].isite;
 
  404     int isite_cp = 
m_arg[itask].isite_cpx;
 
  406     double       *w2 = &v2[Nvcd * isite];
 
  407     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
  411     for (
int it = 0; it < 
m_Mt; ++it) {
 
  412       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  413         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  414           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  415           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  417           int ix1 = Nvc2 * is2;
 
  418           int ix2 = ix1 + 
m_Nvc;
 
  420           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  422             int ici = 2 * ic + 1;
 
  423             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  424             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  425             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  426             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  427             w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
 
  428             w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
 
  429             w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
 
  430             w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
 
  440                                     double *v2, 
const double *v1)
 
  452     double wt1r, wt1i, wt2r, wt2i;
 
  454     int isite = 
m_arg[itask].isite;
 
  456     double       *w2 = &v2[Nvcd * isite];
 
  457     const double *w1 = &v1[Nvcd * isite];
 
  460     for (
int it = 0; it < 
m_Mt; ++it) {
 
  461       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  462         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  463           for (
int ix = 1; ix < 
m_Nx; ++ix) {
 
  464             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  466             int in = Nvcd * (is - 1);
 
  467             int ig = 
m_Ndf * (is - 1);
 
  469             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  470               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  471               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  472               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  473               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  476             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  479               wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  480               wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  481               wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  482               wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  484               w2[2 * ic + id1 + iv]     += wt1r;
 
  485               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  486               w2[2 * ic + id2 + iv]     += wt2r;
 
  487               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  488               w2[2 * ic + id3 + iv]     += -wt2i;
 
  489               w2[2 * ic + 1 + id3 + iv] += +wt2r;
 
  490               w2[2 * ic + id4 + iv]     += -wt1i;
 
  491               w2[2 * ic + 1 + id4 + iv] += +wt1r;
 
  502                                     double *vcp1, 
const double *v1)
 
  504     int Nvc2  = 2 * 
m_Nvc;
 
  506     int Nvcd2 = Nvcd / 2;
 
  513     int isite    = 
m_arg[itask].isite;
 
  514     int isite_cp = 
m_arg[itask].isite_cpy;
 
  516     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
  517     const double *w1 = &v1[Nvcd * isite];
 
  524     for (
int it = 0; it < 
m_Mt; ++it) {
 
  525       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  526         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  527           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  528           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  530           int ix1 = Nvc2 * is2;
 
  531           int ix2 = ix1 + 
m_Nvc;
 
  533           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  534             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
 
  535             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
 
  536             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
 
  537             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  547                                     double *v2, 
const double *vcp2)
 
  549     int Nvc2  = 2 * 
m_Nvc;
 
  551     int Nvcd2 = Nvcd / 2;
 
  560     double wt1r, wt1i, wt2r, wt2i;
 
  562     int isite    = 
m_arg[itask].isite;
 
  563     int isite_cp = 
m_arg[itask].isite_cpy;
 
  565     double       *w2 = &v2[Nvcd * isite];
 
  566     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
  571     for (
int it = 0; it < 
m_Mt; ++it) {
 
  572       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  573         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  574           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  575           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  578           int ix1 = Nvc2 * is2;
 
  579           int ix2 = ix1 + 
m_Nvc;
 
  581           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  582             int ic2 = ic * 
m_Nvc;
 
  584             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  585             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  586             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  587             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  589             w2[2 * ic + id1 + iv]     += wt1r;
 
  590             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  591             w2[2 * ic + id2 + iv]     += wt2r;
 
  592             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  593             w2[2 * ic + id3 + iv]     += -wt2r;
 
  594             w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  595             w2[2 * ic + id4 + iv]     += wt1r;
 
  596             w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  606                                     double *v2, 
const double *v1)
 
  618     double wt1r, wt1i, wt2r, wt2i;
 
  620     int isite = 
m_arg[itask].isite;
 
  622     double       *w2 = &v2[Nvcd * isite];
 
  623     const double *w1 = &v1[Nvcd * isite];
 
  626     for (
int it = 0; it < 
m_Mt; ++it) {
 
  627       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  628         for (
int iy = 0; iy < 
m_Ny - 1; ++iy) {
 
  629           for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  630             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  632             int in = Nvcd * (is + 
m_Nx);
 
  635             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  636               vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
 
  637               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  638               vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
 
  639               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  642             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  643               int ic2 = ic * 
m_Nvc;
 
  645               wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  646               wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  647               wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  648               wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  650               w2[2 * ic + id1 + iv]     += wt1r;
 
  651               w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  652               w2[2 * ic + id2 + iv]     += wt2r;
 
  653               w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  654               w2[2 * ic + id3 + iv]     += -wt2r;
 
  655               w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  656               w2[2 * ic + id4 + iv]     += wt1r;
 
  657               w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  668                                     double *vcp1, 
const double *v1)
 
  670     int Nvc2  = 2 * 
m_Nvc;
 
  672     int Nvcd2 = Nvcd / 2;
 
  681     int isite    = 
m_arg[itask].isite;
 
  682     int isite_cp = 
m_arg[itask].isite_cpy;
 
  684     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
  685     const double *w1 = &v1[Nvcd * isite];
 
  692     for (
int it = 0; it < 
m_Mt; ++it) {
 
  693       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  694         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  695           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  696           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  699           int ix1 = Nvc2 * is2;
 
  700           int ix2 = ix1 + 
m_Nvc;
 
  702           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  703             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  704             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  705             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  706             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  709           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  711             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  712             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  713             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  714             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  724                                     double *v2, 
const double *vcp2)
 
  726     int Nvc2  = 2 * 
m_Nvc;
 
  728     int Nvcd2 = Nvcd / 2;
 
  740     int isite    = 
m_arg[itask].isite;
 
  741     int isite_cp = 
m_arg[itask].isite_cpy;
 
  743     double       *w2 = &v2[Nvcd * isite];
 
  744     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
  748     for (
int it = 0; it < 
m_Mt; ++it) {
 
  749       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  750         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  751           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  752           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  754           int ix1 = Nvc2 * is2;
 
  755           int ix2 = ix1 + 
m_Nvc;
 
  757           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  759             int ici = 2 * ic + 1;
 
  760             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  761             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  762             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  763             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  764             w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
 
  765             w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
 
  766             w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
 
  767             w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
 
  777                                     double *v2, 
const double *v1)
 
  789     double wt1r, wt1i, wt2r, wt2i;
 
  791     int isite = 
m_arg[itask].isite;
 
  793     double       *w2 = &v2[Nvcd * isite];
 
  794     const double *w1 = &v1[Nvcd * isite];
 
  797     for (
int it = 0; it < 
m_Mt; ++it) {
 
  798       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  799         for (
int iy = 1; iy < 
m_Ny; ++iy) {
 
  800           for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  801             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  803             int in = Nvcd * (is - 
m_Nx);
 
  806             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  807               vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  808               vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  809               vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  810               vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  813             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  815               wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  816               wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  817               wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  818               wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  820               w2[ic2 + id1 + iv]     += wt1r;
 
  821               w2[ic2 + 1 + id1 + iv] += wt1i;
 
  822               w2[ic2 + id2 + iv]     += wt2r;
 
  823               w2[ic2 + 1 + id2 + iv] += wt2i;
 
  824               w2[ic2 + id3 + iv]     += wt2r;
 
  825               w2[ic2 + 1 + id3 + iv] += wt2i;
 
  826               w2[ic2 + id4 + iv]     += -wt1r;
 
  827               w2[ic2 + 1 + id4 + iv] += -wt1i;
 
  838                                     double *vcp1, 
const double *v1)
 
  840     int Nvc2  = 2 * 
m_Nvc;
 
  842     int Nvcd2 = Nvcd / 2;
 
  849     int isite    = 
m_arg[itask].isite;
 
  850     int isite_cp = 
m_arg[itask].isite_cpz;
 
  852     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
  853     const double *w1 = &v1[Nvcd * isite];
 
  858     if (
m_arg[itask].kz0 == 1) {
 
  861       for (
int it = 0; it < 
m_Mt; ++it) {
 
  862         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  863           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
  864           int is2 = ixy + Nxy * it;
 
  867           int ix1 = Nvc2 * is2;
 
  868           int ix2 = ix1 + 
m_Nvc;
 
  870           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  871             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
 
  872             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
 
  873             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
  874             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
 
  884                                     double *v2, 
const double *vcp2)
 
  886     int Nvc2  = 2 * 
m_Nvc;
 
  888     int Nvcd2 = Nvcd / 2;
 
  897     double wt1r, wt1i, wt2r, wt2i;
 
  899     int isite    = 
m_arg[itask].isite;
 
  900     int isite_cp = 
m_arg[itask].isite_cpz;
 
  902     double       *w2 = &v2[Nvcd * isite];
 
  903     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
  906     if (
m_arg[itask].kz1 == 1) {
 
  909       for (
int it = 0; it < 
m_Mt; ++it) {
 
  910         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  911           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
  912           int is2 = ixy + Nxy * it;
 
  915           int ix1 = Nvc2 * is2;
 
  916           int ix2 = ix1 + 
m_Nvc;
 
  918           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  919             int ic2 = ic * 
m_Nvc;
 
  921             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  922             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  923             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  924             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  926             w2[2 * ic + id1 + iv]     += wt1r;
 
  927             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  928             w2[2 * ic + id2 + iv]     += wt2r;
 
  929             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  930             w2[2 * ic + id3 + iv]     += wt1i;
 
  931             w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
  932             w2[2 * ic + id4 + iv]     += -wt2i;
 
  933             w2[2 * ic + 1 + id4 + iv] += wt2r;
 
  943                                     double *v2, 
const double *v1)
 
  955     double wt1r, wt1i, wt2r, wt2i;
 
  957     int isite = 
m_arg[itask].isite;
 
  959     double       *w2 = &v2[Nvcd * isite];
 
  960     const double *w1 = &v1[Nvcd * isite];
 
  963     int kz1 = 
m_arg[itask].kz1;
 
  966     for (
int it = 0; it < 
m_Mt; ++it) {
 
  967       for (
int iz = 0; iz < 
m_Mz - kz1; ++iz) {
 
  968         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  969           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
  971           int in = Nvcd * (is + Nxy);
 
  974           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  975             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
  976             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
 
  977             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
  978             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
 
  981           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  982             int ic2 = ic * 
m_Nvc;
 
  984             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  985             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  986             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  987             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  989             w2[2 * ic + id1 + iv]     += wt1r;
 
  990             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  991             w2[2 * ic + id2 + iv]     += wt2r;
 
  992             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  993             w2[2 * ic + id3 + iv]     += wt1i;
 
  994             w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
  995             w2[2 * ic + id4 + iv]     += -wt2i;
 
  996             w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1006                                     double *vcp1, 
const double *v1)
 
 1008     int Nvc2  = 2 * 
m_Nvc;
 
 1010     int Nvcd2 = Nvcd / 2;
 
 1014     int id3 = 
m_Nvc * 2;
 
 1015     int id4 = 
m_Nvc * 3;
 
 1019     int isite    = 
m_arg[itask].isite;
 
 1020     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1022     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1023     const double *w1 = &v1[Nvcd * isite];
 
 1028     if (
m_arg[itask].kz1 == 1) {
 
 1031       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1032         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1033           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1034           int is2 = ixy + Nxy * it;
 
 1036           int ig  = 
m_Ndf * is;
 
 1037           int ix1 = Nvc2 * is2;
 
 1038           int ix2 = ix1 + 
m_Nvc;
 
 1040           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1041             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1042             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1043             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1044             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1047           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1049             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1050             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1051             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1052             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1062                                     double *v2, 
const double *vcp2)
 
 1064     int Nvc2  = 2 * 
m_Nvc;
 
 1066     int Nvcd2 = Nvcd / 2;
 
 1070     int id3 = 
m_Nvc * 2;
 
 1071     int id4 = 
m_Nvc * 3;
 
 1078     int isite    = 
m_arg[itask].isite;
 
 1079     int isite_cp = 
m_arg[itask].isite_cpz;
 
 1081     double       *w2 = &v2[Nvcd * isite];
 
 1082     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 1084     if (
m_arg[itask].kz0 == 1) {
 
 1088       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1089         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1090           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1091           int is2 = ixy + Nxy * it;
 
 1093           int ix1 = Nvc2 * is2;
 
 1094           int ix2 = ix1 + 
m_Nvc;
 
 1096           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1098             int ici = 2 * ic + 1;
 
 1099             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1100             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1101             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1102             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1103             w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
 
 1104             w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
 
 1105             w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
 
 1106             w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
 
 1116                                     double *v2, 
const double *v1)
 
 1122     int id3 = 
m_Nvc * 2;
 
 1123     int id4 = 
m_Nvc * 3;
 
 1128     double wt1r, wt1i, wt2r, wt2i;
 
 1130     int isite = 
m_arg[itask].isite;
 
 1132     double       *w2 = &v2[Nvcd * isite];
 
 1133     const double *w1 = &v1[Nvcd * isite];
 
 1136     int kz0 = 
m_arg[itask].kz0;
 
 1139     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1140       for (
int iz = kz0; iz < 
m_Mz; ++iz) {
 
 1141         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1142           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1144           int in = Nvcd * (is - Nxy);
 
 1145           int ig = 
m_Ndf * (is - Nxy);
 
 1147           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1148             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1149             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1150             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1151             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1154           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1156             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1157             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1158             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1159             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1161             w2[ic2 + id1 + iv]     += wt1r;
 
 1162             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1163             w2[ic2 + id2 + iv]     += wt2r;
 
 1164             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1165             w2[ic2 + id3 + iv]     += -wt1i;
 
 1166             w2[ic2 + 1 + id3 + iv] += wt1r;
 
 1167             w2[ic2 + id4 + iv]     += wt2i;
 
 1168             w2[ic2 + 1 + id4 + iv] += -wt2r;
 
 1178                                           double *vcp1, 
const double *v1)
 
 1180     int Nvc2  = 2 * 
m_Nvc;
 
 1182     int Nvcd2 = Nvcd / 2;
 
 1186     int id3 = 
m_Nvc * 2;
 
 1187     int id4 = 
m_Nvc * 3;
 
 1189     int isite    = 
m_arg[itask].isite;
 
 1190     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1192     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1193     const double *w1 = &v1[Nvcd * isite];
 
 1198     if (
m_arg[itask].kt0 == 1) {
 
 1201       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1202         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1203           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1204           int is2 = ixy + Nxy * iz;
 
 1207           int ix1 = Nvc2 * is2;
 
 1208           int ix2 = ix1 + 
m_Nvc;
 
 1210           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1211             w2[2 * ic + ix1]     = 2.0 * bc2 * w1[2 * ic + id3 + in];
 
 1212             w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
 
 1213             w2[2 * ic + ix2]     = 2.0 * bc2 * w1[2 * ic + id4 + in];
 
 1214             w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
 
 1224                                           double *v2, 
const double *vcp2)
 
 1226     int Nvc2  = 2 * 
m_Nvc;
 
 1228     int Nvcd2 = Nvcd / 2;
 
 1232     int id3 = 
m_Nvc * 2;
 
 1233     int id4 = 
m_Nvc * 3;
 
 1237     double wt1r, wt1i, wt2r, wt2i;
 
 1239     int isite    = 
m_arg[itask].isite;
 
 1240     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1242     double       *w2 = &v2[Nvcd * isite];
 
 1243     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 1246     if (
m_arg[itask].kt1 == 1) {
 
 1249       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1250         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1251           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1252           int is2 = ixy + Nxy * iz;
 
 1254           int ig  = 
m_Ndf * is;
 
 1255           int ix1 = Nvc2 * is2;
 
 1256           int ix2 = ix1 + 
m_Nvc;
 
 1258           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1259             int ic2 = ic * 
m_Nvc;
 
 1261             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1262             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1263             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1264             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1266             w2[2 * ic + id3 + iv]     += wt1r;
 
 1267             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1268             w2[2 * ic + id4 + iv]     += wt2r;
 
 1269             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1279                                           double *v2, 
const double *v1)
 
 1285     int id3 = 
m_Nvc * 2;
 
 1286     int id4 = 
m_Nvc * 3;
 
 1291     double wt1r, wt1i, wt2r, wt2i;
 
 1293     int isite = 
m_arg[itask].isite;
 
 1295     double       *w2 = &v2[Nvcd * isite];
 
 1296     const double *w1 = &v1[Nvcd * isite];
 
 1299     int kt1  = 
m_arg[itask].kt1;
 
 1301     int Nxyz = Nxy * 
m_Nz;
 
 1303     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1304       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1305         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1306           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1308           int in = Nvcd * (is + Nxyz);
 
 1309           int ig = 
m_Ndf * is;
 
 1311           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1312             vt1[2 * ic]     = 2.0 * w1[2 * ic + id3 + in];
 
 1313             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
 
 1314             vt2[2 * ic]     = 2.0 * w1[2 * ic + id4 + in];
 
 1315             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
 
 1318           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1319             int ic2 = ic * 
m_Nvc;
 
 1321             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1322             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1323             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1324             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1326             w2[2 * ic + id3 + iv]     += wt1r;
 
 1327             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1328             w2[2 * ic + id4 + iv]     += wt2r;
 
 1329             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1339                                           double *vcp1, 
const double *v1)
 
 1341     int Nvc2  = 2 * 
m_Nvc;
 
 1343     int Nvcd2 = Nvcd / 2;
 
 1352     int isite    = 
m_arg[itask].isite;
 
 1353     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1355     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1356     const double *w1 = &v1[Nvcd * isite];
 
 1361     if (
m_arg[itask].kt1 == 1) {
 
 1364       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1365         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1366           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1367           int is2 = ixy + Nxy * iz;
 
 1369           int ig  = 
m_Ndf * is;
 
 1370           int ix1 = Nvc2 * is2;
 
 1371           int ix2 = ix1 + 
m_Nvc;
 
 1373           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1374             vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1375             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1376             vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1377             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1380           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1382             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1383             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1384             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1385             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1395                                           double *v2, 
const double *vcp2)
 
 1397     int Nvc2  = 2 * 
m_Nvc;
 
 1399     int Nvcd2 = Nvcd / 2;
 
 1411     int isite    = 
m_arg[itask].isite;
 
 1412     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1414     double       *w2 = &v2[Nvcd * isite];
 
 1415     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 1417     if (
m_arg[itask].kt0 == 1) {
 
 1420       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1421         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1422           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1423           int is2 = ixy + Nxy * iz;
 
 1425           int ix1 = Nvc2 * is2;
 
 1426           int ix2 = ix1 + 
m_Nvc;
 
 1428           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1430             int ici = 2 * ic + 1;
 
 1431             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1432             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1433             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1434             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1444                                           double *v2, 
const double *v1)
 
 1456     double wt1r, wt1i, wt2r, wt2i;
 
 1458     int isite = 
m_arg[itask].isite;
 
 1460     double       *w2 = &v2[Nvcd * isite];
 
 1461     const double *w1 = &v1[Nvcd * isite];
 
 1464     int kt0  = 
m_arg[itask].kt0;
 
 1466     int Nxyz = Nxy * 
m_Nz;
 
 1468     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1469       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1470         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1471           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1473           int in = Nvcd * (is - Nxyz);
 
 1474           int ig = 
m_Ndf * (is - Nxyz);
 
 1476           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1477             vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1478             vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1479             vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1480             vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1483           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1485             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1486             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1487             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1488             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1490             w2[ic2 + id1 + iv]     += wt1r;
 
 1491             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1492             w2[ic2 + id2 + iv]     += wt2r;
 
 1493             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1503                                            double *vcp1, 
const double *v1)
 
 1505     int Nvc2  = 2 * 
m_Nvc;
 
 1507     int Nvcd2 = Nvcd / 2;
 
 1511     int id3 = 
m_Nvc * 2;
 
 1512     int id4 = 
m_Nvc * 3;
 
 1514     int isite    = 
m_arg[itask].isite;
 
 1515     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1517     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1518     const double *w1 = &v1[Nvcd * isite];
 
 1523     if (
m_arg[itask].kt0 == 1) {
 
 1526       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1527         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1528           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1529           int is2 = ixy + Nxy * iz;
 
 1532           int ix1 = Nvc2 * is2;
 
 1533           int ix2 = ix1 + 
m_Nvc;
 
 1535           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1536             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
 
 1537             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
 
 1538             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
 
 1539             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1549                                            double *v2, 
const double *vcp2)
 
 1551     int Nvc2  = 2 * 
m_Nvc;
 
 1553     int Nvcd2 = Nvcd / 2;
 
 1557     int id3 = 
m_Nvc * 2;
 
 1558     int id4 = 
m_Nvc * 3;
 
 1562     double wt1r, wt1i, wt2r, wt2i;
 
 1564     int isite    = 
m_arg[itask].isite;
 
 1565     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1567     double       *w2 = &v2[Nvcd * isite];
 
 1568     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 1571     if (
m_arg[itask].kt1 == 1) {
 
 1574       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1575         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1576           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1577           int is2 = ixy + Nxy * iz;
 
 1579           int ig  = 
m_Ndf * is;
 
 1580           int ix1 = Nvc2 * is2;
 
 1581           int ix2 = ix1 + 
m_Nvc;
 
 1583           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1584             int ic2 = ic * 
m_Nvc;
 
 1586             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1587             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1588             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1589             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1591             w2[2 * ic + id1 + iv]     += wt1r;
 
 1592             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1593             w2[2 * ic + id2 + iv]     += wt2r;
 
 1594             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1595             w2[2 * ic + id3 + iv]     += wt1r;
 
 1596             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1597             w2[2 * ic + id4 + iv]     += wt2r;
 
 1598             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1608                                            double *v2, 
const double *v1)
 
 1614     int id3 = 
m_Nvc * 2;
 
 1615     int id4 = 
m_Nvc * 3;
 
 1620     double wt1r, wt1i, wt2r, wt2i;
 
 1622     int isite = 
m_arg[itask].isite;
 
 1624     double       *w2 = &v2[Nvcd * isite];
 
 1625     const double *w1 = &v1[Nvcd * isite];
 
 1628     int kt1  = 
m_arg[itask].kt1;
 
 1630     int Nxyz = Nxy * 
m_Nz;
 
 1632     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1633       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1634         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1635           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1637           int in = Nvcd * (is + Nxyz);
 
 1638           int ig = 
m_Ndf * is;
 
 1640           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1641             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
 
 1642             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1643             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
 
 1644             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1647           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1648             int ic2 = ic * 
m_Nvc;
 
 1650             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1651             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1652             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1653             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1655             w2[2 * ic + id1 + iv]     += wt1r;
 
 1656             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1657             w2[2 * ic + id2 + iv]     += wt2r;
 
 1658             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1659             w2[2 * ic + id3 + iv]     += wt1r;
 
 1660             w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1661             w2[2 * ic + id4 + iv]     += wt2r;
 
 1662             w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1672                                            double *vcp1, 
const double *v1)
 
 1674     int Nvc2  = 2 * 
m_Nvc;
 
 1676     int Nvcd2 = Nvcd / 2;
 
 1680     int id3 = 
m_Nvc * 2;
 
 1681     int id4 = 
m_Nvc * 3;
 
 1685     int isite    = 
m_arg[itask].isite;
 
 1686     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1688     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1689     const double *w1 = &v1[Nvcd * isite];
 
 1694     if (
m_arg[itask].kt1 == 1) {
 
 1697       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1698         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1699           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1700           int is2 = ixy + Nxy * iz;
 
 1702           int ig  = 
m_Ndf * is;
 
 1703           int ix1 = Nvc2 * is2;
 
 1704           int ix2 = ix1 + 
m_Nvc;
 
 1706           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1707             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1708             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1709             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1710             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1713           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1715             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1716             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1717             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1718             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1728                                            double *v2, 
const double *vcp2)
 
 1730     int Nvc2  = 2 * 
m_Nvc;
 
 1732     int Nvcd2 = Nvcd / 2;
 
 1736     int id3 = 
m_Nvc * 2;
 
 1737     int id4 = 
m_Nvc * 3;
 
 1744     int isite    = 
m_arg[itask].isite;
 
 1745     int isite_cp = 
m_arg[itask].isite_cpt;
 
 1747     double       *w2 = &v2[Nvcd * isite];
 
 1748     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 1750     if (
m_arg[itask].kt0 == 1) {
 
 1753       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1754         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1755           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1756           int is2 = ixy + Nxy * iz;
 
 1758           int ix1 = Nvc2 * is2;
 
 1759           int ix2 = ix1 + 
m_Nvc;
 
 1761           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1763             int ici = 2 * ic + 1;
 
 1764             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1765             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1766             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1767             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1768             w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
 
 1769             w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
 
 1770             w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
 
 1771             w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
 
 1781                                            double *v2, 
const double *v1)
 
 1787     int id3 = 
m_Nvc * 2;
 
 1788     int id4 = 
m_Nvc * 3;
 
 1793     double wt1r, wt1i, wt2r, wt2i;
 
 1795     int isite = 
m_arg[itask].isite;
 
 1797     double       *w2 = &v2[Nvcd * isite];
 
 1798     const double *w1 = &v1[Nvcd * isite];
 
 1801     int kt0  = 
m_arg[itask].kt0;
 
 1803     int Nxyz = Nxy * 
m_Nz;
 
 1805     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1806       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1807         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1808           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1810           int in = Nvcd * (is - Nxyz);
 
 1811           int ig = 
m_Ndf * (is - Nxyz);
 
 1813           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1814             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1815             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1816             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1817             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1820           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1822             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1823             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1824             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1825             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1827             w2[ic2 + id1 + iv]     += wt1r;
 
 1828             w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1829             w2[ic2 + id2 + iv]     += wt2r;
 
 1830             w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1831             w2[ic2 + id3 + iv]     -= wt1r;
 
 1832             w2[ic2 + 1 + id3 + iv] -= wt1i;
 
 1833             w2[ic2 + id4 + iv]     -= wt2r;
 
 1834             w2[ic2 + 1 + id4 + iv] -= wt2i;
 
 1844                                      double *v2, 
const double *v1)
 
 1851     int id3 = 
m_Nvc * 2;
 
 1852     int id4 = 
m_Nvc * 3;
 
 1854     int          isite = 
m_arg[itask].isite;
 
 1855     double       *w2   = &v2[Nvcd * isite];
 
 1856     const double *w1   = &v1[Nvcd * isite];
 
 1858     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1859       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1860         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1861           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 1862           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 1863             w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 1864             w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 1865             w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
 
 1866             w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
 
 1876                                       double *v2, 
const double *v1)
 
 1883     int id3 = 
m_Nvc * 2;
 
 1884     int id4 = 
m_Nvc * 3;
 
 1886     int          isite = 
m_arg[itask].isite;
 
 1887     double       *w2   = &v2[Nvcd * isite];
 
 1888     const double *w1   = &v1[Nvcd * isite];
 
 1890     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1891       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1892         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1893           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 1894           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 1895             w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
 
 1896             w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
 
 1897             w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 1898             w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
void mult_zm1_thread(int, double *, const double *)
 
void mult_zpb_thread(int, double *, const double *)
 
void mult_ymb_thread(int, double *, const double *)
 
void mult_tp2_chiral_thread(int, double *, const double *)
 
const double * ptr(const int jin, const int site, const int jex) const 
 
void mult_ym2_thread(int, double *, const double *)
 
void mult_zmb_thread(int, double *, const double *)
 
void mult_xpb_thread(int, double *, const double *)
 
void general(const char *format,...)
 
void mult_tm1_chiral_thread(int, double *, const double *)
 
Bridge::VerboseLevel m_vl
 
void gm5_chiral_thread(int, double *, const double *)
 
void mult_ypb_thread(int, double *, const double *)
 
const Field_G * m_U
gauge configuration. 
 
void mult_tm1_dirac_thread(int, double *, const double *)
 
void mult_tp1_chiral_thread(int, double *, const double *)
 
void gm5_dirac_thread(int, double *, const double *)
 
void mult_tmb_chiral_thread(int, double *, const double *)
 
void mult_zp1_thread(int, double *, const double *)
 
void mult_ym1_thread(int, double *, const double *)
 
void daypx_thread(int, double *, double, const double *)
 
void mult_xp1_thread(int, double *, const double *)
 
void mult_tpb_chiral_thread(int, double *, const double *)
 
void mult_tm2_dirac_thread(int, double *, const double *)
 
std::vector< mult_arg > m_arg
 
static int get_num_threads_available()
returns number of threads (works outside of parallel region). 
 
void crucial(const char *format,...)
 
void mult_zp2_thread(int, double *, const double *)
 
std::vector< double > m_boundary2
b.c. for each node. 
 
void mult_tm2_chiral_thread(int, double *, const double *)
 
void mult_tp2_dirac_thread(int, double *, const double *)
 
void mult_xm2_thread(int, double *, const double *)
 
void clear_thread(int, double *)
 
void mult_yp2_thread(int, double *, const double *)
 
void mult_xm1_thread(int, double *, const double *)
 
void mult_tpb_dirac_thread(int, double *, const double *)
 
void mult_xmb_thread(int, double *, const double *)
 
void mult_zm2_thread(int, double *, const double *)
 
void mult_tp1_dirac_thread(int, double *, const double *)
 
void mult_yp1_thread(int, double *, const double *)
 
void mult_tmb_dirac_thread(int, double *, const double *)
 
static const std::string class_name
 
void mult_xp2_thread(int, double *, const double *)