17 #if defined USE_GROUP_SU3 
   18 #include "fopr_Wilson_impl_SU3.inc" 
   19 #elif defined USE_GROUP_SU2 
   20 #include "fopr_Wilson_impl_SU2.inc" 
   21 #elif defined USE_GROUP_SU_N 
   22 #include "fopr_Wilson_impl_SU_N.inc" 
   45       vout.
crucial(
m_vl, 
"Error at %s:  Nz = %d and Nt = %d do not match Nthread = %d\n",
 
   55       vout.
crucial(
m_vl, 
"Error at %s:  Mz = %d and Ntask_z = %d do not match Nz = %d\n",
 
   61       vout.
crucial(
m_vl, 
"Error at %s:  Mt = %d and Ntask_t = %d do not match Nt = %d\n",
 
   91     for (
int ithread_t = 0; ithread_t < 
m_Ntask_t; ++ithread_t) {
 
   92       for (
int ithread_z = 0; ithread_z < 
m_Ntask_z; ++ithread_z) {
 
   93         int itask = ithread_z + m_Ntask_z * ithread_t;
 
  100         m_arg[itask].kz1 = 0;
 
  101         if (ithread_t == 0) 
m_arg[itask].kt0 = 1;
 
  102         if (ithread_z == 0) 
m_arg[itask].kz0 = 1;
 
  103         if (ithread_t == m_Ntask_t - 1) 
m_arg[itask].kt1 = 1;
 
  104         if (ithread_z == m_Ntask_z - 1) 
m_arg[itask].kz1 = 1;
 
  108         m_arg[itask].isite_cp_z = ithread_t * 
m_Mt * Nxy;
 
  109         m_arg[itask].isite_cp_t = ithread_z * 
m_Mz * Nxy;
 
  117     int itask, 
double *v2, 
double fac, 
const double *v1)
 
  122     int isite = 
m_arg[itask].isite;
 
  124     const double *w1 = &v1[Nvcd * isite];
 
  125     double       *w2 = &v2[Nvcd * isite];
 
  127     for (
int it = 0; it < 
m_Mt; ++it) {
 
  128       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  129         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  130           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  131           w2[iv] += fac * w1[iv];
 
  140     int itask, 
double *v2, 
double fac, 
const double *v1)
 
  145     int          isite = 
m_arg[itask].isite;
 
  146     const double *w1   = &v1[Nvcd * isite];
 
  147     double       *w2   = &v2[Nvcd * isite];
 
  149     for (
int it = 0; it < 
m_Mt; ++it) {
 
  150       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  151         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  152           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  153           w2[iv] = fac * w2[iv] + w1[iv];
 
  162                                        double *v, 
double fac)
 
  167     int    isite = 
m_arg[itask].isite;
 
  168     double *w    = &v[Nvcd * isite];
 
  170     for (
int it = 0; it < 
m_Mt; ++it) {
 
  171       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  172         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  173           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  188     int    isite = 
m_arg[itask].isite;
 
  189     double *w2   = &v2[Nvcd * isite];
 
  191     for (
int it = 0; it < 
m_Mt; ++it) {
 
  192       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  193         for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  194           int iv = ivxy + Nvxy * (iz + 
m_Nz * it);
 
  204     int itask, 
double *vcp1, 
const double *v1)
 
  213     int isite    = 
m_arg[itask].isite;
 
  214     int isite_cp = 
m_arg[itask].isite_cp_x;
 
  216     const double *w1 = &v1[Nvcd * isite];
 
  217     double       *w2 = &vcp1[Nvcd * isite_cp];
 
  224     for (
int it = 0; it < 
m_Mt; ++it) {
 
  225       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  226         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  227           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  228           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  230           int ix1 = Nvcd * is2;
 
  231           int ix2 = ix1 + 
m_Nvc;
 
  232           int ix3 = ix2 + 
m_Nvc;
 
  233           int ix4 = ix3 + 
m_Nvc;
 
  235           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  237             int ic_i = 2 * ic + 1;
 
  239             w2[ic_r + ix1] = bc2 * (
m_r_s * w1[ic_r + id1 + in] - 
m_nu_s * w1[ic_i + id4 + in]);
 
  240             w2[ic_i + ix1] = bc2 * (
m_r_s * w1[ic_i + id1 + in] + 
m_nu_s * w1[ic_r + id4 + in]);
 
  241             w2[ic_r + ix2] = bc2 * (
m_r_s * w1[ic_r + id2 + in] - 
m_nu_s * w1[ic_i + id3 + in]);
 
  242             w2[ic_i + ix2] = bc2 * (
m_r_s * w1[ic_i + id2 + in] + 
m_nu_s * w1[ic_r + id3 + in]);
 
  244             w2[ic_r + ix3] = bc2 * (
m_r_s * w1[ic_r + id3 + in] + 
m_nu_s * w1[ic_i + id2 + in]);
 
  245             w2[ic_i + ix3] = bc2 * (
m_r_s * w1[ic_i + id3 + in] - 
m_nu_s * w1[ic_r + id2 + in]);
 
  246             w2[ic_r + ix4] = bc2 * (
m_r_s * w1[ic_r + id4 + in] + 
m_nu_s * w1[ic_i + id1 + in]);
 
  247             w2[ic_i + ix4] = bc2 * (
m_r_s * w1[ic_i + id4 + in] - 
m_nu_s * w1[ic_r + id1 + in]);
 
  257     int itask, 
double *v2, 
const double *vcp2)
 
  268     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
  270     int isite    = 
m_arg[itask].isite;
 
  271     int isite_cp = 
m_arg[itask].isite_cp_x;
 
  273     const double *w1 = &vcp2[Nvcd * isite_cp];
 
  274     double       *w2 = &v2[Nvcd * isite];
 
  279     for (
int it = 0; it < 
m_Mt; ++it) {
 
  280       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  281         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  282           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  283           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  286           int ix1 = Nvcd * is2;
 
  287           int ix2 = ix1 + 
m_Nvc;
 
  288           int ix3 = ix2 + 
m_Nvc;
 
  289           int ix4 = ix3 + 
m_Nvc;
 
  291           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  292             int ic2 = ic * 
m_Nvc;
 
  294             wt1_r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  295             wt1_i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  296             wt2_r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  297             wt2_i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  299             wt3_r = mult_uv_r(&u[ic2 + ig], &w1[ix3], m_Nc);
 
  300             wt3_i = mult_uv_i(&u[ic2 + ig], &w1[ix3], m_Nc);
 
  301             wt4_r = mult_uv_r(&u[ic2 + ig], &w1[ix4], m_Nc);
 
  302             wt4_i = mult_uv_i(&u[ic2 + ig], &w1[ix4], m_Nc);
 
  305             int ic_i = 2 * ic + 1;
 
  307             w2[ic_r + id1 + iv] += wt1_r;
 
  308             w2[ic_i + id1 + iv] += wt1_i;
 
  309             w2[ic_r + id2 + iv] += wt2_r;
 
  310             w2[ic_i + id2 + iv] += wt2_i;
 
  312             w2[ic_r + id3 + iv] += wt3_r;
 
  313             w2[ic_i + id3 + iv] += wt3_i;
 
  314             w2[ic_r + id4 + iv] += wt4_r;
 
  315             w2[ic_i + id4 + iv] += wt4_i;
 
  325     int itask, 
double *v2, 
const double *v1)
 
  337     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
  339     int isite = 
m_arg[itask].isite;
 
  341     const double *w1 = &v1[Nvcd * isite];
 
  342     double       *w2 = &v2[Nvcd * isite];
 
  345     for (
int it = 0; it < 
m_Mt; ++it) {
 
  346       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  347         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  348           for (
int ix = 0; ix < 
m_Nx - 1; ++ix) {
 
  349             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  351             int in = Nvcd * (is + 1);
 
  354             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  356               int ic_i = 2 * ic + 1;
 
  358               vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] - 
m_nu_s * w1[ic_i + id4 + in];
 
  359               vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] + 
m_nu_s * w1[ic_r + id4 + in];
 
  360               vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] - 
m_nu_s * w1[ic_i + id3 + in];
 
  361               vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] + 
m_nu_s * w1[ic_r + id3 + in];
 
  363               vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] + 
m_nu_s * w1[ic_i + id2 + in];
 
  364               vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] - 
m_nu_s * w1[ic_r + id2 + in];
 
  365               vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] + 
m_nu_s * w1[ic_i + id1 + in];
 
  366               vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] - 
m_nu_s * w1[ic_r + id1 + in];
 
  369             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  370               int ic2 = ic * 
m_Nvc;
 
  372               wt1_r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  373               wt1_i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  374               wt2_r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  375               wt2_i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  377               wt3_r = mult_uv_r(&u[ic2 + ig], vt3, m_Nc);
 
  378               wt3_i = mult_uv_i(&u[ic2 + ig], vt3, m_Nc);
 
  379               wt4_r = mult_uv_r(&u[ic2 + ig], vt4, m_Nc);
 
  380               wt4_i = mult_uv_i(&u[ic2 + ig], vt4, m_Nc);
 
  383               int ic_i = 2 * ic + 1;
 
  385               w2[ic_r + id1 + iv] += wt1_r;
 
  386               w2[ic_i + id1 + iv] += wt1_i;
 
  387               w2[ic_r + id2 + iv] += wt2_r;
 
  388               w2[ic_i + id2 + iv] += wt2_i;
 
  390               w2[ic_r + id3 + iv] += wt3_r;
 
  391               w2[ic_i + id3 + iv] += wt3_i;
 
  392               w2[ic_r + id4 + iv] += wt4_r;
 
  393               w2[ic_i + id4 + iv] += wt4_i;
 
  404     int itask, 
double *vcp1, 
const double *v1)
 
  415     int isite    = 
m_arg[itask].isite;
 
  416     int isite_cp = 
m_arg[itask].isite_cp_x;
 
  418     const double *w1 = &v1[Nvcd * isite];
 
  419     double       *w2 = &vcp1[Nvcd * isite_cp];
 
  426     for (
int it = 0; it < 
m_Mt; ++it) {
 
  427       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  428         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  429           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  430           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  433           int ix1 = Nvcd * is2;
 
  434           int ix2 = ix1 + 
m_Nvc;
 
  435           int ix3 = ix2 + 
m_Nvc;
 
  436           int ix4 = ix3 + 
m_Nvc;
 
  438           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  440             int ic_i = 2 * ic + 1;
 
  442             vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] + 
m_nu_s * w1[ic_i + id4 + in];
 
  443             vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] - 
m_nu_s * w1[ic_r + id4 + in];
 
  444             vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] + 
m_nu_s * w1[ic_i + id3 + in];
 
  445             vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] - 
m_nu_s * w1[ic_r + id3 + in];
 
  447             vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] - 
m_nu_s * w1[ic_i + id2 + in];
 
  448             vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] + 
m_nu_s * w1[ic_r + id2 + in];
 
  449             vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] - 
m_nu_s * w1[ic_i + id1 + in];
 
  450             vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] + 
m_nu_s * w1[ic_r + id1 + in];
 
  453           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  457             int ic_i = 2 * ic + 1;
 
  459             w2[ic_r + ix1] = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  460             w2[ic_i + ix1] = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  461             w2[ic_r + ix2] = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  462             w2[ic_i + ix2] = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  464             w2[ic_r + ix3] = mult_udagv_r(&u[ic2 + ig], vt3, m_Nc);
 
  465             w2[ic_i + ix3] = mult_udagv_i(&u[ic2 + ig], vt3, m_Nc);
 
  466             w2[ic_r + ix4] = mult_udagv_r(&u[ic2 + ig], vt4, m_Nc);
 
  467             w2[ic_i + ix4] = mult_udagv_i(&u[ic2 + ig], vt4, m_Nc);
 
  477     int itask, 
double *v2, 
const double *vcp2)
 
  491     int isite    = 
m_arg[itask].isite;
 
  492     int isite_cp = 
m_arg[itask].isite_cp_x;
 
  494     const double *w1 = &vcp2[Nvcd * isite_cp];
 
  495     double       *w2 = &v2[Nvcd * isite];
 
  499     for (
int it = 0; it < 
m_Mt; ++it) {
 
  500       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  501         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  502           int is  = ix + 
m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  503           int is2 = iy + m_Ny * (iz + m_Mz * it);
 
  505           int ix1 = Nvcd * is2;
 
  506           int ix2 = ix1 + 
m_Nvc;
 
  507           int ix3 = ix2 + 
m_Nvc;
 
  508           int ix4 = ix3 + 
m_Nvc;
 
  510           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  512             int ic_i = 2 * ic + 1;
 
  514             w2[ic_r + id1 + iv] += bc2 * w1[ic_r + ix1];
 
  515             w2[ic_i + id1 + iv] += bc2 * w1[ic_i + ix1];
 
  516             w2[ic_r + id2 + iv] += bc2 * w1[ic_r + ix2];
 
  517             w2[ic_i + id2 + iv] += bc2 * w1[ic_i + ix2];
 
  519             w2[ic_r + id3 + iv] += bc2 * w1[ic_r + ix3];
 
  520             w2[ic_i + id3 + iv] += bc2 * w1[ic_i + ix3];
 
  521             w2[ic_r + id4 + iv] += bc2 * w1[ic_r + ix4];
 
  522             w2[ic_i + id4 + iv] += bc2 * w1[ic_i + ix4];
 
  532     int itask, 
double *v2, 
const double *v1)
 
  544     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
  546     int isite = 
m_arg[itask].isite;
 
  548     const double *w1 = &v1[Nvcd * isite];
 
  549     double       *w2 = &v2[Nvcd * isite];
 
  552     for (
int it = 0; it < 
m_Mt; ++it) {
 
  553       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  554         for (
int iy = 0; iy < 
m_Ny; ++iy) {
 
  555           for (
int ix = 1; ix < 
m_Nx; ++ix) {
 
  556             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  558             int in = Nvcd * (is - 1);
 
  559             int ig = 
m_Ndf * (is - 1);
 
  561             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  563               int ic_i = 2 * ic + 1;
 
  565               vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] + 
m_nu_s * w1[ic_i + id4 + in];
 
  566               vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] - 
m_nu_s * w1[ic_r + id4 + in];
 
  567               vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] + 
m_nu_s * w1[ic_i + id3 + in];
 
  568               vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] - 
m_nu_s * w1[ic_r + id3 + in];
 
  570               vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] - 
m_nu_s * w1[ic_i + id2 + in];
 
  571               vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] + 
m_nu_s * w1[ic_r + id2 + in];
 
  572               vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] - 
m_nu_s * w1[ic_i + id1 + in];
 
  573               vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] + 
m_nu_s * w1[ic_r + id1 + in];
 
  576             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  579               wt1_r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  580               wt1_i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  581               wt2_r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  582               wt2_i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  584               wt3_r = mult_udagv_r(&u[ic2 + ig], vt3, m_Nc);
 
  585               wt3_i = mult_udagv_i(&u[ic2 + ig], vt3, m_Nc);
 
  586               wt4_r = mult_udagv_r(&u[ic2 + ig], vt4, m_Nc);
 
  587               wt4_i = mult_udagv_i(&u[ic2 + ig], vt4, m_Nc);
 
  590               int ic_i = 2 * ic + 1;
 
  592               w2[ic_r + id1 + iv] += wt1_r;
 
  593               w2[ic_i + id1 + iv] += wt1_i;
 
  594               w2[ic_r + id2 + iv] += wt2_r;
 
  595               w2[ic_i + id2 + iv] += wt2_i;
 
  597               w2[ic_r + id3 + iv] += wt3_r;
 
  598               w2[ic_i + id3 + iv] += wt3_i;
 
  599               w2[ic_r + id4 + iv] += wt4_r;
 
  600               w2[ic_i + id4 + iv] += wt4_i;
 
  611     int itask, 
double *vcp1, 
const double *v1)
 
  620     int isite    = 
m_arg[itask].isite;
 
  621     int isite_cp = 
m_arg[itask].isite_cp_y;
 
  623     const double *w1 = &v1[Nvcd * isite];
 
  624     double       *w2 = &vcp1[Nvcd * isite_cp];
 
  631     for (
int it = 0; it < 
m_Mt; ++it) {
 
  632       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  633         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  634           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  635           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  637           int ix1 = Nvcd * is2;
 
  638           int ix2 = ix1 + 
m_Nvc;
 
  639           int ix3 = ix2 + 
m_Nvc;
 
  640           int ix4 = ix3 + 
m_Nvc;
 
  642           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  644             int ic_i = 2 * ic + 1;
 
  646             w2[ic_r + ix1] = bc2 * (
m_r_s * w1[ic_r + id1 + in] + 
m_nu_s * w1[ic_r + id4 + in]);
 
  647             w2[ic_i + ix1] = bc2 * (
m_r_s * w1[ic_i + id1 + in] + 
m_nu_s * w1[ic_i + id4 + in]);
 
  648             w2[ic_r + ix2] = bc2 * (
m_r_s * w1[ic_r + id2 + in] - 
m_nu_s * w1[ic_r + id3 + in]);
 
  649             w2[ic_i + ix2] = bc2 * (
m_r_s * w1[ic_i + id2 + in] - 
m_nu_s * w1[ic_i + id3 + in]);
 
  651             w2[ic_r + ix3] = bc2 * (
m_r_s * w1[ic_r + id3 + in] - 
m_nu_s * w1[ic_r + id2 + in]);
 
  652             w2[ic_i + ix3] = bc2 * (
m_r_s * w1[ic_i + id3 + in] - 
m_nu_s * w1[ic_i + id2 + in]);
 
  653             w2[ic_r + ix4] = bc2 * (
m_r_s * w1[ic_r + id4 + in] + 
m_nu_s * w1[ic_r + id1 + in]);
 
  654             w2[ic_i + ix4] = bc2 * (
m_r_s * w1[ic_i + id4 + in] + 
m_nu_s * w1[ic_i + id1 + in]);
 
  664     int itask, 
double *v2, 
const double *vcp2)
 
  675     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
  677     int isite    = 
m_arg[itask].isite;
 
  678     int isite_cp = 
m_arg[itask].isite_cp_y;
 
  680     const double *w1 = &vcp2[Nvcd * isite_cp];
 
  681     double       *w2 = &v2[Nvcd * isite];
 
  686     for (
int it = 0; it < 
m_Mt; ++it) {
 
  687       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  688         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  689           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  690           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  693           int ix1 = Nvcd * is2;
 
  694           int ix2 = ix1 + 
m_Nvc;
 
  695           int ix3 = ix2 + 
m_Nvc;
 
  696           int ix4 = ix3 + 
m_Nvc;
 
  698           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  699             int ic2 = ic * 
m_Nvc;
 
  701             wt1_r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  702             wt1_i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  703             wt2_r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  704             wt2_i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  706             wt3_r = mult_uv_r(&u[ic2 + ig], &w1[ix3], m_Nc);
 
  707             wt3_i = mult_uv_i(&u[ic2 + ig], &w1[ix3], m_Nc);
 
  708             wt4_r = mult_uv_r(&u[ic2 + ig], &w1[ix4], m_Nc);
 
  709             wt4_i = mult_uv_i(&u[ic2 + ig], &w1[ix4], m_Nc);
 
  712             int ic_i = 2 * ic + 1;
 
  714             w2[ic_r + id1 + iv] += wt1_r;
 
  715             w2[ic_i + id1 + iv] += wt1_i;
 
  716             w2[ic_r + id2 + iv] += wt2_r;
 
  717             w2[ic_i + id2 + iv] += wt2_i;
 
  719             w2[ic_r + id3 + iv] += wt3_r;
 
  720             w2[ic_i + id3 + iv] += wt3_i;
 
  721             w2[ic_r + id4 + iv] += wt4_r;
 
  722             w2[ic_i + id4 + iv] += wt4_i;
 
  732     int itask, 
double *v2, 
const double *v1)
 
  744     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
  746     int isite = 
m_arg[itask].isite;
 
  748     const double *w1 = &v1[Nvcd * isite];
 
  749     double       *w2 = &v2[Nvcd * isite];
 
  752     for (
int it = 0; it < 
m_Mt; ++it) {
 
  753       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  754         for (
int iy = 0; iy < 
m_Ny - 1; ++iy) {
 
  755           for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  756             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  758             int in = Nvcd * (is + 
m_Nx);
 
  761             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  763               int ic_i = 2 * ic + 1;
 
  765               vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] + 
m_nu_s * w1[ic_r + id4 + in];
 
  766               vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] + 
m_nu_s * w1[ic_i + id4 + in];
 
  767               vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] - 
m_nu_s * w1[ic_r + id3 + in];
 
  768               vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] - 
m_nu_s * w1[ic_i + id3 + in];
 
  770               vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] - 
m_nu_s * w1[ic_r + id2 + in];
 
  771               vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] - 
m_nu_s * w1[ic_i + id2 + in];
 
  772               vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] + 
m_nu_s * w1[ic_r + id1 + in];
 
  773               vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] + 
m_nu_s * w1[ic_i + id1 + in];
 
  776             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  777               int ic2 = ic * 
m_Nvc;
 
  779               wt1_r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  780               wt1_i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  781               wt2_r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  782               wt2_i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  784               wt3_r = mult_uv_r(&u[ic2 + ig], vt3, m_Nc);
 
  785               wt3_i = mult_uv_i(&u[ic2 + ig], vt3, m_Nc);
 
  786               wt4_r = mult_uv_r(&u[ic2 + ig], vt4, m_Nc);
 
  787               wt4_i = mult_uv_i(&u[ic2 + ig], vt4, m_Nc);
 
  790               int ic_i = 2 * ic + 1;
 
  792               w2[ic_r + id1 + iv] += wt1_r;
 
  793               w2[ic_i + id1 + iv] += wt1_i;
 
  794               w2[ic_r + id2 + iv] += wt2_r;
 
  795               w2[ic_i + id2 + iv] += wt2_i;
 
  797               w2[ic_r + id3 + iv] += wt3_r;
 
  798               w2[ic_i + id3 + iv] += wt3_i;
 
  799               w2[ic_r + id4 + iv] += wt4_r;
 
  800               w2[ic_i + id4 + iv] += wt4_i;
 
  811     int itask, 
double *vcp1, 
const double *v1)
 
  822     int isite    = 
m_arg[itask].isite;
 
  823     int isite_cp = 
m_arg[itask].isite_cp_y;
 
  825     const double *w1 = &v1[Nvcd * isite];
 
  826     double       *w2 = &vcp1[Nvcd * isite_cp];
 
  833     for (
int it = 0; it < 
m_Mt; ++it) {
 
  834       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  835         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  836           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  837           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  840           int ix1 = Nvcd * is2;
 
  841           int ix2 = ix1 + 
m_Nvc;
 
  842           int ix3 = ix2 + 
m_Nvc;
 
  843           int ix4 = ix3 + 
m_Nvc;
 
  845           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  847             int ic_i = 2 * ic + 1;
 
  849             vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] - 
m_nu_s * w1[ic_r + id4 + in];
 
  850             vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] - 
m_nu_s * w1[ic_i + id4 + in];
 
  851             vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] + 
m_nu_s * w1[ic_r + id3 + in];
 
  852             vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] + 
m_nu_s * w1[ic_i + id3 + in];
 
  854             vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] + 
m_nu_s * w1[ic_r + id2 + in];
 
  855             vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] + 
m_nu_s * w1[ic_i + id2 + in];
 
  856             vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] - 
m_nu_s * w1[ic_r + id1 + in];
 
  857             vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] - 
m_nu_s * w1[ic_i + id1 + in];
 
  860           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  864             int ic_i = 2 * ic + 1;
 
  866             w2[ic_r + ix1] = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  867             w2[ic_i + ix1] = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  868             w2[ic_r + ix2] = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  869             w2[ic_i + ix2] = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  871             w2[ic_r + ix3] = mult_udagv_r(&u[ic2 + ig], vt3, m_Nc);
 
  872             w2[ic_i + ix3] = mult_udagv_i(&u[ic2 + ig], vt3, m_Nc);
 
  873             w2[ic_r + ix4] = mult_udagv_r(&u[ic2 + ig], vt4, m_Nc);
 
  874             w2[ic_i + ix4] = mult_udagv_i(&u[ic2 + ig], vt4, m_Nc);
 
  884     int itask, 
double *v2, 
const double *vcp2)
 
  898     int isite    = 
m_arg[itask].isite;
 
  899     int isite_cp = 
m_arg[itask].isite_cp_y;
 
  901     const double *w1 = &vcp2[Nvcd * isite_cp];
 
  902     double       *w2 = &v2[Nvcd * isite];
 
  906     for (
int it = 0; it < 
m_Mt; ++it) {
 
  907       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  908         for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  909           int is  = ix + m_Nx * (iy + 
m_Ny * (iz + 
m_Nz * it));
 
  910           int is2 = ix + m_Nx * (iz + m_Mz * it);
 
  912           int ix1 = Nvcd * is2;
 
  913           int ix2 = ix1 + 
m_Nvc;
 
  914           int ix3 = ix2 + 
m_Nvc;
 
  915           int ix4 = ix3 + 
m_Nvc;
 
  917           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  919             int ic_i = 2 * ic + 1;
 
  921             w2[ic_r + id1 + iv] += bc2 * w1[ic_r + ix1];
 
  922             w2[ic_i + id1 + iv] += bc2 * w1[ic_i + ix1];
 
  923             w2[ic_r + id2 + iv] += bc2 * w1[ic_r + ix2];
 
  924             w2[ic_i + id2 + iv] += bc2 * w1[ic_i + ix2];
 
  926             w2[ic_r + id3 + iv] += bc2 * w1[ic_r + ix3];
 
  927             w2[ic_i + id3 + iv] += bc2 * w1[ic_i + ix3];
 
  928             w2[ic_r + id4 + iv] += bc2 * w1[ic_r + ix4];
 
  929             w2[ic_i + id4 + iv] += bc2 * w1[ic_i + ix4];
 
  939     int itask, 
double *v2, 
const double *v1)
 
  951     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
  953     int isite = 
m_arg[itask].isite;
 
  955     const double *w1 = &v1[Nvcd * isite];
 
  956     double       *w2 = &v2[Nvcd * isite];
 
  959     for (
int it = 0; it < 
m_Mt; ++it) {
 
  960       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
  961         for (
int iy = 1; iy < 
m_Ny; ++iy) {
 
  962           for (
int ix = 0; ix < 
m_Nx; ++ix) {
 
  963             int is = ix + m_Nx * (iy + m_Ny * (iz + 
m_Nz * it));
 
  965             int in = Nvcd * (is - 
m_Nx);
 
  968             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  970               int ic_i = 2 * ic + 1;
 
  972               vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] - 
m_nu_s * w1[ic_r + id4 + in];
 
  973               vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] - 
m_nu_s * w1[ic_i + id4 + in];
 
  974               vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] + 
m_nu_s * w1[ic_r + id3 + in];
 
  975               vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] + 
m_nu_s * w1[ic_i + id3 + in];
 
  977               vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] + 
m_nu_s * w1[ic_r + id2 + in];
 
  978               vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] + 
m_nu_s * w1[ic_i + id2 + in];
 
  979               vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] - 
m_nu_s * w1[ic_r + id1 + in];
 
  980               vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] - 
m_nu_s * w1[ic_i + id1 + in];
 
  983             for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  986               wt1_r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  987               wt1_i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  988               wt2_r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  989               wt2_i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  991               wt3_r = mult_udagv_r(&u[ic2 + ig], vt3, m_Nc);
 
  992               wt3_i = mult_udagv_i(&u[ic2 + ig], vt3, m_Nc);
 
  993               wt4_r = mult_udagv_r(&u[ic2 + ig], vt4, m_Nc);
 
  994               wt4_i = mult_udagv_i(&u[ic2 + ig], vt4, m_Nc);
 
  997               int ic_i = 2 * ic + 1;
 
  999               w2[ic_r + id1 + iv] += wt1_r;
 
 1000               w2[ic_i + id1 + iv] += wt1_i;
 
 1001               w2[ic_r + id2 + iv] += wt2_r;
 
 1002               w2[ic_i + id2 + iv] += wt2_i;
 
 1004               w2[ic_r + id3 + iv] += wt3_r;
 
 1005               w2[ic_i + id3 + iv] += wt3_i;
 
 1006               w2[ic_r + id4 + iv] += wt4_r;
 
 1007               w2[ic_i + id4 + iv] += wt4_i;
 
 1018     int itask, 
double *vcp1, 
const double *v1)
 
 1024     int id3 = 
m_Nvc * 2;
 
 1025     int id4 = 
m_Nvc * 3;
 
 1027     int isite    = 
m_arg[itask].isite;
 
 1028     int isite_cp = 
m_arg[itask].isite_cp_z;
 
 1030     const double *w1 = &v1[Nvcd * isite];
 
 1031     double       *w2 = &vcp1[Nvcd * isite_cp];
 
 1036     if (
m_arg[itask].kz0 == 1) {
 
 1039       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1040         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1041           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1042           int is2 = ixy + Nxy * it;
 
 1045           int ix1 = Nvcd * is2;
 
 1046           int ix2 = ix1 + 
m_Nvc;
 
 1047           int ix3 = ix2 + 
m_Nvc;
 
 1048           int ix4 = ix3 + 
m_Nvc;
 
 1050           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1052             int ic_i = 2 * ic + 1;
 
 1054             w2[ic_r + ix1] = bc2 * (
m_r_s * w1[ic_r + id1 + in] - 
m_nu_s * w1[ic_i + id3 + in]);
 
 1055             w2[ic_i + ix1] = bc2 * (
m_r_s * w1[ic_i + id1 + in] + 
m_nu_s * w1[ic_r + id3 + in]);
 
 1056             w2[ic_r + ix2] = bc2 * (
m_r_s * w1[ic_r + id2 + in] + 
m_nu_s * w1[ic_i + id4 + in]);
 
 1057             w2[ic_i + ix2] = bc2 * (
m_r_s * w1[ic_i + id2 + in] - 
m_nu_s * w1[ic_r + id4 + in]);
 
 1059             w2[ic_r + ix3] = bc2 * (
m_r_s * w1[ic_r + id3 + in] + 
m_nu_s * w1[ic_i + id1 + in]);
 
 1060             w2[ic_i + ix3] = bc2 * (
m_r_s * w1[ic_i + id3 + in] - 
m_nu_s * w1[ic_r + id1 + in]);
 
 1061             w2[ic_r + ix4] = bc2 * (
m_r_s * w1[ic_r + id4 + in] - 
m_nu_s * w1[ic_i + id2 + in]);
 
 1062             w2[ic_i + ix4] = bc2 * (
m_r_s * w1[ic_i + id4 + in] + 
m_nu_s * w1[ic_r + id2 + in]);
 
 1072     int itask, 
double *v2, 
const double *vcp2)
 
 1078     int id3 = 
m_Nvc * 2;
 
 1079     int id4 = 
m_Nvc * 3;
 
 1083     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
 1085     int isite    = 
m_arg[itask].isite;
 
 1086     int isite_cp = 
m_arg[itask].isite_cp_z;
 
 1088     const double *w1 = &vcp2[Nvcd * isite_cp];
 
 1089     double       *w2 = &v2[Nvcd * isite];
 
 1092     if (
m_arg[itask].kz1 == 1) {
 
 1095       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1096         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1097           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1098           int is2 = ixy + Nxy * it;
 
 1100           int ig  = 
m_Ndf * is;
 
 1101           int ix1 = Nvcd * is2;
 
 1102           int ix2 = ix1 + 
m_Nvc;
 
 1103           int ix3 = ix2 + 
m_Nvc;
 
 1104           int ix4 = ix3 + 
m_Nvc;
 
 1106           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1107             int ic2 = ic * 
m_Nvc;
 
 1109             wt1_r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1110             wt1_i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1111             wt2_r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1112             wt2_i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1114             wt3_r = mult_uv_r(&u[ic2 + ig], &w1[ix3], m_Nc);
 
 1115             wt3_i = mult_uv_i(&u[ic2 + ig], &w1[ix3], m_Nc);
 
 1116             wt4_r = mult_uv_r(&u[ic2 + ig], &w1[ix4], m_Nc);
 
 1117             wt4_i = mult_uv_i(&u[ic2 + ig], &w1[ix4], m_Nc);
 
 1120             int ic_i = 2 * ic + 1;
 
 1122             w2[ic_r + id1 + iv] += wt1_r;
 
 1123             w2[ic_i + id1 + iv] += wt1_i;
 
 1124             w2[ic_r + id2 + iv] += wt2_r;
 
 1125             w2[ic_i + id2 + iv] += wt2_i;
 
 1127             w2[ic_r + id3 + iv] += wt3_r;
 
 1128             w2[ic_i + id3 + iv] += wt3_i;
 
 1129             w2[ic_r + id4 + iv] += wt4_r;
 
 1130             w2[ic_i + id4 + iv] += wt4_i;
 
 1140     int itask, 
double *v2, 
const double *v1)
 
 1146     int id3 = 
m_Nvc * 2;
 
 1147     int id4 = 
m_Nvc * 3;
 
 1152     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
 1154     int isite = 
m_arg[itask].isite;
 
 1156     const double *w1 = &v1[Nvcd * isite];
 
 1157     double       *w2 = &v2[Nvcd * isite];
 
 1160     int kz1 = 
m_arg[itask].kz1;
 
 1163     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1164       for (
int iz = 0; iz < 
m_Mz - kz1; ++iz) {
 
 1165         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1166           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1168           int in = Nvcd * (is + Nxy);
 
 1169           int ig = 
m_Ndf * is;
 
 1171           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1173             int ic_i = 2 * ic + 1;
 
 1175             vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] - 
m_nu_s * w1[ic_i + id3 + in];
 
 1176             vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] + 
m_nu_s * w1[ic_r + id3 + in];
 
 1177             vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] + 
m_nu_s * w1[ic_i + id4 + in];
 
 1178             vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] - 
m_nu_s * w1[ic_r + id4 + in];
 
 1180             vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] + 
m_nu_s * w1[ic_i + id1 + in];
 
 1181             vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] - 
m_nu_s * w1[ic_r + id1 + in];
 
 1182             vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] - 
m_nu_s * w1[ic_i + id2 + in];
 
 1183             vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] + 
m_nu_s * w1[ic_r + id2 + in];
 
 1186           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1187             int ic2 = ic * 
m_Nvc;
 
 1189             wt1_r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1190             wt1_i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1191             wt2_r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1192             wt2_i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1194             wt3_r = mult_uv_r(&u[ic2 + ig], vt3, m_Nc);
 
 1195             wt3_i = mult_uv_i(&u[ic2 + ig], vt3, m_Nc);
 
 1196             wt4_r = mult_uv_r(&u[ic2 + ig], vt4, m_Nc);
 
 1197             wt4_i = mult_uv_i(&u[ic2 + ig], vt4, m_Nc);
 
 1200             int ic_i = 2 * ic + 1;
 
 1202             w2[ic_r + id1 + iv] += wt1_r;
 
 1203             w2[ic_i + id1 + iv] += wt1_i;
 
 1204             w2[ic_r + id2 + iv] += wt2_r;
 
 1205             w2[ic_i + id2 + iv] += wt2_i;
 
 1207             w2[ic_r + id3 + iv] += wt3_r;
 
 1208             w2[ic_i + id3 + iv] += wt3_i;
 
 1209             w2[ic_r + id4 + iv] += wt4_r;
 
 1210             w2[ic_i + id4 + iv] += wt4_i;
 
 1220     int itask, 
double *vcp1, 
const double *v1)
 
 1226     int id3 = 
m_Nvc * 2;
 
 1227     int id4 = 
m_Nvc * 3;
 
 1231     int isite    = 
m_arg[itask].isite;
 
 1232     int isite_cp = 
m_arg[itask].isite_cp_z;
 
 1234     const double *w1 = &v1[Nvcd * isite];
 
 1235     double       *w2 = &vcp1[Nvcd * isite_cp];
 
 1240     if (
m_arg[itask].kz1 == 1) {
 
 1243       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1244         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1245           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1246           int is2 = ixy + Nxy * it;
 
 1248           int ig  = 
m_Ndf * is;
 
 1249           int ix1 = Nvcd * is2;
 
 1250           int ix2 = ix1 + 
m_Nvc;
 
 1251           int ix3 = ix2 + 
m_Nvc;
 
 1252           int ix4 = ix3 + 
m_Nvc;
 
 1254           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1256             int ic_i = 2 * ic + 1;
 
 1258             vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] + 
m_nu_s * w1[ic_i + id3 + in];
 
 1259             vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] - 
m_nu_s * w1[ic_r + id3 + in];
 
 1260             vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] - 
m_nu_s * w1[ic_i + id4 + in];
 
 1261             vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] + 
m_nu_s * w1[ic_r + id4 + in];
 
 1263             vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] - 
m_nu_s * w1[ic_i + id1 + in];
 
 1264             vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] + 
m_nu_s * w1[ic_r + id1 + in];
 
 1265             vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] + 
m_nu_s * w1[ic_i + id2 + in];
 
 1266             vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] - 
m_nu_s * w1[ic_r + id2 + in];
 
 1269           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1273             int ic_i = 2 * ic + 1;
 
 1275             w2[ic_r + ix1] = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1276             w2[ic_i + ix1] = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1277             w2[ic_r + ix2] = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1278             w2[ic_i + ix2] = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1280             w2[ic_r + ix3] = mult_udagv_r(&u[ic2 + ig], vt3, m_Nc);
 
 1281             w2[ic_i + ix3] = mult_udagv_i(&u[ic2 + ig], vt3, m_Nc);
 
 1282             w2[ic_r + ix4] = mult_udagv_r(&u[ic2 + ig], vt4, m_Nc);
 
 1283             w2[ic_i + ix4] = mult_udagv_i(&u[ic2 + ig], vt4, m_Nc);
 
 1293     int itask, 
double *v2, 
const double *vcp2)
 
 1299     int id3 = 
m_Nvc * 2;
 
 1300     int id4 = 
m_Nvc * 3;
 
 1307     int isite    = 
m_arg[itask].isite;
 
 1308     int isite_cp = 
m_arg[itask].isite_cp_z;
 
 1310     const double *w1 = &vcp2[Nvcd * isite_cp];
 
 1311     double       *w2 = &v2[Nvcd * isite];
 
 1313     if (
m_arg[itask].kz0 == 1) {
 
 1317       for (
int it = 0; it < 
m_Mt; ++it) {
 
 1318         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1319           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1320           int is2 = ixy + Nxy * it;
 
 1322           int ix1 = Nvcd * is2;
 
 1323           int ix2 = ix1 + 
m_Nvc;
 
 1324           int ix3 = ix2 + 
m_Nvc;
 
 1325           int ix4 = ix3 + 
m_Nvc;
 
 1327           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1329             int ic_i = 2 * ic + 1;
 
 1331             w2[ic_r + id1 + iv] += bc2 * w1[ic_r + ix1];
 
 1332             w2[ic_i + id1 + iv] += bc2 * w1[ic_i + ix1];
 
 1333             w2[ic_r + id2 + iv] += bc2 * w1[ic_r + ix2];
 
 1334             w2[ic_i + id2 + iv] += bc2 * w1[ic_i + ix2];
 
 1336             w2[ic_r + id3 + iv] += bc2 * w1[ic_r + ix3];
 
 1337             w2[ic_i + id3 + iv] += bc2 * w1[ic_i + ix3];
 
 1338             w2[ic_r + id4 + iv] += bc2 * w1[ic_r + ix4];
 
 1339             w2[ic_i + id4 + iv] += bc2 * w1[ic_i + ix4];
 
 1349     int itask, 
double *v2, 
const double *v1)
 
 1355     int id3 = 
m_Nvc * 2;
 
 1356     int id4 = 
m_Nvc * 3;
 
 1361     double wt1_r, wt1_i, wt2_r, wt2_i, wt3_r, wt3_i, wt4_r, wt4_i;
 
 1363     int isite = 
m_arg[itask].isite;
 
 1365     const double *w1 = &v1[Nvcd * isite];
 
 1366     double       *w2 = &v2[Nvcd * isite];
 
 1369     int kz0 = 
m_arg[itask].kz0;
 
 1372     for (
int it = 0; it < 
m_Mt; ++it) {
 
 1373       for (
int iz = kz0; iz < 
m_Mz; ++iz) {
 
 1374         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1375           int is = ixy + Nxy * (iz + 
m_Nz * it);
 
 1377           int in = Nvcd * (is - Nxy);
 
 1378           int ig = 
m_Ndf * (is - Nxy);
 
 1380           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1382             int ic_i = 2 * ic + 1;
 
 1384             vt1[ic_r] = 
m_r_s * w1[ic_r + id1 + in] + 
m_nu_s * w1[ic_i + id3 + in];
 
 1385             vt1[ic_i] = 
m_r_s * w1[ic_i + id1 + in] - 
m_nu_s * w1[ic_r + id3 + in];
 
 1386             vt2[ic_r] = 
m_r_s * w1[ic_r + id2 + in] - 
m_nu_s * w1[ic_i + id4 + in];
 
 1387             vt2[ic_i] = 
m_r_s * w1[ic_i + id2 + in] + 
m_nu_s * w1[ic_r + id4 + in];
 
 1389             vt3[ic_r] = 
m_r_s * w1[ic_r + id3 + in] - 
m_nu_s * w1[ic_i + id1 + in];
 
 1390             vt3[ic_i] = 
m_r_s * w1[ic_i + id3 + in] + 
m_nu_s * w1[ic_r + id1 + in];
 
 1391             vt4[ic_r] = 
m_r_s * w1[ic_r + id4 + in] + 
m_nu_s * w1[ic_i + id2 + in];
 
 1392             vt4[ic_i] = 
m_r_s * w1[ic_i + id4 + in] - 
m_nu_s * w1[ic_r + id2 + in];
 
 1395           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1398             wt1_r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1399             wt1_i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1400             wt2_r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1401             wt2_i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1403             wt3_r = mult_udagv_r(&u[ic2 + ig], vt3, m_Nc);
 
 1404             wt3_i = mult_udagv_i(&u[ic2 + ig], vt3, m_Nc);
 
 1405             wt4_r = mult_udagv_r(&u[ic2 + ig], vt4, m_Nc);
 
 1406             wt4_i = mult_udagv_i(&u[ic2 + ig], vt4, m_Nc);
 
 1409             int ic_i = 2 * ic + 1;
 
 1411             w2[ic_r + id1 + iv] += wt1_r;
 
 1412             w2[ic_i + id1 + iv] += wt1_i;
 
 1413             w2[ic_r + id2 + iv] += wt2_r;
 
 1414             w2[ic_i + id2 + iv] += wt2_i;
 
 1416             w2[ic_r + id3 + iv] += wt3_r;
 
 1417             w2[ic_i + id3 + iv] += wt3_i;
 
 1418             w2[ic_r + id4 + iv] += wt4_r;
 
 1419             w2[ic_i + id4 + iv] += wt4_i;
 
 1429     int itask, 
double *vcp1, 
const double *v1)
 
 1431     int Nvc2  = 2 * 
m_Nvc;
 
 1433     int Nvcd2 = Nvcd / 2;
 
 1437     int id3 = 
m_Nvc * 2;
 
 1438     int id4 = 
m_Nvc * 3;
 
 1440     int isite    = 
m_arg[itask].isite;
 
 1441     int isite_cp = 
m_arg[itask].isite_cp_t;
 
 1443     const double *w1 = &v1[Nvcd * isite];
 
 1444     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1449     if (
m_arg[itask].kt0 == 1) {
 
 1452       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1453         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1454           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1455           int is2 = ixy + Nxy * iz;
 
 1458           int ix1 = Nvc2 * is2;
 
 1459           int ix2 = ix1 + 
m_Nvc;
 
 1461           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1463             int ic_i = 2 * ic + 1;
 
 1465             w2[ic_r + ix1] = 2.0 * bc2 * w1[ic_r + id3 + in];
 
 1466             w2[ic_i + ix1] = 2.0 * bc2 * w1[ic_i + id3 + in];
 
 1467             w2[ic_r + ix2] = 2.0 * bc2 * w1[ic_r + id4 + in];
 
 1468             w2[ic_i + ix2] = 2.0 * bc2 * w1[ic_i + id4 + in];
 
 1478     int itask, 
double *v2, 
const double *vcp2)
 
 1480     int Nvc2  = 2 * 
m_Nvc;
 
 1482     int Nvcd2 = Nvcd / 2;
 
 1486     int id3 = 
m_Nvc * 2;
 
 1487     int id4 = 
m_Nvc * 3;
 
 1491     double wt1_r, wt1_i, wt2_r, wt2_i;
 
 1493     int isite    = 
m_arg[itask].isite;
 
 1494     int isite_cp = 
m_arg[itask].isite_cp_t;
 
 1496     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 1497     double       *w2 = &v2[Nvcd * isite];
 
 1500     if (
m_arg[itask].kt1 == 1) {
 
 1503       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1504         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1505           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1506           int is2 = ixy + Nxy * iz;
 
 1508           int ig  = 
m_Ndf * is;
 
 1509           int ix1 = Nvc2 * is2;
 
 1510           int ix2 = ix1 + 
m_Nvc;
 
 1512           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1513             int ic2 = ic * 
m_Nvc;
 
 1515             wt1_r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1516             wt1_i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1517             wt2_r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1518             wt2_i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1521             int ic_i = 2 * ic + 1;
 
 1523             w2[ic_r + id3 + iv] += wt1_r;
 
 1524             w2[ic_i + id3 + iv] += wt1_i;
 
 1525             w2[ic_r + id4 + iv] += wt2_r;
 
 1526             w2[ic_i + id4 + iv] += wt2_i;
 
 1536     int itask, 
double *v2, 
const double *v1)
 
 1542     int id3 = 
m_Nvc * 2;
 
 1543     int id4 = 
m_Nvc * 3;
 
 1548     double wt1_r, wt1_i, wt2_r, wt2_i;
 
 1550     int isite = 
m_arg[itask].isite;
 
 1552     const double *w1 = &v1[Nvcd * isite];
 
 1553     double       *w2 = &v2[Nvcd * isite];
 
 1556     int kt1  = 
m_arg[itask].kt1;
 
 1558     int Nxyz = Nxy * 
m_Nz;
 
 1560     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1561       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1562         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1563           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1565           int in = Nvcd * (is + Nxyz);
 
 1566           int ig = 
m_Ndf * is;
 
 1568           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1570             int ic_i = 2 * ic + 1;
 
 1572             vt1[ic_r] = 2.0 * w1[ic_r + id3 + in];
 
 1573             vt1[ic_i] = 2.0 * w1[ic_i + id3 + in];
 
 1574             vt2[ic_r] = 2.0 * w1[ic_r + id4 + in];
 
 1575             vt2[ic_i] = 2.0 * w1[ic_i + id4 + in];
 
 1578           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1579             int ic2 = ic * 
m_Nvc;
 
 1581             wt1_r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1582             wt1_i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1583             wt2_r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1584             wt2_i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1587             int ic_i = 2 * ic + 1;
 
 1589             w2[ic_r + id3 + iv] += wt1_r;
 
 1590             w2[ic_i + id3 + iv] += wt1_i;
 
 1591             w2[ic_r + id4 + iv] += wt2_r;
 
 1592             w2[ic_i + id4 + iv] += wt2_i;
 
 1602     int itask, 
double *vcp1, 
const double *v1)
 
 1604     int Nvc2  = 2 * 
m_Nvc;
 
 1606     int Nvcd2 = Nvcd / 2;
 
 1615     int isite    = 
m_arg[itask].isite;
 
 1616     int isite_cp = 
m_arg[itask].isite_cp_t;
 
 1618     const double *w1 = &v1[Nvcd * isite];
 
 1619     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1624     if (
m_arg[itask].kt1 == 1) {
 
 1627       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1628         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1629           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1630           int is2 = ixy + Nxy * iz;
 
 1632           int ig  = 
m_Ndf * is;
 
 1633           int ix1 = Nvc2 * is2;
 
 1634           int ix2 = ix1 + 
m_Nvc;
 
 1636           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1638             int ic_i = 2 * ic + 1;
 
 1640             vt1[ic_r] = 2.0 * w1[ic_r + id1 + in];
 
 1641             vt1[ic_i] = 2.0 * w1[ic_i + id1 + in];
 
 1642             vt2[ic_r] = 2.0 * w1[ic_r + id2 + in];
 
 1643             vt2[ic_i] = 2.0 * w1[ic_i + id2 + in];
 
 1646           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1650             int ic_i = 2 * ic + 1;
 
 1652             w2[ic_r + ix1] = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1653             w2[ic_i + ix1] = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1654             w2[ic_r + ix2] = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1655             w2[ic_i + ix2] = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1665     int itask, 
double *v2, 
const double *vcp2)
 
 1667     int Nvc2  = 2 * 
m_Nvc;
 
 1669     int Nvcd2 = Nvcd / 2;
 
 1681     int isite    = 
m_arg[itask].isite;
 
 1682     int isite_cp = 
m_arg[itask].isite_cp_t;
 
 1684     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 1685     double       *w2 = &v2[Nvcd * isite];
 
 1687     if (
m_arg[itask].kt0 == 1) {
 
 1690       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1691         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1692           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1693           int is2 = ixy + Nxy * iz;
 
 1695           int ix1 = Nvc2 * is2;
 
 1696           int ix2 = ix1 + 
m_Nvc;
 
 1698           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1700             int ic_i = 2 * ic + 1;
 
 1702             w2[ic_r + id1 + iv] += bc2 * w1[ic_r + ix1];
 
 1703             w2[ic_i + id1 + iv] += bc2 * w1[ic_i + ix1];
 
 1704             w2[ic_r + id2 + iv] += bc2 * w1[ic_r + ix2];
 
 1705             w2[ic_i + id2 + iv] += bc2 * w1[ic_i + ix2];
 
 1715     int itask, 
double *v2, 
const double *v1)
 
 1727     double wt1_r, wt1_i, wt2_r, wt2_i;
 
 1729     int isite = 
m_arg[itask].isite;
 
 1731     const double *w1 = &v1[Nvcd * isite];
 
 1732     double       *w2 = &v2[Nvcd * isite];
 
 1735     int kt0  = 
m_arg[itask].kt0;
 
 1737     int Nxyz = Nxy * 
m_Nz;
 
 1739     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 1740       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1741         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1742           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1744           int in = Nvcd * (is - Nxyz);
 
 1745           int ig = 
m_Ndf * (is - Nxyz);
 
 1747           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1749             int ic_i = 2 * ic + 1;
 
 1751             vt1[ic_r] = 2.0 * w1[ic_r + id1 + in];
 
 1752             vt1[ic_i] = 2.0 * w1[ic_i + id1 + in];
 
 1753             vt2[ic_r] = 2.0 * w1[ic_r + id2 + in];
 
 1754             vt2[ic_i] = 2.0 * w1[ic_i + id2 + in];
 
 1757           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1760             wt1_r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1761             wt1_i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1762             wt2_r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1763             wt2_i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1766             int ic_i = 2 * ic + 1;
 
 1768             w2[ic_r + id1 + iv] += wt1_r;
 
 1769             w2[ic_i + id1 + iv] += wt1_i;
 
 1770             w2[ic_r + id2 + iv] += wt2_r;
 
 1771             w2[ic_i + id2 + iv] += wt2_i;
 
 1781     int itask, 
double *vcp1, 
const double *v1)
 
 1783     int Nvc2  = 2 * 
m_Nvc;
 
 1785     int Nvcd2 = Nvcd / 2;
 
 1789     int id3 = 
m_Nvc * 2;
 
 1790     int id4 = 
m_Nvc * 3;
 
 1792     int isite    = 
m_arg[itask].isite;
 
 1793     int isite_cp = 
m_arg[itask].isite_cp_t;
 
 1795     const double *w1 = &v1[Nvcd * isite];
 
 1796     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1801     if (
m_arg[itask].kt0 == 1) {
 
 1804       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1805         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1806           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1807           int is2 = ixy + Nxy * iz;
 
 1810           int ix1 = Nvc2 * is2;
 
 1811           int ix2 = ix1 + 
m_Nvc;
 
 1813           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1815             int ic_i = 2 * ic + 1;
 
 1817             w2[ic_r + ix1] = bc2 * (w1[ic_r + id1 + in] + w1[ic_r + id3 + in]);
 
 1818             w2[ic_i + ix1] = bc2 * (w1[ic_i + id1 + in] + w1[ic_i + id3 + in]);
 
 1819             w2[ic_r + ix2] = bc2 * (w1[ic_r + id2 + in] + w1[ic_r + id4 + in]);
 
 1820             w2[ic_i + ix2] = bc2 * (w1[ic_i + id2 + in] + w1[ic_i + id4 + in]);
 
 1830     int itask, 
double *v2, 
const double *vcp2)
 
 1832     int Nvc2  = 2 * 
m_Nvc;
 
 1834     int Nvcd2 = Nvcd / 2;
 
 1838     int id3 = 
m_Nvc * 2;
 
 1839     int id4 = 
m_Nvc * 3;
 
 1843     double wt1_r, wt1_i, wt2_r, wt2_i;
 
 1845     int isite    = 
m_arg[itask].isite;
 
 1846     int isite_cp = 
m_arg[itask].isite_cp_t;
 
 1849     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 1850     double       *w2 = &v2[Nvcd * isite];
 
 1853     if (
m_arg[itask].kt1 == 1) {
 
 1856       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1857         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1858           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1859           int is2 = ixy + Nxy * iz;
 
 1861           int ig  = 
m_Ndf * is;
 
 1862           int ix1 = Nvc2 * is2;
 
 1863           int ix2 = ix1 + 
m_Nvc;
 
 1865           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1866             int ic2 = ic * 
m_Nvc;
 
 1868             wt1_r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1869             wt1_i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1870             wt2_r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1871             wt2_i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1874             int ic_i = 2 * ic + 1;
 
 1876             w2[ic_r + id1 + iv] += wt1_r;
 
 1877             w2[ic_i + id1 + iv] += wt1_i;
 
 1878             w2[ic_r + id2 + iv] += wt2_r;
 
 1879             w2[ic_i + id2 + iv] += wt2_i;
 
 1881             w2[ic_r + id3 + iv] += wt1_r;
 
 1882             w2[ic_i + id3 + iv] += wt1_i;
 
 1883             w2[ic_r + id4 + iv] += wt2_r;
 
 1884             w2[ic_i + id4 + iv] += wt2_i;
 
 1894     int itask, 
double *v2, 
const double *v1)
 
 1900     int id3 = 
m_Nvc * 2;
 
 1901     int id4 = 
m_Nvc * 3;
 
 1906     double wt1_r, wt1_i, wt2_r, wt2_i;
 
 1908     int isite = 
m_arg[itask].isite;
 
 1910     const double *w1 = &v1[Nvcd * isite];
 
 1911     double       *w2 = &v2[Nvcd * isite];
 
 1914     int kt1  = 
m_arg[itask].kt1;
 
 1916     int Nxyz = Nxy * 
m_Nz;
 
 1918     for (
int it = 0; it < 
m_Mt - kt1; ++it) {
 
 1919       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1920         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1921           int is = ixy + Nxy * (iz + m_Nz * it);
 
 1923           int in = Nvcd * (is + Nxyz);
 
 1924           int ig = 
m_Ndf * is;
 
 1926           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1928             int ic_i = 2 * ic + 1;
 
 1930             vt1[ic_r] = w1[ic_r + id1 + in] + w1[ic_r + id3 + in];
 
 1931             vt1[ic_i] = w1[ic_i + id1 + in] + w1[ic_i + id3 + in];
 
 1932             vt2[ic_r] = w1[ic_r + id2 + in] + w1[ic_r + id4 + in];
 
 1933             vt2[ic_i] = w1[ic_i + id2 + in] + w1[ic_i + id4 + in];
 
 1936           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1937             int ic2 = ic * 
m_Nvc;
 
 1939             wt1_r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1940             wt1_i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1941             wt2_r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1942             wt2_i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1945             int ic_i = 2 * ic + 1;
 
 1947             w2[ic_r + id1 + iv] += wt1_r;
 
 1948             w2[ic_i + id1 + iv] += wt1_i;
 
 1949             w2[ic_r + id2 + iv] += wt2_r;
 
 1950             w2[ic_i + id2 + iv] += wt2_i;
 
 1952             w2[ic_r + id3 + iv] += wt1_r;
 
 1953             w2[ic_i + id3 + iv] += wt1_i;
 
 1954             w2[ic_r + id4 + iv] += wt2_r;
 
 1955             w2[ic_i + id4 + iv] += wt2_i;
 
 1965     int itask, 
double *vcp1, 
const double *v1)
 
 1967     int Nvc2  = 2 * 
m_Nvc;
 
 1969     int Nvcd2 = Nvcd / 2;
 
 1973     int id3 = 
m_Nvc * 2;
 
 1974     int id4 = 
m_Nvc * 3;
 
 1978     int isite    = 
m_arg[itask].isite;
 
 1979     int isite_cp = 
m_arg[itask].isite_cp_t;
 
 1982     const double *w1 = &v1[Nvcd * isite];
 
 1983     double       *w2 = &vcp1[Nvcd2 * isite_cp];
 
 1988     if (
m_arg[itask].kt1 == 1) {
 
 1991       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 1992         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1993           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 1994           int is2 = ixy + Nxy * iz;
 
 1996           int ig  = 
m_Ndf * is;
 
 1997           int ix1 = Nvc2 * is2;
 
 1998           int ix2 = ix1 + 
m_Nvc;
 
 2000           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 2002             int ic_i = 2 * ic + 1;
 
 2004             vt1[ic_r] = w1[ic_r + id1 + in] - w1[ic_r + id3 + in];
 
 2005             vt1[ic_i] = w1[ic_i + id1 + in] - w1[ic_i + id3 + in];
 
 2006             vt2[ic_r] = w1[ic_r + id2 + in] - w1[ic_r + id4 + in];
 
 2007             vt2[ic_i] = w1[ic_i + id2 + in] - w1[ic_i + id4 + in];
 
 2010           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 2014             int ic_i = 2 * ic + 1;
 
 2016             w2[ic_r + ix1] = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 2017             w2[ic_i + ix1] = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 2018             w2[ic_r + ix2] = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 2019             w2[ic_i + ix2] = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 2029     int itask, 
double *v2, 
const double *vcp2)
 
 2031     int Nvc2  = 2 * 
m_Nvc;
 
 2033     int Nvcd2 = Nvcd / 2;
 
 2037     int id3 = 
m_Nvc * 2;
 
 2038     int id4 = 
m_Nvc * 3;
 
 2045     int isite    = 
m_arg[itask].isite;
 
 2046     int isite_cp = 
m_arg[itask].isite_cp_t;
 
 2048     const double *w1 = &vcp2[Nvcd2 * isite_cp];
 
 2049     double       *w2 = &v2[Nvcd * isite];
 
 2051     if (
m_arg[itask].kt0 == 1) {
 
 2054       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2055         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2056           int is  = ixy + Nxy * (iz + 
m_Nz * it);
 
 2057           int is2 = ixy + Nxy * iz;
 
 2059           int ix1 = Nvc2 * is2;
 
 2060           int ix2 = ix1 + 
m_Nvc;
 
 2062           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 2064             int ic_i = 2 * ic + 1;
 
 2066             w2[ic_r + id1 + iv] += bc2 * w1[ic_r + ix1];
 
 2067             w2[ic_i + id1 + iv] += bc2 * w1[ic_i + ix1];
 
 2068             w2[ic_r + id2 + iv] += bc2 * w1[ic_r + ix2];
 
 2069             w2[ic_i + id2 + iv] += bc2 * w1[ic_i + ix2];
 
 2071             w2[ic_r + id3 + iv] -= bc2 * w1[ic_r + ix1];
 
 2072             w2[ic_i + id3 + iv] -= bc2 * w1[ic_i + ix1];
 
 2073             w2[ic_r + id4 + iv] -= bc2 * w1[ic_r + ix2];
 
 2074             w2[ic_i + id4 + iv] -= bc2 * w1[ic_i + ix2];
 
 2084     int itask, 
double *v2, 
const double *v1)
 
 2090     int id3 = 
m_Nvc * 2;
 
 2091     int id4 = 
m_Nvc * 3;
 
 2096     double wt1_r, wt1_i, wt2_r, wt2_i;
 
 2098     int isite = 
m_arg[itask].isite;
 
 2100     const double *w1 = &v1[Nvcd * isite];
 
 2101     double       *w2 = &v2[Nvcd * isite];
 
 2104     int kt0  = 
m_arg[itask].kt0;
 
 2106     int Nxyz = Nxy * 
m_Nz;
 
 2108     for (
int it = kt0; it < 
m_Mt; ++it) {
 
 2109       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2110         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2111           int is = ixy + Nxy * (iz + m_Nz * it);
 
 2113           int in = Nvcd * (is - Nxyz);
 
 2114           int ig = 
m_Ndf * (is - Nxyz);
 
 2116           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 2118             int ic_i = 2 * ic + 1;
 
 2120             vt1[ic_r] = w1[ic_r + id1 + in] - w1[ic_r + id3 + in];
 
 2121             vt1[ic_i] = w1[ic_i + id1 + in] - w1[ic_i + id3 + in];
 
 2122             vt2[ic_r] = w1[ic_r + id2 + in] - w1[ic_r + id4 + in];
 
 2123             vt2[ic_i] = w1[ic_i + id2 + in] - w1[ic_i + id4 + in];
 
 2126           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 2129             wt1_r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 2130             wt1_i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 2131             wt2_r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 2132             wt2_i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 2135             int ic_i = 2 * ic + 1;
 
 2137             w2[ic_r + id1 + iv] += wt1_r;
 
 2138             w2[ic_i + id1 + iv] += wt1_i;
 
 2139             w2[ic_r + id2 + iv] += wt2_r;
 
 2140             w2[ic_i + id2 + iv] += wt2_i;
 
 2142             w2[ic_r + id3 + iv] -= wt1_r;
 
 2143             w2[ic_i + id3 + iv] -= wt1_i;
 
 2144             w2[ic_r + id4 + iv] -= wt2_r;
 
 2145             w2[ic_i + id4 + iv] -= wt2_i;
 
 2155     int itask, 
double *v2, 
const double *v1)
 
 2162     int id3 = 
m_Nvc * 2;
 
 2163     int id4 = 
m_Nvc * 3;
 
 2165     int isite = 
m_arg[itask].isite;
 
 2167     const double *w1 = &v1[Nvcd * isite];
 
 2168     double       *w2 = &v2[Nvcd * isite];
 
 2170     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2171       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2172         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2173           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2174           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2175             w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2176             w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2177             w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
 
 2178             w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
 
 2188     int itask, 
double *v2, 
const double *v1)
 
 2195     int id3 = 
m_Nvc * 2;
 
 2196     int id4 = 
m_Nvc * 3;
 
 2198     int isite = 
m_arg[itask].isite;
 
 2200     const double *w1 = &v1[Nvcd * isite];
 
 2201     double       *w2 = &v2[Nvcd * isite];
 
 2203     for (
int it = 0; it < 
m_Mt; ++it) {
 
 2204       for (
int iz = 0; iz < 
m_Mz; ++iz) {
 
 2205         for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2206           int iv = Nvcd * (ixy + Nxy * (iz + 
m_Nz * it));
 
 2207           for (
int ivc = 0; ivc < 
m_Nvc; ++ivc) {
 
 2208             w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
 
 2209             w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
 
 2210             w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2211             w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
void mult_t_plus1_chiral_thread(int, double *, const double *)
 
const double * ptr(const int jin, const int site, const int jex) const 
 
void mult_y_plus2_thread(int, double *, const double *)
 
void mult_t_plus2_dirac_thread(int, double *, const double *)
 
void general(const char *format,...)
 
Bridge::VerboseLevel m_vl
 
void mult_x_minus_bulk_thread(int, double *, const double *)
 
void mult_x_plus2_thread(int, double *, const double *)
 
void mult_t_plus_bulk_dirac_thread(int, double *, const double *)
 
void mult_t_minus2_chiral_thread(int, double *, const double *)
 
void gm5_dirac_thread(int, double *, const double *)
 
void daxpy_thread(int, double *, double, const double *)
 
void gm5_chiral_thread(int, double *, const double *)
 
void mult_x_minus1_thread(int, double *, const double *)
 
void mult_y_plus_bulk_thread(int, double *, const double *)
 
void mult_x_minus2_thread(int, double *, const double *)
 
void mult_z_plus_bulk_thread(int, double *, const double *)
 
void mult_y_minus_bulk_thread(int, double *, const double *)
 
void mult_z_minus1_thread(int, double *, const double *)
 
void mult_t_minus1_dirac_thread(int, double *, const double *)
 
void mult_t_minus_bulk_chiral_thread(int, double *, const double *)
 
void mult_x_plus1_thread(int, double *, const double *)
 
const Field_G * m_U
gauge configuration. 
 
void daypx_thread(int, double *, double, const double *)
 
std::vector< double > m_boundary2
b.c. for each node. 
 
void mult_t_plus2_chiral_thread(int, double *, const double *)
 
void mult_x_plus_bulk_thread(int, double *, const double *)
 
void clear_thread(int, double *)
 
void mult_t_minus2_dirac_thread(int, double *, const double *)
 
static int get_num_threads_available()
returns number of threads (works outside of parallel region). 
 
void mult_y_minus1_thread(int, double *, const double *)
 
void crucial(const char *format,...)
 
void mult_y_minus2_thread(int, double *, const double *)
 
void mult_z_plus2_thread(int, double *, const double *)
 
void scal_thread(int, double *, double)
 
static const std::string class_name
 
void mult_t_plus_bulk_chiral_thread(int, double *, const double *)
 
void mult_z_minus_bulk_thread(int, double *, const double *)
 
void mult_z_plus1_thread(int, double *, const double *)
 
std::vector< mult_arg > m_arg
 
void mult_y_plus1_thread(int, double *, const double *)
 
void mult_t_minus1_chiral_thread(int, double *, const double *)
 
void mult_t_plus1_dirac_thread(int, double *, const double *)
 
void mult_t_minus_bulk_dirac_thread(int, double *, const double *)
 
void mult_z_minus2_thread(int, double *, const double *)