22 #if defined USE_GROUP_SU3 
   23 #include "fopr_Wilson_impl_SU3.inc" 
   24 #elif defined USE_GROUP_SU2 
   25 #include "fopr_Wilson_impl_SU2.inc" 
   26 #elif defined USE_GROUP_SU_N 
   27 #include "fopr_Wilson_impl_SU_N.inc" 
   62   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   63     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
   64       int itask = ith_z + m_Ntask_z * ith_t;
 
   72       if (ith_t == 0) 
m_arg[itask].kt0 = 1;
 
   73       if (ith_z == 0) 
m_arg[itask].kz0 = 1;
 
   74       if (ith_t == m_Ntask_t - 1) 
m_arg[itask].kt1 = 1;
 
   75       if (ith_z == m_Ntask_z - 1) 
m_arg[itask].kz1 = 1;
 
   79       m_arg[itask].isite_cpz = ith_t * 
m_Mt * Nxy2;
 
   80       m_arg[itask].isite_cpt = ith_z * 
m_Mz * Nxy2;
 
   87   int Nvcd2 = 2 * Nc * Nd / 2;
 
   91   valarray<int> datasize(
m_Ntask);
 
   92   valarray<int> offset_up(
m_Ntask);
 
   93   valarray<int> offset_lw(
m_Ntask);
 
   94   valarray<int> datasize_up(
m_Ntask);
 
   95   valarray<int> datasize_lw(
m_Ntask);
 
   98   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
   99     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  101       int isite_cp = itask * 
m_Mz * 
m_Mt * (m_Ny / 2);
 
  102       destid[itask]   = itask;
 
  103       offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  104       datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * (m_Ny / 2);
 
  113   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  114     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  117       destid[itask]   = itask;
 
  118       offset[itask]   = 
sizeof(double) * Nvcd2 * isite_cp;
 
  119       datasize[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Mt * m_Nx2;
 
  128   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  129     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  130       int itask = ith_z + m_Ntask_z * ith_t;
 
  132       offset_up[itask]   = 0;
 
  133       offset_lw[itask]   = 0;
 
  134       datasize_up[itask] = 0;
 
  135       datasize_lw[itask] = 0;
 
  137         destid[itask]      = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
 
  138         offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx2 * m_Ny;
 
  139         datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx2 * m_Ny;
 
  141       if (ith_z == m_Ntask_z - 1) {
 
  143         offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_t * 
m_Mt * 
m_Nx2 * m_Ny;
 
  144         datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mt * 
m_Nx2 * m_Ny;
 
  154   for (
int ith_t = 0; ith_t < 
m_Ntask_t; ++ith_t) {
 
  155     for (
int ith_z = 0; ith_z < 
m_Ntask_z; ++ith_z) {
 
  156       int itask = ith_z + m_Ntask_z * ith_t;
 
  158       offset_up[itask]   = 0;
 
  159       offset_lw[itask]   = 0;
 
  160       datasize_up[itask] = 0;
 
  161       datasize_lw[itask] = 0;
 
  163         destid[itask]      = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
 
  164         offset_lw[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx2 * m_Ny;
 
  165         datasize_lw[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx2 * m_Ny;
 
  167       if (ith_t == m_Ntask_t - 1) {
 
  168         destid[itask]      = ith_z;
 
  169         offset_up[itask]   = 
sizeof(double) * Nvcd2 * ith_z * 
m_Mz * 
m_Nx2 * m_Ny;
 
  170         datasize_up[itask] = 
sizeof(double) * Nvcd2 * 
m_Mz * 
m_Nx2 * m_Ny;
 
  182                                                       double *w, 
double fac)
 
  184   int Nvcd = m_Nvc * 
m_Nd;
 
  185   int Nvxy = Nvcd * m_Nx2 * m_Ny;
 
  187   int    isite = m_arg[itask].isite;
 
  188   double *wp   = &w[Nvcd * isite];
 
  190   for (
int it = 0; it < m_Mt; ++it) {
 
  191     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  192       for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  193         int iv = ivxy + Nvxy * (iz + m_Nz * it);
 
  194         wp[iv] = fac * wp[iv];
 
  205   int Nvcd = m_Nvc * 
m_Nd;
 
  206   int Nvxy = Nvcd * m_Nx2 * m_Ny;
 
  208   int    isite = m_arg[itask].isite;
 
  209   double *wp   = &v[Nvcd * isite];
 
  211   for (
int it = 0; it < m_Mt; ++it) {
 
  212     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  213       for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
 
  214         int iv = ivxy + Nvxy * (iz + m_Nz * it);
 
  224                                                           double *vcp1, 
double *
v1, 
int ieo)
 
  226   int Nvc2  = 2 * m_Nvc;
 
  227   int Nvcd  = m_Nvc * 
m_Nd;
 
  228   int Nvcd2 = Nvcd / 2;
 
  237   int isite    = m_arg[itask].isite;
 
  238   int isite_cp = m_arg[itask].isite_cpx;
 
  239   int iyzt0    = isite / m_Nx2;
 
  243     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  244   double *w1 = &v1[Nvcd * isite];
 
  246   double bc2 = m_boundary2[idir];
 
  251   for (
int it = 0; it < m_Mt; ++it) {
 
  252     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  253       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  254         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  255         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  257           int is = ix + m_Nx2 * iyzt;
 
  260           int ix1 = Nvc2 * ibf;
 
  261           int ix2 = ix1 + m_Nvc;
 
  263           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  264             w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
 
  265             w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
 
  266             w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  267             w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
 
  275   m_bw_send[idir]->start_thread(itask);
 
  280                                                           double *
v2, 
double *vcp2, 
int ieo)
 
  282   int Nvc2  = 2 * m_Nvc;
 
  283   int Nvcd  = m_Nvc * 
m_Nd;
 
  284   int Nvcd2 = Nvcd / 2;
 
  293   double wt1r, wt1i, wt2r, wt2i;
 
  295   int isite    = m_arg[itask].isite;
 
  296   int isite_cp = m_arg[itask].isite_cpx;
 
  297   int iyzt0    = isite / m_Nx2;
 
  299   double *w2 = &v2[Nvcd * isite];
 
  302     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  303   double *u = 
const_cast<Field_G *
>(m_U)->ptr(
 
  306   m_bw_recv[idir]->wait_thread(itask);
 
  310   for (
int it = 0; it < m_Mt; ++it) {
 
  311     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  312       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  313         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  314         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  317           int is  = ix + m_Nx2 * iyzt;
 
  320           int ix1 = Nvc2 * ibf;
 
  321           int ix2 = ix1 + m_Nvc;
 
  323           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  324             int ic2 = ic * m_Nvc;
 
  325             wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  326             wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  327             wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  328             wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  329             w2[2 * ic + id1 + iv]     += wt1r;
 
  330             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  331             w2[2 * ic + id2 + iv]     += wt2r;
 
  332             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  333             w2[2 * ic + id3 + iv]     += wt2i;
 
  334             w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  335             w2[2 * ic + id4 + iv]     += wt1i;
 
  336             w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  348                                                           double *v2, 
double *v1, 
int ieo)
 
  350   int Nvcd = m_Nvc * 
m_Nd;
 
  359   double vt1[m_Nvc], vt2[m_Nvc];
 
  360   double wt1r, wt1i, wt2r, wt2i;
 
  362   int isite = m_arg[itask].isite;
 
  363   int iyzt0 = isite / m_Nx2;
 
  365   double *w2 = &v2[Nvcd * isite];
 
  366   double *w1 = &v1[Nvcd * isite];
 
  367   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
  368     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
  370   for (
int it = 0; it < m_Mt; ++it) {
 
  371     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  372       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  373         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  374         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  375         for (
int ix = 0; ix < m_Nx2 - Leo; ++ix) {
 
  376           int is = ix + m_Nx2 * iyzt;
 
  378           int in = Nvcd * (is + Leo);
 
  381           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  382             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  383             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
 
  384             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  385             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
 
  388           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  389             int ic2 = ic * m_Nvc;
 
  391             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  392             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  393             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  394             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  396             w2[2 * ic + id1 + iv]     += wt1r;
 
  397             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  398             w2[2 * ic + id2 + iv]     += wt2r;
 
  399             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  400             w2[2 * ic + id3 + iv]     += wt2i;
 
  401             w2[2 * ic + 1 + id3 + iv] += -wt2r;
 
  402             w2[2 * ic + id4 + iv]     += wt1i;
 
  403             w2[2 * ic + 1 + id4 + iv] += -wt1r;
 
  414                                                           double *vcp1, 
double *v1, 
int ieo)
 
  416   int Nvc2  = 2 * m_Nvc;
 
  417   int Nvcd  = m_Nvc * 
m_Nd;
 
  418   int Nvcd2 = Nvcd / 2;
 
  427   int isite    = m_arg[itask].isite;
 
  428   int isite_cp = m_arg[itask].isite_cpx;
 
  429   int iyzt0    = isite / m_Nx2;
 
  433     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  434   double *w1 = &v1[Nvcd * isite];
 
  435   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
  436     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * 
m_Nvol));
 
  438   double vt1[m_Nvc], vt2[m_Nvc];
 
  443   for (
int it = 0; it < m_Mt; ++it) {
 
  444     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  445       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  446         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  447         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  449           int is = ix + m_Nx2 * iyzt;
 
  453           int ix1 = Nvc2 * ibf;
 
  454           int ix2 = ix1 + m_Nvc;
 
  456           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  457             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  458             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  459             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  460             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  463           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  465             w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  466             w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  467             w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  468             w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  476   m_fw_send[idir]->start_thread(itask);
 
  481                                                           double *v2, 
double *vcp2, 
int ieo)
 
  483   int Nvc2  = 2 * m_Nvc;
 
  484   int Nvcd  = m_Nvc * 
m_Nd;
 
  485   int Nvcd2 = Nvcd / 2;
 
  493   double bc2  = m_boundary2[idir];
 
  495   double wt1r, wt1i, wt2r, wt2i;
 
  497   int isite    = m_arg[itask].isite;
 
  498   int isite_cp = m_arg[itask].isite_cpx;
 
  499   int iyzt0    = isite / m_Nx2;
 
  501   double *w2 = &v2[Nvcd * isite];
 
  504     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  506   m_fw_recv[idir]->wait_thread(itask);
 
  510   for (
int it = 0; it < m_Mt; ++it) {
 
  511     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  512       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  513         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  514         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  516           int is = ix + m_Nx2 * iyzt;
 
  519           int ix1 = Nvc2 * ibf;
 
  520           int ix2 = ix1 + m_Nvc;
 
  522           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  524             int ici = 2 * ic + 1;
 
  525             w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  526             w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  527             w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  528             w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  529             w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
 
  530             w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
 
  531             w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
 
  532             w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
 
  544                                                           double *v2, 
double *v1, 
int ieo)
 
  546   int Nvcd = m_Nvc * 
m_Nd;
 
  555   double vt1[m_Nvc], vt2[m_Nvc];
 
  556   double wt1r, wt1i, wt2r, wt2i;
 
  558   int isite = m_arg[itask].isite;
 
  559   int iyzt0 = isite / m_Nx2;
 
  561   double *w2 = &v2[Nvcd * isite];
 
  562   double *w1 = &v1[Nvcd * isite];
 
  563   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
  564     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
  566   for (
int it = 0; it < m_Mt; ++it) {
 
  567     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  568       for (
int iy = 0; iy < m_Ny; ++iy) {
 
  569         int iyzt = iy + m_Ny * (iz + m_Nz * it);
 
  570         int Leo  = ieo + (1 - 2 * ieo) * m_Leo[iyzt0 + iyzt];
 
  572         for (
int ix = Meo; ix < m_Nx2; ++ix) {
 
  573           int is = ix + m_Nx2 * iyzt;
 
  575           int in = Nvcd * (is - 
Meo);
 
  576           int ig = m_Ndf * (is - 
Meo);
 
  578           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  579             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  580             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
 
  581             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  582             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
 
  585           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  588             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  589             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  590             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  591             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  593             w2[2 * ic + id1 + iv]     += wt1r;
 
  594             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  595             w2[2 * ic + id2 + iv]     += wt2r;
 
  596             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  597             w2[2 * ic + id3 + iv]     += -wt2i;
 
  598             w2[2 * ic + 1 + id3 + iv] += +wt2r;
 
  599             w2[2 * ic + id4 + iv]     += -wt1i;
 
  600             w2[2 * ic + 1 + id4 + iv] += +wt1r;
 
  611                                                           double *vcp1, 
double *v1, 
int ieo)
 
  613   int Nvc2  = 2 * m_Nvc;
 
  614   int Nvcd  = m_Nvc * 
m_Nd;
 
  615   int Nvcd2 = Nvcd / 2;
 
  624   int isite    = m_arg[itask].isite;
 
  625   int isite_cp = m_arg[itask].isite_cpy;
 
  629     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  630   double *w1 = &v1[Nvcd * isite];
 
  632   double bc2 = m_boundary2[idir];
 
  636   for (
int it = 0; it < m_Mt; ++it) {
 
  637     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  638       for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  639         int is  = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  640         int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  642         int ix1 = Nvc2 * is2;
 
  643         int ix2 = ix1 + m_Nvc;
 
  645         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  646           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
 
  647           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
 
  648           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
 
  649           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
 
  655   m_bw_send[idir]->start_thread(itask);
 
  660                                                           double *v2, 
double *vcp2, 
int ieo)
 
  662   int Nvc2  = 2 * m_Nvc;
 
  663   int Nvcd  = m_Nvc * 
m_Nd;
 
  664   int Nvcd2 = Nvcd / 2;
 
  673   double wt1r, wt1i, wt2r, wt2i;
 
  675   int isite    = m_arg[itask].isite;
 
  676   int isite_cp = m_arg[itask].isite_cpy;
 
  678   double *w2 = &v2[Nvcd * isite];
 
  681     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  682   double *u = 
const_cast<Field_G *
>(m_U)->ptr(
 
  683     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
  685   m_bw_recv[idir]->wait_thread(itask);
 
  688   for (
int it = 0; it < m_Mt; ++it) {
 
  689     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  690       for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  691         int is  = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  692         int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  695         int ix1 = Nvc2 * is2;
 
  696         int ix2 = ix1 + m_Nvc;
 
  698         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  699           int ic2 = ic * m_Nvc;
 
  701           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  702           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
  703           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  704           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
  706           w2[2 * ic + id1 + iv]     += wt1r;
 
  707           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  708           w2[2 * ic + id2 + iv]     += wt2r;
 
  709           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  710           w2[2 * ic + id3 + iv]     += -wt2r;
 
  711           w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  712           w2[2 * ic + id4 + iv]     += wt1r;
 
  713           w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  723                                                           double *v2, 
double *v1, 
int ieo)
 
  725   int Nvcd = m_Nvc * 
m_Nd;
 
  734   double vt1[m_Nvc], vt2[m_Nvc];
 
  735   double wt1r, wt1i, wt2r, wt2i;
 
  737   int isite = m_arg[itask].isite;
 
  739   double *w2 = &v2[Nvcd * isite];
 
  740   double *w1 = &v1[Nvcd * isite];
 
  741   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
  742     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
  744   for (
int it = 0; it < m_Mt; ++it) {
 
  745     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  746       for (
int iy = 0; iy < m_Ny - 1; ++iy) {
 
  747         for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  748           int is = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  750           int in = Nvcd * (is + m_Nx2);
 
  753           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  754             vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
 
  755             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
 
  756             vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
 
  757             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
 
  760           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  761             int ic2 = ic * m_Nvc;
 
  763             wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
  764             wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
  765             wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
  766             wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
  768             w2[2 * ic + id1 + iv]     += wt1r;
 
  769             w2[2 * ic + 1 + id1 + iv] += wt1i;
 
  770             w2[2 * ic + id2 + iv]     += wt2r;
 
  771             w2[2 * ic + 1 + id2 + iv] += wt2i;
 
  772             w2[2 * ic + id3 + iv]     += -wt2r;
 
  773             w2[2 * ic + 1 + id3 + iv] += -wt2i;
 
  774             w2[2 * ic + id4 + iv]     += wt1r;
 
  775             w2[2 * ic + 1 + id4 + iv] += wt1i;
 
  786                                                           double *vcp1, 
double *v1, 
int ieo)
 
  788   int Nvc2  = 2 * m_Nvc;
 
  789   int Nvcd  = m_Nvc * 
m_Nd;
 
  790   int Nvcd2 = Nvcd / 2;
 
  799   int isite    = m_arg[itask].isite;
 
  800   int isite_cp = m_arg[itask].isite_cpy;
 
  804     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  805   double *w1 = &v1[Nvcd * isite];
 
  806   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
  807     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * 
m_Nvol));
 
  809   double vt1[m_Nvc], vt2[m_Nvc];
 
  813   for (
int it = 0; it < m_Mt; ++it) {
 
  814     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  815       for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  816         int is  = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  817         int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  820         int ix1 = Nvc2 * is2;
 
  821         int ix2 = ix1 + m_Nvc;
 
  823         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  824           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  825           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  826           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  827           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  830         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  832           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
  833           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
  834           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
  835           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
  841   m_fw_send[idir]->start_thread(itask);
 
  846                                                           double *v2, 
double *vcp2, 
int ieo)
 
  848   int Nvc2  = 2 * m_Nvc;
 
  849   int Nvcd  = m_Nvc * 
m_Nd;
 
  850   int Nvcd2 = Nvcd / 2;
 
  858   double bc2  = m_boundary2[idir];
 
  860   double wt1r, wt1i, wt2r, wt2i;
 
  862   int isite    = m_arg[itask].isite;
 
  863   int isite_cp = m_arg[itask].isite_cpy;
 
  865   double *w2 = &v2[Nvcd * isite];
 
  868     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  870   m_fw_recv[idir]->wait_thread(itask);
 
  873   for (
int it = 0; it < m_Mt; ++it) {
 
  874     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  875       for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  876         int is  = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  877         int is2 = ix + m_Nx2 * (iz + m_Mz * it);
 
  879         int ix1 = Nvc2 * is2;
 
  880         int ix2 = ix1 + m_Nvc;
 
  882         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  884           int ici = 2 * ic + 1;
 
  885           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
  886           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
  887           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
  888           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
  889           w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
 
  890           w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
 
  891           w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
 
  892           w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
 
  902                                                           double *v2, 
double *v1, 
int ieo)
 
  904   int Nvcd = m_Nvc * 
m_Nd;
 
  913   double vt1[m_Nvc], vt2[m_Nvc];
 
  914   double wt1r, wt1i, wt2r, wt2i;
 
  916   int isite = m_arg[itask].isite;
 
  918   double *w2 = &v2[Nvcd * isite];
 
  919   double *w1 = &v1[Nvcd * isite];
 
  920   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
  921     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
  923   for (
int it = 0; it < m_Mt; ++it) {
 
  924     for (
int iz = 0; iz < m_Mz; ++iz) {
 
  925       for (
int iy = 1; iy < m_Ny; ++iy) {
 
  926         for (
int ix = 0; ix < m_Nx2; ++ix) {
 
  927           int is = ix + m_Nx2 * (iy + m_Ny * (iz + m_Nz * it));
 
  929           int in = Nvcd * (is - m_Nx2);
 
  930           int ig = m_Ndf * (is - m_Nx2);
 
  932           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  933             vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
 
  934             vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
 
  935             vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
 
  936             vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
 
  939           for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
  941             wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
  942             wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
  943             wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
  944             wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
  946             w2[ic2 + id1 + iv]     += wt1r;
 
  947             w2[ic2 + 1 + id1 + iv] += wt1i;
 
  948             w2[ic2 + id2 + iv]     += wt2r;
 
  949             w2[ic2 + 1 + id2 + iv] += wt2i;
 
  950             w2[ic2 + id3 + iv]     += wt2r;
 
  951             w2[ic2 + 1 + id3 + iv] += wt2i;
 
  952             w2[ic2 + id4 + iv]     += -wt1r;
 
  953             w2[ic2 + 1 + id4 + iv] += -wt1i;
 
  964                                                           double *vcp1, 
double *v1, 
int ieo)
 
  966   int Nvc2  = 2 * m_Nvc;
 
  967   int Nvcd  = m_Nvc * 
m_Nd;
 
  968   int Nvcd2 = Nvcd / 2;
 
  977   int isite    = m_arg[itask].isite;
 
  978   int isite_cp = m_arg[itask].isite_cpz;
 
  982     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
  983   double *w1 = &v1[Nvcd * isite];
 
  985   double bc2 = m_boundary2[idir];
 
  987   if (m_arg[itask].kz0 == 1) {
 
  988     int Nxy = m_Nx2 * m_Ny;
 
  990     for (
int it = 0; it < m_Mt; ++it) {
 
  991       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
  992         int is  = ixy + Nxy * (iz + m_Nz * it);
 
  993         int is2 = ixy + Nxy * it;
 
  996         int ix1 = Nvc2 * is2;
 
  997         int ix2 = ix1 + m_Nvc;
 
  999         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1000           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
 
 1001           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
 
 1002           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1003           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
 
 1009   m_bw_send[idir]->start_thread(itask);
 
 1014                                                           double *v2, 
double *vcp2, 
int ieo)
 
 1016   int Nvc2  = 2 * m_Nvc;
 
 1017   int Nvcd  = m_Nvc * 
m_Nd;
 
 1018   int Nvcd2 = Nvcd / 2;
 
 1022   int id3 = m_Nvc * 2;
 
 1023   int id4 = m_Nvc * 3;
 
 1027   double wt1r, wt1i, wt2r, wt2i;
 
 1029   int isite    = m_arg[itask].isite;
 
 1030   int isite_cp = m_arg[itask].isite_cpz;
 
 1032   double *w2 = &v2[Nvcd * isite];
 
 1035     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1036   double *u = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1037     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1039   m_bw_recv[idir]->wait_thread(itask);
 
 1041   if (m_arg[itask].kz1 == 1) {
 
 1042     int Nxy = m_Nx2 * m_Ny;
 
 1044     for (
int it = 0; it < m_Mt; ++it) {
 
 1045       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1046         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1047         int is2 = ixy + Nxy * it;
 
 1049         int ig  = m_Ndf * is;
 
 1050         int ix1 = Nvc2 * is2;
 
 1051         int ix2 = ix1 + m_Nvc;
 
 1053         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1054           int ic2 = ic * m_Nvc;
 
 1056           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1057           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1058           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1059           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1061           w2[2 * ic + id1 + iv]     += wt1r;
 
 1062           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1063           w2[2 * ic + id2 + iv]     += wt2r;
 
 1064           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1065           w2[2 * ic + id3 + iv]     += wt1i;
 
 1066           w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1067           w2[2 * ic + id4 + iv]     += -wt2i;
 
 1068           w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1078                                                           double *v2, 
double *v1, 
int ieo)
 
 1080   int Nvcd = m_Nvc * 
m_Nd;
 
 1084   int id3 = m_Nvc * 2;
 
 1085   int id4 = m_Nvc * 3;
 
 1089   double vt1[m_Nvc], vt2[m_Nvc];
 
 1090   double wt1r, wt1i, wt2r, wt2i;
 
 1092   int isite = m_arg[itask].isite;
 
 1094   double *w2 = &v2[Nvcd * isite];
 
 1095   double *w1 = &v1[Nvcd * isite];
 
 1096   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1097     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1099   int kz1 = m_arg[itask].kz1;
 
 1100   int Nxy = m_Nx2 * m_Ny;
 
 1102   for (
int it = 0; it < m_Mt; ++it) {
 
 1103     for (
int iz = 0; iz < m_Mz - kz1; ++iz) {
 
 1104       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1105         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1107         int in = Nvcd * (is + Nxy);
 
 1108         int ig = m_Ndf * is;
 
 1110         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1111           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1112           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
 
 1113           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1114           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
 
 1117         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1118           int ic2 = ic * m_Nvc;
 
 1120           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1121           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1122           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1123           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1125           w2[2 * ic + id1 + iv]     += wt1r;
 
 1126           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1127           w2[2 * ic + id2 + iv]     += wt2r;
 
 1128           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1129           w2[2 * ic + id3 + iv]     += wt1i;
 
 1130           w2[2 * ic + 1 + id3 + iv] += -wt1r;
 
 1131           w2[2 * ic + id4 + iv]     += -wt2i;
 
 1132           w2[2 * ic + 1 + id4 + iv] += wt2r;
 
 1142                                                           double *vcp1, 
double *v1, 
int ieo)
 
 1144   int Nvc2  = 2 * m_Nvc;
 
 1145   int Nvcd  = m_Nvc * 
m_Nd;
 
 1146   int Nvcd2 = Nvcd / 2;
 
 1150   int id3 = m_Nvc * 2;
 
 1151   int id4 = m_Nvc * 3;
 
 1155   int isite    = m_arg[itask].isite;
 
 1156   int isite_cp = m_arg[itask].isite_cpz;
 
 1160     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1161   double *w1 = &v1[Nvcd * isite];
 
 1162   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1163     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * 
m_Nvol));
 
 1165   double vt1[m_Nvc], vt2[m_Nvc];
 
 1167   if (m_arg[itask].kz1 == 1) {
 
 1168     int Nxy = m_Nx2 * m_Ny;
 
 1170     for (
int it = 0; it < m_Mt; ++it) {
 
 1171       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1172         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1173         int is2 = ixy + Nxy * it;
 
 1175         int ig  = m_Ndf * is;
 
 1176         int ix1 = Nvc2 * is2;
 
 1177         int ix2 = ix1 + m_Nvc;
 
 1179         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1180           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1181           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1182           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1183           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1186         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1188           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1189           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1190           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1191           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1197   m_fw_send[idir]->start_thread(itask);
 
 1202                                                           double *v2, 
double *vcp2, 
int ieo)
 
 1204   int Nvc2  = 2 * m_Nvc;
 
 1205   int Nvcd  = m_Nvc * 
m_Nd;
 
 1206   int Nvcd2 = Nvcd / 2;
 
 1210   int id3 = m_Nvc * 2;
 
 1211   int id4 = m_Nvc * 3;
 
 1214   double bc2  = m_boundary2[idir];
 
 1216   double wt1r, wt1i, wt2r, wt2i;
 
 1218   int isite    = m_arg[itask].isite;
 
 1219   int isite_cp = m_arg[itask].isite_cpz;
 
 1221   double *w2 = &v2[Nvcd * isite];
 
 1224     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1226   m_fw_recv[idir]->wait_thread(itask);
 
 1228   if (m_arg[itask].kz0 == 1) {
 
 1229     int Nxy = m_Nx2 * m_Ny;
 
 1232     for (
int it = 0; it < m_Mt; ++it) {
 
 1233       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1234         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1235         int is2 = ixy + Nxy * it;
 
 1237         int ix1 = Nvc2 * is2;
 
 1238         int ix2 = ix1 + m_Nvc;
 
 1240         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1242           int ici = 2 * ic + 1;
 
 1243           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1244           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1245           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1246           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1247           w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
 
 1248           w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
 
 1249           w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
 
 1250           w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
 
 1260                                                           double *v2, 
double *v1, 
int ieo)
 
 1262   int Nvcd = m_Nvc * 
m_Nd;
 
 1266   int id3 = m_Nvc * 2;
 
 1267   int id4 = m_Nvc * 3;
 
 1271   double vt1[m_Nvc], vt2[m_Nvc];
 
 1272   double wt1r, wt1i, wt2r, wt2i;
 
 1274   int isite = m_arg[itask].isite;
 
 1276   double *w2 = &v2[Nvcd * isite];
 
 1277   double *w1 = &v1[Nvcd * isite];
 
 1278   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1279     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1281   int kz0 = m_arg[itask].kz0;
 
 1282   int Nxy = m_Nx2 * m_Ny;
 
 1284   for (
int it = 0; it < m_Mt; ++it) {
 
 1285     for (
int iz = kz0; iz < m_Mz; ++iz) {
 
 1286       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1287         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1289         int in = Nvcd * (is - Nxy);
 
 1290         int ig = m_Ndf * (is - Nxy);
 
 1292         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1293           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1294           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
 
 1295           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1296           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
 
 1299         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1301           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1302           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1303           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1304           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1306           w2[ic2 + id1 + iv]     += wt1r;
 
 1307           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1308           w2[ic2 + id2 + iv]     += wt2r;
 
 1309           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1310           w2[ic2 + id3 + iv]     += -wt1i;
 
 1311           w2[ic2 + 1 + id3 + iv] += wt1r;
 
 1312           w2[ic2 + id4 + iv]     += wt2i;
 
 1313           w2[ic2 + 1 + id4 + iv] += -wt2r;
 
 1323                                                                 double *vcp1, 
double *v1, 
int ieo)
 
 1325   int Nvc2  = 2 * m_Nvc;
 
 1326   int Nvcd  = m_Nvc * 
m_Nd;
 
 1327   int Nvcd2 = Nvcd / 2;
 
 1331   int id3 = m_Nvc * 2;
 
 1332   int id4 = m_Nvc * 3;
 
 1336   int isite    = m_arg[itask].isite;
 
 1337   int isite_cp = m_arg[itask].isite_cpt;
 
 1341     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1342   double *w1 = &v1[Nvcd * isite];
 
 1344   double bc2 = m_boundary2[idir];
 
 1346   if (m_arg[itask].kt0 == 1) {
 
 1347     int Nxy = m_Nx2 * m_Ny;
 
 1349     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1350       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1351         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1352         int is2 = ixy + Nxy * iz;
 
 1355         int ix1 = Nvc2 * is2;
 
 1356         int ix2 = ix1 + m_Nvc;
 
 1358         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1359           w2[2 * ic + ix1]     = 2.0 * bc2 * w1[2 * ic + id3 + in];
 
 1360           w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
 
 1361           w2[2 * ic + ix2]     = 2.0 * bc2 * w1[2 * ic + id4 + in];
 
 1362           w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
 
 1368   m_bw_send[idir]->start_thread(itask);
 
 1373                                                                 double *v2, 
double *vcp2, 
int ieo)
 
 1375   int Nvc2  = 2 * m_Nvc;
 
 1376   int Nvcd  = m_Nvc * 
m_Nd;
 
 1377   int Nvcd2 = Nvcd / 2;
 
 1381   int id3 = m_Nvc * 2;
 
 1382   int id4 = m_Nvc * 3;
 
 1386   double wt1r, wt1i, wt2r, wt2i;
 
 1388   int isite    = m_arg[itask].isite;
 
 1389   int isite_cp = m_arg[itask].isite_cpt;
 
 1391   double *w2 = &v2[Nvcd * isite];
 
 1394     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1395   double *u = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1396     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1398   m_bw_recv[idir]->wait_thread(itask);
 
 1400   if (m_arg[itask].kt1 == 1) {
 
 1401     int Nxy = m_Nx2 * m_Ny;
 
 1403     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1404       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1405         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1406         int is2 = ixy + Nxy * iz;
 
 1408         int ig  = m_Ndf * is;
 
 1409         int ix1 = Nvc2 * is2;
 
 1410         int ix2 = ix1 + m_Nvc;
 
 1412         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1413           int ic2 = ic * m_Nvc;
 
 1415           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1416           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1417           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1418           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1420           w2[2 * ic + id3 + iv]     += wt1r;
 
 1421           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1422           w2[2 * ic + id4 + iv]     += wt2r;
 
 1423           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1433                                                                 double *v2, 
double *v1, 
int ieo)
 
 1435   int Nvcd = m_Nvc * 
m_Nd;
 
 1439   int id3 = m_Nvc * 2;
 
 1440   int id4 = m_Nvc * 3;
 
 1444   double vt1[m_Nvc], vt2[m_Nvc];
 
 1445   double wt1r, wt1i, wt2r, wt2i;
 
 1447   int isite = m_arg[itask].isite;
 
 1449   double *w2 = &v2[Nvcd * isite];
 
 1450   double *w1 = &v1[Nvcd * isite];
 
 1451   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1452     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1454   int kt1  = m_arg[itask].kt1;
 
 1455   int Nxy  = m_Nx2 * m_Ny;
 
 1456   int Nxyz = Nxy * m_Nz;
 
 1458   for (
int it = 0; it < m_Mt - kt1; ++it) {
 
 1459     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1460       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1461         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1463         int in = Nvcd * (is + Nxyz);
 
 1464         int ig = m_Ndf * is;
 
 1466         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1467           vt1[2 * ic]     = 2.0 * w1[2 * ic + id3 + in];
 
 1468           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
 
 1469           vt2[2 * ic]     = 2.0 * w1[2 * ic + id4 + in];
 
 1470           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
 
 1473         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1474           int ic2 = ic * m_Nvc;
 
 1476           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1477           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1478           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1479           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1481           w2[2 * ic + id3 + iv]     += wt1r;
 
 1482           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1483           w2[2 * ic + id4 + iv]     += wt2r;
 
 1484           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1494                                                                 double *vcp1, 
double *v1, 
int ieo)
 
 1496   int Nvc2  = 2 * m_Nvc;
 
 1497   int Nvcd  = m_Nvc * 
m_Nd;
 
 1498   int Nvcd2 = Nvcd / 2;
 
 1502   int id3 = m_Nvc * 2;
 
 1503   int id4 = m_Nvc * 3;
 
 1507   int isite    = m_arg[itask].isite;
 
 1508   int isite_cp = m_arg[itask].isite_cpt;
 
 1512     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1513   double *w1 = &v1[Nvcd * isite];
 
 1514   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1515     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * 
m_Nvol));
 
 1517   double vt1[m_Nvc], vt2[m_Nvc];
 
 1519   if (m_arg[itask].kt1 == 1) {
 
 1520     int Nxy = m_Nx2 * m_Ny;
 
 1522     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1523       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1524         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1525         int is2 = ixy + Nxy * iz;
 
 1527         int ig  = m_Ndf * is;
 
 1528         int ix1 = Nvc2 * is2;
 
 1529         int ix2 = ix1 + m_Nvc;
 
 1531         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1532           vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1533           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1534           vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1535           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1538         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1540           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1541           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1542           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1543           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1549   m_fw_send[idir]->start_thread(itask);
 
 1554                                                                 double *v2, 
double *vcp2, 
int ieo)
 
 1556   int Nvc2  = 2 * m_Nvc;
 
 1557   int Nvcd  = m_Nvc * 
m_Nd;
 
 1558   int Nvcd2 = Nvcd / 2;
 
 1562   int id3 = m_Nvc * 2;
 
 1563   int id4 = m_Nvc * 3;
 
 1566   double bc2  = m_boundary2[idir];
 
 1568   double wt1r, wt1i, wt2r, wt2i;
 
 1570   int isite    = m_arg[itask].isite;
 
 1571   int isite_cp = m_arg[itask].isite_cpt;
 
 1573   double *w2 = &v2[Nvcd * isite];
 
 1576     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1578   m_fw_recv[idir]->wait_thread(itask);
 
 1580   if (m_arg[itask].kt0 == 1) {
 
 1581     int Nxy = m_Nx2 * m_Ny;
 
 1583     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1584       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1585         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1586         int is2 = ixy + Nxy * iz;
 
 1588         int ix1 = Nvc2 * is2;
 
 1589         int ix2 = ix1 + m_Nvc;
 
 1591         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1593           int ici = 2 * ic + 1;
 
 1594           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1595           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1596           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1597           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1607                                                                 double *v2, 
double *v1, 
int ieo)
 
 1609   int Nvcd = m_Nvc * 
m_Nd;
 
 1613   int id3 = m_Nvc * 2;
 
 1614   int id4 = m_Nvc * 3;
 
 1618   double vt1[m_Nvc], vt2[m_Nvc];
 
 1619   double wt1r, wt1i, wt2r, wt2i;
 
 1621   int isite = m_arg[itask].isite;
 
 1623   double *w2 = &v2[Nvcd * isite];
 
 1624   double *w1 = &v1[Nvcd * isite];
 
 1625   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1626     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1628   int kt0  = m_arg[itask].kt0;
 
 1629   int Nxy  = m_Nx2 * m_Ny;
 
 1630   int Nxyz = Nxy * m_Nz;
 
 1632   for (
int it = kt0; it < m_Mt; ++it) {
 
 1633     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1634       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1635         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1637         int in = Nvcd * (is - Nxyz);
 
 1638         int ig = m_Ndf * (is - Nxyz);
 
 1640         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1641           vt1[2 * ic]     = 2.0 * w1[2 * ic + id1 + in];
 
 1642           vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
 
 1643           vt2[2 * ic]     = 2.0 * w1[2 * ic + id2 + in];
 
 1644           vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
 
 1647         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1649           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1650           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1651           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1652           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1654           w2[ic2 + id1 + iv]     += wt1r;
 
 1655           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 1656           w2[ic2 + id2 + iv]     += wt2r;
 
 1657           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 1667                                                                  double *vcp1, 
double *v1, 
int ieo)
 
 1669   int Nvc2  = 2 * m_Nvc;
 
 1670   int Nvcd  = m_Nvc * 
m_Nd;
 
 1671   int Nvcd2 = Nvcd / 2;
 
 1675   int id3 = m_Nvc * 2;
 
 1676   int id4 = m_Nvc * 3;
 
 1680   int isite    = m_arg[itask].isite;
 
 1681   int isite_cp = m_arg[itask].isite_cpt;
 
 1685     = (
double *)m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1686   double *w1 = &v1[Nvcd * isite];
 
 1688   double bc2 = m_boundary2[idir];
 
 1690   if (m_arg[itask].kt0 == 1) {
 
 1691     int Nxy = m_Nx2 * m_Ny;
 
 1693     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1694       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1695         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1696         int is2 = ixy + Nxy * iz;
 
 1699         int ix1 = Nvc2 * is2;
 
 1700         int ix2 = ix1 + m_Nvc;
 
 1702         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1703           w2[2 * ic + ix1]     = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
 
 1704           w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
 
 1705           w2[2 * ic + ix2]     = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
 
 1706           w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
 
 1712   m_bw_send[idir]->start_thread(itask);
 
 1717                                                                  double *v2, 
double *vcp2, 
int ieo)
 
 1719   int Nvc2  = 2 * m_Nvc;
 
 1720   int Nvcd  = m_Nvc * 
m_Nd;
 
 1721   int Nvcd2 = Nvcd / 2;
 
 1725   int id3 = m_Nvc * 2;
 
 1726   int id4 = m_Nvc * 3;
 
 1730   double wt1r, wt1i, wt2r, wt2i;
 
 1732   int isite    = m_arg[itask].isite;
 
 1733   int isite_cp = m_arg[itask].isite_cpt;
 
 1735   double *w2 = &v2[Nvcd * isite];
 
 1738     = (
double *)m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1739   double *u = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1740     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1742   m_bw_recv[idir]->wait_thread(itask);
 
 1744   if (m_arg[itask].kt1 == 1) {
 
 1745     int Nxy = m_Nx2 * m_Ny;
 
 1747     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1748       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1749         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1750         int is2 = ixy + Nxy * iz;
 
 1752         int ig  = m_Ndf * is;
 
 1753         int ix1 = Nvc2 * is2;
 
 1754         int ix2 = ix1 + m_Nvc;
 
 1756         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1757           int ic2 = ic * m_Nvc;
 
 1759           wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1760           wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
 
 1761           wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1762           wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
 
 1764           w2[2 * ic + id1 + iv]     += wt1r;
 
 1765           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1766           w2[2 * ic + id2 + iv]     += wt2r;
 
 1767           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1768           w2[2 * ic + id3 + iv]     += wt1r;
 
 1769           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1770           w2[2 * ic + id4 + iv]     += wt2r;
 
 1771           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1781                                                                  double *v2, 
double *v1, 
int ieo)
 
 1783   int Nvcd = m_Nvc * 
m_Nd;
 
 1787   int id3 = m_Nvc * 2;
 
 1788   int id4 = m_Nvc * 3;
 
 1792   double vt1[m_Nvc], vt2[m_Nvc];
 
 1793   double wt1r, wt1i, wt2r, wt2i;
 
 1795   int isite = m_arg[itask].isite;
 
 1797   double *w2 = &v2[Nvcd * isite];
 
 1798   double *w1 = &v1[Nvcd * isite];
 
 1799   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1800     m_Ndf * (isite + ieo * m_Nvol / 2 + idir * m_Nvol));
 
 1802   int kt1  = m_arg[itask].kt1;
 
 1803   int Nxy  = m_Nx2 * m_Ny;
 
 1804   int Nxyz = Nxy * m_Nz;
 
 1806   for (
int it = 0; it < m_Mt - kt1; ++it) {
 
 1807     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1808       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1809         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1811         int in = Nvcd * (is + Nxyz);
 
 1812         int ig = m_Ndf * is;
 
 1814         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1815           vt1[2 * ic]     = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
 
 1816           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
 
 1817           vt2[2 * ic]     = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
 
 1818           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
 
 1821         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1822           int ic2 = ic * m_Nvc;
 
 1824           wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
 
 1825           wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
 
 1826           wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
 
 1827           wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
 
 1829           w2[2 * ic + id1 + iv]     += wt1r;
 
 1830           w2[2 * ic + 1 + id1 + iv] += wt1i;
 
 1831           w2[2 * ic + id2 + iv]     += wt2r;
 
 1832           w2[2 * ic + 1 + id2 + iv] += wt2i;
 
 1833           w2[2 * ic + id3 + iv]     += wt1r;
 
 1834           w2[2 * ic + 1 + id3 + iv] += wt1i;
 
 1835           w2[2 * ic + id4 + iv]     += wt2r;
 
 1836           w2[2 * ic + 1 + id4 + iv] += wt2i;
 
 1846                                                                  double *vcp1, 
double *v1, 
int ieo)
 
 1848   int Nvc2  = 2 * m_Nvc;
 
 1849   int Nvcd  = m_Nvc * 
m_Nd;
 
 1850   int Nvcd2 = Nvcd / 2;
 
 1854   int id3 = m_Nvc * 2;
 
 1855   int id4 = m_Nvc * 3;
 
 1859   int isite    = m_arg[itask].isite;
 
 1860   int isite_cp = m_arg[itask].isite_cpt;
 
 1864     = (
double *)m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1865   double *w1 = &v1[Nvcd * isite];
 
 1866   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1867     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * 
m_Nvol));
 
 1869   double vt1[m_Nvc], vt2[m_Nvc];
 
 1871   if (m_arg[itask].kt1 == 1) {
 
 1872     int Nxy = m_Nx2 * m_Ny;
 
 1874     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1875       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1876         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1877         int is2 = ixy + Nxy * iz;
 
 1879         int ig  = m_Ndf * is;
 
 1880         int ix1 = Nvc2 * is2;
 
 1881         int ix2 = ix1 + m_Nvc;
 
 1883         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1884           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1885           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1886           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 1887           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 1890         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1892           w2[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
 
 1893           w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
 
 1894           w2[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
 
 1895           w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
 
 1901   m_fw_send[idir]->start_thread(itask);
 
 1906                                                                  double *v2, 
double *vcp2, 
int ieo)
 
 1908   int Nvc2  = 2 * m_Nvc;
 
 1909   int Nvcd  = m_Nvc * 
m_Nd;
 
 1910   int Nvcd2 = Nvcd / 2;
 
 1914   int id3 = m_Nvc * 2;
 
 1915   int id4 = m_Nvc * 3;
 
 1918   double bc2  = m_boundary2[idir];
 
 1920   double wt1r, wt1i, wt2r, wt2i;
 
 1922   int isite    = m_arg[itask].isite;
 
 1923   int isite_cp = m_arg[itask].isite_cpt;
 
 1925   double *w2 = &v2[Nvcd * isite];
 
 1928     = (
double *)m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
 
 1930   m_fw_recv[idir]->wait_thread(itask);
 
 1932   if (m_arg[itask].kt0 == 1) {
 
 1933     int Nxy = m_Nx2 * m_Ny;
 
 1935     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1936       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1937         int is  = ixy + Nxy * (iz + m_Nz * it);
 
 1938         int is2 = ixy + Nxy * iz;
 
 1940         int ix1 = Nvc2 * is2;
 
 1941         int ix2 = ix1 + m_Nvc;
 
 1943         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1945           int ici = 2 * ic + 1;
 
 1946           w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
 
 1947           w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
 
 1948           w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
 
 1949           w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
 
 1950           w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
 
 1951           w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
 
 1952           w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
 
 1953           w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
 
 1963                                                                  double *v2, 
double *v1, 
int ieo)
 
 1965   int Nvcd = m_Nvc * 
m_Nd;
 
 1969   int id3 = m_Nvc * 2;
 
 1970   int id4 = m_Nvc * 3;
 
 1974   double vt1[m_Nvc], vt2[m_Nvc];
 
 1975   double wt1r, wt1i, wt2r, wt2i;
 
 1977   int isite = m_arg[itask].isite;
 
 1979   double *w2 = &v2[Nvcd * isite];
 
 1980   double *w1 = &v1[Nvcd * isite];
 
 1981   double *u  = 
const_cast<Field_G *
>(m_U)->ptr(
 
 1982     m_Ndf * (isite + (1 - ieo) * m_Nvol / 2 + idir * m_Nvol));
 
 1984   int kt0  = m_arg[itask].kt0;
 
 1985   int Nxy  = m_Nx2 * m_Ny;
 
 1986   int Nxyz = Nxy * m_Nz;
 
 1988   for (
int it = kt0; it < m_Mt; ++it) {
 
 1989     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 1990       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 1991         int is = ixy + Nxy * (iz + m_Nz * it);
 
 1993         int in = Nvcd * (is - Nxyz);
 
 1994         int ig = m_Ndf * (is - Nxyz);
 
 1996         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 1997           vt1[2 * ic]     = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
 
 1998           vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
 
 1999           vt2[2 * ic]     = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
 
 2000           vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
 
 2003         for (
int ic = 0; ic < 
m_Nc; ++ic) {
 
 2005           wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
 
 2006           wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
 
 2007           wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
 
 2008           wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
 
 2010           w2[ic2 + id1 + iv]     += wt1r;
 
 2011           w2[ic2 + 1 + id1 + iv] += wt1i;
 
 2012           w2[ic2 + id2 + iv]     += wt2r;
 
 2013           w2[ic2 + 1 + id2 + iv] += wt2i;
 
 2014           w2[ic2 + id3 + iv]     -= wt1r;
 
 2015           w2[ic2 + 1 + id3 + iv] -= wt1i;
 
 2016           w2[ic2 + id4 + iv]     -= wt2r;
 
 2017           w2[ic2 + 1 + id4 + iv] -= wt2i;
 
 2027                                                            double *v2, 
double *v1)
 
 2029   int Nvcd = m_Nvc * 
m_Nd;
 
 2030   int Nxy  = m_Nx2 * m_Ny;
 
 2034   int id3 = m_Nvc * 2;
 
 2035   int id4 = m_Nvc * 3;
 
 2037   int    isite = m_arg[itask].isite;
 
 2038   double *w2   = &v2[Nvcd * isite];
 
 2039   double *w1   = &v1[Nvcd * isite];
 
 2041   for (
int it = 0; it < m_Mt; ++it) {
 
 2042     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2043       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2044         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2045         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2046           w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2047           w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2048           w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
 
 2049           w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
 
 2059                                                             double *v2, 
double *v1)
 
 2061   int Nvcd = m_Nvc * 
m_Nd;
 
 2062   int Nxy  = m_Nx2 * m_Ny;
 
 2066   int id3 = m_Nvc * 2;
 
 2067   int id4 = m_Nvc * 3;
 
 2069   int    isite = m_arg[itask].isite;
 
 2070   double *w2   = &v2[Nvcd * isite];
 
 2071   double *w1   = &v1[Nvcd * isite];
 
 2073   for (
int it = 0; it < m_Mt; ++it) {
 
 2074     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2075       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2076         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2077         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2078           w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
 
 2079           w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
 
 2080           w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2081           w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
 2093   int Nvcd = m_Nvc * 
m_Nd;
 
 2094   int Nxy  = m_Nx2 * m_Ny;
 
 2098   int id3 = m_Nvc * 2;
 
 2099   int id4 = m_Nvc * 3;
 
 2101   int    isite = m_arg[itask].isite;
 
 2102   double *w1   = &v1[Nvcd * isite];
 
 2104   for (
int it = 0; it < m_Mt; ++it) {
 
 2105     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2106       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2107         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2108         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2109           double wt1 = w1[ivc + id1 + iv];
 
 2110           double wt2 = w1[ivc + id2 + iv];
 
 2111           w1[ivc + id1 + iv] = w1[ivc + id3 + iv];
 
 2112           w1[ivc + id2 + iv] = w1[ivc + id4 + iv];
 
 2113           w1[ivc + id3 + iv] = wt1;
 
 2114           w1[ivc + id4 + iv] = wt2;
 
 2126   int Nvcd = m_Nvc * 
m_Nd;
 
 2127   int Nxy  = m_Nx2 * m_Ny;
 
 2131   int id3 = m_Nvc * 2;
 
 2132   int id4 = m_Nvc * 3;
 
 2134   int    isite = m_arg[itask].isite;
 
 2135   double *w1   = &v1[Nvcd * isite];
 
 2137   for (
int it = 0; it < m_Mt; ++it) {
 
 2138     for (
int iz = 0; iz < m_Mz; ++iz) {
 
 2139       for (
int ixy = 0; ixy < Nxy; ++ixy) {
 
 2140         int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
 
 2141         for (
int ivc = 0; ivc < m_Nvc; ++ivc) {
 
 2142           w1[ivc + id3 + iv] = -w1[ivc + id3 + iv];
 
 2143           w1[ivc + id4 + iv] = -w1[ivc + id4 + iv];
 
void mult_tm2_dirac_thread(int, double *, double *, int)
 
const Field_F Meo(const Field_F &, const int ieo)
 
void mult_tp1_dirac_thread(int, double *, double *, int)
 
void mult_ym1_thread(int, double *, double *, int)
 
void mult_xp1_thread(int, double *, double *, int)
 
void general(const char *format,...)
 
void mult_tm2_chiral_thread(int, double *, double *, int)
 
std::valarray< Channel * > m_bw_recv
 
void mult_ymb_thread(int, double *, double *, int)
 
void clear_thread(int, double *)
 
std::valarray< Channel * > m_fw_recv
 
valarray< mult_arg > m_arg
 
void mult_tmb_dirac_thread(int, double *, double *, int)
 
void mult_zp1_thread(int, double *, double *, int)
 
void gm5_dirac_thread(int, double *, double *)
 
void mult_xp2_thread(int, double *, double *, int)
 
void mult_ypb_thread(int, double *, double *, int)
 
void mult_tm1_dirac_thread(int, double *, double *, int)
 
void mult_zm2_thread(int, double *, double *, int)
 
std::valarray< Channel * > m_bw_send
 
void mult_tmb_chiral_thread(int, double *, double *, int)
 
void mult_xm2_thread(int, double *, double *, int)
 
void mult_ym2_thread(int, double *, double *, int)
 
static int get_num_threads_available()
returns number of threads (works outside of parallel region). 
 
void mult_xm1_thread(int, double *, double *, int)
 
void mult_tp1_chiral_thread(int, double *, double *, int)
 
std::valarray< Channel * > m_fw_send
 
void mult_tp2_dirac_thread(int, double *, double *, int)
 
void mult_tp2_chiral_thread(int, double *, double *, int)
 
void mult_tpb_chiral_thread(int, double *, double *, int)
 
void gm5_chiral_thread(int, double *, double *)
 
void mult_zp2_thread(int, double *, double *, int)
 
void mult_xmb_thread(int, double *, double *, int)
 
void mult_zmb_thread(int, double *, double *, int)
 
Bridge::VerboseLevel m_vl
 
void mult_yp1_thread(int, double *, double *, int)
 
void mult_zm1_thread(int, double *, double *, int)
 
void mult_zpb_thread(int, double *, double *, int)
 
void mult_yp2_thread(int, double *, double *, int)
 
void scal_thread(int, double *, double)
 
void mult_xpb_thread(int, double *, double *, int)
 
void mult_tm1_chiral_thread(int, double *, double *, int)
 
void mult_tpb_dirac_thread(int, double *, double *, int)