10 #ifndef MULT_DOMAINWALL_5DIN_QXS_INCLUDED 
   11 #define MULT_DOMAINWALL_5DIN_QXS_INCLUDED 
   20   int *Nsize, 
int *do_comm)
 
   26   int Nstv = Nxv * Nyv * Nz * Nt;
 
   27   int Nst  = Nstv * 
VLEN;
 
   32   int ith, nth, site0, site1;
 
   33   set_threadtask(ith, nth, site0, site1, Nstv);
 
   35   for (
int site = site0; site < site1; ++site) {
 
   36     real_t *vp2 = &vp[Nin5 * site];
 
   37     real_t *wp2 = &wp[Nin5 * site];
 
   38     real_t *yp2 = &yp[Nin5 * site];
 
   40     for (
int is = 0; is < Ns; ++is) {
 
   43       for (
int ic = 0; ic < 
NC; ++ic) {
 
   44         svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
 
   46         int    is_up = (is + 1) % Ns;
 
   48         if (is == Ns - 1) Fup = -0.5 * mq;
 
   49         set_aPm5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
 
   50                            vt3r, vt3i, vt4r, vt4i,
 
   53         int    is_dn = (is - 1 + Ns) % Ns;
 
   55         if (is == 0) Fdn = -0.5 * mq;
 
   56         add_aPp5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
 
   57                            vt3r, vt3i, vt4r, vt4i,
 
   61         real_t   FF1    = b[is] * (4.0 - M0) + 1.0;
 
   62         real_t   FF2    = c[is] * (4.0 - M0) - 1.0;
 
   63         int      offset = 2 * 
ND * ic + 
NVCD * is;
 
   64         dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt1r, 0 + offset);
 
   65         dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt1i, 1 + offset);
 
   66         dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt2r, 2 + offset);
 
   67         dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt2i, 3 + offset);
 
   68         dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt3r, 4 + offset);
 
   69         dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt3i, 5 + offset);
 
   70         dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt4r, 6 + offset);
 
   71         dw_5dir_axpy(pg, vp2, yp2, wp2, FF1, FF2, b[is], c[is], vt4i, 7 + offset);
 
   83   int *Nsize, 
int *do_comm)
 
   89   int Nstv = Nxv * Nyv * Nz * Nt;
 
   90   int Nst  = Nstv * 
VLEN;
 
   95   int ith, nth, site0, site1;
 
   96   set_threadtask(ith, nth, site0, site1, Nstv);
 
   98   for (
int site = site0; site < site1; ++site) {
 
   99     real_t *vp2 = &vp[Nin5 * site];
 
  100     real_t *wp2 = &wp[Nin5 * site];
 
  101     real_t *yp2 = &yp[Nin5 * site];
 
  107     for (
int is = 0; is < Ns; ++is) {
 
  110       for (
int ic = 0; ic < 
NC; ++ic) {
 
  111         svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
 
  112         FF1 = b[is] * (4.0 - M0) + 1.0;
 
  114         int    index = 2 * 
ND * ic + 
NVCD * is;
 
  118         dw_5dir_dag(pg, vt3r, vt3i, vt4r, vt4i, vt1r, vt1i, vt2r, vt2i,
 
  119                     wp2, yp2, -FF1, -a1, index);
 
  121         int is_up = (is + 1) % Ns;
 
  122         FF2 = c[is_up] * (4.0 - M0) - 1.0;
 
  123         real_t   aup = -0.5 * c[is_up];
 
  124         int      index_up = 2 * 
ND * ic + 
NVCD * is_up;
 
  125         svreal_t xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i;
 
  126         dw_5dir_dag(pg, xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i,
 
  127                     wp2, yp2, FF2, aup, index_up);
 
  130         if (is == Ns - 1) Fup = -0.5 * mq;
 
  131         add_aPp5_dirac_vec(pg, vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i,
 
  132                            Fup, xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i);
 
  134         int is_dn = (is - 1 + Ns) % Ns;
 
  135         FF2 = c[is_dn] * (4.0 - M0) - 1.0;
 
  136         real_t adn      = -0.5 * c[is_dn];
 
  137         int    index_dn = 2 * 
ND * ic + 
NVCD * is_dn;
 
  138         dw_5dir_dag(pg, xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i,
 
  139                     wp2, yp2, FF2, adn, index_dn);
 
  142         if (is == 0) Fdn = -0.5 * mq;
 
  143         add_aPm5_dirac_vec(pg, vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i,
 
  144                            -Fdn, xt1r, xt1i, xt2r, xt2i, xt3r, xt3i, xt4r, xt4i);
 
  146         int offset = 2 * 
ND * ic + 
NVCD * is;
 
  147         save_vec(pg, &vp2[
VLEN * (0 + offset)], vt1r);
 
  148         save_vec(pg, &vp2[
VLEN * (1 + offset)], vt1i);
 
  149         save_vec(pg, &vp2[
VLEN * (2 + offset)], vt2r);
 
  150         save_vec(pg, &vp2[
VLEN * (3 + offset)], vt2i);
 
  151         save_vec(pg, &vp2[
VLEN * (4 + offset)], vt3r);
 
  152         save_vec(pg, &vp2[
VLEN * (5 + offset)], vt3i);
 
  153         save_vec(pg, &vp2[
VLEN * (6 + offset)], vt4r);
 
  154         save_vec(pg, &vp2[
VLEN * (7 + offset)], vt4i);
 
  169   int Nstv = Nxv * Nyv * Nz * Nt;
 
  172   int Nin5 = Nin4 * Ns;
 
  174   int ith, nth, site0, site1;
 
  175   set_threadtask(ith, nth, site0, site1, Nstv);
 
  177   for (
int site = site0; site < site1; ++site) {
 
  178     real_t *vp2 = &vp[Nin5 * site];
 
  179     real_t *wp2 = &wp[Nin5 * site];
 
  182     for (
int is = 0; is < Ns; ++is) {
 
  183       for (
int ic = 0; ic < 
NC; ++ic) {
 
  184         svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
 
  185         int      index = 2 * 
ND * ic + 
NVCD * is;
 
  186         load_vec(pg, vt1r, &wp2[
VLEN * (4 + index)]);
 
  187         load_vec(pg, vt1i, &wp2[
VLEN * (5 + index)]);
 
  188         load_vec(pg, vt2r, &wp2[
VLEN * (6 + index)]);
 
  189         load_vec(pg, vt2i, &wp2[
VLEN * (7 + index)]);
 
  190         load_vec(pg, vt3r, &wp2[
VLEN * (0 + index)]);
 
  191         load_vec(pg, vt3i, &wp2[
VLEN * (1 + index)]);
 
  192         load_vec(pg, vt4r, &wp2[
VLEN * (2 + index)]);
 
  193         load_vec(pg, vt4i, &wp2[
VLEN * (3 + index)]);
 
  203         save_vec(pg, &vp2[
VLEN * (0 + index)], vt1r);
 
  204         save_vec(pg, &vp2[
VLEN * (1 + index)], vt1i);
 
  205         save_vec(pg, &vp2[
VLEN * (2 + index)], vt2r);
 
  206         save_vec(pg, &vp2[
VLEN * (3 + index)], vt2i);
 
  207         save_vec(pg, &vp2[
VLEN * (4 + index)], vt3r);
 
  208         save_vec(pg, &vp2[
VLEN * (5 + index)], vt3i);
 
  209         save_vec(pg, &vp2[
VLEN * (6 + index)], vt4r);
 
  210         save_vec(pg, &vp2[
VLEN * (7 + index)], vt4i);
 
  224   int Nstv = Nxv * Nyv * Nz * Nt;
 
  227   int Nin5 = Nin4 * Ns;
 
  229   int ith, nth, site0, site1;
 
  230   set_threadtask(ith, nth, site0, site1, Nstv);
 
  235   for (
int site = site0; site < site1; ++site) {
 
  236     real_t *vp2 = &vp[Nin5 * site];
 
  238     for (
int is = 0; is < Ns; ++is) {
 
  239       save_vec(&vp2[Nin4 * is], yL, 
NVCD);
 
  250   int *Nsize, 
int *do_comm)
 
  256   int Nstv = Nxv * Nyv * Nz * Nt;
 
  257   int Nst  = Nstv * 
VLEN;
 
  260   int Nin5 = Nin4 * Ns;
 
  265   int Nxyz = Nxv * Nyv * Nz;
 
  267   svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
 
  268   svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
 
  269   set_predicate_xp(pg1_xp, pg2_xp);
 
  270   set_predicate_xm(pg1_xm, pg2_xm);
 
  271   set_predicate_yp(pg1_yp, pg2_yp);
 
  272   set_predicate_ym(pg1_ym, pg2_ym);
 
  274   int ith, nth, site0, site1;
 
  275   set_threadtask(ith, nth, site0, site1, Nstv);
 
  279   for (
int site = site0; site < site1; ++site) {
 
  281     int iyzt = site / Nxv;
 
  282     int ixy  = site % Nxy;
 
  284     int izt  = site / Nxy;
 
  287     int ixyz = site % Nxyz;
 
  291     real_t *wp2 = &wp[Nin5 * site];
 
  292     real_t *vp2 = &vp[Nin5 * site];
 
  298     load_vec(vL, vp2, 
NVCD * Ns);
 
  302     if ((ix < Nxv - 1) || (do_comm[idir] == 0)) {
 
  303       int ix2 = (ix + 1) % Nxv;
 
  304       nei = ix2 + Nxv * iyzt;
 
  305       real_t *wpn = &wp[Nin5 * nei];
 
  307       for (
int is = 0; is < Ns; ++is) {
 
  308         mult_wilson_xpb(pg1_xp, pg2_xp, &vL[
NVCD * is].v[0], u,
 
  309                         &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  313     if ((ix > 0) || (do_comm[idir] == 0)) {
 
  314       int ix2 = (ix - 1 + Nxv) % Nxv;
 
  315       nei = ix2 + Nxv * iyzt;
 
  316       real_t *wpn = &wp[Nin5 * nei];
 
  319       for (
int is = 0; is < Ns; ++is) {
 
  320         mult_wilson_xmb(pg1_xm, pg2_xm, &vL[
NVCD * is], ux, un,
 
  321                         &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  327     if ((iy < Nyv - 1) || (do_comm[idir] == 0)) {
 
  328       int iy2 = (iy + 1) % Nyv;
 
  329       nei = ix + Nxv * (iy2 + Nyv * izt);
 
  330       real_t *wpn = &wp[Nin5 * nei];
 
  332       for (
int is = 0; is < Ns; ++is) {
 
  333         mult_wilson_ypb(pg1_yp, pg2_yp, &vL[
NVCD * is].v[0], u,
 
  334                         &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  338     if ((iy > 0) || (do_comm[idir] == 0)) {
 
  339       int iy2 = (iy - 1 + Nyv) % Nyv;
 
  340       nei = ix + Nxv * (iy2 + Nyv * izt);
 
  341       real_t *wpn = &wp[Nin5 * nei];
 
  344       for (
int is = 0; is < Ns; ++is) {
 
  345         mult_wilson_ymb(pg1_ym, pg2_ym, &vL[
NVCD * is].v[0], ux, un,
 
  346                         &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  352     if ((iz < Nz - 1) || (do_comm[idir] == 0)) {
 
  353       int iz2 = (iz + 1) % Nz;
 
  354       nei = ixy + Nxy * (iz2 + Nz * it);
 
  355       real_t *wpn = &wp[Nin5 * nei];
 
  357       for (
int is = 0; is < Ns; ++is) {
 
  358         mult_wilson_zpb(&vL[
NVCD * is].v[0], u, &wpn[Nin4 * is]);
 
  362     if ((iz > 0) || (do_comm[idir] == 0)) {
 
  363       int iz2 = (iz - 1 + Nz) % Nz;
 
  364       nei = ixy + Nxy * (iz2 + Nz * it);
 
  365       real_t *wpn = &wp[Nin5 * nei];
 
  367       for (
int is = 0; is < Ns; ++is) {
 
  368         mult_wilson_zmb(&vL[
NVCD * is].v[0], u, &wpn[Nin4 * is]);
 
  374     if ((it < Nt - 1) || (do_comm[idir] == 0)) {
 
  375       int it2 = (it + 1) % Nt;
 
  376       nei = ixyz + Nxyz * it2;
 
  377       real_t *wpn = &wp[Nin5 * nei];
 
  379       for (
int is = 0; is < Ns; ++is) {
 
  380         mult_wilson_tpb_dirac(&vL[
NVCD * is].v[0], u, &wpn[Nin4 * is]);
 
  384     if ((it > 0) || (do_comm[idir] == 0)) {
 
  385       int it2 = (it - 1 + Nt) % Nt;
 
  386       nei = ixyz + Nxyz * it2;
 
  387       real_t *wpn = &wp[Nin5 * nei];
 
  389       for (
int is = 0; is < Ns; ++is) {
 
  390         mult_wilson_tmb_dirac(&vL[
NVCD * is].v[0], u, &wpn[Nin4 * is]);
 
  394     save_vec(vp2, vL, 
NVCD * Ns);
 
  407   int *Nsize, 
int *do_comm)
 
  413   int Nstv = Nxv * Nyv * Nz * Nt;
 
  414   int Nst  = Nstv * 
VLEN;
 
  418   int Nin5  = Nin4 * Ns;
 
  420   int Nin5H = Nin4H * Ns;
 
  422   svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
 
  423   svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
 
  424   set_predicate_xp(pg1_xp, pg2_xp);
 
  425   set_predicate_xm(pg1_xm, pg2_xm);
 
  426   set_predicate_yp(pg1_yp, pg2_yp);
 
  427   set_predicate_ym(pg1_ym, pg2_ym);
 
  429   set_index_xp(svidx_xp);
 
  430   set_index_xm(svidx_xm);
 
  432   if (do_comm[0] == 1) {
 
  435     int Nyzt = Nyv * Nz * Nt;
 
  437     int ith, nth, site0, site1;
 
  438     set_threadtask(ith, nth, site0, site1, Nyzt);
 
  440     for (
int iyzt = site0; iyzt < site1; ++iyzt) {
 
  443         int    site = ix + Nxv * iyzt;
 
  444         real_t *wp2 = &wp[Nin5 * site];
 
  446         set_index_xm(svidx_xm);
 
  447         for (
int is = 0; is < Ns; ++is) {
 
  448           mult_wilson_xp1(pg2_xm, svidx_xm,
 
  449                           &buf1_xp[ibf + 
VLENY * 
NVC * 
ND2 * is], &wp2[Nin4 * is]);
 
  455         int    site = ix + Nxv * iyzt;
 
  456         real_t *wp2 = &wp[Nin5 * site];
 
  459         set_index_xp(svidx_xp);
 
  460         for (
int is = 0; is < Ns; ++is) {
 
  461           mult_wilson_xm1(pg2_xp, svidx_xp,
 
  462                           &buf1_xm[ibf + 
VLENY * 
NVC * 
ND2 * is], u2, &wp2[Nin4 * is]);
 
  468   if (do_comm[1] == 1) {
 
  470     int Nxzt = Nxv * Nz * Nt;
 
  472     int ith, nth, site0, site1;
 
  473     set_threadtask(ith, nth, site0, site1, Nxzt);
 
  475     for (
int ixzt = site0; ixzt < site1; ++ixzt) {
 
  477       int izt = ixzt / Nxv;
 
  481         int    site = ix + Nxv * (iy + Nyv * izt);
 
  482         real_t *wp2 = &wp[Nin5 * site];
 
  484         int ibf = 
VLENX * 
NVC * 
ND2 * Ns * (ix + Nxv * izt);
 
  485         for (
int is = 0; is < Ns; ++is) {
 
  486           mult_wilson_yp1(pg2_ym,
 
  487                           &buf1_yp[ibf + 
VLENX * 
NVC * 
ND2 * is], &wp2[Nin4 * is]);
 
  493         int    site = ix + Nxv * (iy + Nyv * izt);
 
  494         real_t *wp2 = &wp[Nin5 * site];
 
  496         int    ibf = 
VLENX * 
NVC * 
ND2 * Ns * (ix + Nxv * izt);
 
  498         for (
int is = 0; is < Ns; ++is) {
 
  499           mult_wilson_ym1(pg2_yp,
 
  500                           &buf1_ym[ibf + 
VLENX * 
NVC * 
ND2 * is], u2, &wp2[Nin4 * is]);
 
  507   if (do_comm[2] == 1) {
 
  510     int Nxyt = Nxv * Nyv * Nt;
 
  512     int ith, nth, site0, site1;
 
  513     set_threadtask(ith, nth, site0, site1, Nxyt);
 
  515     for (
int ixyt = site0; ixyt < site1; ++ixyt) {
 
  516       int ixy = ixyt % Nxy;
 
  521         int    site = ixy + Nxy * (iz + Nz * it);
 
  522         real_t *wp2 = &wp[Nin5 * site];
 
  523         int    ibf  = Nin5H * (ixy + Nxy * it);
 
  524         for (
int is = 0; is < Ns; ++is) {
 
  525           mult_wilson_zp1(&buf1_zp[ibf + Nin4H * is], &wp2[Nin4 * is]);
 
  530         int    site = ixy + Nxy * (iz + Nz * it);
 
  531         real_t *wp2 = &wp[Nin5 * site];
 
  532         int    ibf  = Nin5H * (ixy + Nxy * it);
 
  534         for (
int is = 0; is < Ns; ++is) {
 
  535           mult_wilson_zm1(&buf1_zm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
 
  541   if (do_comm[3] == 1) {
 
  543     int Nxyz = Nxv * Nyv * Nz;
 
  545     int ith, nth, site0, site1;
 
  546     set_threadtask(ith, nth, site0, site1, Nxyz);
 
  548     for (
int ixyz = site0; ixyz < site1; ++ixyz) {
 
  551         int    site = ixyz + Nxyz * it;
 
  552         real_t *wp2 = &wp[Nin5 * site];
 
  553         int    ibf  = Nin5H * ixyz;
 
  554         for (
int is = 0; is < Ns; ++is) {
 
  555           mult_wilson_tp1_dirac(&buf1_tp[ibf + Nin4H * is], &wp2[Nin4 * is]);
 
  561         int    site = ixyz + Nxyz * it;
 
  562         real_t *wp2 = &wp[Nin5 * site];
 
  563         int    ibf  = Nin5H * ixyz;
 
  565         for (
int is = 0; is < Ns; ++is) {
 
  566           mult_wilson_tm1_dirac(&buf1_tm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
 
  582   int *Nsize, 
int *do_comm)
 
  590   int Nstv = Nxv * Nyv * Nz * Nt;
 
  591   int Nst  = Nstv * 
VLEN;
 
  594   int Nin5  = Nin4 * Ns;
 
  596   int Nin5H = Nin4H * Ns;
 
  600   int Nxyz = Nxv * Nyv * Nz;
 
  602   svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
 
  603   svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
 
  604   set_predicate_xp(pg1_xp, pg2_xp);
 
  605   set_predicate_xm(pg1_xm, pg2_xm);
 
  606   set_predicate_yp(pg1_yp, pg2_yp);
 
  607   set_predicate_ym(pg1_ym, pg2_ym);
 
  609   set_index_xp(svidx_xp);
 
  610   set_index_xm(svidx_xm);
 
  612   int ith, nth, site0, site1;
 
  613   set_threadtask(ith, nth, site0, site1, Nstv);
 
  617   for (
int site = site0; site < site1; ++site) {
 
  619     int iyzt = site / Nxv;
 
  620     int ixy  = site % Nxy;
 
  622     int izt  = site / Nxy;
 
  625     int ixyz = site % Nxyz;
 
  629     real_t *wp2 = &wp[Nin5 * site];
 
  630     real_t *vp2 = &vp[Nin5 * site];
 
  636     clear_vec(vL, 
NVCD * Ns);
 
  641     if (do_comm[idir] == 1) {
 
  645         for (
int is = 0; is < Ns; ++is) {
 
  646           set_index_xp(svidx_xp);
 
  647           mult_wilson_xp2(pg1_xp, pg2_xp, svidx_xp,
 
  648                           &vL[
NVCD * is].v[0], u2,
 
  649                           &wp2[Nin4 * is], &buf2_xp[ibf + 
VLENY * 
NVC * 
ND2 * is]);
 
  657         set_index_xm(svidx_xm);
 
  658         for (
int is = 0; is < Ns; ++is) {
 
  659           mult_wilson_xm2(pg1_xm, pg2_xm, svidx_xm,
 
  660                           &vL[
NVCD * is].v[0], u2,
 
  661                           &wp2[Nin4 * is], &buf2_xm[ibf + 
VLENY * 
NVC * 
ND2 * is]);
 
  668     if (do_comm[idir] == 1) {
 
  670         int    ibf = 
VLENX * 
NVC * 
ND2 * Ns * (ix + Nxv * izt);
 
  672         for (
int is = 0; is < Ns; ++is) {
 
  673           mult_wilson_yp2(pg1_yp, pg2_yp,
 
  674                           &vL[
NVCD * is].v[0], u2,
 
  675                           &wp2[Nin4 * is], &buf2_yp[ibf + 
VLENX * 
NVC * 
ND2 * is]);
 
  681         int    ibf = 
VLENX * 
NVC * 
ND2 * Ns * (ix + Nxv * izt);
 
  683         for (
int is = 0; is < Ns; ++is) {
 
  684           mult_wilson_ym2(pg1_ym, pg2_ym,
 
  685                           &vL[
NVCD * is].v[0], u2,
 
  686                           &wp2[Nin4 * is], &buf2_ym[ibf + 
VLENX * 
NVC * 
ND2 * is]);
 
  693     if (do_comm[idir] == 1) {
 
  695         int    ibf = Nin5H * (ixy + Nxy * it);
 
  697         for (
int is = 0; is < Ns; ++is) {
 
  698           mult_wilson_zp2(&vL[
NVCD * is].v[0], u2, &buf2_zp[ibf + Nin4H * is]);
 
  704         int ibf = Nin5H * (ixy + Nxy * it);
 
  705         for (
int is = 0; is < Ns; ++is) {
 
  706           mult_wilson_zm2(&vL[
NVCD * is].v[0], &buf2_zm[ibf + Nin4H * is]);
 
  713     if (do_comm[idir] == 1) {
 
  715         int    ibf = Nin5H * ixyz;
 
  717         for (
int is = 0; is < Ns; ++is) {
 
  718           mult_wilson_tp2_dirac(&vL[
NVCD * is].v[0], u2, &buf2_tp[ibf + Nin4H * is]);
 
  724         int ibf = Nin5H * ixyz;
 
  725         for (
int is = 0; is < Ns; ++is) {
 
  726           mult_wilson_tm2_dirac(&vL[
NVCD * is].v[0], &buf2_tm[ibf + Nin4H * is]);
 
  736       for (
int ivs = 0; ivs < 
NVCD * Ns; ivs += 4) {
 
  739         load_vec(pg, yt1, &vp2[
VLEN * (ivs)]);
 
  740         load_vec(pg, yt2, &vp2[
VLEN * (ivs + 1)]);
 
  741         load_vec(pg, yt3, &vp2[
VLEN * (ivs + 2)]);
 
  742         load_vec(pg, yt4, &vp2[
VLEN * (ivs + 3)]);
 
  743         load_vec(pg, vt1, &(vL[ivs].v[0]));
 
  744         load_vec(pg, vt2, &(vL[ivs + 1].v[0]));
 
  745         load_vec(pg, vt3, &(vL[ivs + 2].v[0]));
 
  746         load_vec(pg, vt4, &(vL[ivs + 3].v[0]));
 
  748         add_vec(pg, yt1, vt1);
 
  749         add_vec(pg, yt2, vt2);
 
  750         add_vec(pg, yt3, vt3);
 
  751         add_vec(pg, yt4, vt4);
 
  752         save_vec(pg, &vp2[
VLEN * (ivs)], yt1);
 
  753         save_vec(pg, &vp2[
VLEN * (ivs + 1)], yt2);
 
  754         save_vec(pg, &vp2[
VLEN * (ivs + 2)], yt3);
 
  755         save_vec(pg, &vp2[
VLEN * (ivs + 3)], yt4);