18 #ifndef MULT_DOMAINWALL_5DIN_EO_QXS_INCLUDED 
   19 #define MULT_DOMAINWALL_5DIN_EO_QXS_INCLUDED 
   30   int *Nsize, 
int *do_comm)
 
   36   int Nst2v = Nx2v * Ny * Nz * Nt;
 
   41   int ith, nth, site0, site1;
 
   42   set_threadtask(ith, nth, site0, site1, Nst2v);
 
   44   for (
int site = site0; site < site1; ++site) {
 
   45     real_t *wp2 = &wp[Nin5 * site];
 
   46     real_t *yp2 = &yp[Nin5 * site];
 
   49     for (
int is = 0; is < Ns; ++is) {
 
   50       for (
int ic = 0; ic < 
NC; ++ic) {
 
   51         svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
 
   52         int      offset = 2 * 
ND * ic + 
NVCD * is;
 
   54         real_t   bb     = b[is] * factor;
 
   56         load_vec(pg, vt1r, &wp2[
VLEN * (offset + 0)]);
 
   57         load_vec(pg, vt1i, &wp2[
VLEN * (offset + 1)]);
 
   58         load_vec(pg, vt2r, &wp2[
VLEN * (offset + 2)]);
 
   59         load_vec(pg, vt2i, &wp2[
VLEN * (offset + 3)]);
 
   60         load_vec(pg, vt3r, &wp2[
VLEN * (offset + 4)]);
 
   61         load_vec(pg, vt3i, &wp2[
VLEN * (offset + 5)]);
 
   62         load_vec(pg, vt4r, &wp2[
VLEN * (offset + 6)]);
 
   63         load_vec(pg, vt4i, &wp2[
VLEN * (offset + 7)]);
 
   64         scal_vec(pg, vt1r, bb);
 
   65         scal_vec(pg, vt1i, bb);
 
   66         scal_vec(pg, vt2r, bb);
 
   67         scal_vec(pg, vt2i, bb);
 
   68         scal_vec(pg, vt3r, bb);
 
   69         scal_vec(pg, vt3i, bb);
 
   70         scal_vec(pg, vt4r, bb);
 
   71         scal_vec(pg, vt4i, bb);
 
   73         int    is_up = (is + 1) % Ns;
 
   74         real_t Fup   = 0.5 * c[is] * factor;
 
   75         if (is == Ns - 1) Fup *= -mq;
 
   76         add_aPm5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
 
   77                            vt3r, vt3i, vt4r, vt4i,
 
   80         int    is_dn = (is - 1 + Ns) % Ns;
 
   81         real_t Fdn   = 0.5 * c[is] * factor;
 
   82         if (is == 0) Fdn *= -mq;
 
   83         add_aPp5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
 
   84                            vt3r, vt3i, vt4r, vt4i,
 
   86         save_vec(pg, &yp2[
VLEN * (offset + 0)], vt1r);
 
   87         save_vec(pg, &yp2[
VLEN * (offset + 1)], vt1i);
 
   88         save_vec(pg, &yp2[
VLEN * (offset + 2)], vt2r);
 
   89         save_vec(pg, &yp2[
VLEN * (offset + 3)], vt2i);
 
   90         save_vec(pg, &yp2[
VLEN * (offset + 4)], vt3r);
 
   91         save_vec(pg, &yp2[
VLEN * (offset + 5)], vt3i);
 
   92         save_vec(pg, &yp2[
VLEN * (offset + 6)], vt4r);
 
   93         save_vec(pg, &yp2[
VLEN * (offset + 7)], vt4i);
 
  110   int Nstv = Nxv * Ny * Nz * Nt;
 
  113   int Nin5 = Nin4 * Ns;
 
  115   int ith, nth, site0, site1;
 
  116   set_threadtask(ith, nth, site0, site1, Nstv);
 
  118   for (
int site = site0; site < site1; ++site) {
 
  119     real_t   *vp2 = &vp[Nin5 * site];
 
  120     real_t   *wp2 = &wp[Nin5 * site];
 
  122     for (
int is = 0; is < Ns; ++is) {
 
  123       for (
int ic = 0; ic < 
NC; ++ic) {
 
  126         int      offset = 2 * 
ND * ic + 
NVCD * is;
 
  128         load_mult_gm5_dirac_vec(pg, vt1r, vt1i, vt2r, vt2i,
 
  129                                 vt3r, vt3i, vt4r, vt4i, &wp2[
VLEN * offset]);
 
  130         save_vec(pg, &vp2[
VLEN * (offset + 0)], vt1r);
 
  131         save_vec(pg, &vp2[
VLEN * (offset + 1)], vt1i);
 
  132         save_vec(pg, &vp2[
VLEN * (offset + 2)], vt2r);
 
  133         save_vec(pg, &vp2[
VLEN * (offset + 3)], vt2i);
 
  134         save_vec(pg, &vp2[
VLEN * (offset + 4)], vt3r);
 
  135         save_vec(pg, &vp2[
VLEN * (offset + 5)], vt3i);
 
  136         save_vec(pg, &vp2[
VLEN * (offset + 6)], vt4r);
 
  137         save_vec(pg, &vp2[
VLEN * (offset + 7)], vt4i);
 
  151   int Nstv = Nxv * Ny * Nz * Nt;
 
  154   int Nin5 = Nin4 * Ns;
 
  156   int ith, nth, site0, site1;
 
  157   set_threadtask(ith, nth, site0, site1, Nstv);
 
  161   pg = set_predicate();
 
  163   for (
int site = site0; site < site1; ++site) {
 
  164     real_t *vp2 = &vp[Nin5 * site];
 
  166     for (
int is = 0; is < Ns; ++is) {
 
  167       for (
int i = 0; i < 
NVCD; ++i) {
 
  168         save_vec(pg, &vp2[
VLEN * (is * 
NVCD + i)], y);
 
  181   int *Nsize, 
int *do_comm)
 
  187   int Nst2v = Nx2v * Ny * Nz * Nt;
 
  188   int Nst2  = Nst2v * 
VLEN;
 
  191   int Nin5 = Nin4 * Ns;
 
  193   int ith, nth, site0, site1;
 
  194   set_threadtask(ith, nth, site0, site1, Nst2v);
 
  196   for (
int site = site0; site < site1; ++site) {
 
  197     real_t *vp2 = &vp[Nin5 * site];
 
  198     real_t *yp2 = &yp[Nin5 * site];
 
  201     for (
int is = 0; is < Ns; ++is) {
 
  202       for (
int ic = 0; ic < 
NC; ++ic) {
 
  203         svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
 
  204         int      offset = 2 * 
ND * ic + 
NVCD * is;
 
  207         load_vec(pg, vt3r, &yp2[
VLEN * (offset + 0)]);
 
  208         load_vec(pg, vt3i, &yp2[
VLEN * (offset + 1)]);
 
  209         load_vec(pg, vt4r, &yp2[
VLEN * (offset + 2)]);
 
  210         load_vec(pg, vt4i, &yp2[
VLEN * (offset + 3)]);
 
  211         load_vec(pg, vt1r, &yp2[
VLEN * (offset + 4)]);
 
  212         load_vec(pg, vt1i, &yp2[
VLEN * (offset + 5)]);
 
  213         load_vec(pg, vt2r, &yp2[
VLEN * (offset + 6)]);
 
  214         load_vec(pg, vt2i, &yp2[
VLEN * (offset + 7)]);
 
  215         scal_vec(pg, vt1r, bb);
 
  216         scal_vec(pg, vt1i, bb);
 
  217         scal_vec(pg, vt2r, bb);
 
  218         scal_vec(pg, vt2i, bb);
 
  219         scal_vec(pg, vt3r, bb);
 
  220         scal_vec(pg, vt3i, bb);
 
  221         scal_vec(pg, vt4r, bb);
 
  222         scal_vec(pg, vt4i, bb);
 
  224         int    is_up = (is + 1) % Ns;
 
  225         real_t Fup   = 0.5 * (-0.5) * c[is_up];
 
  226         if (is == Ns - 1) Fup *= -mq;
 
  227         add_aPp5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
 
  228                            vt3r, vt3i, vt4r, vt4i,
 
  229                            Fup, yp2, is_up, ic);
 
  231         int    is_dn = (is - 1 + Ns) % Ns;
 
  232         real_t Fdn   = -0.5 * (-0.5) * c[is_dn];
 
  233         if (is == 0) Fdn *= -mq;
 
  234         add_aPm5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
 
  235                            vt3r, vt3i, vt4r, vt4i,
 
  236                            Fdn, yp2, is_dn, ic);
 
  237         save_vec(pg, &vp2[
VLEN * (offset + 0)], vt1r);
 
  238         save_vec(pg, &vp2[
VLEN * (offset + 1)], vt1i);
 
  239         save_vec(pg, &vp2[
VLEN * (offset + 2)], vt2r);
 
  240         save_vec(pg, &vp2[
VLEN * (offset + 3)], vt2i);
 
  241         save_vec(pg, &vp2[
VLEN * (offset + 4)], vt3r);
 
  242         save_vec(pg, &vp2[
VLEN * (offset + 5)], vt3i);
 
  243         save_vec(pg, &vp2[
VLEN * (offset + 6)], vt4r);
 
  244         save_vec(pg, &vp2[
VLEN * (offset + 7)], vt4i);
 
  256   int *Leo, 
int *Nsize, 
int *do_comm,
 
  263   int Nst2v = Nx2v * Ny * Nz * Nt;
 
  264   int Nst2  = Nst2v * 
VLEN;
 
  267   int Nin5 = Nin4 * Ns;
 
  269   int NvU2 = 
NDF * Nst2;
 
  271   int Nxy2  = Nx2v * Ny;
 
  272   int Nxyz2 = Nx2v * Ny * Nz;
 
  274   svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
 
  275   svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
 
  276   svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
 
  277   set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
 
  278   set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
 
  279   set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
 
  280   set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
 
  281   set_predicate_yp(pg1_yp, pg2_yp);
 
  282   set_predicate_ym(pg1_ym, pg2_ym);
 
  284   int ith, nth, site0, site1;
 
  285   set_threadtask(ith, nth, site0, site1, Nst2v);
 
  289   for (
int site = site0; site < site1; ++site) {
 
  290     int ix   = site % Nx2v;
 
  291     int iyzt = site / Nx2v;
 
  293     int izt  = site / Nxy2;
 
  296     int ixy  = ix + Nx2v * iy;
 
  297     int ixyz = ixy + Nxy2 * iz;
 
  298     int jeo  = (ieo + Leo[
VLENY * iyzt]) % 2;
 
  301     int ix_p1   = (site + 1) % Nx2v;
 
  302     int iyzt_p1 = (site + 1) / Nx2v;
 
  303     int iy_p1   = iyzt_p1 % Ny;
 
  304     int izt_p1  = (site + 1) / Nxy2;
 
  305     int iz_p1   = izt_p1 % Nz;
 
  306     int it_p1   = izt_p1 / Nz;
 
  307     int ixy_p1  = ix_p1 + Nx2v * iy_p1;
 
  308     int ixyz_p1 = ixy_p1 + Nxy2 * iz_p1;
 
  309     int jeo_p1  = (ieo + Leo[
VLENY * iyzt_p1]) % 2;
 
  313     real_t *wp2 = &wp[Nin5 * site];
 
  314     real_t *vp2 = &vp[Nin5 * site];
 
  325     if ((ix < Nx2v - 1) || (do_comm[0] == 0)) {
 
  326       int nei = ix + 1 + Nx2v * iyzt;
 
  328       nei_p1 = ix - 1 + Nx2v * iyzt;
 
  329       if (ix == Nx2v - 1) nei = 0 + Nx2v * iyzt;
 
  331       real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (ieo + 2 * idir)];
 
  332       real_t *wpn = &wp[Nin5 * nei];
 
  335         for (
int is = 0; is < Ns; ++is) {
 
  340           mult_wilson_eo_xpb(pg1e_xp, pg2e_xp, pg3e_xp,
 
  341                              &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  344         for (
int is = 0; is < Ns; ++is) {
 
  349           mult_wilson_eo_xpb(pg1o_xp, pg2o_xp, pg3o_xp,
 
  350                              &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  358     if ((ix > 0) || (do_comm[0] == 0)) {
 
  359       int nei = ix - 1 + Nx2v * iyzt;
 
  361       int iy2_p1 = (iy + 1) % Ny;
 
  362       nei_p1 = ix + Nx2v * (iy2_p1 + Ny * izt);
 
  364       if (ix == 0) nei = Nx2v - 1 + Nx2v * iyzt;
 
  366       real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
 
  367       real_t *un  = &up[
VLEN * 
NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
 
  368       real_t *wpn = &wp[Nin5 * nei];
 
  370         for (
int is = 0; is < Ns; ++is) {
 
  375           mult_wilson_eo_xmb(pg1e_xm, pg2e_xm, pg3e_xm, &vp2[Nin4 * is],
 
  376                              u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  379         for (
int is = 0; is < Ns; ++is) {
 
  384           mult_wilson_eo_xmb(pg1o_xm, pg2o_xm, pg3o_xm, &vp2[Nin4 * is],
 
  385                              u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  395     if ((iy < Ny - 1) || (do_comm[idir] == 0)) {
 
  396       int iy2 = (iy + 1) % Ny;
 
  397       int nei = ix + Nx2v * (iy2 + Ny * izt);
 
  399       int iy2_p1 = (iy - 1 + Ny) % Ny;
 
  400       nei_p1 = ix + Nx2v * (iy2_p1 + Ny * izt);
 
  403       real_t *wpn = &wp[Nin5 * nei];
 
  404       real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (ieo + 2 * idir)];
 
  405       for (
int is = 0; is < Ns; ++is) {
 
  408         mult_wilson_ypb(pg1_yp, pg2_yp,
 
  409                         &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  415     if ((iy > 0) || (do_comm[idir] == 0)) {
 
  416       int iy2 = (iy - 1 + Ny) % Ny;
 
  417       int nei = ix + Nx2v * (iy2 + Ny * izt);
 
  419       int iz2_p1 = (iz + 1) % Nz;
 
  420       nei_p1 = ixy + Nxy2 * (iz2_p1 + Nz * it);
 
  422       real_t *wpn = &wp[Nin5 * nei];
 
  423       real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
 
  424       real_t *un  = &up[
VLEN * 
NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
 
  425       for (
int is = 0; is < Ns; ++is) {
 
  427         mult_wilson_ymb(pg1_ym, pg2_ym,
 
  428                         &vp2[Nin4 * is], u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
 
  436     if ((iz < Nz - 1) || (do_comm[idir] == 0)) {
 
  437       int iz2 = (iz + 1) % Nz;
 
  438       int nei = ixy + Nxy2 * (iz2 + Nz * it);
 
  440       int iz2_p1 = (iz - 1 + Nz) % Nz;
 
  441       nei_p1 = ixy + Nxy2 * (iz2_p1 + Nz * it);
 
  443       real_t *wpn = &wp[Nin5 * nei];
 
  444       real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (ieo + 2 * idir)];
 
  445       for (
int is = 0; is < Ns; ++is) {
 
  448         mult_wilson_zpb(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
 
  454     if ((iz > 0) || (do_comm[idir] == 0)) {
 
  455       int iz2 = (iz - 1 + Nz) % Nz;
 
  456       int nei = ixy + Nxy2 * (iz2 + Nz * it);
 
  458       int it2_p1 = (it + 1) % Nt;
 
  459       nei_p1 = ixyz + Nxyz2 * it2_p1;
 
  461       real_t *wpn = &wp[Nin5 * nei];
 
  462       real_t *u2  = &up[
VLEN * 
NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
 
  463       for (
int is = 0; is < Ns; ++is) {
 
  466         mult_wilson_zmb(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
 
  474     if ((it < Nt - 1) || (do_comm[idir] == 0)) {
 
  475       int it2 = (it + 1) % Nt;
 
  476       int nei = ixyz + Nxyz2 * it2;
 
  478       int it2_p1 = (it - 1 + Nt) % Nt;
 
  479       nei_p1 = ixyz + Nxyz2 * it2_p1;
 
  481       real_t *wpn = &wp[Nin5 * nei];
 
  482       real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (ieo + 2 * idir)];
 
  483       for (
int is = 0; is < Ns; ++is) {
 
  486         mult_wilson_tpb_dirac(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
 
  492     if ((it > 0) || (do_comm[idir] == 0)) {
 
  493       int it2 = (it - 1 + Nt) % Nt;
 
  494       int nei = ixyz + Nxyz2 * it2;
 
  496       nei_p1 = ix_p1 + 1 + Nx2v * iyzt_p1;
 
  498       real_t *wpn = &wp[Nin5 * nei];
 
  499       real_t *u2  = &up[
VLEN * 
NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
 
  500       for (
int is = 0; is < Ns; ++is) {
 
  503         mult_wilson_tmb_dirac(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
 
  518   int *Leo, 
int *Nsize, 
int *do_comm,
 
  525   int Nst2v = Nx2v * Ny * Nz * Nt;
 
  526   int Nst2  = Nst2v * 
VLEN;
 
  529   int Nin5  = Nin4 * Ns;
 
  531   int Nin5H = Nin4H * Ns;
 
  532   int NvU2  = 
NDF * Nst2;
 
  534   int Nxy2  = Nx2v * Ny;
 
  535   int Nxyz2 = Nx2v * Ny * Nz;
 
  537   svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
 
  538   svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
 
  539   set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
 
  540   set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
 
  541   set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
 
  542   set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
 
  543   svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
 
  544   set_predicate_yp(pg1_yp, pg2_yp);
 
  545   set_predicate_ym(pg1_ym, pg2_ym);
 
  547   set_index_xp_eo(svidx_xp);
 
  548   set_index_xm_eo(svidx_xm);
 
  551   if (do_comm[idir] == 1) {
 
  552     int Nyzt = Ny * Nz * Nt;
 
  554     int ith, nth, site0, site1;
 
  555     set_threadtask(ith, nth, site0, site1, Nyzt);
 
  557     for (
int iyzt = site0; iyzt < site1; ++iyzt) {
 
  559       int iz = (iyzt / Ny) % Nz;
 
  560       int it = iyzt / (Ny * Nz);
 
  562       int jeo = (ieo + Leo[
VLENY * iyzt]) % 2;
 
  564       int Nskipx = (
VLENY + 1) / 2;
 
  568       int ibf_up_xp1   = Nskipx * 
NVC * 
ND2 * Ns * (iyzt + 1);
 
  569       int ibf_dn_xp1   = Nskipx * 
NVC * 
ND2 * Ns * (iyzt + 1);
 
  570       int ibf_dn_xp1_2 = Nskipx * 
NVC * 
ND2 * Ns * iyzt;
 
  571       int ibf_up_xp1_2 = Nskipx * 
NVC * 
ND2 * Ns * (iyzt + 1);
 
  573       int ibf_up_xp1   = Nskipx * 
NVC * 
ND2 * Ns * ((iyzt + 1) / 2);
 
  574       int ibf_dn_xp1   = Nskipx * 
NVC * 
ND2 * Ns * ((iyzt + 1) / 2);
 
  575       int ibf_dn_xp1_2 = Nskipx * 
NVC * 
ND2 * Ns * (iyzt / 2);
 
  576       int ibf_up_xp1_2 = Nskipx * 
NVC * 
ND2 * Ns * ((iyzt + 1) / 2);
 
  583       int ibf_up = Nskipx * 
NVC * 
ND2 * Ns * iyzt;
 
  584       int ibf_dn = Nskipx * 
NVC * 
ND2 * Ns * iyzt;
 
  586       int ibf_up = Nskipx * 
NVC * 
ND2 * Ns * (iyzt / 2);
 
  587       int ibf_dn = Nskipx * 
NVC * 
ND2 * Ns * (iyzt / 2);
 
  592         int    site = ix + Nx2v * iyzt;
 
  593         real_t *wp2 = &wp[Nin5 * site];
 
  596         int site_xp1   = ix + Nx2v * (iyzt + 1);
 
  597         int site_xp1_2 = Nx2v - 1 + Nx2v * (iyzt + 1);
 
  599         set_index_xm_eo(svidx_xm);
 
  602           for (
int is = 0; is < Ns; ++is) {
 
  607             if ((!((it == 0) || (it == Nt - 1))) &&
 
  608                 (!((iz == 0) || (iz == Nz - 1))) &&
 
  609                 (!((iy == 0) || (iy == Ny - 1)))) {
 
  615             mult_wilson_eo_xp1(pg2o_xm, svidx_xm,
 
  616                                &buf1_xp[ibf_up + Nskipx * 
NVC * 
ND2 * is],
 
  621           for (
int is = 0; is < Ns; ++is) {
 
  626             if ((!((it == 0) || (it == Nt - 1))) &&
 
  627                 (!((iz == 0) || (iz == Nz - 1))) &&
 
  628                 (!((iy == 0) || (iy == Ny - 1)))) {
 
  634             mult_wilson_eo_xp1(pg2e_xm, svidx_xm,
 
  635                                &buf1_xp[ibf_up + Nskipx * 
NVC * 
ND2 * is],
 
  643         int    site = ix + Nx2v * iyzt;
 
  644         real_t *wp2 = &wp[Nin5 * site];
 
  645         real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
 
  647         int site_xp1   = ix + Nx2v * (iyzt + 1);
 
  648         int site_xp1_2 = 0 + Nx2v * (iyzt + 1);
 
  653         set_index_xp_eo(svidx_xp);
 
  655           for (
int is = 0; is < Ns; ++is) {
 
  660             if ((!((it == 0) || (it == Nt - 1))) &&
 
  661                 (!((iz == 0) || (iz == Nz - 1))) &&
 
  662                 (!((iy == 0) || (iy == Ny - 1)))) {
 
  668             mult_wilson_eo_xm1(pg2o_xp, svidx_xp,
 
  669                                &buf1_xm[ibf_dn + Nskipx * 
NVC * 
ND2 * is],
 
  670                                u2, &wp2[Nin4 * is]);
 
  674           for (
int is = 0; is < Ns; ++is) {
 
  679             if ((!((it == 0) || (it == Nt - 1))) &&
 
  680                 (!((iz == 0) || (iz == Nz - 1))) &&
 
  681                 (!((iy == 0) || (iy == Ny - 1)))) {
 
  687             mult_wilson_eo_xm1(pg2e_xp, svidx_xp,
 
  688                                &buf1_xm[ibf_dn + Nskipx * 
NVC * 
ND2 * is],
 
  689                                u2, &wp2[Nin4 * is]);
 
  698   if (do_comm[idir] == 1) {
 
  699     int Nxzt2 = Nx2v * Nz * Nt;
 
  701     int ith, nth, site0, site1;
 
  702     set_threadtask(ith, nth, site0, site1, Nxzt2);
 
  704     for (
int ixzt = site0; ixzt < site1; ++ixzt) {
 
  705       int ix  = ixzt % Nx2v;
 
  706       int izt = ixzt / Nx2v;
 
  715                     ((ix + i_p1) % Nx2v + Nx2v * (izt + (ix + i_p1) / Nx2v));
 
  717                     ((ix + i_p2) % Nx2v + Nx2v * (izt + (ix + i_p2) / Nx2v));
 
  721         int    site = ix + Nx2v * (iy + Ny * izt);
 
  723         real_t *wp2 = &wp[Nin5 * site];
 
  725         int iy_p1    = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
 
  726         int izt_p1   = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
 
  727         int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
 
  728         int iy_p2    = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
 
  729         int izt_p2   = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
 
  730         int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
 
  732         for (
int is = 0; is < Ns; ++is) {
 
  736           if ((!((it == 0) || (it == Nt - 1))) &&
 
  737               (!((iz == 0) || (iz == Nz - 1)))) {
 
  742           mult_wilson_yp1(pg2_ym,
 
  743                           &buf1_yp[ibf + 
VLENX * 
NVC * 
ND2 * is], &wp2[Nin4 * is]);
 
  748         int    site = ix + Nx2v * (iy + Ny * izt);
 
  750         real_t *wp2 = &wp[Nin5 * site];
 
  751         real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
 
  753         int iy_p1    = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
 
  754         int izt_p1   = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
 
  755         int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
 
  756         int iy_p2    = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
 
  757         int izt_p2   = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
 
  758         int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
 
  762         for (
int is = 0; is < Ns; ++is) {
 
  766           if ((!((it == 0) || (it == Nt - 1))) &&
 
  767               (!((iz == 0) || (iz == Nz - 1)))) {
 
  773           mult_wilson_ym1(pg2_yp,
 
  774                           &buf1_ym[ibf + 
VLENX * 
NVC * 
ND2 * is], u2, &wp2[Nin4 * is]);
 
  781   if (do_comm[idir] == 1) {
 
  782     int Nxyt2 = Nxy2 * Nt;
 
  784     int ith, nth, site0, site1;
 
  785     set_threadtask(ith, nth, site0, site1, Nxyt2);
 
  787     for (
int ixyt = site0; ixyt < site1; ++ixyt) {
 
  788       int ixy = ixyt % Nxy2;
 
  789       int it  = ixyt / Nxy2;
 
  793       int ibf_zp1 = Nin5H * ((ixy + i_p1) % Nxy2 + Nxy2 * (it + (ixy + i_p1) / Nxy2));
 
  794       int ibf_zp2 = Nin5H * ((ixy + i_p2) % Nxy2 + Nxy2 * (it + (ixy + i_p2) / Nxy2));
 
  796       int ibf = Nin5H * (ixy + Nxy2 * it);
 
  800         int    site = ixy + Nxy2 * (iz + Nz * it);
 
  801         real_t *wp2 = &wp[Nin5 * site];
 
  803         int iz_p1    = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
 
  804         int it_p1    = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
 
  805         int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
 
  806         int iz_p2    = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
 
  807         int it_p2    = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
 
  808         int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
 
  810         for (
int is = 0; is < Ns; ++is) {
 
  814           if (!((it == 0) || (it == Nt - 1))) {
 
  819           mult_wilson_zp1(&buf1_zp[ibf + Nin4H * is], &wp2[Nin4 * is]);
 
  825         int    site = ixy + Nxy2 * (iz + Nz * it);
 
  826         real_t *wp2 = &wp[Nin5 * site];
 
  827         real_t *u2  = &up[
VLEN * 
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
 
  829         int iz_p1    = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
 
  830         int it_p1    = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
 
  831         int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
 
  832         int iz_p2    = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
 
  833         int it_p2    = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
 
  834         int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
 
  838         for (
int is = 0; is < Ns; ++is) {
 
  842           if (!((it == 0) || (it == Nt - 1))) {
 
  847           mult_wilson_zm1(&buf1_zm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
 
  854   if (do_comm[idir] == 1) {
 
  855     int ith, nth, site0, site1;
 
  856     set_threadtask(ith, nth, site0, site1, Nxyz2);
 
  858     for (
int ixyz = site0; ixyz < site1; ++ixyz) {
 
  859       int ibf = Nin5H * ixyz;
 
  865       int ibf_tp1 = Nin5H * (ixyz + i_p1);
 
  866       int ibf_tp2 = Nin5H * (ixyz + i_p2);
 
  870         int    site = ixyz + Nxyz2 * it;
 
  871         real_t *wp2 = &wp[Nin5 * site];
 
  874         int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
 
  875         int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
 
  877         for (
int is = 0; is < Ns; ++is) {
 
  884           mult_wilson_tp1_dirac(&buf1_tp[ibf + Nin4H * is], &wp2[Nin4 * is]);
 
  890         int    site = ixyz + Nxyz2 * it;
 
  891         real_t *wp2 = &wp[Nin5 * site];
 
  894         int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
 
  895         int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
 
  899         real_t *u2 = &up[
VLEN * 
NDF * site + NvU2 * (1 - ieo + 2 * idir)];
 
  900         for (
int is = 0; is < Ns; ++is) {
 
  907           mult_wilson_tm1_dirac(&buf1_tm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
 
  923   int *Leo, 
int *Nsize, 
int *do_comm,
 
  930   int Nst2v = Nx2v * Ny * Nz * Nt;
 
  931   int Nst2  = Nst2v * 
VLEN;
 
  934   int Nin5  = Nin4 * Ns;
 
  936   int Nin5H = Nin4H * Ns;
 
  937   int NvU2  = 
NDF * Nst2;
 
  939   int Nxy2  = Nx2v * Ny;
 
  940   int Nxyz2 = Nx2v * Ny * Nz;
 
  942   svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
 
  943   svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
 
  944   set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
 
  945   set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
 
  946   set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
 
  947   set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
 
  948   svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
 
  949   set_predicate_yp(pg1_yp, pg2_yp);
 
  950   set_predicate_ym(pg1_ym, pg2_ym);
 
  952   set_index_xp_eo(svidx_xp);
 
  953   set_index_xm_eo(svidx_xm);
 
  955   int Nskipx = (
VLENY + 1) / 2;
 
  957   int ith, nth, site0, site1;
 
  958   set_threadtask(ith, nth, site0, site1, Nst2v);
 
  964   for (
int site = site0; site < site1; ++site) {
 
  965     int ix   = site % Nx2v;
 
  966     int iyzt = site / Nx2v;
 
  968     int izt  = site / Nxy2;
 
  971     int ixy  = ix + Nx2v * iy;
 
  972     int ixyz = ixy + Nxy2 * iz;
 
  973     int jeo  = (ieo + Leo[
VLENY * iyzt]) % 2;
 
  976     int ibf_up_xp1, ibf_dn_xp1, site_xp1, site_xp1_2, ibf_up_xp1_2, ibf_dn_xp1_2;
 
  978     ibf_up_xp1 = Nskipx * 
NVC * 
ND2 * Ns * (iyzt + 1);
 
  980       ibf_up_xp1 = Nskipx * 
NVC * 
ND2 * Ns * ((iyzt + 1) / 2);
 
  982     ibf_dn_xp1 = Nskipx * 
NVC * 
ND2 * Ns * (iyzt + 1);
 
  984       ibf_dn_xp1 = Nskipx * 
NVC * 
ND2 * Ns * ((iyzt + 1) / 2);
 
  986     site_xp1 = (iyzt + 1) * Nx2v + ix;
 
  989       site_xp1_2 = iyzt * Nx2v + Nx2v - 1;
 
  991       ibf_dn_xp1_2 = Nskipx * 
NVC * 
ND2 * Ns * iyzt;
 
  993         ibf_dn_xp1_2 = Nskipx * 
NVC * 
ND2 * Ns * (iyzt / 2);
 
  995     } 
else if (ix == Nx2v - 1) {
 
  996       site_xp1_2 = (iyzt + 1) * Nx2v + 0;
 
  998       ibf_up_xp1_2 = Nskipx * 
NVC * 
ND2 * Ns * (iyzt + 1);
 
 1000         ibf_up_xp1_2 = Nskipx * 
NVC * 
ND2 * Ns * ((iyzt + 1) / 2);
 
 1007     int ibf_yp1  = 
VLENX * 
NVC * 
ND2 * Ns * ((ix + i_p1) % Nx2v + Nx2v * (izt + (ix + i_p1) / Nx2v));
 
 1008     int iy_p1    = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
 
 1009     int izt_p1   = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
 
 1010     int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
 
 1012     int ibf_yp2  = 
VLENX * 
NVC * 
ND2 * Ns * ((ix + i_p2) % Nx2v + Nx2v * (izt + (ix + i_p2) / Nx2v));
 
 1013     int iy_p2    = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
 
 1014     int izt_p2   = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
 
 1015     int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
 
 1017     int ibf_zp1  = Nin5H * ((ixy + i_p1) % Nxy2 + Nxy2 * (it + (ixy + i_p1) / Nxy2));
 
 1018     int iz_p1    = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
 
 1019     int it_p1    = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
 
 1020     int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
 
 1022     int ibf_zp2  = Nin5H * ((ixy + i_p2) % Nxy2 + Nxy2 * (it + (ixy + i_p2) / Nxy2));
 
 1023     int iz_p2    = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
 
 1024     int it_p2    = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
 
 1025     int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
 
 1027     int ibf_tp1  = Nin5H * (ixyz + i_p1);
 
 1028     int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
 
 1030     int ibf_tp2  = Nin5H * (ixyz + i_p2);
 
 1031     int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
 
 1036     real_t *wp2 = &wp[Nin5 * site];
 
 1037     real_t *vp2 = &vp[Nin5 * site];
 
 1040     if (do_comm[idir] == 1) {
 
 1041       if (ix == Nx2v - 1) {
 
 1042         int ibf_up = Nskipx * 
NVC * 
ND2 * Ns * iyzt;
 
 1044           ibf_up = Nskipx * 
NVC * 
ND2 * Ns * (iyzt / 2);
 
 1049         real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 0)];
 
 1050         set_index_xp_eo(svidx_xp);
 
 1052           for (
int is = 0; is < Ns; ++is) {
 
 1058             if ((!((it == 0) || (it == Nt - 1))) &&
 
 1059                 (!((iz == 0) || (iz == Nz - 1))) &&
 
 1060                 (!((iy == 0) || (iy == Ny - 1)))) {
 
 1066             mult_wilson_eo_xp2(pg1e_xp, pg2e_xp, pg3e_xp, svidx_xp,
 
 1067                                &vp2[Nin4 * is], &u[
VLEN * 
NDF * site], &wp2[Nin4 * is],
 
 1068                                &buf2_xp[ibf_up + Nskipx * 
NVC * 
ND2 * is]);
 
 1071           for (
int is = 0; is < Ns; ++is) {
 
 1076             if ((!((it == 0) || (it == Nt - 1))) &&
 
 1077                 (!((iz == 0) || (iz == Nz - 1))) &&
 
 1078                 (!((iy == 0) || (iy == Ny - 1)))) {
 
 1084             mult_wilson_eo_xp2(pg1o_xp, pg2o_xp, pg3o_xp, svidx_xp,
 
 1085                                &vp2[Nin4 * is], &u[
VLEN * 
NDF * site], &wp2[Nin4 * is],
 
 1086                                &buf2_xp[ibf_up + Nskipx * 
NVC * 
ND2 * is]);
 
 1092         int ibf_dn = Nskipx * 
NVC * 
ND2 * Ns * iyzt;
 
 1094           ibf_dn = Nskipx * 
NVC * 
ND2 * Ns * (iyzt / 2);
 
 1099         real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 0)];
 
 1101         set_index_xm_eo(svidx_xm);
 
 1103           for (
int is = 0; is < Ns; ++is) {
 
 1108             if ((!((it == 0) || (it == Nt - 1))) &&
 
 1109                 (!((iz == 0) || (iz == Nz - 1))) &&
 
 1110                 (!((iy == 0) || (iy == Ny - 1)))) {
 
 1116             mult_wilson_eo_xm2(pg1e_xm, pg2e_xm, pg3e_xm, svidx_xm,
 
 1117                                &vp2[Nin4 * is], &u[
VLEN * 
NDF * site], &wp2[Nin4 * is],
 
 1118                                &buf2_xm[ibf_dn + Nskipx * 
NVC * 
ND2 * is]);
 
 1121           for (
int is = 0; is < Ns; ++is) {
 
 1126             if ((!((it == 0) || (it == Nt - 1))) &&
 
 1127                 (!((iz == 0) || (iz == Nz - 1))) &&
 
 1128                 (!((iy == 0) || (iy == Ny - 1)))) {
 
 1134             mult_wilson_eo_xm2(pg1o_xm, pg2o_xm, pg3o_xm, svidx_xm,
 
 1135                                &vp2[Nin4 * is], &u[
VLEN * 
NDF * site], &wp2[Nin4 * is],
 
 1136                                &buf2_xm[ibf_dn + Nskipx * 
NVC * 
ND2 * is]);
 
 1143     if (do_comm[idir] == 1) {
 
 1145         int ibf = 
VLENX * 
NVC * 
ND2 * Ns * (ix + Nx2v * izt);
 
 1149         real_t *u = &up[
NDF * Nst2 * (ieo + 2 * 1)];
 
 1150         for (
int is = 0; is < Ns; ++is) {
 
 1155           if ((!((it == 0) || (it == Nt - 1))) &&
 
 1156               (!((iz == 0) || (iz == Nz - 1)))) {
 
 1162           mult_wilson_yp2(pg1_yp, pg2_yp,
 
 1163                           &vp2[Nin4 * is], &u[
VLEN * 
NDF * site],
 
 1164                           &wp2[Nin4 * is], &buf2_yp[ibf + 
VLENX * 
NVC * 
ND2 * is]);
 
 1169         int ibf = 
VLENX * 
NVC * 
ND2 * Ns * (ix + Nx2v * izt);
 
 1172         real_t *u = &up[
NDF * Nst2 * (1 - ieo + 2 * 1)];
 
 1173         for (
int is = 0; is < Ns; ++is) {
 
 1178           if ((!((it == 0) || (it == Nt - 1))) &&
 
 1179               (!((iz == 0) || (iz == Nz - 1)))) {
 
 1186           mult_wilson_ym2(pg1_ym, pg2_ym,
 
 1187                           &vp2[Nin4 * is], &u[
VLEN * 
NDF * site],
 
 1188                           &wp2[Nin4 * is], &buf2_ym[ibf + 
VLENX * 
NVC * 
ND2 * is]);
 
 1194     if (do_comm[idir] == 1) {
 
 1196         int ibf = Nin5H * (ixy + Nxy2 * it);
 
 1199         real_t *u2 = &up[
VLEN * 
NDF * site + NvU2 * (ieo + 2 * idir)];
 
 1200         for (
int is = 0; is < Ns; ++is) {
 
 1204           if (!((it == 0) || (it == Nt - 1))) {
 
 1209           mult_wilson_zp2(&vp2[Nin4 * is], u2, &buf2_zp[ibf + Nin4H * is]);
 
 1214         int ibf = Nin5H * (ixy + Nxy2 * it);
 
 1216         for (
int is = 0; is < Ns; ++is) {
 
 1220           if (!((it == 0) || (it == Nt - 1))) {
 
 1225           mult_wilson_zm2(&vp2[Nin4 * is], &buf2_zm[ibf + Nin4H * is]);
 
 1231     if (do_comm[idir] == 1) {
 
 1233         int ibf = Nin5H * ixyz;
 
 1237         real_t *u2 = &up[
VLEN * 
NDF * site + NvU2 * (ieo + 2 * idir)];
 
 1238         for (
int is = 0; is < Ns; ++is) {
 
 1245           mult_wilson_tp2_dirac(&vp2[Nin4 * is], u2, &buf2_tp[ibf + Nin4H * is]);
 
 1250         int ibf = Nin5H * ixyz;
 
 1253         for (
int is = 0; is < Ns; ++is) {
 
 1260           mult_wilson_tm2_dirac(&vp2[Nin4 * is], &buf2_tm[ibf + Nin4H * is]);
 
 1274   int *Leo, 
int *Nsize, 
int *do_comm,
 
 1283                                      Leo, Nsize, do_comm, ieo);
 
 1294   int Nx2v  = Nsize[0];
 
 1298   int Nst2v = Nx2v * Ny * Nz * Nt;
 
 1299   int Nst2  = Nst2v * 
VLEN;
 
 1302   int Nin5 = Nin4 * Ns;
 
 1305   int ith, nth, site0, site1;
 
 1306   set_threadtask(ith, nth, site0, site1, Nst2v);
 
 1308   for (
int site = site0; site < site1; ++site) {
 
 1309     real_t   *vp2 = &vp[Nin5 * site];
 
 1310     real_t   *wp2 = &wp[Nin5 * site];
 
 1313     for (
int ic = 0; ic < 
NC; ++ic) {
 
 1314       svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
 
 1315       svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
 
 1316       svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
 
 1317       int      offset0 = 2 * 
ND * ic;
 
 1322       load_vec(pg, v1r, &wp2[
VLEN * (offset0 + 0)]);
 
 1323       load_vec(pg, v1i, &wp2[
VLEN * (offset0 + 1)]);
 
 1324       load_vec(pg, v2r, &wp2[
VLEN * (offset0 + 2)]);
 
 1325       load_vec(pg, v2i, &wp2[
VLEN * (offset0 + 3)]);
 
 1326       load_vec(pg, v3r, &wp2[
VLEN * (offset0 + 4)]);
 
 1327       load_vec(pg, v3i, &wp2[
VLEN * (offset0 + 5)]);
 
 1328       load_vec(pg, v4r, &wp2[
VLEN * (offset0 + 6)]);
 
 1329       load_vec(pg, v4i, &wp2[
VLEN * (offset0 + 7)]);
 
 1330       save_vec(pg, &vp2[
VLEN * (offset0 + 0)], v1r);
 
 1331       save_vec(pg, &vp2[
VLEN * (offset0 + 1)], v1i);
 
 1332       save_vec(pg, &vp2[
VLEN * (offset0 + 2)], v2r);
 
 1333       save_vec(pg, &vp2[
VLEN * (offset0 + 3)], v2i);
 
 1334       save_vec(pg, &vp2[
VLEN * (offset0 + 4)], v3r);
 
 1335       save_vec(pg, &vp2[
VLEN * (offset0 + 5)], v3i);
 
 1336       save_vec(pg, &vp2[
VLEN * (offset0 + 6)], v4r);
 
 1337       save_vec(pg, &vp2[
VLEN * (offset0 + 7)], v4i);
 
 1339       scal_vec(pg, y1r, e[0]);
 
 1341       scal_vec(pg, y1i, e[0]);
 
 1343       scal_vec(pg, y2r, e[0]);
 
 1345       scal_vec(pg, y2i, e[0]);
 
 1347       scal_vec(pg, y3r, e[0]);
 
 1349       scal_vec(pg, y3i, e[0]);
 
 1351       scal_vec(pg, y4r, e[0]);
 
 1353       scal_vec(pg, y4i, e[0]);
 
 1355       for (
int is = 1; is < Ns - 1; ++is) {
 
 1365         int offset = 2 * 
ND * ic + 
NVCD * is;
 
 1371         load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
 
 1372         load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
 
 1373         load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
 
 1374         load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
 
 1375         load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
 
 1376         load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
 
 1377         load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
 
 1378         load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
 
 1382         add_aPp5_dirac_vec(pg,
 
 1383                            v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
 
 1385                            x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
 
 1386         save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
 
 1387         save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
 
 1388         save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
 
 1389         save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
 
 1390         save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
 
 1391         save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
 
 1392         save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
 
 1393         save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
 
 1394         axpy_vec(pg, y1r, e[is], v1r);
 
 1395         axpy_vec(pg, y1i, e[is], v1i);
 
 1396         axpy_vec(pg, y2r, e[is], v2r);
 
 1397         axpy_vec(pg, y2i, e[is], v2i);
 
 1398         axpy_vec(pg, y3r, e[is], v3r);
 
 1399         axpy_vec(pg, y3i, e[is], v3i);
 
 1400         axpy_vec(pg, y4r, e[is], v4r);
 
 1401         axpy_vec(pg, y4i, e[is], v4i);
 
 1413       int offset = 2 * 
ND * ic + 
NVCD * is;
 
 1414       load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
 
 1415       load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
 
 1416       load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
 
 1417       load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
 
 1418       load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
 
 1419       load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
 
 1420       load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
 
 1421       load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
 
 1423       add_aPp5_dirac_vec(pg,
 
 1424                          v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
 
 1426                          x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
 
 1428       add_aPm5_dirac_vec(pg,
 
 1429                          v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
 
 1431                          y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i);
 
 1432       save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
 
 1433       save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
 
 1434       save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
 
 1435       save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
 
 1436       save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
 
 1437       save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
 
 1438       save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
 
 1439       save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
 
 1454   int Nx2v  = Nsize[0];
 
 1458   int Nst2v = Nx2v * Ny * Nz * Nt;
 
 1459   int Nst2  = Nst2v * 
VLEN;
 
 1462   int Nin5 = Nin4 * Ns;
 
 1465   int ith, nth, site0, site1;
 
 1466   set_threadtask(ith, nth, site0, site1, Nst2v);
 
 1468   for (
int site = site0; site < site1; ++site) {
 
 1469     real_t   *vp2 = &vp[Nin5 * site];
 
 1470     real_t   *wp2 = &wp[Nin5 * site];
 
 1473     for (
int ic = 0; ic < 
NC; ++ic) {
 
 1474       svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
 
 1475       svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
 
 1476       svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
 
 1478       int offset0 = 2 * 
ND * ic + 
NVCD * (Ns - 1);
 
 1483       load_vec(pg, v1r, &wp2[
VLEN * (offset0 + 0)]);
 
 1484       load_vec(pg, v1i, &wp2[
VLEN * (offset0 + 1)]);
 
 1485       load_vec(pg, v2r, &wp2[
VLEN * (offset0 + 2)]);
 
 1486       load_vec(pg, v2i, &wp2[
VLEN * (offset0 + 3)]);
 
 1487       load_vec(pg, v3r, &wp2[
VLEN * (offset0 + 4)]);
 
 1488       load_vec(pg, v3i, &wp2[
VLEN * (offset0 + 5)]);
 
 1489       load_vec(pg, v4r, &wp2[
VLEN * (offset0 + 6)]);
 
 1490       load_vec(pg, v4i, &wp2[
VLEN * (offset0 + 7)]);
 
 1492       real_t a = dpinv[Ns - 1];
 
 1494       scal_vec(pg, v1r, a);
 
 1495       scal_vec(pg, v1i, a);
 
 1496       scal_vec(pg, v2r, a);
 
 1497       scal_vec(pg, v2i, a);
 
 1498       scal_vec(pg, v3r, a);
 
 1499       scal_vec(pg, v3i, a);
 
 1500       scal_vec(pg, v4r, a);
 
 1501       scal_vec(pg, v4i, a);
 
 1503       save_vec(pg, &vp2[
VLEN * (offset0 + 0)], v1r);
 
 1504       save_vec(pg, &vp2[
VLEN * (offset0 + 1)], v1i);
 
 1505       save_vec(pg, &vp2[
VLEN * (offset0 + 2)], v2r);
 
 1506       save_vec(pg, &vp2[
VLEN * (offset0 + 3)], v2i);
 
 1507       save_vec(pg, &vp2[
VLEN * (offset0 + 4)], v3r);
 
 1508       save_vec(pg, &vp2[
VLEN * (offset0 + 5)], v3i);
 
 1509       save_vec(pg, &vp2[
VLEN * (offset0 + 6)], v4r);
 
 1510       save_vec(pg, &vp2[
VLEN * (offset0 + 7)], v4i);
 
 1512       set_aPp5_dirac_vec(pg,
 
 1513                          y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i,
 
 1515                          v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i);
 
 1516       for (
int is = Ns - 2; is >= 0; --is) {
 
 1526         int offset = 2 * 
ND * ic + 
NVCD * is;
 
 1531         load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
 
 1532         load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
 
 1533         load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
 
 1534         load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
 
 1535         load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
 
 1536         load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
 
 1537         load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
 
 1538         load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
 
 1542         add_aPm5_dirac_vec(pg,
 
 1543                            v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
 
 1545                            x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
 
 1547         axpy_vec(pg, v1r, -f[is], y1r);
 
 1548         axpy_vec(pg, v1i, -f[is], y1i);
 
 1549         axpy_vec(pg, v2r, -f[is], y2r);
 
 1550         axpy_vec(pg, v2i, -f[is], y2i);
 
 1551         axpy_vec(pg, v3r, -f[is], y3r);
 
 1552         axpy_vec(pg, v3i, -f[is], y3i);
 
 1553         axpy_vec(pg, v4r, -f[is], y4r);
 
 1554         axpy_vec(pg, v4i, -f[is], y4i);
 
 1557         scal_vec(pg, v1r, aa);
 
 1558         scal_vec(pg, v1i, aa);
 
 1559         scal_vec(pg, v2r, aa);
 
 1560         scal_vec(pg, v2i, aa);
 
 1561         scal_vec(pg, v3r, aa);
 
 1562         scal_vec(pg, v3i, aa);
 
 1563         scal_vec(pg, v4r, aa);
 
 1564         scal_vec(pg, v4i, aa);
 
 1566         save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
 
 1567         save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
 
 1568         save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
 
 1569         save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
 
 1570         save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
 
 1571         save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
 
 1572         save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
 
 1573         save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
 
 1588   int Nx2v  = Nsize[0];
 
 1592   int Nst2v = Nx2v * Ny * Nz * Nt;
 
 1593   int Nst2  = Nst2v * 
VLEN;
 
 1596   int Nin5 = Nin4 * Ns;
 
 1599   int ith, nth, site0, site1;
 
 1600   set_threadtask(ith, nth, site0, site1, Nst2v);
 
 1602   for (
int site = site0; site < site1; ++site) {
 
 1603     real_t   *vp2 = &vp[Nin5 * site];
 
 1604     real_t   *wp2 = &wp[Nin5 * site];
 
 1607     for (
int ic = 0; ic < 
NC; ++ic) {
 
 1608       svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
 
 1609       svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
 
 1610       svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
 
 1611       int      offset0 = 2 * 
ND * ic;
 
 1616       load_vec(pg, v1r, &wp2[
VLEN * (offset0 + 0)]);
 
 1617       load_vec(pg, v1i, &wp2[
VLEN * (offset0 + 1)]);
 
 1618       load_vec(pg, v2r, &wp2[
VLEN * (offset0 + 2)]);
 
 1619       load_vec(pg, v2i, &wp2[
VLEN * (offset0 + 3)]);
 
 1620       load_vec(pg, v3r, &wp2[
VLEN * (offset0 + 4)]);
 
 1621       load_vec(pg, v3i, &wp2[
VLEN * (offset0 + 5)]);
 
 1622       load_vec(pg, v4r, &wp2[
VLEN * (offset0 + 6)]);
 
 1623       load_vec(pg, v4i, &wp2[
VLEN * (offset0 + 7)]);
 
 1626       scal_vec(pg, v1r, a);
 
 1627       scal_vec(pg, v1i, a);
 
 1628       scal_vec(pg, v2r, a);
 
 1629       scal_vec(pg, v2i, a);
 
 1630       scal_vec(pg, v3r, a);
 
 1631       scal_vec(pg, v3i, a);
 
 1632       scal_vec(pg, v4r, a);
 
 1633       scal_vec(pg, v4i, a);
 
 1635       save_vec(pg, &vp2[
VLEN * (offset0 + 0)], v1r);
 
 1636       save_vec(pg, &vp2[
VLEN * (offset0 + 1)], v1i);
 
 1637       save_vec(pg, &vp2[
VLEN * (offset0 + 2)], v2r);
 
 1638       save_vec(pg, &vp2[
VLEN * (offset0 + 3)], v2i);
 
 1639       save_vec(pg, &vp2[
VLEN * (offset0 + 4)], v3r);
 
 1640       save_vec(pg, &vp2[
VLEN * (offset0 + 5)], v3i);
 
 1641       save_vec(pg, &vp2[
VLEN * (offset0 + 6)], v4r);
 
 1642       save_vec(pg, &vp2[
VLEN * (offset0 + 7)], v4i);
 
 1644       scal_vec(pg, y1r, f[0]);
 
 1646       scal_vec(pg, y1i, f[0]);
 
 1648       scal_vec(pg, y2r, f[0]);
 
 1650       scal_vec(pg, y2i, f[0]);
 
 1652       scal_vec(pg, y3r, f[0]);
 
 1654       scal_vec(pg, y3i, f[0]);
 
 1656       scal_vec(pg, y4r, f[0]);
 
 1658       scal_vec(pg, y4i, f[0]);
 
 1660       for (
int is = 1; is < Ns - 1; ++is) {
 
 1670         int offset = 2 * 
ND * ic + 
NVCD * is;
 
 1675         load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
 
 1676         load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
 
 1677         load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
 
 1678         load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
 
 1679         load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
 
 1680         load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
 
 1681         load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
 
 1682         load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
 
 1686         add_aPm5_dirac_vec(pg,
 
 1687                            v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
 
 1689                            x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
 
 1692         scal_vec(pg, v1r, aa);
 
 1693         scal_vec(pg, v1i, aa);
 
 1694         scal_vec(pg, v2r, aa);
 
 1695         scal_vec(pg, v2i, aa);
 
 1696         scal_vec(pg, v3r, aa);
 
 1697         scal_vec(pg, v3i, aa);
 
 1698         scal_vec(pg, v4r, aa);
 
 1699         scal_vec(pg, v4i, aa);
 
 1701         save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
 
 1702         save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
 
 1703         save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
 
 1704         save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
 
 1705         save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
 
 1706         save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
 
 1707         save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
 
 1708         save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
 
 1709         axpy_vec(pg, y1r, f[is], v1r);
 
 1710         axpy_vec(pg, y1i, f[is], v1i);
 
 1711         axpy_vec(pg, y2r, f[is], v2r);
 
 1712         axpy_vec(pg, y2i, f[is], v2i);
 
 1713         axpy_vec(pg, y3r, f[is], v3r);
 
 1714         axpy_vec(pg, y3i, f[is], v3i);
 
 1715         axpy_vec(pg, y4r, f[is], v4r);
 
 1716         axpy_vec(pg, y4i, f[is], v4i);
 
 1728       int offset = 2 * 
ND * ic + 
NVCD * is;
 
 1729       load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
 
 1730       load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
 
 1731       load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
 
 1732       load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
 
 1733       load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
 
 1734       load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
 
 1735       load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
 
 1736       load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
 
 1737       a = 
real_t(0.5) * dm[is - 1];
 
 1738       add_aPm5_dirac_vec(pg,
 
 1739                          v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
 
 1741                          x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
 
 1742       add_aPp5_dirac_vec(pg,
 
 1743                          v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
 
 1745                          y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i);
 
 1747       scal_vec(pg, v1r, aa);
 
 1748       scal_vec(pg, v1i, aa);
 
 1749       scal_vec(pg, v2r, aa);
 
 1750       scal_vec(pg, v2i, aa);
 
 1751       scal_vec(pg, v3r, aa);
 
 1752       scal_vec(pg, v3i, aa);
 
 1753       scal_vec(pg, v4r, aa);
 
 1754       scal_vec(pg, v4i, aa);
 
 1756       save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
 
 1757       save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
 
 1758       save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
 
 1759       save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
 
 1760       save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
 
 1761       save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
 
 1762       save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
 
 1763       save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);
 
 1777   int Nx2v  = Nsize[0];
 
 1781   int Nst2v = Nx2v * Ny * Nz * Nt;
 
 1782   int Nst2  = Nst2v * 
VLEN;
 
 1785   int Nin5 = Nin4 * Ns;
 
 1788   int ith, nth, site0, site1;
 
 1789   set_threadtask(ith, nth, site0, site1, Nst2v);
 
 1791   for (
int site = site0; site < site1; ++site) {
 
 1792     real_t   *vp2 = &vp[Nin5 * site];
 
 1793     real_t   *wp2 = &wp[Nin5 * site];
 
 1796     for (
int ic = 0; ic < 
NC; ++ic) {
 
 1797       svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
 
 1798       svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
 
 1799       svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
 
 1801       int offset0 = 2 * 
ND * ic + 
NVCD * (Ns - 1);
 
 1806       load_vec(pg, v1r, &wp2[
VLEN * (offset0 + 0)]);
 
 1807       load_vec(pg, v1i, &wp2[
VLEN * (offset0 + 1)]);
 
 1808       load_vec(pg, v2r, &wp2[
VLEN * (offset0 + 2)]);
 
 1809       load_vec(pg, v2i, &wp2[
VLEN * (offset0 + 3)]);
 
 1810       load_vec(pg, v3r, &wp2[
VLEN * (offset0 + 4)]);
 
 1811       load_vec(pg, v3i, &wp2[
VLEN * (offset0 + 5)]);
 
 1812       load_vec(pg, v4r, &wp2[
VLEN * (offset0 + 6)]);
 
 1813       load_vec(pg, v4i, &wp2[
VLEN * (offset0 + 7)]);
 
 1815       save_vec(pg, &vp2[
VLEN * (offset0 + 0)], v1r);
 
 1816       save_vec(pg, &vp2[
VLEN * (offset0 + 1)], v1i);
 
 1817       save_vec(pg, &vp2[
VLEN * (offset0 + 2)], v2r);
 
 1818       save_vec(pg, &vp2[
VLEN * (offset0 + 3)], v2i);
 
 1819       save_vec(pg, &vp2[
VLEN * (offset0 + 4)], v3r);
 
 1820       save_vec(pg, &vp2[
VLEN * (offset0 + 5)], v3i);
 
 1821       save_vec(pg, &vp2[
VLEN * (offset0 + 6)], v4r);
 
 1822       save_vec(pg, &vp2[
VLEN * (offset0 + 7)], v4i);
 
 1824       set_aPm5_dirac_vec(pg,
 
 1825                          y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i,
 
 1827                          v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i);
 
 1829       for (
int is = Ns - 2; is >= 0; --is) {
 
 1839         int offset = 2 * 
ND * ic + 
NVCD * is;
 
 1844         load_vec(pg, v1r, &wp2[
VLEN * (offset + 0)]);
 
 1845         load_vec(pg, v1i, &wp2[
VLEN * (offset + 1)]);
 
 1846         load_vec(pg, v2r, &wp2[
VLEN * (offset + 2)]);
 
 1847         load_vec(pg, v2i, &wp2[
VLEN * (offset + 3)]);
 
 1848         load_vec(pg, v3r, &wp2[
VLEN * (offset + 4)]);
 
 1849         load_vec(pg, v3i, &wp2[
VLEN * (offset + 5)]);
 
 1850         load_vec(pg, v4r, &wp2[
VLEN * (offset + 6)]);
 
 1851         load_vec(pg, v4i, &wp2[
VLEN * (offset + 7)]);
 
 1855         add_aPp5_dirac_vec(pg,
 
 1856                            v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
 
 1858                            x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
 
 1860         axpy_vec(pg, v1r, -e[is], y1r);
 
 1861         axpy_vec(pg, v1i, -e[is], y1i);
 
 1862         axpy_vec(pg, v2r, -e[is], y2r);
 
 1863         axpy_vec(pg, v2i, -e[is], y2i);
 
 1864         axpy_vec(pg, v3r, -e[is], y3r);
 
 1865         axpy_vec(pg, v3i, -e[is], y3i);
 
 1866         axpy_vec(pg, v4r, -e[is], y4r);
 
 1867         axpy_vec(pg, v4i, -e[is], y4i);
 
 1869         save_vec(pg, &vp2[
VLEN * (offset + 0)], v1r);
 
 1870         save_vec(pg, &vp2[
VLEN * (offset + 1)], v1i);
 
 1871         save_vec(pg, &vp2[
VLEN * (offset + 2)], v2r);
 
 1872         save_vec(pg, &vp2[
VLEN * (offset + 3)], v2i);
 
 1873         save_vec(pg, &vp2[
VLEN * (offset + 4)], v3r);
 
 1874         save_vec(pg, &vp2[
VLEN * (offset + 5)], v3i);
 
 1875         save_vec(pg, &vp2[
VLEN * (offset + 6)], v4r);
 
 1876         save_vec(pg, &vp2[
VLEN * (offset + 7)], v4i);