9 #ifndef MULT_WILSON_PARTS_QXS_H 
   10 #define MULT_WILSON_PARTS_QXS_H 
   26   template<
typename REALTYPE>
 
   27   inline void mult_wilson_xp1(REALTYPE *buf, REALTYPE *v1)
 
   31     load_vec1_x(vt, v1, 0, 
NVCD);
 
   37   template<
typename REALTYPE>
 
   38   inline void mult_wilson_xp2(
Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
 
   41     shift_vec1_xbw(vt1, &buf[0], 
NVC);
 
   48     for (
int ic = 0; ic < 
NC; ++ic) {
 
   49       int ic2 = 
ND * 2 * ic;
 
   50       mult_uv(wt1, &ut[2 * ic], vt1, 
NC);
 
   51       mult_uv(wt2, &ut[2 * ic], vt2, 
NC);
 
   52       set_sp4_xp(&v2[ic2], wt1, wt2);
 
   58   template<
typename REALTYPE>
 
   59   inline void mult_wilson_xpb(
Vsimd_t *v2,
 
   60                               REALTYPE *u, REALTYPE *v1)
 
   63     set_sp2_xp(vt1, vt2, v1);
 
   69     for (
int ic = 0; ic < 
NC; ++ic) {
 
   70       int ic2 = 
ND * 2 * ic;
 
   71       mult_uv(wt1, &ut[2 * ic], vt1, 
NC);
 
   72       mult_uv(wt2, &ut[2 * ic], vt2, 
NC);
 
   73       set_sp4_xp(&v2[ic2], wt1, wt2);
 
   79   template<
typename REALTYPE>
 
   80   inline void mult_wilson_xm1(REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
 
   83     set_sp2_xm(vt1, vt2, v1);
 
   89     for (
int ic = 0; ic < 
NC; ++ic) {
 
   91       mult_udagv(&wt1[2 * ic], &ut[ic2], vt1, 
NC);
 
   92       mult_udagv(&wt2[2 * ic], &ut[ic2], vt2, 
NC);
 
   95     for (
int ic = 0; ic < 
NC; ++ic) {
 
  103   template<
typename REALTYPE>
 
  104   inline void mult_wilson_xm2(
Vsimd_t *v2, REALTYPE *buf)
 
  107     for (
int ic = 0; ic < 
NC; ++ic) {
 
  108       int ic2 = 
ND * 2 * ic;
 
  109       shift_vec1_xfw(wt1, &buf[
VLENY * (2 * ic)], 2);
 
  110       shift_vec1_xfw(wt2, &buf[
VLENY * (2 * ic + 
NVC)], 2);
 
  111       set_sp4_xm(&v2[ic2], wt1, wt2);
 
  117   template<
typename REALTYPE>
 
  118   inline void mult_wilson_xmb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
 
  121     set_sp2_xm(vt1, vt2, v1);
 
  124     load_vec(ut, u, 
NDF);
 
  127     for (
int ic = 0; ic < 
NC; ++ic) {
 
  129       int ic3 = 
ND * 2 * ic;
 
  130       mult_udagv(wt1, &ut[ic2], vt1, 
NC);
 
  131       mult_udagv(wt2, &ut[ic2], vt2, 
NC);
 
  132       set_sp4_xm(&v2[ic3], wt1, wt2);
 
  138   template<
typename REALTYPE>
 
  139   inline void mult_wilson_yp1(REALTYPE *buf, REALTYPE *v1)
 
  143     load_vec1_y(vt, v1, 0, 
NVCD);
 
  144     set_sp2_yp1(buf, vt);
 
  149   template<
typename REALTYPE>
 
  150   inline void mult_wilson_yp2(
Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
 
  153     shift_vec1_ybw(vt1, &buf[0], 
NVC);
 
  157     load_vec(ut, u, 
NDF);
 
  160     for (
int ic = 0; ic < 
NC; ++ic) {
 
  161       int ic2 = 
ND * 2 * ic;
 
  162       mult_uv(wt1, &ut[2 * ic], vt1, 
NC);
 
  163       mult_uv(wt2, &ut[2 * ic], vt2, 
NC);
 
  164       set_sp4_yp(&v2[ic2], wt1, wt2);
 
  170   template<
typename REALTYPE>
 
  171   inline void mult_wilson_ypb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
 
  174     set_sp2_yp(vt1, vt2, v1);
 
  177     load_vec(ut, u, 
NDF);
 
  180     for (
int ic = 0; ic < 
NC; ++ic) {
 
  181       int ic2 = 
ND * 2 * ic;
 
  182       mult_uv(wt1, &ut[2 * ic], vt1, 
NC);
 
  183       mult_uv(wt2, &ut[2 * ic], vt2, 
NC);
 
  184       set_sp4_yp(&v2[ic2], wt1, wt2);
 
  190   template<
typename REALTYPE>
 
  191   inline void mult_wilson_ym1(REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
 
  194     set_sp2_ym(vt1, vt2, v1);
 
  197     load_vec(ut, u, 
NDF);
 
  200     for (
int ic = 0; ic < 
NC; ++ic) {
 
  202       mult_udagv(&wt1[2 * ic], &ut[ic2], vt1, 
NC);
 
  203       mult_udagv(&wt2[2 * ic], &ut[ic2], vt2, 
NC);
 
  206     for (
int ic = 0; ic < 
NC; ++ic) {
 
  214   template<
typename REALTYPE>
 
  215   inline void mult_wilson_ym2(
Vsimd_t *v2, REALTYPE *buf)
 
  218     for (
int ic = 0; ic < 
NC; ++ic) {
 
  219       int ic2 = 
ND * 2 * ic;
 
  220       shift_vec1_yfw(wt1, &buf[
VLENX * (2 * ic)], 2);
 
  221       shift_vec1_yfw(wt2, &buf[
VLENX * (2 * ic + 
NVC)], 2);
 
  222       set_sp4_ym(&v2[ic2], wt1, wt2);
 
  228   template<
typename REALTYPE>
 
  229   inline void mult_wilson_ymb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
 
  232     set_sp2_ym(vt1, vt2, v1);
 
  235     load_vec(ut, u, 
NDF);
 
  238     for (
int ic = 0; ic < 
NC; ++ic) {
 
  240       int ic3 = 
ND * 2 * ic;
 
  241       mult_udagv(wt1, &ut[ic2], vt1, 
NC);
 
  242       mult_udagv(wt2, &ut[ic2], vt2, 
NC);
 
  243       set_sp4_ym(&v2[ic3], wt1, wt2);
 
  249   template<
typename REALTYPE>
 
  250   inline void mult_wilson_zp1(REALTYPE *buf, REALTYPE *v1)
 
  253     set_sp2_zp(vt1, vt2, v1);
 
  255     save_vec(&buf[0], vt1, 
NVC);
 
  261   template<
typename REALTYPE>
 
  262   inline void mult_wilson_zp2(
Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
 
  265     load_vec(vt1, &buf[0], 
NVC);
 
  269     load_vec(ut, u, 
NDF);
 
  272     for (
int ic = 0; ic < 
NC; ++ic) {
 
  273       int ic2 = 
ND * 2 * ic;
 
  274       mult_uv(wt1, &ut[2 * ic], vt1, 
NC);
 
  275       mult_uv(wt2, &ut[2 * ic], vt2, 
NC);
 
  276       set_sp4_zp(&v2[ic2], wt1, wt2);
 
  282   template<
typename REALTYPE>
 
  283   inline void mult_wilson_zpb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
 
  286     set_sp2_zp(vt1, vt2, v1);
 
  289     load_vec(ut, u, 
NDF);
 
  292     for (
int ic = 0; ic < 
NC; ++ic) {
 
  293       int ic2 = 
ND * 2 * ic;
 
  294       mult_uv(wt1, &ut[2 * ic], vt1, 
NC);
 
  295       mult_uv(wt2, &ut[2 * ic], vt2, 
NC);
 
  296       set_sp4_zp(&v2[ic2], wt1, wt2);
 
  302   template<
typename REALTYPE>
 
  303   inline void mult_wilson_zm1(REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
 
  306     set_sp2_zm(vt1, vt2, v1);
 
  309     load_vec(ut, u, 
NDF);
 
  312     for (
int ic = 0; ic < 
NC; ++ic) {
 
  314       mult_udagv(&wt1[2 * ic], &ut[ic2], vt1, 
NC);
 
  315       mult_udagv(&wt2[2 * ic], &ut[ic2], vt2, 
NC);
 
  318     save_vec(&buf[0], wt1, 
NVC);
 
  324   template<
typename REALTYPE>
 
  325   inline void mult_wilson_zm2(
Vsimd_t *v2, REALTYPE *buf)
 
  328     for (
int ic = 0; ic < 
NC; ++ic) {
 
  329       int ic2 = 
ND * 2 * ic;
 
  330       load_vec(wt1, &buf[
VLEN * 2 * ic], 2);
 
  331       load_vec(wt2, &buf[
VLEN * 2 * (ic + 
NC)], 2);
 
  332       set_sp4_zm(&v2[ic2], wt1, wt2);
 
  338   template<
typename REALTYPE>
 
  339   inline void mult_wilson_zmb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
 
  342     set_sp2_zm(vt1, vt2, v1);
 
  345     load_vec(ut, u, 
NDF);
 
  348     for (
int ic = 0; ic < 
NC; ++ic) {
 
  350       int ic3 = 
ND * 2 * ic;
 
  351       mult_udagv(wt1, &ut[ic2], vt1, 
NC);
 
  352       mult_udagv(wt2, &ut[ic2], vt2, 
NC);
 
  353       set_sp4_zm(&v2[ic3], wt1, wt2);
 
  359   template<
typename REALTYPE>
 
  360   inline void mult_wilson_tp1_dirac(REALTYPE *buf, REALTYPE *v1)
 
  363     set_sp2_tp_dirac(vt1, vt2, v1);
 
  365     save_vec(&buf[0], vt1, 
NVC);
 
  371   template<
typename REALTYPE>
 
  372   inline void mult_wilson_tp2_dirac(
Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
 
  375     load_vec(vt1, &buf[0], 
NVC);
 
  379     load_vec(ut, u, 
NDF);
 
  382     for (
int ic = 0; ic < 
NC; ++ic) {
 
  383       int ic2 = 
ND * 2 * ic;
 
  384       mult_uv(wt1, &ut[2 * ic], vt1, 
NC);
 
  385       mult_uv(wt2, &ut[2 * ic], vt2, 
NC);
 
  386       set_sp4_tp_dirac(&v2[ic2], wt1, wt2);
 
  392   template<
typename REALTYPE>
 
  393   inline void mult_wilson_tpb_dirac(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
 
  396     set_sp2_tp_dirac(vt1, vt2, v1);
 
  399     load_vec(ut, u, 
NDF);
 
  402     for (
int ic = 0; ic < 
NC; ++ic) {
 
  403       int ic2 = 
ND * 2 * ic;
 
  404       mult_uv(wt1, &ut[2 * ic], vt1, 
NC);
 
  405       mult_uv(wt2, &ut[2 * ic], vt2, 
NC);
 
  406       set_sp4_tp_dirac(&v2[ic2], wt1, wt2);
 
  412   template<
typename REALTYPE>
 
  413   inline void mult_wilson_tm1_dirac(REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
 
  416     set_sp2_tm_dirac(vt1, vt2, v1);
 
  419     load_vec(ut, u, 
NDF);
 
  422     for (
int ic = 0; ic < 
NC; ++ic) {
 
  424       mult_udagv(&wt1[2 * ic], &ut[ic2], vt1, 
NC);
 
  425       mult_udagv(&wt2[2 * ic], &ut[ic2], vt2, 
NC);
 
  428     save_vec(&buf[0], wt1, 
NVC);
 
  434   template<
typename REALTYPE>
 
  435   inline void mult_wilson_tm2_dirac(
Vsimd_t *v2, REALTYPE *buf)
 
  438     for (
int ic = 0; ic < 
NC; ++ic) {
 
  439       int ic2 = 
ND * 2 * ic;
 
  440       load_vec(wt1, &buf[
VLEN * 2 * ic], 2);
 
  441       load_vec(wt2, &buf[
VLEN * 2 * (ic + 
NC)], 2);
 
  442       set_sp4_tm_dirac(&v2[ic2], wt1, wt2);
 
  448   template<
typename REALTYPE>
 
  449   inline void mult_wilson_tmb_dirac(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
 
  452     set_sp2_tm_dirac(vt1, vt2, v1);
 
  455     load_vec(ut, u, 
NDF);
 
  458     for (
int ic = 0; ic < 
NC; ++ic) {
 
  460       int ic3 = 
ND * 2 * ic;
 
  461       mult_udagv(wt1, &ut[ic2], vt1, 
NC);
 
  462       mult_udagv(wt2, &ut[ic2], vt2, 
NC);
 
  463       set_sp4_tm_dirac(&v2[ic3], wt1, wt2);