10 #ifndef MULT_WILSON_PARTS_QXS_2_H
11 #define MULT_WILSON_PARTS_QXS_2_H
15 inline void check_setup()
28 template<
typename REALTYPE>
29 inline void mult_wilson_xp1(REALTYPE *buf, REALTYPE *v1)
33 load_vec1(vt, v1, 0,
NVCD);
39 template<
typename REALTYPE>
40 inline void mult_wilson_xp2(
Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
43 shift_vec1_bw(vt1, &buf[0],
NVC);
44 shift_vec1_bw(vt2, &buf[
NVC],
NVC);
50 for (
int ic = 0; ic <
NC; ++ic) {
51 int ic2 =
ND * 2 * ic;
52 mult_uv(wt1, &ut[2 * ic], vt1,
NC);
53 mult_uv(wt2, &ut[2 * ic], vt2,
NC);
54 set_sp4_xp(&v2[ic2], wt1, wt2);
60 template<
typename REALTYPE>
61 inline void mult_wilson_xpb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
65 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
66 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
68 set_sp2_xp(pg, vt10, vt11, vt20, vt21, v1, 0);
69 set_sp2_xp(pg, vt12, vt13, vt22, vt23, v1, 1);
70 set_sp2_xp(pg, vt14, vt15, vt24, vt25, v1, 2);
72 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
75 for (
int ic = 0; ic <
NC; ++ic) {
76 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
78 mult_uv(pg, wt1r, wt1i,
79 ut10, ut11, ut12, ut13, ut14, ut15,
80 vt10, vt11, vt12, vt13, vt14, vt15);
81 mult_uv(pg, wt2r, wt2i,
82 ut10, ut11, ut12, ut13, ut14, ut15,
83 vt20, vt21, vt22, vt23, vt24, vt25);
84 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
90 template<
typename REALTYPE>
91 inline void mult_wilson_xm1(REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
94 set_sp2_xm(vt1, vt2, v1);
100 for (
int ic = 0; ic <
NC; ++ic) {
102 mult_udagv(&wt1[2 * ic], &ut[ic2], vt1,
NC);
103 mult_udagv(&wt2[2 * ic], &ut[ic2], vt2,
NC);
106 for (
int ic = 0; ic <
NC; ++ic) {
107 save_vec1(&buf[0], wt1,
VLEN - 1,
NVC);
114 template<
typename REALTYPE>
115 inline void mult_wilson_xm2(
Vsimd_t *v2, REALTYPE *buf)
118 for (
int ic = 0; ic <
NC; ++ic) {
119 int ic2 =
ND * 2 * ic;
120 shift_vec1_fw(wt1, &buf[2 * ic], 2);
121 shift_vec1_fw(wt2, &buf[2 * ic +
NVC], 2);
122 set_sp4_xm(&v2[ic2], wt1, wt2);
128 template<
typename REALTYPE>
129 inline void mult_wilson_xmb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
133 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
134 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
136 set_sp2_xm(pg, vt10, vt11, vt20, vt21, v1, 0);
137 set_sp2_xm(pg, vt12, vt13, vt22, vt23, v1, 1);
138 set_sp2_xm(pg, vt14, vt15, vt24, vt25, v1, 2);
140 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
143 for (
int ic = 0; ic <
NC; ++ic) {
144 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
146 mult_udv(pg, wt1r, wt1i,
147 ut10, ut11, ut12, ut13, ut14, ut15,
148 vt10, vt11, vt12, vt13, vt14, vt15);
149 mult_udv(pg, wt2r, wt2i,
150 ut10, ut11, ut12, ut13, ut14, ut15,
151 vt20, vt21, vt22, vt23, vt24, vt25);
152 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
158 template<
typename REALTYPE>
159 inline void mult_wilson_yp1(REALTYPE *buf, REALTYPE *v1)
163 for (
int ic = 0; ic <
NC; ++ic) {
165 set_sp2_yp(pg, vt1r, vt1i, vt2r, vt2i, v1, ic);
166 save_vec(pg, &buf[
VLEN * (2 * ic)], vt1r);
167 save_vec(pg, &buf[
VLEN * (2 * ic + 1)], vt1i);
168 save_vec(pg, &buf[
VLEN * (2 * ic +
NVC)], vt2r);
169 save_vec(pg, &buf[
VLEN * (2 * ic + 1 +
NVC)], vt2i);
175 template<
typename REALTYPE>
176 inline void mult_wilson_yp2(
Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
180 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
181 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
183 load_vec(pg, vt10, &buf[
VLEN * 0]);
184 load_vec(pg, vt11, &buf[
VLEN * 1]);
185 load_vec(pg, vt12, &buf[
VLEN * 2]);
186 load_vec(pg, vt13, &buf[
VLEN * 3]);
187 load_vec(pg, vt14, &buf[
VLEN * 4]);
188 load_vec(pg, vt15, &buf[
VLEN * 5]);
190 load_vec(pg, vt20, &buf[
VLEN * (0 +
NVC)]);
191 load_vec(pg, vt21, &buf[
VLEN * (1 +
NVC)]);
192 load_vec(pg, vt22, &buf[
VLEN * (2 +
NVC)]);
193 load_vec(pg, vt23, &buf[
VLEN * (3 +
NVC)]);
194 load_vec(pg, vt24, &buf[
VLEN * (4 +
NVC)]);
195 load_vec(pg, vt25, &buf[
VLEN * (5 +
NVC)]);
197 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
200 for (
int ic = 0; ic <
NC; ++ic) {
201 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
202 &u[
VLEN * (2 * ic)]);
203 mult_uv(pg, wt1r, wt1i,
204 ut10, ut11, ut12, ut13, ut14, ut15,
205 vt10, vt11, vt12, vt13, vt14, vt15);
206 mult_uv(pg, wt2r, wt2i,
207 ut10, ut11, ut12, ut13, ut14, ut15,
208 vt20, vt21, vt22, vt23, vt24, vt25);
209 set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
215 template<
typename REALTYPE>
216 inline void mult_wilson_ypb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
220 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
221 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
223 set_sp2_yp(pg, vt10, vt11, vt20, vt21, v1, 0);
224 set_sp2_yp(pg, vt12, vt13, vt22, vt23, v1, 1);
225 set_sp2_yp(pg, vt14, vt15, vt24, vt25, v1, 2);
227 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
230 for (
int ic = 0; ic <
NC; ++ic) {
231 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
232 &u[
VLEN * (2 * ic)]);
233 mult_uv(pg, wt1r, wt1i,
234 ut10, ut11, ut12, ut13, ut14, ut15,
235 vt10, vt11, vt12, vt13, vt14, vt15);
236 mult_uv(pg, wt2r, wt2i,
237 ut10, ut11, ut12, ut13, ut14, ut15,
238 vt20, vt21, vt22, vt23, vt24, vt25);
239 set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
245 template<
typename REALTYPE>
246 inline void mult_wilson_ypb(REALTYPE *v2, REALTYPE *u, REALTYPE *v1)
250 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
251 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
253 set_sp2_yp(pg, vt10, vt11, vt20, vt21, v1, 0);
254 set_sp2_yp(pg, vt12, vt13, vt22, vt23, v1, 1);
255 set_sp2_yp(pg, vt14, vt15, vt24, vt25, v1, 2);
257 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
260 for (
int ic = 0; ic <
NC; ++ic) {
261 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
262 &u[
VLEN * (2 * ic)]);
263 mult_uv(pg, wt1r, wt1i,
264 ut10, ut11, ut12, ut13, ut14, ut15,
265 vt10, vt11, vt12, vt13, vt14, vt15);
266 mult_uv(pg, wt2r, wt2i,
267 ut10, ut11, ut12, ut13, ut14, ut15,
268 vt20, vt21, vt22, vt23, vt24, vt25);
269 set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
275 template<
typename REALTYPE>
276 inline void mult_wilson_ym1(REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
280 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
281 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
283 set_sp2_ym(pg, vt10, vt11, vt20, vt21, v1, 0);
284 set_sp2_ym(pg, vt12, vt13, vt22, vt23, v1, 1);
285 set_sp2_ym(pg, vt14, vt15, vt24, vt25, v1, 2);
287 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
290 for (
int ic = 0; ic <
NC; ++ic) {
291 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
293 mult_udv(pg, wt1r, wt1i,
294 ut10, ut11, ut12, ut13, ut14, ut15,
295 vt10, vt11, vt12, vt13, vt14, vt15);
297 mult_udv(pg, wt2r, wt2i,
298 ut10, ut11, ut12, ut13, ut14, ut15,
299 vt20, vt21, vt22, vt23, vt24, vt25);
301 save_vec(pg, &buf[
VLEN * (2 * ic)], wt1r);
302 save_vec(pg, &buf[
VLEN * (2 * ic + 1)], wt1i);
304 save_vec(pg, &buf[
VLEN * (2 * ic +
NVC)], wt2r);
305 save_vec(pg, &buf[
VLEN * (2 * ic + 1 +
NVC)], wt2i);
311 template<
typename REALTYPE>
312 inline void mult_wilson_ym2(
Vsimd_t *v2, REALTYPE *buf)
316 for (
int ic = 0; ic <
NC; ++ic) {
318 load_vec(pg, wt1r, &buf[
VLEN * (2 * ic)]);
319 load_vec(pg, wt1i, &buf[
VLEN * (2 * ic + 1)]);
320 load_vec(pg, wt2r, &buf[
VLEN * (2 * ic +
NVC)]);
321 load_vec(pg, wt2i, &buf[
VLEN * (2 * ic + 1 +
NVC)]);
322 set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
328 template<
typename REALTYPE>
329 inline void mult_wilson_ymb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
333 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
334 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
336 set_sp2_ym(pg, vt10, vt11, vt20, vt21, v1, 0);
337 set_sp2_ym(pg, vt12, vt13, vt22, vt23, v1, 1);
338 set_sp2_ym(pg, vt14, vt15, vt24, vt25, v1, 2);
340 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
343 for (
int ic = 0; ic <
NC; ++ic) {
344 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
346 mult_udv(pg, wt1r, wt1i,
347 ut10, ut11, ut12, ut13, ut14, ut15,
348 vt10, vt11, vt12, vt13, vt14, vt15);
349 mult_udv(pg, wt2r, wt2i,
350 ut10, ut11, ut12, ut13, ut14, ut15,
351 vt20, vt21, vt22, vt23, vt24, vt25);
352 set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);