10 #ifndef MULT_WILSON_PARTS_QXS_H
11 #define MULT_WILSON_PARTS_QXS_H
15 template<
typename REALTYPE>
17 REALTYPE *__restrict buf,
18 REALTYPE *__restrict v1)
20 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
23 for (
int ic = 0; ic <
NC; ++ic) {
24 int icr =
ND * 2 * ic;
25 int ici =
ND * 2 * ic + 1;
27 load_vec(pg2, w1r, &v1[
VLEN * (icr +
ID1)]);
28 load_vec(pg2, w1i, &v1[
VLEN * (ici +
ID1)]);
29 load_vec(pg2, w2r, &v1[
VLEN * (icr +
ID2)]);
30 load_vec(pg2, w2i, &v1[
VLEN * (ici +
ID2)]);
31 load_vec(pg2, w3r, &v1[
VLEN * (icr +
ID3)]);
32 load_vec(pg2, w3i, &v1[
VLEN * (ici +
ID3)]);
33 load_vec(pg2, w4r, &v1[
VLEN * (icr +
ID4)]);
34 load_vec(pg2, w4i, &v1[
VLEN * (ici +
ID4)]);
36 add_vec(pg2, v1r, w1r, w4i);
37 sub_vec(pg2, v1i, w1i, w4r);
38 add_vec(pg2, v2r, w2r, w3i);
39 sub_vec(pg2, v2i, w2i, w3r);
41 save_vec_scatter(pg2, &buf[
VLENY * (2 * ic)], v1r, svidx);
42 save_vec_scatter(pg2, &buf[
VLENY * (2 * ic + 1)], v1i, svidx);
43 save_vec_scatter(pg2, &buf[
VLENY * (2 * ic +
NVC)], v2r, svidx);
44 save_vec_scatter(pg2, &buf[
VLENY * (2 * ic + 1 +
NVC)], v2i, svidx);
50 template<
typename REALTYPE>
54 REALTYPE *v, REALTYPE *buf,
svint_t& index,
int ic)
56 int icr =
ND * 2 * ic;
57 int ici =
ND * 2 * ic + 1;
59 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
60 load_vec(pg1, w1r, &v[
VLEN * (icr +
ID1) + 1]);
61 load_vec(pg1, w1i, &v[
VLEN * (ici +
ID1) + 1]);
62 load_vec(pg1, w2r, &v[
VLEN * (icr +
ID2) + 1]);
63 load_vec(pg1, w2i, &v[
VLEN * (ici +
ID2) + 1]);
64 load_vec(pg1, w3r, &v[
VLEN * (icr +
ID3) + 1]);
65 load_vec(pg1, w3i, &v[
VLEN * (ici +
ID3) + 1]);
66 load_vec(pg1, w4r, &v[
VLEN * (icr +
ID4) + 1]);
67 load_vec(pg1, w4i, &v[
VLEN * (ici +
ID4) + 1]);
69 add_vec(pg1, vt1r, w1r, w4i);
70 sub_vec(pg1, vt1i, w1i, w4r);
71 add_vec(pg1, vt2r, w2r, w3i);
72 sub_vec(pg1, vt2i, w2i, w3r);
74 load_add_gather(pg2, vt1r, &buf[
VLENY * (2 * ic)], index);
75 load_add_gather(pg2, vt1i, &buf[
VLENY * (2 * ic + 1)], index);
76 load_add_gather(pg2, vt2r, &buf[
VLENY * (2 * ic +
NVC)], index);
77 load_add_gather(pg2, vt2i, &buf[
VLENY * (2 * ic + 1 +
NVC)], index);
82 template<
typename REALTYPE>
85 REALTYPE *v1, REALTYPE *buf)
89 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
90 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
92 set_sp2_xp2(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
93 set_sp2_xp2(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
94 set_sp2_xp2(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
96 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
99 for (
int ic = 0; ic <
NC; ++ic) {
100 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
101 &u[
VLEN * (2 * ic)]);
102 mult_uv(pg, wt1r, wt1i,
103 ut10, ut11, ut12, ut13, ut14, ut15,
104 vt10, vt11, vt12, vt13, vt14, vt15);
105 mult_uv(pg, wt2r, wt2i,
106 ut10, ut11, ut12, ut13, ut14, ut15,
107 vt20, vt21, vt22, vt23, vt24, vt25);
108 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
114 template<
typename REALTYPE>
116 REALTYPE *__restrict v2,
117 REALTYPE *__restrict u,
118 REALTYPE *__restrict v1,
119 REALTYPE *__restrict buf)
123 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
124 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
126 set_sp2_xp2(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
127 set_sp2_xp2(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
128 set_sp2_xp2(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
130 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
133 for (
int ic = 0; ic <
NC; ++ic) {
134 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
135 &u[
VLEN * (2 * ic)]);
136 mult_uv(pg, wt1r, wt1i,
137 ut10, ut11, ut12, ut13, ut14, ut15,
138 vt10, vt11, vt12, vt13, vt14, vt15);
139 mult_uv(pg, wt2r, wt2i,
140 ut10, ut11, ut12, ut13, ut14, ut15,
141 vt20, vt21, vt22, vt23, vt24, vt25);
142 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
148 template<
typename REALTYPE>
152 REALTYPE *v, REALTYPE *vn,
int ic)
154 int icr =
ND * 2 * ic;
155 int ici =
ND * 2 * ic + 1;
156 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
158 shift_vec_xbw(pg1, pg2, w1r, &v[
VLEN * (icr +
ID1)],
160 shift_vec_xbw(pg1, pg2, w1i, &v[
VLEN * (ici +
ID1)],
163 shift_vec_xbw(pg1, pg2, w2r, &v[
VLEN * (icr +
ID2)],
165 shift_vec_xbw(pg1, pg2, w2i, &v[
VLEN * (ici +
ID2)],
168 shift_vec_xbw(pg1, pg2, w3r, &v[
VLEN * (icr +
ID3)],
170 shift_vec_xbw(pg1, pg2, w3i, &v[
VLEN * (ici +
ID3)],
173 shift_vec_xbw(pg1, pg2, w4r, &v[
VLEN * (icr +
ID4)],
175 shift_vec_xbw(pg1, pg2, w4i, &v[
VLEN * (ici +
ID4)],
178 add_vec(pg, vt1r, w1r, w4i);
179 sub_vec(pg, vt1i, w1i, w4r);
180 add_vec(pg, vt2r, w2r, w3i);
181 sub_vec(pg, vt2i, w2i, w3r);
186 template<
typename REALTYPE>
188 REALTYPE *u, REALTYPE *v1, REALTYPE *v1n)
192 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
193 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
195 set_sp2_xp(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
196 set_sp2_xp(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
197 set_sp2_xp(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
199 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
202 for (
int ic = 0; ic <
NC; ++ic) {
203 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[
VLEN * (2 * ic)]);
204 mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
205 vt10, vt11, vt12, vt13, vt14, vt15);
206 mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
207 vt20, vt21, vt22, vt23, vt24, vt25);
208 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
214 template<
typename REALTYPE>
216 REALTYPE *__restrict v2,
217 REALTYPE *u, REALTYPE *v1, REALTYPE *v1n)
221 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
222 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
224 set_sp2_xp(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
225 set_sp2_xp(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
226 set_sp2_xp(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
229 for (
int ic = 0; ic <
NC; ++ic) {
230 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
232 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[
VLEN * (2 * ic)]);
233 mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
234 vt10, vt11, vt12, vt13, vt14, vt15);
235 mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
236 vt20, vt21, vt22, vt23, vt24, vt25);
237 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
243 template<
typename REALTYPE>
244 inline void set_sp2_xm1(
svbool_t& pg,
247 REALTYPE *vx,
int ic)
249 int icr =
ND * 2 * ic;
250 int ici =
ND * 2 * ic + 1;
251 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
253 load_vec(pg, w1r, &vx[
VLEN * (icr +
ID1)]);
254 load_vec(pg, w1i, &vx[
VLEN * (ici +
ID1)]);
256 load_vec(pg, w2r, &vx[
VLEN * (icr +
ID2)]);
257 load_vec(pg, w2i, &vx[
VLEN * (ici +
ID2)]);
259 load_vec(pg, w3r, &vx[
VLEN * (icr +
ID3)]);
260 load_vec(pg, w3i, &vx[
VLEN * (ici +
ID3)]);
262 load_vec(pg, w4r, &vx[
VLEN * (icr +
ID4)]);
263 load_vec(pg, w4i, &vx[
VLEN * (ici +
ID4)]);
265 sub_vec(pg, vt1r, w1r, w4i);
266 add_vec(pg, vt1i, w1i, w4r);
267 sub_vec(pg, vt2r, w2r, w3i);
268 add_vec(pg, vt2i, w2i, w3r);
273 template<
typename REALTYPE>
275 REALTYPE *__restrict buf,
276 REALTYPE *__restrict u,
277 REALTYPE *__restrict v1)
281 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
282 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
284 set_sp2_xm1(pg2, vt10, vt11, vt20, vt21, v1, 0);
285 set_sp2_xm1(pg2, vt12, vt13, vt22, vt23, v1, 1);
286 set_sp2_xm1(pg2, vt14, vt15, vt24, vt25, v1, 2);
288 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
291 for (
int ic = 0; ic <
NC; ++ic) {
292 load_udag(pg2, ut10, ut11, ut12, ut13, ut14, ut15,
295 mult_udv(pg2, wt1r, wt1i,
296 ut10, ut11, ut12, ut13, ut14, ut15,
297 vt10, vt11, vt12, vt13, vt14, vt15);
298 mult_udv(pg2, wt2r, wt2i,
299 ut10, ut11, ut12, ut13, ut14, ut15,
300 vt20, vt21, vt22, vt23, vt24, vt25);
302 save_vec_scatter(pg2, &buf[
VLENY * (2 * ic)], wt1r, svidx);
303 save_vec_scatter(pg2, &buf[
VLENY * (2 * ic + 1)], wt1i, svidx);
304 save_vec_scatter(pg2, &buf[
VLENY * (2 * ic +
NVC)], wt2r, svidx);
305 save_vec_scatter(pg2, &buf[
VLENY * (2 * ic + 1 +
NVC)], wt2i, svidx);
311 template<
typename REALTYPE>
312 inline void set_sp2_xm2(
svbool_t& pg,
315 REALTYPE *vx,
int ic)
317 int icr =
ND * 2 * ic;
318 int ici =
ND * 2 * ic + 1;
319 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
321 load_vec(pg, w1r, &vx[
VLEN * (icr +
ID1) - 1]);
322 load_vec(pg, w1i, &vx[
VLEN * (ici +
ID1) - 1]);
323 load_vec(pg, w2r, &vx[
VLEN * (icr +
ID2) - 1]);
324 load_vec(pg, w2i, &vx[
VLEN * (ici +
ID2) - 1]);
325 load_vec(pg, w3r, &vx[
VLEN * (icr +
ID3) - 1]);
326 load_vec(pg, w3i, &vx[
VLEN * (ici +
ID3) - 1]);
327 load_vec(pg, w4r, &vx[
VLEN * (icr +
ID4) - 1]);
328 load_vec(pg, w4i, &vx[
VLEN * (ici +
ID4) - 1]);
330 sub_vec(pg, vt1r, w1r, w4i);
331 add_vec(pg, vt1i, w1i, w4r);
332 sub_vec(pg, vt2r, w2r, w3i);
333 add_vec(pg, vt2i, w2i, w3r);
338 template<
typename REALTYPE>
341 REALTYPE *v1, REALTYPE *buf)
345 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
346 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
348 set_sp2_xm2(pg1, vt10, vt11, vt20, vt21, v1, 0);
349 set_sp2_xm2(pg1, vt12, vt13, vt22, vt23, v1, 1);
350 set_sp2_xm2(pg1, vt14, vt15, vt24, vt25, v1, 2);
352 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
355 for (
int ic = 0; ic <
NC; ++ic) {
356 load_udag(pg1, ut10, ut11, ut12, ut13, ut14, ut15,
358 mult_udv(pg1, wt1r, wt1i,
359 ut10, ut11, ut12, ut13, ut14, ut15,
360 vt10, vt11, vt12, vt13, vt14, vt15);
361 mult_udv(pg1, wt2r, wt2i,
362 ut10, ut11, ut12, ut13, ut14, ut15,
363 vt20, vt21, vt22, vt23, vt24, vt25);
365 load_add_gather(pg2, wt1r, &buf[
VLENY * (2 * ic)], svidx);
366 load_add_gather(pg2, wt1i, &buf[
VLENY * (2 * ic + 1)], svidx);
367 load_add_gather(pg2, wt2r, &buf[
VLENY * (2 * ic +
NVC)], svidx);
368 load_add_gather(pg2, wt2i, &buf[
VLENY * (2 * ic + 1 +
NVC)], svidx);
370 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
376 template<
typename REALTYPE>
378 REALTYPE *__restrict v2,
379 REALTYPE *__restrict u,
380 REALTYPE *__restrict v1,
381 REALTYPE *__restrict buf)
385 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
386 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
388 set_sp2_xm2(pg1, vt10, vt11, vt20, vt21, v1, 0);
389 set_sp2_xm2(pg1, vt12, vt13, vt22, vt23, v1, 1);
390 set_sp2_xm2(pg1, vt14, vt15, vt24, vt25, v1, 2);
392 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
395 for (
int ic = 0; ic <
NC; ++ic) {
396 load_udag(pg1, ut10, ut11, ut12, ut13, ut14, ut15,
398 mult_udv(pg1, wt1r, wt1i,
399 ut10, ut11, ut12, ut13, ut14, ut15,
400 vt10, vt11, vt12, vt13, vt14, vt15);
401 mult_udv(pg1, wt2r, wt2i,
402 ut10, ut11, ut12, ut13, ut14, ut15,
403 vt20, vt21, vt22, vt23, vt24, vt25);
405 load_add_gather(pg2, wt1r, &buf[
VLENY * (2 * ic)], svidx);
406 load_add_gather(pg2, wt1i, &buf[
VLENY * (2 * ic + 1)], svidx);
407 load_add_gather(pg2, wt2r, &buf[
VLENY * (2 * ic +
NVC)], svidx);
408 load_add_gather(pg2, wt2i, &buf[
VLENY * (2 * ic + 1 +
NVC)], svidx);
410 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
416 template<
typename REALTYPE>
420 REALTYPE *vx, REALTYPE *vn,
int ic)
422 int icr =
ND * 2 * ic;
423 int ici =
ND * 2 * ic + 1;
424 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
426 shift_vec_xfw(pg1, pg2, w1r, &vx[
VLEN * (icr +
ID1)],
428 shift_vec_xfw(pg1, pg2, w1i, &vx[
VLEN * (ici +
ID1)],
431 shift_vec_xfw(pg1, pg2, w2r, &vx[
VLEN * (icr +
ID2)],
433 shift_vec_xfw(pg1, pg2, w2i, &vx[
VLEN * (ici +
ID2)],
436 shift_vec_xfw(pg1, pg2, w3r, &vx[
VLEN * (icr +
ID3)],
438 shift_vec_xfw(pg1, pg2, w3i, &vx[
VLEN * (ici +
ID3)],
441 shift_vec_xfw(pg1, pg2, w4r, &vx[
VLEN * (icr +
ID4)],
443 shift_vec_xfw(pg1, pg2, w4i, &vx[
VLEN * (ici +
ID4)],
446 sub_vec(pg, vt1r, w1r, w4i);
447 add_vec(pg, vt1i, w1i, w4r);
448 sub_vec(pg, vt2r, w2r, w3i);
449 add_vec(pg, vt2i, w2i, w3r);
454 template<
typename REALTYPE>
456 REALTYPE *u, REALTYPE *un,
457 REALTYPE *v1, REALTYPE *v1n)
461 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
462 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
464 set_sp2_xm(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
465 set_sp2_xm(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
466 set_sp2_xm(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
468 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
471 for (
int ic = 0; ic <
NC; ++ic) {
472 load_udag_xm(pg1, pg2, ut10, ut11, ut12, ut13, ut14, ut15,
475 mult_udv(pg, wt1r, wt1i,
476 ut10, ut11, ut12, ut13, ut14, ut15,
477 vt10, vt11, vt12, vt13, vt14, vt15);
478 mult_udv(pg, wt2r, wt2i,
479 ut10, ut11, ut12, ut13, ut14, ut15,
480 vt20, vt21, vt22, vt23, vt24, vt25);
481 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
487 template<
typename REALTYPE>
489 REALTYPE *__restrict *v2,
490 REALTYPE *u, REALTYPE *un,
491 REALTYPE *v1, REALTYPE *v1n)
495 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
496 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
498 set_sp2_xm(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
499 set_sp2_xm(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
500 set_sp2_xm(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
502 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
505 for (
int ic = 0; ic <
NC; ++ic) {
506 load_udag_xm(pg1, pg2, ut10, ut11, ut12, ut13, ut14, ut15,
509 mult_udv(pg, wt1r, wt1i,
510 ut10, ut11, ut12, ut13, ut14, ut15,
511 vt10, vt11, vt12, vt13, vt14, vt15);
512 mult_udv(pg, wt2r, wt2i,
513 ut10, ut11, ut12, ut13, ut14, ut15,
514 vt20, vt21, vt22, vt23, vt24, vt25);
515 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
521 template<
typename REALTYPE>
524 REALTYPE *v1, REALTYPE *v1n)
528 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
529 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
531 set_sp2_xm(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
532 set_sp2_xm(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
533 set_sp2_xm(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
535 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
538 for (
int ic = 0; ic <
NC; ++ic) {
539 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
542 mult_udv(pg, wt1r, wt1i,
543 ut10, ut11, ut12, ut13, ut14, ut15,
544 vt10, vt11, vt12, vt13, vt14, vt15);
545 mult_udv(pg, wt2r, wt2i,
546 ut10, ut11, ut12, ut13, ut14, ut15,
547 vt20, vt21, vt22, vt23, vt24, vt25);
548 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
554 template<
typename REALTYPE>
555 inline void mult_wilson_yp1(
svbool_t& pg2,
556 REALTYPE *__restrict buf,
557 REALTYPE *__restrict v1)
561 for (
int ic = 0; ic <
NC; ++ic) {
563 set_sp2_yp(pg2, vt1r, vt1i, vt2r, vt2i, v1, ic);
565 save_vec(pg2, &buf[
VLENX * (2 * ic)], vt1r);
566 save_vec(pg2, &buf[
VLENX * (2 * ic + 1)], vt1i);
567 save_vec(pg2, &buf[
VLENX * (2 * ic +
NVC)], vt2r);
568 save_vec(pg2, &buf[
VLENX * (2 * ic + 1 +
NVC)], vt2i);
574 template<
typename REALTYPE>
577 REALTYPE *v1, REALTYPE *buf)
581 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
582 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
584 set_sp2_yp(pg1, vt10, vt11, vt20, vt21, &v1[
VLENX], 0);
585 set_sp2_yp(pg1, vt12, vt13, vt22, vt23, &v1[
VLENX], 1);
586 set_sp2_yp(pg1, vt14, vt15, vt24, vt25, &v1[
VLENX], 2);
590 load_add(pg2, vt10, &buf[offset +
VLENX * (2 * ic)]);
591 load_add(pg2, vt11, &buf[offset +
VLENX * (2 * ic + 1)]);
592 load_add(pg2, vt20, &buf[offset +
VLENX * (2 * ic +
NVC)]);
593 load_add(pg2, vt21, &buf[offset +
VLENX * (2 * ic + 1 +
NVC)]);
595 load_add(pg2, vt12, &buf[offset +
VLENX * (2 * ic)]);
596 load_add(pg2, vt13, &buf[offset +
VLENX * (2 * ic + 1)]);
597 load_add(pg2, vt22, &buf[offset +
VLENX * (2 * ic +
NVC)]);
598 load_add(pg2, vt23, &buf[offset +
VLENX * (2 * ic + 1 +
NVC)]);
600 load_add(pg2, vt14, &buf[offset +
VLENX * (2 * ic)]);
601 load_add(pg2, vt15, &buf[offset +
VLENX * (2 * ic + 1)]);
602 load_add(pg2, vt24, &buf[offset +
VLENX * (2 * ic +
NVC)]);
603 load_add(pg2, vt25, &buf[offset +
VLENX * (2 * ic + 1 +
NVC)]);
605 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
608 for (
int ic = 0; ic <
NC; ++ic) {
609 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
610 &u[
VLEN * (2 * ic)]);
611 mult_uv(pg, wt1r, wt1i,
612 ut10, ut11, ut12, ut13, ut14, ut15,
613 vt10, vt11, vt12, vt13, vt14, vt15);
614 mult_uv(pg, wt2r, wt2i,
615 ut10, ut11, ut12, ut13, ut14, ut15,
616 vt20, vt21, vt22, vt23, vt24, vt25);
617 set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
623 template<
typename REALTYPE>
625 REALTYPE *__restrict v2,
626 REALTYPE *__restrict u,
627 REALTYPE *__restrict v1,
628 REALTYPE *__restrict buf)
632 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
633 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
635 set_sp2_yp(pg1, vt10, vt11, vt20, vt21, &v1[
VLENX], 0);
636 set_sp2_yp(pg1, vt12, vt13, vt22, vt23, &v1[
VLENX], 1);
637 set_sp2_yp(pg1, vt14, vt15, vt24, vt25, &v1[
VLENX], 2);
641 load_add(pg2, vt10, &buf[offset +
VLENX * (2 * ic)]);
642 load_add(pg2, vt11, &buf[offset +
VLENX * (2 * ic + 1)]);
643 load_add(pg2, vt20, &buf[offset +
VLENX * (2 * ic +
NVC)]);
644 load_add(pg2, vt21, &buf[offset +
VLENX * (2 * ic + 1 +
NVC)]);
646 load_add(pg2, vt12, &buf[offset +
VLENX * (2 * ic)]);
647 load_add(pg2, vt13, &buf[offset +
VLENX * (2 * ic + 1)]);
648 load_add(pg2, vt22, &buf[offset +
VLENX * (2 * ic +
NVC)]);
649 load_add(pg2, vt23, &buf[offset +
VLENX * (2 * ic + 1 +
NVC)]);
651 load_add(pg2, vt14, &buf[offset +
VLENX * (2 * ic)]);
652 load_add(pg2, vt15, &buf[offset +
VLENX * (2 * ic + 1)]);
653 load_add(pg2, vt24, &buf[offset +
VLENX * (2 * ic +
NVC)]);
654 load_add(pg2, vt25, &buf[offset +
VLENX * (2 * ic + 1 +
NVC)]);
656 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
659 for (
int ic = 0; ic <
NC; ++ic) {
660 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
661 &u[
VLEN * (2 * ic)]);
662 mult_uv(pg, wt1r, wt1i,
663 ut10, ut11, ut12, ut13, ut14, ut15,
664 vt10, vt11, vt12, vt13, vt14, vt15);
665 mult_uv(pg, wt2r, wt2i,
666 ut10, ut11, ut12, ut13, ut14, ut15,
667 vt20, vt21, vt22, vt23, vt24, vt25);
668 set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
674 template<
typename REALTYPE>
678 REALTYPE *v, REALTYPE *vn,
int ic)
680 int icr =
ND * 2 * ic;
681 int ici =
ND * 2 * ic + 1;
682 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
685 shift_vec_ybw(pg1, pg2, w1r, &v[
VLEN * (icr +
ID1)],
687 shift_vec_ybw(pg1, pg2, w1i, &v[
VLEN * (ici +
ID1)],
690 shift_vec_ybw(pg1, pg2, w2r, &v[
VLEN * (icr +
ID2)],
692 shift_vec_ybw(pg1, pg2, w2i, &v[
VLEN * (ici +
ID2)],
695 shift_vec_ybw(pg1, pg2, w3r, &v[
VLEN * (icr +
ID3)],
697 shift_vec_ybw(pg1, pg2, w3i, &v[
VLEN * (ici +
ID3)],
700 shift_vec_ybw(pg1, pg2, w4r, &v[
VLEN * (icr +
ID4)],
702 shift_vec_ybw(pg1, pg2, w4i, &v[
VLEN * (ici +
ID4)],
705 load_vec(pg, w1r, &vn[
VLEN * (icr +
ID1)]);
706 load_vec(pg, w1i, &vn[
VLEN * (ici +
ID1)]);
708 load_vec(pg, w2r, &vn[
VLEN * (icr +
ID2)]);
709 load_vec(pg, w2i, &vn[
VLEN * (ici +
ID2)]);
711 load_vec(pg, w3r, &vn[
VLEN * (icr +
ID3)]);
712 load_vec(pg, w3i, &vn[
VLEN * (ici +
ID3)]);
714 load_vec(pg, w4r, &vn[
VLEN * (icr +
ID4)]);
715 load_vec(pg, w4i, &vn[
VLEN * (ici +
ID4)]);
718 sub_vec(pg, vt1r, w1r, w4r);
719 sub_vec(pg, vt1i, w1i, w4i);
720 add_vec(pg, vt2r, w2r, w3r);
721 add_vec(pg, vt2i, w2i, w3i);
726 template<
typename REALTYPE>
730 REALTYPE *v, REALTYPE *vn,
int ic)
732 int icr =
ND * 2 * ic;
733 int ici =
ND * 2 * ic + 1;
734 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
760 shift_vec_ybw(w1r, &v[
VLEN * (icr +
ID1)],
762 shift_vec_ybw(w1i, &v[
VLEN * (ici +
ID1)],
765 shift_vec_ybw(w2r, &v[
VLEN * (icr +
ID2)],
767 shift_vec_ybw(w2i, &v[
VLEN * (ici +
ID2)],
770 shift_vec_ybw(w3r, &v[
VLEN * (icr +
ID3)],
772 shift_vec_ybw(w3i, &v[
VLEN * (ici +
ID3)],
775 shift_vec_ybw(w4r, &v[
VLEN * (icr +
ID4)],
777 shift_vec_ybw(w4i, &v[
VLEN * (ici +
ID4)],
780 load_vec(pg, w1r, &vn[
VLEN * (icr +
ID1)]);
781 load_vec(pg, w1i, &vn[
VLEN * (ici +
ID1)]);
783 load_vec(pg, w2r, &vn[
VLEN * (icr +
ID2)]);
784 load_vec(pg, w2i, &vn[
VLEN * (ici +
ID2)]);
786 load_vec(pg, w3r, &vn[
VLEN * (icr +
ID3)]);
787 load_vec(pg, w3i, &vn[
VLEN * (ici +
ID3)]);
789 load_vec(pg, w4r, &vn[
VLEN * (icr +
ID4)]);
790 load_vec(pg, w4i, &vn[
VLEN * (ici +
ID4)]);
793 sub_vec(pg, vt1r, w1r, w4r);
794 sub_vec(pg, vt1i, w1i, w4i);
795 add_vec(pg, vt2r, w2r, w3r);
796 add_vec(pg, vt2i, w2i, w3i);
801 template<
typename REALTYPE>
804 REALTYPE *v1, REALTYPE *v1n)
808 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
809 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
811 set_sp2_yp(pg, pg1, idx1, vt10, vt11, vt20, vt21, v1, v1n, 0);
812 set_sp2_yp(pg, pg1, idx1, vt12, vt13, vt22, vt23, v1, v1n, 1);
813 set_sp2_yp(pg, pg1, idx1, vt14, vt15, vt24, vt25, v1, v1n, 2);
815 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
818 for (
int ic = 0; ic <
NC; ++ic) {
819 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
820 &u[
VLEN * (2 * ic)]);
821 mult_uv(pg, wt1r, wt1i,
822 ut10, ut11, ut12, ut13, ut14, ut15,
823 vt10, vt11, vt12, vt13, vt14, vt15);
824 mult_uv(pg, wt2r, wt2i,
825 ut10, ut11, ut12, ut13, ut14, ut15,
826 vt20, vt21, vt22, vt23, vt24, vt25);
827 set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
833 template<
typename REALTYPE>
836 REALTYPE *v1, REALTYPE *v1n)
840 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
841 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
843 set_sp2_yp(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
844 set_sp2_yp(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
845 set_sp2_yp(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
847 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
850 for (
int ic = 0; ic <
NC; ++ic) {
851 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
852 &u[
VLEN * (2 * ic)]);
853 mult_uv(pg, wt1r, wt1i,
854 ut10, ut11, ut12, ut13, ut14, ut15,
855 vt10, vt11, vt12, vt13, vt14, vt15);
856 mult_uv(pg, wt2r, wt2i,
857 ut10, ut11, ut12, ut13, ut14, ut15,
858 vt20, vt21, vt22, vt23, vt24, vt25);
859 set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
865 template<
typename REALTYPE>
867 REALTYPE *__restrict v2,
868 REALTYPE *__restrict u,
869 REALTYPE *v1, REALTYPE *v1n)
874 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
875 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
877 set_sp2_yp(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
878 set_sp2_yp(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
879 set_sp2_yp(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
882 for (
int ic = 0; ic <
NC; ++ic) {
883 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
886 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
887 &u[
VLEN * (2 * ic)]);
888 mult_uv(pg, wt1r, wt1i,
889 ut10, ut11, ut12, ut13, ut14, ut15,
890 vt10, vt11, vt12, vt13, vt14, vt15);
891 mult_uv(pg, wt2r, wt2i,
892 ut10, ut11, ut12, ut13, ut14, ut15,
893 vt20, vt21, vt22, vt23, vt24, vt25);
894 set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
900 template<
typename REALTYPE>
901 inline void mult_wilson_ym1(
svbool_t& pg2,
902 REALTYPE *__restrict buf,
903 REALTYPE *__restrict u,
904 REALTYPE *__restrict v1)
908 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
909 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
910 set_sp2_ym(pg2, vt10, vt11, vt20, vt21, v1, 0);
911 set_sp2_ym(pg2, vt12, vt13, vt22, vt23, v1, 1);
912 set_sp2_ym(pg2, vt14, vt15, vt24, vt25, v1, 2);
914 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
917 for (
int ic = 0; ic <
NC; ++ic) {
918 load_udag(pg2, ut10, ut11, ut12, ut13, ut14, ut15,
920 mult_udv(pg2, wt1r, wt1i,
921 ut10, ut11, ut12, ut13, ut14, ut15,
922 vt10, vt11, vt12, vt13, vt14, vt15);
923 mult_udv(pg2, wt2r, wt2i,
924 ut10, ut11, ut12, ut13, ut14, ut15,
925 vt20, vt21, vt22, vt23, vt24, vt25);
929 save_vec(pg2, &buf[offset +
VLENX * (2 * ic)], wt1r);
930 save_vec(pg2, &buf[offset +
VLENX * (2 * ic + 1)], wt1i);
931 save_vec(pg2, &buf[offset +
VLENX * (2 * ic +
NVC)], wt2r);
932 save_vec(pg2, &buf[offset +
VLENX * (2 * ic + 1 +
NVC)], wt2i);
938 template<
typename REALTYPE>
941 REALTYPE *v1, REALTYPE *buf)
945 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
946 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
948 set_sp2_ym(pg1, vt10, vt11, vt20, vt21, &v1[-
VLENX], 0);
949 set_sp2_ym(pg1, vt12, vt13, vt22, vt23, &v1[-
VLENX], 1);
950 set_sp2_ym(pg1, vt14, vt15, vt24, vt25, &v1[-
VLENX], 2);
952 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
955 for (
int ic = 0; ic <
NC; ++ic) {
956 load_udag(pg1, ut10, ut11, ut12, ut13, ut14, ut15,
958 mult_udv(pg1, wt1r, wt1i,
959 ut10, ut11, ut12, ut13, ut14, ut15,
960 vt10, vt11, vt12, vt13, vt14, vt15);
961 mult_udv(pg1, wt2r, wt2i,
962 ut10, ut11, ut12, ut13, ut14, ut15,
963 vt20, vt21, vt22, vt23, vt24, vt25);
965 load_add(pg2, wt1r, &buf[
VLENX * (2 * ic)]);
966 load_add(pg2, wt1i, &buf[
VLENX * (2 * ic + 1)]);
967 load_add(pg2, wt2r, &buf[
VLENX * (2 * ic +
NVC)]);
968 load_add(pg2, wt2i, &buf[
VLENX * (2 * ic + 1 +
NVC)]);
970 set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
976 template<
typename REALTYPE>
978 REALTYPE *__restrict v2,
979 REALTYPE *__restrict u,
980 REALTYPE *__restrict v1,
981 REALTYPE *__restrict buf)
985 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
986 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
988 set_sp2_ym(pg1, vt10, vt11, vt20, vt21, &v1[-
VLENX], 0);
989 set_sp2_ym(pg1, vt12, vt13, vt22, vt23, &v1[-
VLENX], 1);
990 set_sp2_ym(pg1, vt14, vt15, vt24, vt25, &v1[-
VLENX], 2);
992 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
995 for (
int ic = 0; ic <
NC; ++ic) {
996 load_udag(pg1, ut10, ut11, ut12, ut13, ut14, ut15,
998 mult_udv(pg1, wt1r, wt1i,
999 ut10, ut11, ut12, ut13, ut14, ut15,
1000 vt10, vt11, vt12, vt13, vt14, vt15);
1001 mult_udv(pg1, wt2r, wt2i,
1002 ut10, ut11, ut12, ut13, ut14, ut15,
1003 vt20, vt21, vt22, vt23, vt24, vt25);
1005 load_add(pg2, wt1r, &buf[
VLENX * (2 * ic)]);
1006 load_add(pg2, wt1i, &buf[
VLENX * (2 * ic + 1)]);
1007 load_add(pg2, wt2r, &buf[
VLENX * (2 * ic +
NVC)]);
1008 load_add(pg2, wt2i, &buf[
VLENX * (2 * ic + 1 +
NVC)]);
1010 set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1016 template<
typename REALTYPE>
1020 REALTYPE *vx, REALTYPE *vn,
int ic)
1022 int icr =
ND * 2 * ic;
1023 int ici =
ND * 2 * ic + 1;
1024 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
1027 shift_vec_yfw(pg1, pg2, w1r, &vx[
VLEN * (icr +
ID1)],
1029 shift_vec_yfw(pg1, pg2, w1i, &vx[
VLEN * (ici +
ID1)],
1032 shift_vec_yfw(pg1, pg2, w2r, &vx[
VLEN * (icr +
ID2)],
1034 shift_vec_yfw(pg1, pg2, w2i, &vx[
VLEN * (ici +
ID2)],
1037 shift_vec_yfw(pg1, pg2, w3r, &vx[
VLEN * (icr +
ID3)],
1039 shift_vec_yfw(pg1, pg2, w3i, &vx[
VLEN * (ici +
ID3)],
1042 shift_vec_yfw(pg1, pg2, w4r, &vx[
VLEN * (icr +
ID4)],
1044 shift_vec_yfw(pg1, pg2, w4i, &vx[
VLEN * (ici +
ID4)],
1047 load_vec(pg, w1r, &vn[
VLEN * (icr +
ID1)]);
1048 load_vec(pg, w1i, &vn[
VLEN * (ici +
ID1)]);
1050 load_vec(pg, w2r, &vn[
VLEN * (icr +
ID2)]);
1051 load_vec(pg, w2i, &vn[
VLEN * (ici +
ID2)]);
1053 load_vec(pg, w3r, &vn[
VLEN * (icr +
ID3)]);
1054 load_vec(pg, w3i, &vn[
VLEN * (ici +
ID3)]);
1056 load_vec(pg, w4r, &vn[
VLEN * (icr +
ID4)]);
1057 load_vec(pg, w4i, &vn[
VLEN * (ici +
ID4)]);
1059 add_vec(pg, vt1r, w1r, w4r);
1060 add_vec(pg, vt1i, w1i, w4i);
1061 sub_vec(pg, vt2r, w2r, w3r);
1062 sub_vec(pg, vt2i, w2i, w3i);
1067 template<
typename REALTYPE>
1069 Vsimd_t *v2, REALTYPE *u, REALTYPE *un,
1070 REALTYPE *v1, REALTYPE *v1n)
1074 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1075 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1077 set_sp2_ym(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
1078 set_sp2_ym(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
1079 set_sp2_ym(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
1081 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1084 for (
int ic = 0; ic <
NC; ++ic) {
1085 load_udag_ym(pg1, pg2, ut10, ut11, ut12, ut13, ut14, ut15,
1087 mult_udv(pg, wt1r, wt1i,
1088 ut10, ut11, ut12, ut13, ut14, ut15,
1089 vt10, vt11, vt12, vt13, vt14, vt15);
1090 mult_udv(pg, wt2r, wt2i,
1091 ut10, ut11, ut12, ut13, ut14, ut15,
1092 vt20, vt21, vt22, vt23, vt24, vt25);
1093 set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1099 template<
typename REALTYPE>
1103 REALTYPE *vx, REALTYPE *vn,
int ic)
1105 int icr =
ND * 2 * ic;
1106 int ici =
ND * 2 * ic + 1;
1107 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
1110 shift_vec_yfw(w1r, &vx[
VLEN * (icr +
ID1)],
1112 shift_vec_yfw(w1i, &vx[
VLEN * (ici +
ID1)],
1115 shift_vec_yfw(w2r, &vx[
VLEN * (icr +
ID2)],
1117 shift_vec_yfw(w2i, &vx[
VLEN * (ici +
ID2)],
1120 shift_vec_yfw(w3r, &vx[
VLEN * (icr +
ID3)],
1122 shift_vec_yfw(w3i, &vx[
VLEN * (ici +
ID3)],
1125 shift_vec_yfw(w4r, &vx[
VLEN * (icr +
ID4)],
1127 shift_vec_yfw(w4i, &vx[
VLEN * (ici +
ID4)],
1152 load_vec(pg, w1r, &vn[
VLEN * (icr +
ID1)]);
1153 load_vec(pg, w1i, &vn[
VLEN * (ici +
ID1)]);
1155 load_vec(pg, w2r, &vn[
VLEN * (icr +
ID2)]);
1156 load_vec(pg, w2i, &vn[
VLEN * (ici +
ID2)]);
1158 load_vec(pg, w3r, &vn[
VLEN * (icr +
ID3)]);
1159 load_vec(pg, w3i, &vn[
VLEN * (ici +
ID3)]);
1161 load_vec(pg, w4r, &vn[
VLEN * (icr +
ID4)]);
1162 load_vec(pg, w4i, &vn[
VLEN * (ici +
ID4)]);
1164 add_vec(pg, vt1r, w1r, w4r);
1165 add_vec(pg, vt1i, w1i, w4i);
1166 sub_vec(pg, vt2r, w2r, w3r);
1167 sub_vec(pg, vt2i, w2i, w3i);
1172 template<
typename REALTYPE>
1174 Vsimd_t *v2, REALTYPE *u, REALTYPE *un,
1175 REALTYPE *v1, REALTYPE *v1n)
1179 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1180 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1182 set_sp2_ym(pg, pg1, idx1, vt10, vt11, vt20, vt21, v1, v1n, 0);
1183 set_sp2_ym(pg, pg1, idx1, vt12, vt13, vt22, vt23, v1, v1n, 1);
1184 set_sp2_ym(pg, pg1, idx1, vt14, vt15, vt24, vt25, v1, v1n, 2);
1186 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1189 for (
int ic = 0; ic <
NC; ++ic) {
1190 load_udag_ym(ut10, ut11, ut12, ut13, ut14, ut15,
1192 mult_udv(pg, wt1r, wt1i,
1193 ut10, ut11, ut12, ut13, ut14, ut15,
1194 vt10, vt11, vt12, vt13, vt14, vt15);
1195 mult_udv(pg, wt2r, wt2i,
1196 ut10, ut11, ut12, ut13, ut14, ut15,
1197 vt20, vt21, vt22, vt23, vt24, vt25);
1198 set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1204 template<
typename REALTYPE>
1206 REALTYPE *__restrict v2,
1207 REALTYPE *u, REALTYPE *un,
1208 REALTYPE *v1, REALTYPE *v1n)
1212 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1213 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1215 set_sp2_ym(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
1216 set_sp2_ym(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
1217 set_sp2_ym(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
1219 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1222 for (
int ic = 0; ic <
NC; ++ic) {
1223 load_udag_ym(pg1, pg2, ut10, ut11, ut12, ut13, ut14, ut15,
1225 mult_udv(pg, wt1r, wt1i,
1226 ut10, ut11, ut12, ut13, ut14, ut15,
1227 vt10, vt11, vt12, vt13, vt14, vt15);
1228 mult_udv(pg, wt2r, wt2i,
1229 ut10, ut11, ut12, ut13, ut14, ut15,
1230 vt20, vt21, vt22, vt23, vt24, vt25);
1231 set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1237 template<
typename REALTYPE>
1240 REALTYPE *v1, REALTYPE *v1n)
1244 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1245 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1247 set_sp2_ym(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
1248 set_sp2_ym(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
1249 set_sp2_ym(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
1251 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1254 for (
int ic = 0; ic <
NC; ++ic) {
1255 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1257 mult_udv(pg, wt1r, wt1i,
1258 ut10, ut11, ut12, ut13, ut14, ut15,
1259 vt10, vt11, vt12, vt13, vt14, vt15);
1260 mult_udv(pg, wt2r, wt2i,
1261 ut10, ut11, ut12, ut13, ut14, ut15,
1262 vt20, vt21, vt22, vt23, vt24, vt25);
1263 set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1269 template<
typename REALTYPE>
1270 inline void mult_wilson_zp1(REALTYPE *__restrict buf,
1271 REALTYPE *__restrict v1)
1275 for (
int ic = 0; ic <
NC; ++ic) {
1277 set_sp2_zp(pg, vt1r, vt1i, vt2r, vt2i, v1, ic);
1278 save_vec(pg, &buf[
VLEN * (2 * ic)], vt1r);
1279 save_vec(pg, &buf[
VLEN * (2 * ic + 1)], vt1i);
1280 save_vec(pg, &buf[
VLEN * (2 * ic +
NVC)], vt2r);
1281 save_vec(pg, &buf[
VLEN * (2 * ic + 1 +
NVC)], vt2i);
1287 template<
typename REALTYPE>
1288 inline void mult_wilson_zp2(
Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
1292 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1293 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1295 load_vec(pg, vt10, &buf[
VLEN * 0]);
1296 load_vec(pg, vt11, &buf[
VLEN * 1]);
1297 load_vec(pg, vt12, &buf[
VLEN * 2]);
1298 load_vec(pg, vt13, &buf[
VLEN * 3]);
1299 load_vec(pg, vt14, &buf[
VLEN * 4]);
1300 load_vec(pg, vt15, &buf[
VLEN * 5]);
1302 load_vec(pg, vt20, &buf[
VLEN * (0 +
NVC)]);
1303 load_vec(pg, vt21, &buf[
VLEN * (1 +
NVC)]);
1304 load_vec(pg, vt22, &buf[
VLEN * (2 +
NVC)]);
1305 load_vec(pg, vt23, &buf[
VLEN * (3 +
NVC)]);
1306 load_vec(pg, vt24, &buf[
VLEN * (4 +
NVC)]);
1307 load_vec(pg, vt25, &buf[
VLEN * (5 +
NVC)]);
1309 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1312 for (
int ic = 0; ic <
NC; ++ic) {
1313 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1314 &u[
VLEN * (2 * ic)]);
1315 mult_uv(pg, wt1r, wt1i,
1316 ut10, ut11, ut12, ut13, ut14, ut15,
1317 vt10, vt11, vt12, vt13, vt14, vt15);
1318 mult_uv(pg, wt2r, wt2i,
1319 ut10, ut11, ut12, ut13, ut14, ut15,
1320 vt20, vt21, vt22, vt23, vt24, vt25);
1321 set_sp4_zp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1327 template<
typename REALTYPE>
1328 inline void mult_wilson_zp2(REALTYPE *__restrict v2,
1329 REALTYPE *__restrict u,
1330 REALTYPE *__restrict buf)
1334 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1335 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1337 load_vec(pg, vt10, &buf[
VLEN * 0]);
1338 load_vec(pg, vt11, &buf[
VLEN * 1]);
1339 load_vec(pg, vt12, &buf[
VLEN * 2]);
1340 load_vec(pg, vt13, &buf[
VLEN * 3]);
1341 load_vec(pg, vt14, &buf[
VLEN * 4]);
1342 load_vec(pg, vt15, &buf[
VLEN * 5]);
1344 load_vec(pg, vt20, &buf[
VLEN * (0 +
NVC)]);
1345 load_vec(pg, vt21, &buf[
VLEN * (1 +
NVC)]);
1346 load_vec(pg, vt22, &buf[
VLEN * (2 +
NVC)]);
1347 load_vec(pg, vt23, &buf[
VLEN * (3 +
NVC)]);
1348 load_vec(pg, vt24, &buf[
VLEN * (4 +
NVC)]);
1349 load_vec(pg, vt25, &buf[
VLEN * (5 +
NVC)]);
1351 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1354 for (
int ic = 0; ic <
NC; ++ic) {
1355 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1356 &u[
VLEN * (2 * ic)]);
1357 mult_uv(pg, wt1r, wt1i,
1358 ut10, ut11, ut12, ut13, ut14, ut15,
1359 vt10, vt11, vt12, vt13, vt14, vt15);
1360 mult_uv(pg, wt2r, wt2i,
1361 ut10, ut11, ut12, ut13, ut14, ut15,
1362 vt20, vt21, vt22, vt23, vt24, vt25);
1363 set_sp4_zp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1369 template<
typename REALTYPE>
1370 inline void mult_wilson_zpb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
1374 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1375 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1377 set_sp2_zp(pg, vt10, vt11, vt20, vt21, v1, 0);
1378 set_sp2_zp(pg, vt12, vt13, vt22, vt23, v1, 1);
1379 set_sp2_zp(pg, vt14, vt15, vt24, vt25, v1, 2);
1381 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1384 for (
int ic = 0; ic <
NC; ++ic) {
1385 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1386 &u[
VLEN * (2 * ic)]);
1387 mult_uv(pg, wt1r, wt1i,
1388 ut10, ut11, ut12, ut13, ut14, ut15,
1389 vt10, vt11, vt12, vt13, vt14, vt15);
1390 mult_uv(pg, wt2r, wt2i,
1391 ut10, ut11, ut12, ut13, ut14, ut15,
1392 vt20, vt21, vt22, vt23, vt24, vt25);
1393 set_sp4_zp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1399 template<
typename REALTYPE>
1400 inline void mult_wilson_zpb(REALTYPE *__restrict v2,
1401 REALTYPE *__restrict u,
1402 REALTYPE *__restrict v1)
1406 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1407 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1409 set_sp2_zp(pg, vt10, vt11, vt20, vt21, v1, 0);
1410 set_sp2_zp(pg, vt12, vt13, vt22, vt23, v1, 1);
1411 set_sp2_zp(pg, vt14, vt15, vt24, vt25, v1, 2);
1413 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1416 for (
int ic = 0; ic <
NC; ++ic) {
1417 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1418 &u[
VLEN * (2 * ic)]);
1419 mult_uv(pg, wt1r, wt1i,
1420 ut10, ut11, ut12, ut13, ut14, ut15,
1421 vt10, vt11, vt12, vt13, vt14, vt15);
1422 mult_uv(pg, wt2r, wt2i,
1423 ut10, ut11, ut12, ut13, ut14, ut15,
1424 vt20, vt21, vt22, vt23, vt24, vt25);
1425 set_sp4_zp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1431 template<
typename REALTYPE>
1432 inline void mult_wilson_zm1(REALTYPE *__restrict buf,
1433 REALTYPE *__restrict u,
1434 REALTYPE *__restrict v1)
1438 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1439 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1441 set_sp2_zm(pg, vt10, vt11, vt20, vt21, v1, 0);
1442 set_sp2_zm(pg, vt12, vt13, vt22, vt23, v1, 1);
1443 set_sp2_zm(pg, vt14, vt15, vt24, vt25, v1, 2);
1445 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1448 for (
int ic = 0; ic <
NC; ++ic) {
1449 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1451 mult_udv(pg, wt1r, wt1i,
1452 ut10, ut11, ut12, ut13, ut14, ut15,
1453 vt10, vt11, vt12, vt13, vt14, vt15);
1455 mult_udv(pg, wt2r, wt2i,
1456 ut10, ut11, ut12, ut13, ut14, ut15,
1457 vt20, vt21, vt22, vt23, vt24, vt25);
1459 save_vec(pg, &buf[
VLEN * (2 * ic)], wt1r);
1460 save_vec(pg, &buf[
VLEN * (2 * ic + 1)], wt1i);
1462 save_vec(pg, &buf[
VLEN * (2 * ic +
NVC)], wt2r);
1463 save_vec(pg, &buf[
VLEN * (2 * ic + 1 +
NVC)], wt2i);
1469 template<
typename REALTYPE>
1470 inline void mult_wilson_zm2(
Vsimd_t *v2, REALTYPE *buf)
1474 for (
int ic = 0; ic <
NC; ++ic) {
1476 load_vec(pg, wt1r, &buf[
VLEN * (2 * ic)]);
1477 load_vec(pg, wt1i, &buf[
VLEN * (2 * ic + 1)]);
1478 load_vec(pg, wt2r, &buf[
VLEN * (2 * ic +
NVC)]);
1479 load_vec(pg, wt2i, &buf[
VLEN * (2 * ic + 1 +
NVC)]);
1480 set_sp4_zm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1486 template<
typename REALTYPE>
1487 inline void mult_wilson_zm2(REALTYPE *__restrict v2,
1488 REALTYPE *__restrict buf)
1492 for (
int ic = 0; ic <
NC; ++ic) {
1494 load_vec(pg, wt1r, &buf[
VLEN * (2 * ic)]);
1495 load_vec(pg, wt1i, &buf[
VLEN * (2 * ic + 1)]);
1496 load_vec(pg, wt2r, &buf[
VLEN * (2 * ic +
NVC)]);
1497 load_vec(pg, wt2i, &buf[
VLEN * (2 * ic + 1 +
NVC)]);
1498 set_sp4_zm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1504 template<
typename REALTYPE>
1505 inline void mult_wilson_zmb(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
1509 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1510 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1512 set_sp2_zm(pg, vt10, vt11, vt20, vt21, v1, 0);
1513 set_sp2_zm(pg, vt12, vt13, vt22, vt23, v1, 1);
1514 set_sp2_zm(pg, vt14, vt15, vt24, vt25, v1, 2);
1516 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1519 for (
int ic = 0; ic <
NC; ++ic) {
1520 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1522 mult_udv(pg, wt1r, wt1i,
1523 ut10, ut11, ut12, ut13, ut14, ut15,
1524 vt10, vt11, vt12, vt13, vt14, vt15);
1525 mult_udv(pg, wt2r, wt2i,
1526 ut10, ut11, ut12, ut13, ut14, ut15,
1527 vt20, vt21, vt22, vt23, vt24, vt25);
1528 set_sp4_zm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1534 template<
typename REALTYPE>
1535 inline void mult_wilson_zmb(REALTYPE *__restrict v2,
1536 REALTYPE *__restrict u,
1537 REALTYPE *__restrict v1)
1541 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1542 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1544 set_sp2_zm(pg, vt10, vt11, vt20, vt21, v1, 0);
1545 set_sp2_zm(pg, vt12, vt13, vt22, vt23, v1, 1);
1546 set_sp2_zm(pg, vt14, vt15, vt24, vt25, v1, 2);
1548 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1551 for (
int ic = 0; ic <
NC; ++ic) {
1552 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1554 mult_udv(pg, wt1r, wt1i,
1555 ut10, ut11, ut12, ut13, ut14, ut15,
1556 vt10, vt11, vt12, vt13, vt14, vt15);
1557 mult_udv(pg, wt2r, wt2i,
1558 ut10, ut11, ut12, ut13, ut14, ut15,
1559 vt20, vt21, vt22, vt23, vt24, vt25);
1560 set_sp4_zm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1566 template<
typename REALTYPE>
1567 inline void mult_wilson_tp1_dirac(REALTYPE *__restrict buf,
1568 REALTYPE *__restrict v1)
1572 for (
int ic = 0; ic <
NC; ++ic) {
1574 set_sp2_tp_dirac(pg, vt1r, vt1i, vt2r, vt2i, v1, ic);
1575 save_vec(pg, &buf[
VLEN * (2 * ic)], vt1r);
1576 save_vec(pg, &buf[
VLEN * (2 * ic + 1)], vt1i);
1577 save_vec(pg, &buf[
VLEN * (2 * ic +
NVC)], vt2r);
1578 save_vec(pg, &buf[
VLEN * (2 * ic + 1 +
NVC)], vt2i);
1584 template<
typename REALTYPE>
1585 inline void mult_wilson_tp2_dirac(
Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
1589 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1590 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1592 load_vec(pg, vt10, &buf[
VLEN * 0]);
1593 load_vec(pg, vt11, &buf[
VLEN * 1]);
1594 load_vec(pg, vt12, &buf[
VLEN * 2]);
1595 load_vec(pg, vt13, &buf[
VLEN * 3]);
1596 load_vec(pg, vt14, &buf[
VLEN * 4]);
1597 load_vec(pg, vt15, &buf[
VLEN * 5]);
1599 load_vec(pg, vt20, &buf[
VLEN * (0 +
NVC)]);
1600 load_vec(pg, vt21, &buf[
VLEN * (1 +
NVC)]);
1601 load_vec(pg, vt22, &buf[
VLEN * (2 +
NVC)]);
1602 load_vec(pg, vt23, &buf[
VLEN * (3 +
NVC)]);
1603 load_vec(pg, vt24, &buf[
VLEN * (4 +
NVC)]);
1604 load_vec(pg, vt25, &buf[
VLEN * (5 +
NVC)]);
1606 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1609 for (
int ic = 0; ic <
NC; ++ic) {
1610 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1611 &u[
VLEN * (2 * ic)]);
1612 mult_uv(pg, wt1r, wt1i,
1613 ut10, ut11, ut12, ut13, ut14, ut15,
1614 vt10, vt11, vt12, vt13, vt14, vt15);
1615 mult_uv(pg, wt2r, wt2i,
1616 ut10, ut11, ut12, ut13, ut14, ut15,
1617 vt20, vt21, vt22, vt23, vt24, vt25);
1618 set_sp4_tp_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1624 template<
typename REALTYPE>
1625 inline void mult_wilson_tp2_dirac(REALTYPE *__restrict v2,
1626 REALTYPE *__restrict u,
1627 REALTYPE *__restrict buf)
1631 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1632 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1634 load_vec(pg, vt10, &buf[
VLEN * 0]);
1635 load_vec(pg, vt11, &buf[
VLEN * 1]);
1636 load_vec(pg, vt12, &buf[
VLEN * 2]);
1637 load_vec(pg, vt13, &buf[
VLEN * 3]);
1638 load_vec(pg, vt14, &buf[
VLEN * 4]);
1639 load_vec(pg, vt15, &buf[
VLEN * 5]);
1641 load_vec(pg, vt20, &buf[
VLEN * (0 +
NVC)]);
1642 load_vec(pg, vt21, &buf[
VLEN * (1 +
NVC)]);
1643 load_vec(pg, vt22, &buf[
VLEN * (2 +
NVC)]);
1644 load_vec(pg, vt23, &buf[
VLEN * (3 +
NVC)]);
1645 load_vec(pg, vt24, &buf[
VLEN * (4 +
NVC)]);
1646 load_vec(pg, vt25, &buf[
VLEN * (5 +
NVC)]);
1648 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1651 for (
int ic = 0; ic <
NC; ++ic) {
1652 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1653 &u[
VLEN * (2 * ic)]);
1654 mult_uv(pg, wt1r, wt1i,
1655 ut10, ut11, ut12, ut13, ut14, ut15,
1656 vt10, vt11, vt12, vt13, vt14, vt15);
1657 mult_uv(pg, wt2r, wt2i,
1658 ut10, ut11, ut12, ut13, ut14, ut15,
1659 vt20, vt21, vt22, vt23, vt24, vt25);
1660 set_sp4_tp_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1666 template<
typename REALTYPE>
1667 inline void mult_wilson_tpb_dirac(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
1671 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1672 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1674 set_sp2_tp_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1675 set_sp2_tp_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1676 set_sp2_tp_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1678 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1681 for (
int ic = 0; ic <
NC; ++ic) {
1682 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1683 &u[
VLEN * (2 * ic)]);
1684 mult_uv(pg, wt1r, wt1i,
1685 ut10, ut11, ut12, ut13, ut14, ut15,
1686 vt10, vt11, vt12, vt13, vt14, vt15);
1687 mult_uv(pg, wt2r, wt2i,
1688 ut10, ut11, ut12, ut13, ut14, ut15,
1689 vt20, vt21, vt22, vt23, vt24, vt25);
1690 set_sp4_tp_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1696 template<
typename REALTYPE>
1697 inline void mult_wilson_tpb_dirac(REALTYPE *__restrict v2,
1698 REALTYPE *__restrict u,
1699 REALTYPE *__restrict v1)
1703 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1704 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1706 set_sp2_tp_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1707 set_sp2_tp_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1708 set_sp2_tp_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1710 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1713 for (
int ic = 0; ic <
NC; ++ic) {
1714 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1715 &u[
VLEN * (2 * ic)]);
1716 mult_uv(pg, wt1r, wt1i,
1717 ut10, ut11, ut12, ut13, ut14, ut15,
1718 vt10, vt11, vt12, vt13, vt14, vt15);
1719 mult_uv(pg, wt2r, wt2i,
1720 ut10, ut11, ut12, ut13, ut14, ut15,
1721 vt20, vt21, vt22, vt23, vt24, vt25);
1722 set_sp4_tp_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1728 template<
typename REALTYPE>
1729 inline void mult_wilson_tm1_dirac(REALTYPE *__restrict buf,
1730 REALTYPE *__restrict u,
1731 REALTYPE *__restrict v1)
1735 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1736 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1738 set_sp2_tm_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1739 set_sp2_tm_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1740 set_sp2_tm_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1742 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1745 for (
int ic = 0; ic <
NC; ++ic) {
1746 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1748 mult_udv(pg, wt1r, wt1i,
1749 ut10, ut11, ut12, ut13, ut14, ut15,
1750 vt10, vt11, vt12, vt13, vt14, vt15);
1752 mult_udv(pg, wt2r, wt2i,
1753 ut10, ut11, ut12, ut13, ut14, ut15,
1754 vt20, vt21, vt22, vt23, vt24, vt25);
1756 save_vec(pg, &buf[
VLEN * (2 * ic)], wt1r);
1757 save_vec(pg, &buf[
VLEN * (2 * ic + 1)], wt1i);
1759 save_vec(pg, &buf[
VLEN * (2 * ic +
NVC)], wt2r);
1760 save_vec(pg, &buf[
VLEN * (2 * ic + 1 +
NVC)], wt2i);
1766 template<
typename REALTYPE>
1767 inline void mult_wilson_tm2_dirac(
Vsimd_t *v2, REALTYPE *buf)
1771 for (
int ic = 0; ic <
NC; ++ic) {
1773 load_vec(pg, wt1r, &buf[
VLEN * (2 * ic)]);
1774 load_vec(pg, wt1i, &buf[
VLEN * (2 * ic + 1)]);
1775 load_vec(pg, wt2r, &buf[
VLEN * (2 * ic +
NVC)]);
1776 load_vec(pg, wt2i, &buf[
VLEN * (2 * ic + 1 +
NVC)]);
1777 set_sp4_tm_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1783 template<
typename REALTYPE>
1784 inline void mult_wilson_tm2_dirac(REALTYPE *__restrict v2,
1785 REALTYPE *__restrict buf)
1789 for (
int ic = 0; ic <
NC; ++ic) {
1791 load_vec(pg, wt1r, &buf[
VLEN * (2 * ic)]);
1792 load_vec(pg, wt1i, &buf[
VLEN * (2 * ic + 1)]);
1793 load_vec(pg, wt2r, &buf[
VLEN * (2 * ic +
NVC)]);
1794 load_vec(pg, wt2i, &buf[
VLEN * (2 * ic + 1 +
NVC)]);
1795 set_sp4_tm_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1801 template<
typename REALTYPE>
1802 inline void mult_wilson_tmb_dirac(
Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
1806 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1807 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1809 set_sp2_tm_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1810 set_sp2_tm_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1811 set_sp2_tm_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1813 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1816 for (
int ic = 0; ic <
NC; ++ic) {
1817 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1819 mult_udv(pg, wt1r, wt1i,
1820 ut10, ut11, ut12, ut13, ut14, ut15,
1821 vt10, vt11, vt12, vt13, vt14, vt15);
1822 mult_udv(pg, wt2r, wt2i,
1823 ut10, ut11, ut12, ut13, ut14, ut15,
1824 vt20, vt21, vt22, vt23, vt24, vt25);
1825 set_sp4_tm_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1831 template<
typename REALTYPE>
1832 inline void mult_wilson_tmb_dirac(REALTYPE *__restrict v2,
1833 REALTYPE *__restrict u,
1834 REALTYPE *__restrict v1)
1838 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1839 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1841 set_sp2_tm_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1842 set_sp2_tm_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1843 set_sp2_tm_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1845 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1848 for (
int ic = 0; ic <
NC; ++ic) {
1849 load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1851 mult_udv(pg, wt1r, wt1i,
1852 ut10, ut11, ut12, ut13, ut14, ut15,
1853 vt10, vt11, vt12, vt13, vt14, vt15);
1854 mult_udv(pg, wt2r, wt2i,
1855 ut10, ut11, ut12, ut13, ut14, ut15,
1856 vt20, vt21, vt22, vt23, vt24, vt25);
1857 set_sp4_tm_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1863 template<
typename REALTYPE>
1864 inline void mult_wilson_aypx_save(REALTYPE *__restrict v2, REALTYPE a,
1865 Vsimd_t *__restrict v2v, REALTYPE *__restrict v1)
1870 for (
int i = 0; i <
NVCD; ++i) {
1871 load_vec(pg, v1F, &v1[
VLEN * i]);
1872 load_vec(pg, v2F, &v2v[i].v[0]);
1875 axpy_vec(pg, v1F, a, v2F);
1876 save_vec(pg, &v2[
VLEN * i], v1F);