10 #ifndef MULT_WILSON_EO_PARTS_QXS_H
11 #define MULT_WILSON_EO_PARTS_QXS_H
15 template<
typename REALTYPE>
17 REALTYPE *__restrict buf,
18 REALTYPE *__restrict v1)
20 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
23 for (
int ic = 0; ic <
NC; ++ic) {
24 int icr =
ND * 2 * ic;
25 int ici =
ND * 2 * ic + 1;
27 load_vec(pg2, w1r, &v1[
VLEN * (icr +
ID1)]);
28 load_vec(pg2, w1i, &v1[
VLEN * (ici +
ID1)]);
29 load_vec(pg2, w2r, &v1[
VLEN * (icr +
ID2)]);
30 load_vec(pg2, w2i, &v1[
VLEN * (ici +
ID2)]);
31 load_vec(pg2, w3r, &v1[
VLEN * (icr +
ID3)]);
32 load_vec(pg2, w3i, &v1[
VLEN * (ici +
ID3)]);
33 load_vec(pg2, w4r, &v1[
VLEN * (icr +
ID4)]);
34 load_vec(pg2, w4i, &v1[
VLEN * (ici +
ID4)]);
36 add_vec(pg2, v1r, w1r, w4i);
37 sub_vec(pg2, v1i, w1i, w4r);
38 add_vec(pg2, v2r, w2r, w3i);
39 sub_vec(pg2, v2i, w2i, w3r);
41 int skip = (
VLENY + 1) / 2;
42 save_vec_scatter(pg2, &buf[skip * (2 * ic)], v1r, svidx);
43 save_vec_scatter(pg2, &buf[skip * (2 * ic + 1)], v1i, svidx);
44 save_vec_scatter(pg2, &buf[skip * (2 * ic +
NVC)], v2r, svidx);
45 save_vec_scatter(pg2, &buf[skip * (2 * ic + 1 +
NVC)], v2i, svidx);
51 template<
typename REALTYPE>
52 inline void mult_wilson_eo_xp1(
svbool_t& pg2,
53 REALTYPE *__restrict buf,
54 REALTYPE *__restrict v1)
56 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
59 for (
int ic = 0; ic <
NC; ++ic) {
60 int icr =
ND * 2 * ic;
61 int ici =
ND * 2 * ic + 1;
63 load_vec(pg2, w1r, &v1[
VLEN * (icr +
ID1)]);
64 load_vec(pg2, w1i, &v1[
VLEN * (ici +
ID1)]);
65 load_vec(pg2, w2r, &v1[
VLEN * (icr +
ID2)]);
66 load_vec(pg2, w2i, &v1[
VLEN * (ici +
ID2)]);
67 load_vec(pg2, w3r, &v1[
VLEN * (icr +
ID3)]);
68 load_vec(pg2, w3i, &v1[
VLEN * (ici +
ID3)]);
69 load_vec(pg2, w4r, &v1[
VLEN * (icr +
ID4)]);
70 load_vec(pg2, w4i, &v1[
VLEN * (ici +
ID4)]);
72 add_vec(pg2, v1r, w1r, w4i);
73 sub_vec(pg2, v1i, w1i, w4r);
74 add_vec(pg2, v2r, w2r, w3i);
75 sub_vec(pg2, v2i, w2i, w3r);
77 v1r = compact_vec(pg2, v1r);
78 v1i = compact_vec(pg2, v1i);
79 v2r = compact_vec(pg2, v2r);
80 v2i = compact_vec(pg2, v2i);
82 int skip = (
VLENY + 1) / 2;
83 svbool_t pg1 = set_predicate_whilelt(skip);
84 save_vec(pg1, &buf[skip * (2 * ic)], v1r);
85 save_vec(pg1, &buf[skip * (2 * ic + 1)], v1i);
86 save_vec(pg1, &buf[skip * (2 * ic +
NVC)], v2r);
87 save_vec(pg1, &buf[skip * (2 * ic + 1 +
NVC)], v2i);
93 template<
typename REALTYPE>
98 REALTYPE *__restrict v,
99 REALTYPE *__restrict buf,
102 int icr =
ND * 2 * ic;
103 int ici =
ND * 2 * ic + 1;
105 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
106 load_vec(pg3, w1r, &v[
VLEN * (icr +
ID1)]);
107 load_vec(pg3, w1i, &v[
VLEN * (ici +
ID1)]);
108 load_vec(pg3, w2r, &v[
VLEN * (icr +
ID2)]);
109 load_vec(pg3, w2i, &v[
VLEN * (ici +
ID2)]);
111 load_add(pg1, w1r, &v[
VLEN * (icr +
ID1) + 1]);
112 load_add(pg1, w1i, &v[
VLEN * (ici +
ID1) + 1]);
113 load_add(pg1, w2r, &v[
VLEN * (icr +
ID2) + 1]);
114 load_add(pg1, w2i, &v[
VLEN * (ici +
ID2) + 1]);
116 load_vec(pg3, w3r, &v[
VLEN * (icr +
ID3)]);
117 load_vec(pg3, w3i, &v[
VLEN * (ici +
ID3)]);
118 load_vec(pg3, w4r, &v[
VLEN * (icr +
ID4)]);
119 load_vec(pg3, w4i, &v[
VLEN * (ici +
ID4)]);
121 svbool_t pg13 = sveor_z(pg, pg1, pg3);
123 load_add(pg1, w3r, &v[
VLEN * (icr +
ID3) + 1]);
124 load_add(pg1, w3i, &v[
VLEN * (ici +
ID3) + 1]);
125 load_add(pg1, w4r, &v[
VLEN * (icr +
ID4) + 1]);
126 load_add(pg1, w4i, &v[
VLEN * (ici +
ID4) + 1]);
128 add_vec(pg13, vt1r, w1r, w4i);
129 sub_vec(pg13, vt1i, w1i, w4r);
130 add_vec(pg13, vt2r, w2r, w3i);
131 sub_vec(pg13, vt2i, w2i, w3r);
133 int skip = (
VLENY + 1) / 2;
134 load_add_gather(pg2, vt1r, &buf[skip * (2 * ic)], index);
135 load_add_gather(pg2, vt1i, &buf[skip * (2 * ic + 1)], index);
136 load_add_gather(pg2, vt2r, &buf[skip * (2 * ic +
NVC)], index);
137 load_add_gather(pg2, vt2i, &buf[skip * (2 * ic + 1 +
NVC)], index);
142 template<
typename REALTYPE>
147 REALTYPE *__restrict v,
148 REALTYPE *__restrict buf,
151 int icr =
ND * 2 * ic;
152 int ici =
ND * 2 * ic + 1;
154 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
155 load_vec(pg3, w1r, &v[
VLEN * (icr +
ID1)]);
156 load_vec(pg3, w1i, &v[
VLEN * (ici +
ID1)]);
157 load_vec(pg3, w2r, &v[
VLEN * (icr +
ID2)]);
158 load_vec(pg3, w2i, &v[
VLEN * (ici +
ID2)]);
160 load_add(pg1, w1r, &v[
VLEN * (icr +
ID1) + 1]);
161 load_add(pg1, w1i, &v[
VLEN * (ici +
ID1) + 1]);
162 load_add(pg1, w2r, &v[
VLEN * (icr +
ID2) + 1]);
163 load_add(pg1, w2i, &v[
VLEN * (ici +
ID2) + 1]);
165 load_vec(pg3, w3r, &v[
VLEN * (icr +
ID3)]);
166 load_vec(pg3, w3i, &v[
VLEN * (ici +
ID3)]);
167 load_vec(pg3, w4r, &v[
VLEN * (icr +
ID4)]);
168 load_vec(pg3, w4i, &v[
VLEN * (ici +
ID4)]);
170 svbool_t pg13 = sveor_z(pg, pg1, pg3);
172 load_add(pg1, w3r, &v[
VLEN * (icr +
ID3) + 1]);
173 load_add(pg1, w3i, &v[
VLEN * (ici +
ID3) + 1]);
174 load_add(pg1, w4r, &v[
VLEN * (icr +
ID4) + 1]);
175 load_add(pg1, w4i, &v[
VLEN * (ici +
ID4) + 1]);
177 add_vec(pg13, vt1r, w1r, w4i);
178 sub_vec(pg13, vt1i, w1i, w4r);
179 add_vec(pg13, vt2r, w2r, w3i);
180 sub_vec(pg13, vt2i, w2i, w3r);
182 int skip = (
VLENY + 1) / 2;
183 load_add_gather(pg2, vt1r, &buf[skip * (2 * ic)],
idx, skip);
184 load_add_gather(pg2, vt1i, &buf[skip * (2 * ic + 1)],
idx, skip);
185 load_add_gather(pg2, vt2r, &buf[skip * (2 * ic +
NVC)],
idx, skip);
186 load_add_gather(pg2, vt2i, &buf[skip * (2 * ic + 1 +
NVC)],
idx, skip);
191 template<
typename REALTYPE>
195 REALTYPE *v1, REALTYPE *buf)
199 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
200 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
202 set_sp2_xp2(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
203 set_sp2_xp2(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
204 set_sp2_xp2(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
206 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
209 for (
int ic = 0; ic <
NC; ++ic) {
210 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
211 &u[
VLEN * (2 * ic)]);
212 mult_uv(pg, wt1r, wt1i,
213 ut10, ut11, ut12, ut13, ut14, ut15,
214 vt10, vt11, vt12, vt13, vt14, vt15);
215 mult_uv(pg, wt2r, wt2i,
216 ut10, ut11, ut12, ut13, ut14, ut15,
217 vt20, vt21, vt22, vt23, vt24, vt25);
218 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
224 template<
typename REALTYPE>
228 REALTYPE *v1, REALTYPE *buf)
232 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
233 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
235 set_sp2_xp2(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
236 set_sp2_xp2(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
237 set_sp2_xp2(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
239 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
242 for (
int ic = 0; ic <
NC; ++ic) {
243 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
244 &u[
VLEN * (2 * ic)]);
245 mult_uv(pg, wt1r, wt1i,
246 ut10, ut11, ut12, ut13, ut14, ut15,
247 vt10, vt11, vt12, vt13, vt14, vt15);
248 mult_uv(pg, wt2r, wt2i,
249 ut10, ut11, ut12, ut13, ut14, ut15,
250 vt20, vt21, vt22, vt23, vt24, vt25);
251 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
257 template<
typename REALTYPE>
260 REALTYPE *__restrict v2,
261 REALTYPE *__restrict u,
262 REALTYPE *__restrict v1,
263 REALTYPE *__restrict buf)
267 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
268 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
270 set_sp2_xp2(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
271 set_sp2_xp2(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
272 set_sp2_xp2(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
274 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
277 for (
int ic = 0; ic <
NC; ++ic) {
278 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
279 &u[
VLEN * (2 * ic)]);
280 mult_uv(pg, wt1r, wt1i,
281 ut10, ut11, ut12, ut13, ut14, ut15,
282 vt10, vt11, vt12, vt13, vt14, vt15);
283 mult_uv(pg, wt2r, wt2i,
284 ut10, ut11, ut12, ut13, ut14, ut15,
285 vt20, vt21, vt22, vt23, vt24, vt25);
286 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
292 template<
typename REALTYPE>
297 REALTYPE *vx, REALTYPE *vn,
int ic)
299 int icr =
ND * 2 * ic;
300 int ici =
ND * 2 * ic + 1;
301 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
303 shift_vec_xbw(pg1, pg2, pg3, w1r, &vx[
VLEN * (icr +
ID1)],
305 shift_vec_xbw(pg1, pg2, pg3, w1i, &vx[
VLEN * (ici +
ID1)],
308 shift_vec_xbw(pg1, pg2, pg3, w2r, &vx[
VLEN * (icr +
ID2)],
310 shift_vec_xbw(pg1, pg2, pg3, w2i, &vx[
VLEN * (ici +
ID2)],
313 shift_vec_xbw(pg1, pg2, pg3, w3r, &vx[
VLEN * (icr +
ID3)],
315 shift_vec_xbw(pg1, pg2, pg3, w3i, &vx[
VLEN * (ici +
ID3)],
318 shift_vec_xbw(pg1, pg2, pg3, w4r, &vx[
VLEN * (icr +
ID4)],
320 shift_vec_xbw(pg1, pg2, pg3, w4i, &vx[
VLEN * (ici +
ID4)],
322 add_vec(pg, vt1r, w1r, w4i);
323 sub_vec(pg, vt1i, w1i, w4r);
324 add_vec(pg, vt2r, w2r, w3i);
325 sub_vec(pg, vt2i, w2i, w3r);
330 template<
typename REALTYPE>
333 REALTYPE *u, REALTYPE *v1, REALTYPE *v1n)
337 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
338 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
340 set_sp2_xp(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, v1n, 0);
341 set_sp2_xp(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, v1n, 1);
342 set_sp2_xp(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, v1n, 2);
344 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
347 for (
int ic = 0; ic <
NC; ++ic) {
348 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[
VLEN * (2 * ic)]);
349 mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
350 vt10, vt11, vt12, vt13, vt14, vt15);
351 mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
352 vt20, vt21, vt22, vt23, vt24, vt25);
353 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
359 template<
typename REALTYPE>
364 REALTYPE *vx, REALTYPE *vn,
int ic)
366 int icr =
ND * 2 * ic;
367 int ici =
ND * 2 * ic + 1;
368 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
370 shift_vec(pg1,
idx, w1r, &vx[
VLEN * (icr +
ID1)],
372 shift_vec(pg1,
idx, w1i, &vx[
VLEN * (ici +
ID1)],
375 shift_vec(pg1,
idx, w2r, &vx[
VLEN * (icr +
ID2)],
377 shift_vec(pg1,
idx, w2i, &vx[
VLEN * (ici +
ID2)],
380 shift_vec(pg1,
idx, w3r, &vx[
VLEN * (icr +
ID3)],
382 shift_vec(pg1,
idx, w3i, &vx[
VLEN * (ici +
ID3)],
385 shift_vec(pg1,
idx, w4r, &vx[
VLEN * (icr +
ID4)],
387 shift_vec(pg1,
idx, w4i, &vx[
VLEN * (ici +
ID4)],
390 add_vec(pg, vt1r, w1r, w4i);
391 sub_vec(pg, vt1i, w1i, w4r);
392 add_vec(pg, vt2r, w2r, w3i);
393 sub_vec(pg, vt2i, w2i, w3r);
398 template<
typename REALTYPE>
401 REALTYPE *v1, REALTYPE *v1n)
405 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
406 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
408 set_sp2_xp(pg, pg1,
idx, vt10, vt11, vt20, vt21, v1, v1n, 0);
409 set_sp2_xp(pg, pg1,
idx, vt12, vt13, vt22, vt23, v1, v1n, 1);
410 set_sp2_xp(pg, pg1,
idx, vt14, vt15, vt24, vt25, v1, v1n, 2);
412 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
415 for (
int ic = 0; ic <
NC; ++ic) {
416 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[
VLEN * (2 * ic)]);
417 mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
418 vt10, vt11, vt12, vt13, vt14, vt15);
419 mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
420 vt20, vt21, vt22, vt23, vt24, vt25);
421 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
427 template<
typename REALTYPE>
430 REALTYPE *__restrict v2,
431 REALTYPE *__restrict u,
432 REALTYPE *v1, REALTYPE *v1n)
436 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
437 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
439 set_sp2_xp(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, v1n, 0);
440 set_sp2_xp(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, v1n, 1);
441 set_sp2_xp(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, v1n, 2);
443 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
446 for (
int ic = 0; ic <
NC; ++ic) {
447 load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[
VLEN * (2 * ic)]);
448 mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
449 vt10, vt11, vt12, vt13, vt14, vt15);
450 mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
451 vt20, vt21, vt22, vt23, vt24, vt25);
452 set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
458 template<
typename REALTYPE>
459 inline void mult_wilson_eo_xm1(
svbool_t& pg2,
460 REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
464 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
465 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
467 set_sp2_xm1(pg2, vt10, vt11, vt20, vt21, v1, 0);
468 set_sp2_xm1(pg2, vt12, vt13, vt22, vt23, v1, 1);
469 set_sp2_xm1(pg2, vt14, vt15, vt24, vt25, v1, 2);
471 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
474 for (
int ic = 0; ic <
NC; ++ic) {
475 load_udag(pg2, ut10, ut11, ut12, ut13, ut14, ut15,
478 mult_udv(pg2, wt1r, wt1i,
479 ut10, ut11, ut12, ut13, ut14, ut15,
480 vt10, vt11, vt12, vt13, vt14, vt15);
481 mult_udv(pg2, wt2r, wt2i,
482 ut10, ut11, ut12, ut13, ut14, ut15,
483 vt20, vt21, vt22, vt23, vt24, vt25);
485 wt1r = compact_vec(pg2, wt1r);
486 wt1i = compact_vec(pg2, wt1i);
487 wt2r = compact_vec(pg2, wt2r);
488 wt2i = compact_vec(pg2, wt2i);
490 int skip = (
VLENY + 1) / 2;
491 svbool_t pg1 = set_predicate_whilelt(skip);
492 save_vec(pg1, &buf[skip * (2 * ic)], wt1r);
493 save_vec(pg1, &buf[skip * (2 * ic + 1)], wt1i);
494 save_vec(pg1, &buf[skip * (2 * ic +
NVC)], wt2r);
495 save_vec(pg1, &buf[skip * (2 * ic + 1 +
NVC)], wt2i);
501 template<
typename REALTYPE>
503 REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
507 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
508 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
510 set_sp2_xm1(pg2, vt10, vt11, vt20, vt21, v1, 0);
511 set_sp2_xm1(pg2, vt12, vt13, vt22, vt23, v1, 1);
512 set_sp2_xm1(pg2, vt14, vt15, vt24, vt25, v1, 2);
514 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
517 for (
int ic = 0; ic <
NC; ++ic) {
518 load_udag(pg2, ut10, ut11, ut12, ut13, ut14, ut15,
521 mult_udv(pg2, wt1r, wt1i,
522 ut10, ut11, ut12, ut13, ut14, ut15,
523 vt10, vt11, vt12, vt13, vt14, vt15);
524 mult_udv(pg2, wt2r, wt2i,
525 ut10, ut11, ut12, ut13, ut14, ut15,
526 vt20, vt21, vt22, vt23, vt24, vt25);
528 int skip = (
VLENY + 1) / 2;
529 save_vec_scatter(pg2, &buf[skip * (2 * ic)], wt1r, svidx);
530 save_vec_scatter(pg2, &buf[skip * (2 * ic + 1)], wt1i, svidx);
531 save_vec_scatter(pg2, &buf[skip * (2 * ic +
NVC)], wt2r, svidx);
532 save_vec_scatter(pg2, &buf[skip * (2 * ic + 1 +
NVC)], wt2i, svidx);
538 template<
typename REALTYPE>
542 REALTYPE *vx,
int ic)
544 int icr =
ND * 2 * ic;
545 int ici =
ND * 2 * ic + 1;
546 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
548 load_vec(pg3, w1r, &vx[
VLEN * (icr +
ID1)]);
549 load_add(pg1, w1r, &vx[
VLEN * (icr +
ID1) - 1]);
551 load_vec(pg3, w1i, &vx[
VLEN * (ici +
ID1)]);
552 load_add(pg1, w1i, &vx[
VLEN * (ici +
ID1) - 1]);
554 load_vec(pg3, w2r, &vx[
VLEN * (icr +
ID2)]);
555 load_add(pg1, w2r, &vx[
VLEN * (icr +
ID2) - 1]);
557 load_vec(pg3, w2i, &vx[
VLEN * (ici +
ID2)]);
558 load_add(pg1, w2i, &vx[
VLEN * (ici +
ID2) - 1]);
560 load_vec(pg3, w3r, &vx[
VLEN * (icr +
ID3)]);
561 load_add(pg1, w3r, &vx[
VLEN * (icr +
ID3) - 1]);
563 load_vec(pg3, w3i, &vx[
VLEN * (ici +
ID3)]);
564 load_add(pg1, w3i, &vx[
VLEN * (ici +
ID3) - 1]);
566 load_vec(pg3, w4r, &vx[
VLEN * (icr +
ID4)]);
567 load_add(pg1, w4r, &vx[
VLEN * (icr +
ID4) - 1]);
569 load_vec(pg3, w4i, &vx[
VLEN * (ici +
ID4)]);
570 load_add(pg1, w4i, &vx[
VLEN * (ici +
ID4) - 1]);
573 svbool_t pg13 = sveor_z(pg0, pg1, pg3);
574 sub_vec(pg13, vt1r, w1r, w4i);
575 add_vec(pg13, vt1i, w1i, w4r);
576 sub_vec(pg13, vt2r, w2r, w3i);
577 add_vec(pg13, vt2i, w2i, w3r);
582 template<
typename REALTYPE>
586 REALTYPE *v1, REALTYPE *buf)
590 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
591 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
593 set_sp2_xm2(pg1, pg3, vt10, vt11, vt20, vt21, v1, 0);
594 set_sp2_xm2(pg1, pg3, vt12, vt13, vt22, vt23, v1, 1);
595 set_sp2_xm2(pg1, pg3, vt14, vt15, vt24, vt25, v1, 2);
597 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
600 for (
int ic = 0; ic <
NC; ++ic) {
601 load_udag_xm2_eo(pg1, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
603 svbool_t pg13 = sveor_z(pg, pg1, pg3);
604 mult_udv(pg13, wt1r, wt1i,
605 ut10, ut11, ut12, ut13, ut14, ut15,
606 vt10, vt11, vt12, vt13, vt14, vt15);
607 mult_udv(pg13, wt2r, wt2i,
608 ut10, ut11, ut12, ut13, ut14, ut15,
609 vt20, vt21, vt22, vt23, vt24, vt25);
611 int skip = (
VLENY + 1) / 2;
612 load_add_gather(pg2, wt1r, &buf[skip * (2 * ic)], svidx);
613 load_add_gather(pg2, wt1i, &buf[skip * (2 * ic + 1)], svidx);
614 load_add_gather(pg2, wt2r, &buf[skip * (2 * ic +
NVC)], svidx);
615 load_add_gather(pg2, wt2i, &buf[skip * (2 * ic + 1 +
NVC)], svidx);
617 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
623 template<
typename REALTYPE>
627 REALTYPE *v1, REALTYPE *buf)
631 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
632 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
634 set_sp2_xm2(pg1, pg3, vt10, vt11, vt20, vt21, v1, 0);
635 set_sp2_xm2(pg1, pg3, vt12, vt13, vt22, vt23, v1, 1);
636 set_sp2_xm2(pg1, pg3, vt14, vt15, vt24, vt25, v1, 2);
638 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
641 for (
int ic = 0; ic <
NC; ++ic) {
642 load_udag_xm2_eo(pg1, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
644 svbool_t pg13 = sveor_z(pg, pg1, pg3);
645 mult_udv(pg13, wt1r, wt1i,
646 ut10, ut11, ut12, ut13, ut14, ut15,
647 vt10, vt11, vt12, vt13, vt14, vt15);
648 mult_udv(pg13, wt2r, wt2i,
649 ut10, ut11, ut12, ut13, ut14, ut15,
650 vt20, vt21, vt22, vt23, vt24, vt25);
652 int skip = (
VLENY + 1) / 2;
653 load_add_gather(pg2, wt1r, &buf[skip * (2 * ic)],
idx, skip);
654 load_add_gather(pg2, wt1i, &buf[skip * (2 * ic + 1)],
idx, skip);
655 load_add_gather(pg2, wt2r, &buf[skip * (2 * ic +
NVC)],
idx, skip);
656 load_add_gather(pg2, wt2i, &buf[skip * (2 * ic + 1 +
NVC)],
idx, skip);
658 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
664 template<
typename REALTYPE>
667 REALTYPE *__restrict v2,
668 REALTYPE *__restrict u,
669 REALTYPE *__restrict v1,
670 REALTYPE *__restrict buf)
674 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
675 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
677 set_sp2_xm2(pg1, pg3, vt10, vt11, vt20, vt21, v1, 0);
678 set_sp2_xm2(pg1, pg3, vt12, vt13, vt22, vt23, v1, 1);
679 set_sp2_xm2(pg1, pg3, vt14, vt15, vt24, vt25, v1, 2);
681 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
684 for (
int ic = 0; ic <
NC; ++ic) {
685 load_udag_xm2_eo(pg1, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
687 svbool_t pg13 = sveor_z(pg, pg1, pg3);
688 mult_udv(pg13, wt1r, wt1i,
689 ut10, ut11, ut12, ut13, ut14, ut15,
690 vt10, vt11, vt12, vt13, vt14, vt15);
691 mult_udv(pg13, wt2r, wt2i,
692 ut10, ut11, ut12, ut13, ut14, ut15,
693 vt20, vt21, vt22, vt23, vt24, vt25);
695 int skip = (
VLENY + 1) / 2;
696 load_add_gather(pg2, wt1r, &buf[skip * (2 * ic)], svidx);
697 load_add_gather(pg2, wt1i, &buf[skip * (2 * ic + 1)], svidx);
698 load_add_gather(pg2, wt2r, &buf[skip * (2 * ic +
NVC)], svidx);
699 load_add_gather(pg2, wt2i, &buf[skip * (2 * ic + 1 +
NVC)], svidx);
701 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
707 template<
typename REALTYPE>
712 REALTYPE *vx, REALTYPE *vn,
int ic)
714 int icr =
ND * 2 * ic;
715 int ici =
ND * 2 * ic + 1;
716 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
718 shift_vec_xfw(pg1, pg2, pg3, w1r, &vx[
VLEN * (icr +
ID1)],
720 shift_vec_xfw(pg1, pg2, pg3, w1i, &vx[
VLEN * (ici +
ID1)],
723 shift_vec_xfw(pg1, pg2, pg3, w2r, &vx[
VLEN * (icr +
ID2)],
725 shift_vec_xfw(pg1, pg2, pg3, w2i, &vx[
VLEN * (ici +
ID2)],
728 shift_vec_xfw(pg1, pg2, pg3, w3r, &vx[
VLEN * (icr +
ID3)],
730 shift_vec_xfw(pg1, pg2, pg3, w3i, &vx[
VLEN * (ici +
ID3)],
733 shift_vec_xfw(pg1, pg2, pg3, w4r, &vx[
VLEN * (icr +
ID4)],
735 shift_vec_xfw(pg1, pg2, pg3, w4i, &vx[
VLEN * (ici +
ID4)],
738 sub_vec(pg, vt1r, w1r, w4i);
739 add_vec(pg, vt1i, w1i, w4r);
740 sub_vec(pg, vt2r, w2r, w3i);
741 add_vec(pg, vt2i, w2i, w3r);
746 template<
typename REALTYPE>
749 REALTYPE *u, REALTYPE *un,
750 REALTYPE *v1, REALTYPE *v1n)
754 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
755 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
757 set_sp2_xm(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, v1n, 0);
758 set_sp2_xm(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, v1n, 1);
759 set_sp2_xm(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, v1n, 2);
761 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
764 for (
int ic = 0; ic <
NC; ++ic) {
765 load_udag_xm_eo(pg1, pg2, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
768 mult_udv(pg, wt1r, wt1i,
769 ut10, ut11, ut12, ut13, ut14, ut15,
770 vt10, vt11, vt12, vt13, vt14, vt15);
771 mult_udv(pg, wt2r, wt2i,
772 ut10, ut11, ut12, ut13, ut14, ut15,
773 vt20, vt21, vt22, vt23, vt24, vt25);
774 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
780 template<
typename REALTYPE>
784 REALTYPE *vx, REALTYPE *vn,
int ic)
786 int icr =
ND * 2 * ic;
787 int ici =
ND * 2 * ic + 1;
788 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
790 shift_vec(pg1, idx1, w1r, &vx[
VLEN * (icr +
ID1)],
792 shift_vec(pg1, idx1, w1i, &vx[
VLEN * (ici +
ID1)],
795 shift_vec(pg1, idx1, w2r, &vx[
VLEN * (icr +
ID2)],
797 shift_vec(pg1, idx1, w2i, &vx[
VLEN * (ici +
ID2)],
800 shift_vec(pg1, idx1, w3r, &vx[
VLEN * (icr +
ID3)],
802 shift_vec(pg1, idx1, w3i, &vx[
VLEN * (ici +
ID3)],
805 shift_vec(pg1, idx1, w4r, &vx[
VLEN * (icr +
ID4)],
807 shift_vec(pg1, idx1, w4i, &vx[
VLEN * (ici +
ID4)],
810 sub_vec(pg, vt1r, w1r, w4i);
811 add_vec(pg, vt1i, w1i, w4r);
812 sub_vec(pg, vt2r, w2r, w3i);
813 add_vec(pg, vt2i, w2i, w3r);
818 template<
typename REALTYPE>
821 REALTYPE *u, REALTYPE *un,
822 REALTYPE *v1, REALTYPE *v1n)
826 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
827 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
829 set_sp2_xm(pg, pg1, idx1, vt10, vt11, vt20, vt21, v1, v1n, 0);
830 set_sp2_xm(pg, pg1, idx1, vt12, vt13, vt22, vt23, v1, v1n, 1);
831 set_sp2_xm(pg, pg1, idx1, vt14, vt15, vt24, vt25, v1, v1n, 2);
833 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
836 for (
int ic = 0; ic <
NC; ++ic) {
837 load_udag(pg1, idx1, ut10, ut11, ut12, ut13, ut14, ut15,
840 mult_udv(pg, wt1r, wt1i,
841 ut10, ut11, ut12, ut13, ut14, ut15,
842 vt10, vt11, vt12, vt13, vt14, vt15);
843 mult_udv(pg, wt2r, wt2i,
844 ut10, ut11, ut12, ut13, ut14, ut15,
845 vt20, vt21, vt22, vt23, vt24, vt25);
846 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
852 template<
typename REALTYPE>
855 REALTYPE *__restrict v2,
856 REALTYPE *u, REALTYPE *un,
857 REALTYPE *v1, REALTYPE *v1n)
861 svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
862 svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
864 set_sp2_xm(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, v1n, 0);
865 set_sp2_xm(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, v1n, 1);
866 set_sp2_xm(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, v1n, 2);
868 svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
871 for (
int ic = 0; ic <
NC; ++ic) {
872 load_udag_xm_eo(pg1, pg2, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
875 mult_udv(pg, wt1r, wt1i,
876 ut10, ut11, ut12, ut13, ut14, ut15,
877 vt10, vt11, vt12, vt13, vt14, vt15);
878 mult_udv(pg, wt2r, wt2i,
879 ut10, ut11, ut12, ut13, ut14, ut15,
880 vt20, vt21, vt22, vt23, vt24, vt25);
881 set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);