10 #ifndef MULT_CLOVER_COARSE_PARTS_QXS_H
11 #define MULT_CLOVER_COARSE_PARTS_QXS_H
17 const real_t *__restrict__ in,
18 const real_t *__restrict__ u0,
19 int i,
const int ncol)
21 const int nh = ncol / 2;
29 for (
int j = 0; j < nh; j++) {
31 svreal_t in_e1r, in_e1i, in_e2r, in_e2i;
34 load_vec(pg, u_e1r, uij);
35 load_vec(pg, u_e1i, uij +
VLEN);
36 load_vec(pg, u_e2r, uij + 2 *
VLEN);
37 load_vec(pg, u_e2i, uij + 3 *
VLEN);
38 load_vec(pg, in_e1r, inj);
39 load_vec(pg, in_e1i, inj +
VLEN);
40 load_vec(pg, in_e2r, inj + 2 *
VLEN);
41 load_vec(pg, in_e2i, inj + 3 *
VLEN);
44 axpy_vec(pg, out_e1r, u_e1r, in_e1r);
45 ymax_vec(pg, out_e1r, u_e1i, in_e1i);
46 axpy_vec(pg, out_e1i, u_e1r, in_e1i);
47 axpy_vec(pg, out_e1i, u_e1i, in_e1r);
50 axpy_vec(pg, out_e1r, u_e2r, in_e2r);
51 ymax_vec(pg, out_e1r, u_e2i, in_e2i);
52 axpy_vec(pg, out_e1i, u_e2r, in_e2i);
53 axpy_vec(pg, out_e1i, u_e2i, in_e2r);
55 svreal_t uu_e1r, uu_e1i, uu_e2r, uu_e2i;
57 load_vec(pg, uu_e1r, uuij);
58 load_vec(pg, uu_e1i, uuij +
VLEN);
59 load_vec(pg, uu_e2r, uuij + 2 *
VLEN);
60 load_vec(pg, uu_e2i, uuij + 3 *
VLEN);
63 axpy_vec(pg, out_e2r, uu_e1r, in_e1r);
64 ymax_vec(pg, out_e2r, uu_e1i, in_e1i);
65 axpy_vec(pg, out_e2i, uu_e1r, in_e1i);
66 axpy_vec(pg, out_e2i, uu_e1i, in_e1r);
69 axpy_vec(pg, out_e2r, uu_e2r, in_e2r);
70 ymax_vec(pg, out_e2r, uu_e2i, in_e2i);
71 axpy_vec(pg, out_e2i, uu_e2r, in_e2i);
72 axpy_vec(pg, out_e2i, uu_e2i, in_e2r);
78 inline void accum_mult_u_i(
real_t *__restrict__ out,
79 const real_t *__restrict__ in,
80 const real_t *__restrict__ u0,
81 int i,
const int ncol,
const svbool_t pgin)
83 const int nh = ncol / 2;
85 svreal_t out_e1r, out_e1i, out_e2r, out_e2i;
86 load_vec(pgin, out_e1r, out);
87 load_vec(pgin, out_e1i, out +
VLEN);
88 load_vec(pgin, out_e2r, out + 2 *
VLEN);
89 load_vec(pgin, out_e2i, out + 3 *
VLEN);
92 for (
int j = 0; j < nh; j++) {
94 svreal_t in_e1r, in_e1i, in_e2r, in_e2i;
97 load_vec(pg, u_e1r, uij);
98 load_vec(pg, u_e1i, uij +
VLEN);
99 load_vec(pg, u_e2r, uij + 2 *
VLEN);
100 load_vec(pg, u_e2i, uij + 3 *
VLEN);
101 load_vec(pg, in_e1r, inj);
102 load_vec(pg, in_e1i, inj +
VLEN);
103 load_vec(pg, in_e2r, inj + 2 *
VLEN);
104 load_vec(pg, in_e2i, inj + 3 *
VLEN);
107 axpy_vec(pg, out_e1r, u_e1r, in_e1r);
108 ymax_vec(pg, out_e1r, u_e1i, in_e1i);
109 axpy_vec(pg, out_e1i, u_e1r, in_e1i);
110 axpy_vec(pg, out_e1i, u_e1i, in_e1r);
113 axpy_vec(pg, out_e1r, u_e2r, in_e2r);
114 ymax_vec(pg, out_e1r, u_e2i, in_e2i);
115 axpy_vec(pg, out_e1i, u_e2r, in_e2i);
116 axpy_vec(pg, out_e1i, u_e2i, in_e2r);
118 svreal_t uu_e1r, uu_e1i, uu_e2r, uu_e2i;
120 load_vec(pg, uu_e1r, uuij);
121 load_vec(pg, uu_e1i, uuij +
VLEN);
122 load_vec(pg, uu_e2r, uuij + 2 *
VLEN);
123 load_vec(pg, uu_e2i, uuij + 3 *
VLEN);
126 axpy_vec(pg, out_e2r, uu_e1r, in_e1r);
127 ymax_vec(pg, out_e2r, uu_e1i, in_e1i);
128 axpy_vec(pg, out_e2i, uu_e1r, in_e1i);
129 axpy_vec(pg, out_e2i, uu_e1i, in_e1r);
132 axpy_vec(pg, out_e2r, uu_e2r, in_e2r);
133 ymax_vec(pg, out_e2r, uu_e2i, in_e2i);
134 axpy_vec(pg, out_e2i, uu_e2r, in_e2i);
135 axpy_vec(pg, out_e2i, uu_e2i, in_e2r);
137 save_vec(pg, out, out_e1r);
138 save_vec(pg, out +
VLEN, out_e1i);
139 save_vec(pg, out + 2 *
VLEN, out_e2r);
140 save_vec(pg, out + 3 *
VLEN, out_e2i);
148 const int nh = ncol / 2;
150 for (
int i = 0; i < nh; i++) {
152 accum_mult_u_i(outi, in, u0, i, ncol, set_predicate());
158 inline void accum_mult_udag_i(
svbool_t pg,
161 const real_t *__restrict__ in,
162 const real_t *__restrict__ u0,
163 const int i,
const int ncol)
165 const int dof2 = ncol * ncol;
166 const int nh = ncol / 2;
168 for (
int j = 0; j < nh; j++) {
170 svreal_t u_e1r, u_e1i, u_e2r, u_e2i;
172 load_vec(pg, u_e1r, uji);
173 load_vec(pg, u_e1i, uji +
VLEN);
174 load_vec(pg, u_e2r, uji + 2 *
VLEN);
175 load_vec(pg, u_e2i, uji + 3 *
VLEN);
177 svreal_t in_e1r, in_e1i, in_e2r, in_e2i;
179 load_vec(pg, in_e1r, inj);
180 load_vec(pg, in_e1i, inj +
VLEN);
181 load_vec(pg, in_e2r, inj + 2 *
VLEN);
182 load_vec(pg, in_e2i, inj + 3 *
VLEN);
184 axpy_vec(pg, out_e1r, u_e1r, in_e1r);
185 axpy_vec(pg, out_e1r, u_e1i, in_e1i);
186 axpy_vec(pg, out_e1i, u_e1r, in_e1i);
187 ymax_vec(pg, out_e1i, u_e1i, in_e1r);
189 ymax_vec(pg, out_e2r, u_e2r, in_e1r);
190 ymax_vec(pg, out_e2r, u_e2i, in_e1i);
191 ymax_vec(pg, out_e2i, u_e2r, in_e1i);
192 axpy_vec(pg, out_e2i, u_e2i, in_e1r);
194 svreal_t uu_e1r, uu_e1i, uu_e2r, uu_e2i;
195 const real_t *uuji = u +
VLEN * (2 * ncol + 4 * i);
196 load_vec(pg, uu_e1r, uuji);
197 load_vec(pg, uu_e1i, uuji +
VLEN);
198 load_vec(pg, uu_e2r, uuji + 2 *
VLEN);
199 load_vec(pg, uu_e2i, uuji + 3 *
VLEN);
201 ymax_vec(pg, out_e1r, uu_e1r, in_e2r);
202 ymax_vec(pg, out_e1r, uu_e1i, in_e2i);
203 ymax_vec(pg, out_e1i, uu_e1r, in_e2i);
204 axpy_vec(pg, out_e1i, uu_e1i, in_e2r);
206 axpy_vec(pg, out_e2r, uu_e2r, in_e2r);
207 axpy_vec(pg, out_e2r, uu_e2i, in_e2i);
208 axpy_vec(pg, out_e2i, uu_e2r, in_e2i);
209 ymax_vec(pg, out_e2i, uu_e2i, in_e2r);
215 inline void accum_mult_udag_i(
real_t *__restrict__ out,
216 const real_t *__restrict__ in,
217 const real_t *__restrict__ u0,
218 int i,
const int ncol,
221 const int dof2 = ncol * ncol;
222 const int nh = ncol / 2;
225 svreal_t out_e1r, out_e1i, out_e2r, out_e2i;
226 load_vec(pgin, out_e1r, out);
227 load_vec(pgin, out_e1i, out +
VLEN);
228 load_vec(pgin, out_e2r, out + 2 *
VLEN);
229 load_vec(pgin, out_e2i, out + 3 *
VLEN);
231 for (
int j = 0; j < nh; j++) {
234 svreal_t u_e1r, u_e1i, u_e2r, u_e2i;
236 load_vec(pg, u_e1r, uji);
237 load_vec(pg, u_e1i, uji +
VLEN);
238 load_vec(pg, u_e2r, uji + 2 *
VLEN);
239 load_vec(pg, u_e2i, uji + 3 *
VLEN);
241 svreal_t in_e1r, in_e1i, in_e2r, in_e2i;
243 load_vec(pg, in_e1r, inj);
244 load_vec(pg, in_e1i, inj +
VLEN);
245 load_vec(pg, in_e2r, inj + 2 *
VLEN);
246 load_vec(pg, in_e2i, inj + 3 *
VLEN);
249 axpy_vec(pg, out_e1r, u_e1r, in_e1r);
250 axpy_vec(pg, out_e1r, u_e1i, in_e1i);
251 axpy_vec(pg, out_e1i, u_e1r, in_e1i);
252 ymax_vec(pg, out_e1i, u_e1i, in_e1r);
255 ymax_vec(pg, out_e2r, u_e2r, in_e1r);
256 ymax_vec(pg, out_e2r, u_e2i, in_e1i);
257 ymax_vec(pg, out_e2i, u_e2r, in_e1i);
258 axpy_vec(pg, out_e2i, u_e2i, in_e1r);
260 svreal_t uu_e1r, uu_e1i, uu_e2r, uu_e2i;
261 const real_t *uuji = u +
VLEN * (2 * ncol + 4 * i);
262 load_vec(pg, uu_e1r, uuji);
263 load_vec(pg, uu_e1i, uuji +
VLEN);
264 load_vec(pg, uu_e2r, uuji + 2 *
VLEN);
265 load_vec(pg, uu_e2i, uuji + 3 *
VLEN);
268 ymax_vec(pg, out_e1r, uu_e1r, in_e2r);
269 ymax_vec(pg, out_e1r, uu_e1i, in_e2i);
270 ymax_vec(pg, out_e1i, uu_e1r, in_e2i);
271 axpy_vec(pg, out_e1i, uu_e1i, in_e2r);
274 axpy_vec(pg, out_e2r, uu_e2r, in_e2r);
275 axpy_vec(pg, out_e2r, uu_e2i, in_e2i);
276 axpy_vec(pg, out_e2i, uu_e2r, in_e2i);
277 ymax_vec(pg, out_e2i, uu_e2i, in_e2r);
280 save_vec(pg, out, out_e1r);
281 save_vec(pg, out +
VLEN, out_e1i);
282 save_vec(pg, out + 2 *
VLEN, out_e2r);
283 save_vec(pg, out + 3 *
VLEN, out_e2i);
289 const int i,
const int ncol)
291 accum_mult_udag_i(out, in, u0, i, ncol, set_predicate());
300 int i,
const int ncol)
303 const int nh = ncol / 2;
305 svreal_t out_e1r, out_e1i, out_e2r, out_e2i;
306 load_vec(pg, out_e1r, out);
307 load_vec(pg, out_e1i, out +
VLEN);
308 load_vec(pg, out_e2r, out + 2 *
VLEN);
309 load_vec(pg, out_e2i, out + 3 *
VLEN);
311 for (
int j = 0; j < nh; j++) {
316 svreal_t u_e1r, u_e1i, u_e2r, u_e2i;
317 shift_vec_xfw(pg1, pg2, u_e1r, &uu1[
VLEN * (4 * i)],
318 &uu2[
VLEN * (4 * i)]);
319 shift_vec_xfw(pg1, pg2, u_e1i, &uu1[
VLEN * (4 * i + 1)],
320 &uu2[
VLEN * (4 * i + 1)]);
321 shift_vec_xfw(pg1, pg2, u_e2r, &uu1[
VLEN * (4 * i + 2)],
322 &uu2[
VLEN * (4 * i + 2)]);
323 shift_vec_xfw(pg1, pg2, u_e2i, &uu1[
VLEN * (4 * i + 3)],
324 &uu2[
VLEN * (4 * i + 3)]);
328 svreal_t in_e1r, in_e1i, in_e2r, in_e2i;
329 load_vec(pg, in_e1r, inj);
330 load_vec(pg, in_e1i, inj +
VLEN);
331 load_vec(pg, in_e2r, inj + 2 *
VLEN);
332 load_vec(pg, in_e2i, inj + 3 *
VLEN);
335 axpy_vec(pg, out_e1r, u_e1r, in_e1r);
336 axpy_vec(pg, out_e1r, u_e1i, in_e1i);
337 axpy_vec(pg, out_e1i, u_e1r, in_e1i);
338 ymax_vec(pg, out_e1i, u_e1i, in_e1r);
341 ymax_vec(pg, out_e2r, u_e2r, in_e1r);
342 ymax_vec(pg, out_e2r, u_e2i, in_e1i);
343 ymax_vec(pg, out_e2i, u_e2r, in_e1i);
344 axpy_vec(pg, out_e2i, u_e2i, in_e1r);
347 shift_vec_xfw(pg1, pg2, u_e1r, &uu1[
VLEN * (2 * ncol + 4 * i)],
348 &uu2[
VLEN * (2 * ncol + 4 * i)]);
349 shift_vec_xfw(pg1, pg2, u_e1i, &uu1[
VLEN * (2 * ncol + 4 * i + 1)],
350 &uu2[
VLEN * (2 * ncol + 4 * i + 1)]);
351 shift_vec_xfw(pg1, pg2, u_e2r, &uu1[
VLEN * (2 * ncol + 4 * i + 2)],
352 &uu2[
VLEN * (2 * ncol + 4 * i + 2)]);
353 shift_vec_xfw(pg1, pg2, u_e2i, &uu1[
VLEN * (2 * ncol + 4 * i + 3)],
354 &uu2[
VLEN * (2 * ncol + 4 * i + 3)]);
357 ymax_vec(pg, out_e1r, u_e1r, in_e2r);
358 ymax_vec(pg, out_e1r, u_e1i, in_e2i);
359 ymax_vec(pg, out_e1i, u_e1r, in_e2i);
360 axpy_vec(pg, out_e1i, u_e1i, in_e2r);
363 axpy_vec(pg, out_e2r, u_e2r, in_e2r);
364 axpy_vec(pg, out_e2r, u_e2i, in_e2i);
365 axpy_vec(pg, out_e2i, u_e2r, in_e2i);
366 ymax_vec(pg, out_e2i, u_e2i, in_e2r);
369 save_vec(pg, out, out_e1r);
370 save_vec(pg, out +
VLEN, out_e1i);
371 save_vec(pg, out + 2 *
VLEN, out_e2r);
372 save_vec(pg, out + 3 *
VLEN, out_e2i);
381 int i,
const int ncol)
383 const int nh = ncol / 2;
385 svreal_t out_e1r, out_e1i, out_e2r, out_e2i;
386 load_vec(pg, out_e1r, out);
387 load_vec(pg, out_e1i, out +
VLEN);
388 load_vec(pg, out_e2r, out + 2 *
VLEN);
389 load_vec(pg, out_e2i, out + 3 *
VLEN);
391 for (
int j = 0; j < nh; j++) {
396 svreal_t u_e1r, u_e1i, u_e2r, u_e2i;
397 shift_vec_yfw(pg1, pg2, u_e1r, &uu1[
VLEN * (4 * i)],
398 &uu2[
VLEN * (4 * i)]);
399 shift_vec_yfw(pg1, pg2, u_e1i, &uu1[
VLEN * (4 * i + 1)],
400 &uu2[
VLEN * (4 * i + 1)]);
401 shift_vec_yfw(pg1, pg2, u_e2r, &uu1[
VLEN * (4 * i + 2)],
402 &uu2[
VLEN * (4 * i + 2)]);
403 shift_vec_yfw(pg1, pg2, u_e2i, &uu1[
VLEN * (4 * i + 3)],
404 &uu2[
VLEN * (4 * i + 3)]);
408 svreal_t in_e1r, in_e1i, in_e2r, in_e2i;
409 load_vec(pg, in_e1r, inj);
410 load_vec(pg, in_e1i, inj +
VLEN);
411 load_vec(pg, in_e2r, inj + 2 *
VLEN);
412 load_vec(pg, in_e2i, inj + 3 *
VLEN);
415 axpy_vec(pg, out_e1r, u_e1r, in_e1r);
416 axpy_vec(pg, out_e1r, u_e1i, in_e1i);
417 axpy_vec(pg, out_e1i, u_e1r, in_e1i);
418 ymax_vec(pg, out_e1i, u_e1i, in_e1r);
421 ymax_vec(pg, out_e2r, u_e2r, in_e1r);
422 ymax_vec(pg, out_e2r, u_e2i, in_e1i);
423 ymax_vec(pg, out_e2i, u_e2r, in_e1i);
424 axpy_vec(pg, out_e2i, u_e2i, in_e1r);
427 shift_vec_yfw(pg1, pg2, u_e1r, &uu1[
VLEN * (2 * ncol + 4 * i)],
428 &uu2[
VLEN * (2 * ncol + 4 * i)]);
429 shift_vec_yfw(pg1, pg2, u_e1i, &uu1[
VLEN * (2 * ncol + 4 * i + 1)],
430 &uu2[
VLEN * (2 * ncol + 4 * i + 1)]);
431 shift_vec_yfw(pg1, pg2, u_e2r, &uu1[
VLEN * (2 * ncol + 4 * i + 2)],
432 &uu2[
VLEN * (2 * ncol + 4 * i + 2)]);
433 shift_vec_yfw(pg1, pg2, u_e2i, &uu1[
VLEN * (2 * ncol + 4 * i + 3)],
434 &uu2[
VLEN * (2 * ncol + 4 * i + 3)]);
437 ymax_vec(pg, out_e1r, u_e1r, in_e2r);
438 ymax_vec(pg, out_e1r, u_e1i, in_e2i);
439 ymax_vec(pg, out_e1i, u_e1r, in_e2i);
440 axpy_vec(pg, out_e1i, u_e1i, in_e2r);
443 axpy_vec(pg, out_e2r, u_e2r, in_e2r);
444 axpy_vec(pg, out_e2r, u_e2i, in_e2i);
445 axpy_vec(pg, out_e2i, u_e2r, in_e2i);
446 ymax_vec(pg, out_e2i, u_e2i, in_e2r);
449 save_vec(pg, out, out_e1r);
450 save_vec(pg, out +
VLEN, out_e1i);
451 save_vec(pg, out + 2 *
VLEN, out_e2r);
452 save_vec(pg, out + 3 *
VLEN, out_e2i);
469 const int nh = ncol / 2;
470 for (
int i = 0; i < nh; ++i) {
472 accum_mult_udag_i(outi, in, u0, i, ncol);
478 inline void set_mult_u(
real_t *out,
482 const int nh = ncol / 2;
484 svbool_t pgfalse = set_predicate_false();
486 for (
int i = 0; i < nh; i++) {
488 accum_mult_u_i(outi, in, u0, i, ncol, pgfalse);
494 inline void set_mult_udag(
real_t *out,
498 const int nh = ncol / 2;
500 svbool_t pgfalse = set_predicate_false();
502 for (
int i = 0; i < nh; i++) {
504 accum_mult_udag_i(outi, in, u, i, ncol, pgfalse);
510 inline void accum_buf(
real_t *__restrict__ out,
511 real_t *__restrict__ in,
const int ncol)
514 for (
int i = 0; i < 2 * ncol; i++) {
516 load_vec(pg, vin, &in[
VLEN * i]);
518 add_vec(pg,
vout, vin);
525 inline void copy_buf(
real_t *__restrict__ out,
526 real_t *__restrict__ in,
const int ncol)
530 for (
int i = 0; i <
VLEN * 2 * ncol; i +=
VLEN) {
532 load_vec(pg, vinout, &in[i]);
533 save_vec(pg, &out[i], vinout);
540 real_t *__restrict__ buf,
real_t *__restrict__ in,
const int ncol)
542 const int nh = ncol / 2;
543 for (
int i = 0; i < nh; i++) {
545 load_vec(pg2, vt1, &in[
VLEN * (4 * i)]);
546 load_vec(pg2, vt2, &in[
VLEN * (4 * i + 1)]);
547 load_vec(pg2, vt3, &in[
VLEN * (4 * i + 2)]);
548 load_vec(pg2, vt4, &in[
VLEN * (4 * i + 3)]);
549 save_vec_scatter(pg2, &buf[
VLENY * (4 * i)], vt1, svidx);
550 save_vec_scatter(pg2, &buf[
VLENY * (4 * i + 1)], vt2, svidx);
551 save_vec_scatter(pg2, &buf[
VLENY * (4 * i + 2)], vt3, svidx);
552 save_vec_scatter(pg2, &buf[
VLENY * (4 * i + 3)], vt4, svidx);
563 const int ncol,
real_t *__restrict__ work)
566 const int nh = ncol / 2;
568 for (
int i = 0; i < nh; ++i) {
570 load_vec(pg1, vt1, &in0[
VLEN * (4 * i) + 1]);
571 load_vec(pg1, vt2, &in0[
VLEN * (4 * i + 1) + 1]);
572 load_vec(pg1, vt3, &in0[
VLEN * (4 * i + 2) + 1]);
573 load_vec(pg1, vt4, &in0[
VLEN * (4 * i + 3) + 1]);
574 load_add_gather(pg2, vt1, &buf[
VLENY * (4 * i)], svidx);
575 load_add_gather(pg2, vt2, &buf[
VLENY * (4 * i + 1)], svidx);
576 load_add_gather(pg2, vt3, &buf[
VLENY * (4 * i + 2)], svidx);
577 load_add_gather(pg2, vt4, &buf[
VLENY * (4 * i + 3)], svidx);
578 save_vec(pg, &work[
VLEN * (4 * i)], vt1);
579 save_vec(pg, &work[
VLEN * (4 * i + 1)], vt2);
580 save_vec(pg, &work[
VLEN * (4 * i + 2)], vt3);
581 save_vec(pg, &work[
VLEN * (4 * i + 3)], vt4);
584 accum_mult_u(out, work, u0, ncol);
593 const int ncol,
real_t *__restrict__ work)
597 int ncol2 = 2 * ncol;
599 for (
int i = 0; i < ncol2; ++i) {
601 shift_vec_xbw(pg1, pg2, wt, &in1[
VLEN * i], &in2[
VLEN * i]);
602 save_vec(pg, &in[
VLEN * i], wt);
604 accum_mult_u(out, in, u, ncol);
612 const int nh = ncol / 2;
613 for (
int i = 0; i < nh; ++i) {
619 accum_mult_udag_i(pg2, vt1, vt2, vt3, vt4, in, u, i, ncol);
620 save_vec_scatter(pg2, &buf[
VLENY * (4 * i)], vt1, svidx);
621 save_vec_scatter(pg2, &buf[
VLENY * (4 * i + 1)], vt2, svidx);
622 save_vec_scatter(pg2, &buf[
VLENY * (4 * i + 2)], vt3, svidx);
623 save_vec_scatter(pg2, &buf[
VLENY * (4 * i + 3)], vt4, svidx);
631 real_t *buf,
const int ncol)
634 const int nh = ncol / 2;
636 for (
int i = 0; i < nh; ++i) {
638 load_vec(pg, vt1, &out[
VLEN * (4 * i)]);
639 load_vec(pg, vt2, &out[
VLEN * (4 * i + 1)]);
640 load_vec(pg, vt3, &out[
VLEN * (4 * i + 2)]);
641 load_vec(pg, vt4, &out[
VLEN * (4 * i + 3)]);
642 accum_mult_udag_i(pg1, vt1, vt2, vt3, vt4,
643 &in0[-1], &u0[-1], i, ncol);
646 load_vec_gather(pg2, wt1, &buf[
VLENY * (4 * i)], svidx);
647 load_vec_gather(pg2, wt2, &buf[
VLENY * (4 * i + 1)], svidx);
648 load_vec_gather(pg2, wt3, &buf[
VLENY * (4 * i + 2)], svidx);
649 load_vec_gather(pg2, wt4, &buf[
VLENY * (4 * i + 3)], svidx);
650 add_vec(pg2, vt1, wt1);
651 add_vec(pg2, vt2, wt2);
652 add_vec(pg2, vt3, wt3);
653 add_vec(pg2, vt4, wt4);
655 save_vec(pg, &out[
VLEN * (4 * i)], vt1);
656 save_vec(pg, &out[
VLEN * (4 * i + 1)], vt2);
657 save_vec(pg, &out[
VLEN * (4 * i + 2)], vt3);
658 save_vec(pg, &out[
VLEN * (4 * i + 3)], vt4);
668 const int ncol,
real_t *__restrict__ work)
672 int ncol2 = 2 * ncol;
674 for (
int i = 0; i < ncol2; ++i) {
676 shift_vec_xfw(pg1, pg2, wt, &in1[
VLEN * i], &in2[
VLEN * i]);
677 save_vec(pg, &in[
VLEN * i], wt);
680 const int nh = ncol / 2;
681 for (
int i = 0; i < nh; i++) {
683 accum_mult_udag_xm_i(pg1, pg2, outi, in, u1, u2, i, ncol);
689 inline void mult_coarse_yp1(
svbool_t& pg2,
691 real_t *__restrict__ in,
const int ncol)
693 const int nh = ncol / 2;
694 for (
int i = 0; i < nh; i++) {
696 load_vec(pg2, vt1, &in[
VLEN * (4 * i)]);
697 load_vec(pg2, vt2, &in[
VLEN * (4 * i + 1)]);
698 load_vec(pg2, vt3, &in[
VLEN * (4 * i + 2)]);
699 load_vec(pg2, vt4, &in[
VLEN * (4 * i + 3)]);
700 save_vec(pg2, &buf[
VLENX * (4 * i)], vt1);
701 save_vec(pg2, &buf[
VLENX * (4 * i + 1)], vt2);
702 save_vec(pg2, &buf[
VLENX * (4 * i + 2)], vt3);
703 save_vec(pg2, &buf[
VLENX * (4 * i + 3)], vt4);
714 const int ncol,
real_t *work)
717 const int nh = ncol / 2;
720 for (
int i = 0; i < nh; ++i) {
722 load_vec(pg1, vt1, &in0[
VLEN * (4 * i) +
VLENX]);
723 load_vec(pg1, vt2, &in0[
VLEN * (4 * i + 1) +
VLENX]);
724 load_vec(pg1, vt3, &in0[
VLEN * (4 * i + 2) +
VLENX]);
725 load_vec(pg1, vt4, &in0[
VLEN * (4 * i + 3) +
VLENX]);
726 load_add(pg2, vt1, &buf[offset +
VLENY * (4 * i)]);
727 load_add(pg2, vt2, &buf[offset +
VLENY * (4 * i + 1)]);
728 load_add(pg2, vt3, &buf[offset +
VLENY * (4 * i + 2)]);
729 load_add(pg2, vt4, &buf[offset +
VLENY * (4 * i + 3)]);
730 save_vec(pg, &work[
VLEN * (4 * i)], vt1);
731 save_vec(pg, &work[
VLEN * (4 * i + 1)], vt2);
732 save_vec(pg, &work[
VLEN * (4 * i + 2)], vt3);
733 save_vec(pg, &work[
VLEN * (4 * i + 3)], vt4);
736 accum_mult_u(out, work, u0, ncol);
745 const int ncol,
real_t *work)
749 int ncol2 = 2 * ncol;
751 for (
int i = 0; i < ncol2; ++i) {
753 shift_vec_ybw(pg1, pg2, wt, &in1[
VLEN * i], &in2[
VLEN * i]);
754 save_vec(pg, &in[
VLEN * i], wt);
756 const int nh = ncol / 2;
757 for (
int i = 0; i < nh; ++i) {
759 accum_mult_u_i(outi, in, u, i, ncol, set_predicate());
765 inline void mult_coarse_ym1(
svbool_t& pg2,
771 const int nh = ncol / 2;
774 for (
int i = 0; i < nh; ++i) {
780 accum_mult_udag_i(pg2, vt1, vt2, vt3, vt4, in, u, i, ncol);
781 save_vec(pg2, &buf[offset +
VLENX * (4 * i)], vt1);
782 save_vec(pg2, &buf[offset +
VLENX * (4 * i + 1)], vt2);
783 save_vec(pg2, &buf[offset +
VLENX * (4 * i + 2)], vt3);
784 save_vec(pg2, &buf[offset +
VLENX * (4 * i + 3)], vt4);
792 real_t *buf,
const int ncol)
795 const int nh = ncol / 2;
797 for (
int i = 0; i < nh; ++i) {
799 load_vec(pg, vt1, &out[
VLEN * (4 * i)]);
800 load_vec(pg, vt2, &out[
VLEN * (4 * i + 1)]);
801 load_vec(pg, vt3, &out[
VLEN * (4 * i + 2)]);
802 load_vec(pg, vt4, &out[
VLEN * (4 * i + 3)]);
803 accum_mult_udag_i(pg1, vt1, vt2, vt3, vt4,
807 load_vec(pg2, wt1, &buf[
VLENX * (4 * i)]);
808 load_vec(pg2, wt2, &buf[
VLENX * (4 * i + 1)]);
809 load_vec(pg2, wt3, &buf[
VLENX * (4 * i + 2)]);
810 load_vec(pg2, wt4, &buf[
VLENX * (4 * i + 3)]);
811 add_vec(pg2, vt1, wt1);
812 add_vec(pg2, vt2, wt2);
813 add_vec(pg2, vt3, wt3);
814 add_vec(pg2, vt4, wt4);
816 save_vec(pg, &out[
VLEN * (4 * i)], vt1);
817 save_vec(pg, &out[
VLEN * (4 * i + 1)], vt2);
818 save_vec(pg, &out[
VLEN * (4 * i + 2)], vt3);
819 save_vec(pg, &out[
VLEN * (4 * i + 3)], vt4);
829 const int ncol,
real_t *__restrict__ work)
833 int ncol2 = 2 * ncol;
835 for (
int i = 0; i < ncol2; ++i) {
837 shift_vec_yfw(pg1, pg2, wt, &in1[
VLEN * i], &in2[
VLEN * i]);
838 save_vec(pg, &in[
VLEN * i], wt);
841 const int nh = ncol / 2;
842 for (
int i = 0; i < nh; i++) {
844 accum_mult_udag_ym_i(pg1, pg2, outi, in, u1, u2, i, ncol);
850 inline void mult_coarse_zp1(
real_t *out,
real_t *in,
const int ncol)
852 copy_buf(out, in, ncol);
860 accum_mult_u(out, buf, u, ncol);
868 accum_mult_u(out, in, u, ncol);
876 set_mult_udag(out, buf, u, ncol);
881 inline void mult_coarse_zm2(
real_t *out,
real_t *buf,
const int ncol)
883 accum_buf(out, buf, ncol);
891 accum_mult_udag(out, in, u, ncol);
896 inline void mult_coarse_tp1(
real_t *out,
real_t *in,
const int ncol)
898 copy_buf(out, in, ncol);
905 accum_mult_u(out, buf, u, ncol);
913 accum_mult_u(out, in, u, ncol);
921 set_mult_udag(out, buf, u, ncol);
926 inline void mult_coarse_tm2(
real_t *out,
real_t *buf,
const int ncol)
928 accum_buf(out, buf, ncol);
936 accum_mult_udag(out, in, u, ncol);