9 #ifndef QXS_VSIMD_COMMON_INCLUDED
10 #define QXS_VSIMD_COMMON_INCLUDED
13 template<
typename REALTYPE>
14 inline void load_vec(
Vsimd_t *vt, REALTYPE *vp,
int Nin)
16 for (
int in = 0; in < Nin; ++in) {
17 for (
int k = 0; k <
VLEN; ++k) {
18 vt[in].
v[k] = vp[k +
VLEN * in];
24 template<
typename REALTYPE>
25 inline void load_vec1_x(REALTYPE *vt, REALTYPE *v,
int kx,
int Nin)
27 for (
int in = 0; in < Nin; ++in) {
28 for (
int ky = 0; ky <
VLENY; ++ky) {
35 template<
typename REALTYPE>
36 inline void load_vec1_y(REALTYPE *vt, REALTYPE *v,
int ky,
int Nin)
38 for (
int in = 0; in < Nin; ++in) {
39 for (
int kx = 0; kx <
VLENX; ++kx) {
46 template<
typename REALTYPE>
47 inline void save_vec(REALTYPE *x,
Vsimd_t *vt,
int Nin)
49 for (
int in = 0; in < Nin; ++in) {
50 for (
int k = 0; k <
VLEN; ++k) {
51 x[k +
VLEN * in] = vt[in].
v[k];
57 template<
typename REALTYPE>
64 template<
typename REALTYPE>
65 inline void save_vec_scatter(
svbool_t pg, REALTYPE *vp,
68 svst1_scatter_index(pg, vp, index, vt);
72 template<
typename REALTYPE>
73 inline void save_vec1_x(REALTYPE *x,
Vsimd_t *vt,
int kx,
int Nin)
75 for (
int in = 0; in < Nin; ++in) {
76 for (
int ky = 0; ky <
VLENY; ++ky) {
83 template<
typename REALTYPE>
84 inline void save_vec1_y(REALTYPE *x,
Vsimd_t *vt,
int ky,
int Nin)
86 for (
int in = 0; in < Nin; ++in) {
87 for (
int kx = 0; kx <
VLENX; ++kx) {
94 inline void clear_vec(
Vsimd_t *vt,
int Nin)
96 for (
int in = 0; in < Nin; ++in) {
97 for (
int k = 0; k <
VLEN; ++k) {
104 template<
typename REALTYPE>
105 inline void add_vec(REALTYPE *x,
Vsimd_t *vt,
int Nin)
107 for (
int in = 0; in < Nin; ++in) {
108 for (
int k = 0; k <
VLEN; ++k) {
109 x[k +
VLEN * in] += vt[in].
v[k];
117 for (
int in = 0; in < Nin; ++in) {
118 for (
int k = 0; k <
VLEN; ++k) {
119 x[in].
v[k] += y[in].
v[k];
127 x = svadd_m(pg, x, y);
133 z = svadd_m(pg, x, y);
139 x = svsub_m(pg, x, y);
145 z = svsub_m(pg, x, y);
151 for (
int in = 0; in < Nin; ++in) {
152 for (
int k = 0; k <
VLEN; ++k) {
153 x[in].
v[k] = y[in].
v[k];
159 template<
typename REALTYPE>
162 for (
int in = 0; in < Nin; ++in) {
163 for (
int k = 0; k <
VLEN; ++k) {
164 x[in].
v[k] = a * y[in].
v[k];
172 x = svmul_m(pg, y, a);
176 template<
typename REALTYPE>
179 for (
int in = 0; in < Nin; ++in) {
180 for (
int k = 0; k <
VLEN; ++k) {
181 y[in].
v[k] += a * x[in].
v[k];
189 y = svmla_m(pg, y, x, a);
195 y = svmla_m(pg, y, x, a);
201 y = svmls_m(pg, y, x, a);
205 template<
typename REALTYPE>
208 for (
int in = 0; in < Nin; ++in) {
209 for (
int k = 0; k <
VLEN; ++k) {
210 x[in].
v[k] = a * x[in].
v[k] + y[in].
v[k];
218 y = svmla_m(pg, x, y, a);
222 template<
typename REALTYPE>
223 inline void scal_vec(
Vsimd_t *x, REALTYPE a,
int Nin)
225 for (
int in = 0; in < Nin; ++in) {
226 for (
int k = 0; k <
VLEN; ++k) {
235 x = svmul_m(pg, y, w);
241 x = svmul_m(pg, x, a);
247 x = svmul_m(pg, x, a);
251 template<
typename REALTYPE>
255 for (
int in = 0; in < Nin; ++in) {
256 for (
int k = 0; k <
VLEN; ++k) {
257 a += x[in].
v[k] * y[in].
v[k];
263 template<
typename REALTYPE>
264 inline void norm2_vec(REALTYPE& a,
Vsimd_t *x,
int Nin)
267 for (
int in = 0; in < Nin; ++in) {
268 for (
int k = 0; k <
VLEN; ++k) {
269 a += x[in].
v[k] * x[in].
v[k];
275 template<
typename REALTYPE>
276 inline void reduce_vec(REALTYPE& a,
Vsimd_t *x,
int Nin)
279 for (
int in = 0; in < Nin; ++in) {
280 for (
int k = 0; k <
VLEN; ++k) {
287 template<
typename REALTYPE>
296 y = svmla_m(pg, y, x, x);
302 for (
int in = 0; in < Nin; ++in) {
303 for (
int k = 0; k <
VLEN; ++k) {
304 y[in].
v[k] += x[in].
v[k] * x[in].
v[k];
312 y = svmla_m(pg, y, x, w);
318 for (
int in = 0; in < Nin; ++in) {
319 for (
int k = 0; k <
VLEN; ++k) {
320 y[in].
v[k] += x[in].
v[k] * w[in].
v[k];
328 y = svmls_m(pg, y, x, w);
334 for (
int in = 0; in < Nin; ++in) {
335 for (
int k = 0; k <
VLEN; ++k) {
336 y[in].
v[k] -= x[in].
v[k] * w[in].
v[k];
343 inline void set_index_xp(
svint_t& svindex_xp)
346 for (
int iy = 0; iy <
VLENY; ++iy) {
348 for (
int ix = 0; ix <
VLENX - 1; ++ix) {
349 index[ix +
VLENX * iy] = 0;
353 load_svint(pg, svindex_xp, index);
357 inline void set_index_xm(
svint_t& svindex_xm)
360 for (
int iy = 0; iy <
VLENY; ++iy) {
361 index[
VLENX * iy] = iy;
362 for (
int ix = 1; ix <
VLENX; ++ix) {
363 index[ix +
VLENX * iy] = 0;
367 load_svint(pg, svindex_xm, index);
371 inline void set_index_xp_eo(
svint_t& svindex_xp)
374 for (
int iy = 0; iy <
VLENY; ++iy) {
376 for (
int ix = 0; ix <
VLENX - 1; ++ix) {
377 index[ix +
VLENX * iy] = 0;
381 load_svint(pg, svindex_xp, index);
385 inline void set_index_xm_eo(
svint_t& svindex_xm)
388 for (
int iy = 0; iy <
VLENY; ++iy) {
389 index[
VLENX * iy] = iy / 2;
390 for (
int ix = 1; ix <
VLENX; ++ix) {
391 index[ix +
VLENX * iy] = 0;
395 load_svint(pg, svindex_xm, index);
399 inline void set_index_xp_eo(
svuint_t& svindex_xp)
402 for (
int iy = 0; iy <
VLENY; ++iy) {
404 for (
int ix = 0; ix <
VLENX - 1; ++ix) {
405 index[ix +
VLENX * iy] = 0;
409 load_svuint(pg, svindex_xp, index);
413 inline void set_index_xm_eo(
svuint_t& svindex_xm)
416 for (
int iy = 0; iy <
VLENY; ++iy) {
417 index[
VLENX * iy] = iy / 2;
418 for (
int ix = 1; ix <
VLENX; ++ix) {
419 index[ix +
VLENX * iy] = 0;
423 load_svuint(pg, svindex_xm, index);
427 inline void set_index_yp(
svint_t& svindex_yp)
430 for (
int ix = 0; ix <
VLENX; ++ix) {
431 for (
int iy = 0; iy <
VLENY - 1; ++iy) {
432 index[ix +
VLENX * iy] = 0;
437 load_svint(pg, svindex_yp, index);
441 inline void set_index_ym(
svint_t& svindex_ym)
444 for (
int ix = 0; ix <
VLENX; ++ix) {
446 for (
int iy = 1; iy <
VLENY; ++iy) {
447 index[ix +
VLENX * iy] = 0;
451 load_svint(pg, svindex_ym, index);
455 template<
typename REALTYPE>
458 const REALTYPE *__restrict xc,
459 const REALTYPE *__restrict xn)
463 load_vec(pg0, vc, xc);
464 load_vec(pg0, vn, xn);
470 template<
typename REALTYPE>
472 svreal_t& v, REALTYPE *wx, REALTYPE *wn)
474 load_vec(pg1, v, &wx[1]);
475 load_add(pg2, v, &wn[-
VLENX + 1]);
479 template<
typename REALTYPE>
481 svreal_t& v, REALTYPE *wx, REALTYPE *wn)
483 load_vec(pg3, v, &wx[0]);
484 load_add(pg1, v, &wx[1]);
485 load_add(pg2, v, &wn[-
VLENX + 1]);
489 template<
typename REALTYPE>
491 svreal_t& v, REALTYPE *wx, REALTYPE *wn)
493 load_vec(pg1, v, &wx[-1]);
494 load_add(pg2, v, &wn[
VLENX - 1]);
498 template<
typename REALTYPE>
500 svreal_t& v, REALTYPE *wx, REALTYPE *wn)
502 load_vec(pg3, v, &wx[0]);
503 load_add(pg1, v, &wx[-1]);
504 load_add(pg2, v, &wn[
VLENX - 1]);
508 template<
typename REALTYPE>
510 Vsimd_t *x, REALTYPE *wx, REALTYPE *wn,
514 for (
int in = 0; in < Nin; ++in) {
516 load_vec(pg1, vt, &wx[
VLEN * in - 1]);
517 load_add(pg2, vt, &wn[
VLEN * in +
VLENX - 1]);
518 svst1(pg, &x[in].v[0], vt);
523 template<
typename REALTYPE>
525 svreal_t& v, REALTYPE *wx, REALTYPE *wn)
527 load_vec(pg1, v, &wx[
VLENX]);
532 template<
typename REALTYPE>
534 svreal_t& v, REALTYPE *wx, REALTYPE *wn)
536 load_vec(pg1, v, &wx[-
VLENX]);
541 template<
typename REALTYPE>
542 inline void shift_vec_ybw(
svreal_t& v, REALTYPE *wx, REALTYPE *wn)
546 load_vec(pg, v1, &wx[0]);
547 load_vec(pg, v2, &wn[0]);
548 v = svext(v1, v2,
VLENX);
552 template<
typename REALTYPE>
553 inline void shift_vec_yfw(
svreal_t& v, REALTYPE *wx, REALTYPE *wn)
557 load_vec(pg, v1, &wx[0]);
558 load_vec(pg, v2, &wn[0]);
563 template<
typename REALTYPE>
565 Vsimd_t *x, REALTYPE *wx, REALTYPE *wn,
569 for (
int in = 0; in < Nin; ++in) {
571 load_vec(pg1, vt, &wx[
VLEN * in -
VLENX]);
573 svst1(pg, &x[in].v[0], vt);
578 template<
typename REALTYPE>
579 inline void shift_vec0_xbw(REALTYPE *v, REALTYPE *w,
int Nin)
581 for (
int in = 0; in < Nin; ++in) {
582 for (
int kx = 0; kx <
VLENX - 1; ++kx) {
583 for (
int ky = 0; ky <
VLENY; ++ky) {
588 for (
int ky = 0; ky <
VLENY; ++ky) {
595 template<
typename REALTYPE>
596 inline void shift_vec0_xfw(REALTYPE *v, REALTYPE *w,
int Nin)
598 for (
int in = 0; in < Nin; ++in) {
599 for (
int kx = 1; kx <
VLENX; ++kx) {
600 for (
int ky = 0; ky <
VLENY; ++ky) {
604 for (
int ky = 0; ky <
VLENY; ++ky) {
611 template<
typename REALTYPE>
612 inline void shift_vec0_ybw(REALTYPE *v, REALTYPE *w,
int Nin)
614 for (
int in = 0; in < Nin; ++in) {
615 for (
int kx = 0; kx <
VLENX; ++kx) {
616 for (
int ky = 0; ky <
VLENY - 1; ++ky) {
621 for (
int kx = 0; kx <
VLENX; ++kx) {
628 template<
typename REALTYPE>
629 inline void shift_vec0_yfw(REALTYPE *v, REALTYPE *w,
int Nin)
631 for (
int in = 0; in < Nin; ++in) {
632 for (
int kx = 0; kx <
VLENX; ++kx) {
633 for (
int ky = 1; ky <
VLENY; ++ky) {
638 for (
int kx = 0; kx <
VLENX; ++kx) {
645 template<
typename REALTYPE>
646 inline void shift_vec1_xbw(
Vsimd_t *x, REALTYPE *buf,
int Nin)
648 for (
int in = 0; in < Nin; ++in) {
649 for (
int kx = 0; kx <
VLENX - 1; ++kx) {
650 for (
int ky = 0; ky <
VLENY; ++ky) {
651 x[in].
v[kx +
VLENX * ky] = 0.0;
655 for (
int ky = 0; ky <
VLENY; ++ky) {
662 template<
typename REALTYPE>
663 inline void shift_vec1_xfw(
Vsimd_t *x, REALTYPE *buf,
int Nin)
665 for (
int in = 0; in < Nin; ++in) {
666 for (
int kx = 1; kx <
VLENX; ++kx) {
667 for (
int ky = 0; ky <
VLENY; ++ky) {
668 x[in].
v[kx +
VLENX * ky] = 0.0;
671 for (
int ky = 0; ky <
VLENY; ++ky) {
678 template<
typename REALTYPE>
679 inline void shift_vec1_ybw(
Vsimd_t *v, REALTYPE *buf,
int Nin)
681 for (
int in = 0; in < Nin; ++in) {
682 for (
int kx = 0; kx <
VLENX; ++kx) {
683 for (
int ky = 0; ky <
VLENY - 1; ++ky) {
684 v[in].
v[kx +
VLENX * ky] = 0.0;
688 for (
int kx = 0; kx <
VLENX; ++kx) {
695 template<
typename REALTYPE>
696 inline void shift_vec1_yfw(
Vsimd_t *v, REALTYPE *buf,
int Nin)
698 for (
int in = 0; in < Nin; ++in) {
699 for (
int kx = 0; kx <
VLENX; ++kx) {
700 for (
int ky = 1; ky <
VLENY; ++ky) {
701 v[in].
v[kx +
VLENX * ky] = 0.0;
705 for (
int kx = 0; kx <
VLENX; ++kx) {
712 template<
typename REALTYPE>
713 inline void shift_vec2_xbw(REALTYPE *v, REALTYPE *w, REALTYPE *y,
int Nin)
715 for (
int in = 0; in < Nin; ++in) {
716 for (
int kx = 0; kx <
VLENX - 1; ++kx) {
717 for (
int ky = 0; ky <
VLENY; ++ky) {
722 for (
int ky = 0; ky <
VLENY; ++ky) {
729 template<
typename REALTYPE>
730 inline void shift_vec2_xfw(REALTYPE *v, REALTYPE *w, REALTYPE *y,
int Nin)
732 for (
int in = 0; in < Nin; ++in) {
733 for (
int kx = 1; kx <
VLENX; ++kx) {
734 for (
int ky = 0; ky <
VLENY; ++ky) {
738 for (
int ky = 0; ky <
VLENY; ++ky) {
745 template<
typename REALTYPE>
746 inline void shift_vec2_xbw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
int Nin)
748 for (
int in = 0; in < Nin; ++in) {
749 for (
int kx = 0; kx <
VLENX - 1; ++kx) {
750 for (
int ky = 0; ky <
VLENY; ++ky) {
755 for (
int ky = 0; ky <
VLENY; ++ky) {
762 template<
typename REALTYPE>
763 inline void shift_vec2_xbw_eo(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
766 for (
int in = 0; in < Nin; ++in) {
767 for (
int ky = 0; ky <
VLENY; ++ky) {
768 if ((ky % 2) == ieo) {
769 for (
int kx = 0; kx <
VLENX; ++kx) {
773 for (
int kx = 0; kx <
VLENX - 1; ++kx) {
783 template<
typename REALTYPE>
784 inline void shift_vec2_xfw_eo(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
787 for (
int in = 0; in < Nin; ++in) {
788 for (
int ky = 0; ky <
VLENY; ++ky) {
789 if ((ky % 2) == ieo) {
790 for (
int kx = 1; kx <
VLENX; ++kx) {
795 for (
int kx = 0; kx <
VLENX; ++kx) {
804 template<
typename REALTYPE>
805 inline void shift_vec2_xfw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
int Nin)
807 for (
int in = 0; in < Nin; ++in) {
808 for (
int kx = 1; kx <
VLENX; ++kx) {
809 for (
int ky = 0; ky <
VLENY; ++ky) {
813 for (
int ky = 0; ky <
VLENY; ++ky) {
820 template<
typename REALTYPE>
821 inline void shift_vec2_ybw(REALTYPE *v, REALTYPE *w, REALTYPE *y,
int Nin)
823 for (
int in = 0; in < Nin; ++in) {
824 for (
int kx = 0; kx <
VLENX; ++kx) {
825 for (
int ky = 0; ky <
VLENY - 1; ++ky) {
830 for (
int kx = 0; kx <
VLENX; ++kx) {
837 template<
typename REALTYPE>
838 inline void shift_vec2_yfw(REALTYPE *v, REALTYPE *w, REALTYPE *y,
int Nin)
840 for (
int in = 0; in < Nin; ++in) {
841 for (
int kx = 0; kx <
VLENX; ++kx) {
842 for (
int ky = 1; ky <
VLENY; ++ky) {
847 for (
int kx = 0; kx <
VLENX; ++kx) {
854 template<
typename REALTYPE>
855 inline void shift_vec2_ybw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
int Nin)
857 for (
int in = 0; in < Nin; ++in) {
858 for (
int kx = 0; kx <
VLENX; ++kx) {
859 for (
int ky = 0; ky <
VLENY - 1; ++ky) {
864 for (
int kx = 0; kx <
VLENX; ++kx) {
871 template<
typename REALTYPE>
872 inline void shift_vec2_yfw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
int Nin)
874 for (
int in = 0; in < Nin; ++in) {
875 for (
int kx = 0; kx <
VLENX; ++kx) {
876 for (
int ky = 1; ky <
VLENY; ++ky) {
881 for (
int kx = 0; kx <
VLENX; ++kx) {
890 template<
typename REALTYPE>
891 inline void shift_vec0_bw(REALTYPE *v, REALTYPE *w,
int Nin)
893 for (
int in = 0; in < Nin; ++in) {
894 for (
int k = 0; k <
VLEN - 1; ++k) {
895 v[k +
VLEN * in] = w[k + 1 +
VLEN * in];
902 template<
typename REALTYPE>
903 inline void shift_vec0_fw(REALTYPE *v, REALTYPE *w,
int Nin)
905 for (
int in = 0; in < Nin; ++in) {
906 for (
int k = 1; k <
VLEN; ++k) {
907 v[k +
VLEN * in] = w[k - 1 +
VLEN * in];
909 v[0 +
VLEN * in] = 0.0;
914 template<
typename REALTYPE>
915 inline void shift_vec1_bw(
Vsimd_t *x, REALTYPE *buf,
int Nin)
917 for (
int in = 0; in < Nin; ++in) {
918 for (
int k = 0; k <
VLEN - 1; ++k) {
921 x[in].
v[
VLEN - 1] = buf[in];
926 template<
typename REALTYPE>
927 inline void shift_vec1_fw(
Vsimd_t *x, REALTYPE *buf,
int Nin)
929 for (
int in = 0; in < Nin; ++in) {
930 for (
int k = 1; k <
VLEN; ++k) {
933 x[in].
v[0] = buf[in];
938 template<
typename REALTYPE>
939 inline void shift_vec1_bw(REALTYPE *v, REALTYPE *w, REALTYPE *buf,
int Nin)
941 for (
int in = 0; in < Nin; ++in) {
942 for (
int k = 0; k <
VLEN - 1; ++k) {
943 v[k +
VLEN * in] = w[k + 1 +
VLEN * in];
944 v[k +
VLEN * in] = w[k + 1 +
VLEN * in];
951 template<
typename REALTYPE>
952 inline void shift_vec1_fw(REALTYPE *v, REALTYPE *w, REALTYPE *buf,
int Nin)
954 for (
int in = 0; in < Nin; ++in) {
955 for (
int k = 1; k <
VLEN; ++k) {
956 v[k +
VLEN * in] = w[k - 1 +
VLEN * in];
958 v[0 +
VLEN * in] = buf[in];
963 template<
typename REALTYPE>
964 inline void shift_vec2_bw(REALTYPE *v, REALTYPE *w, REALTYPE *y,
int Nin)
966 for (
int in = 0; in < Nin; ++in) {
967 for (
int k = 0; k <
VLEN - 1; ++k) {
968 v[k +
VLEN * in] = w[k + 1 +
VLEN * in];
975 template<
typename REALTYPE>
976 inline void shift_vec2_fw(REALTYPE *v, REALTYPE *w, REALTYPE *y,
int Nin)
978 for (
int in = 0; in < Nin; ++in) {
979 for (
int k = 1; k <
VLEN; ++k) {
980 v[k +
VLEN * in] = w[k - 1 +
VLEN * in];
987 template<
typename REALTYPE>
988 inline void shift_vec2_bw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
int Nin)
990 for (
int in = 0; in < Nin; ++in) {
991 for (
int k = 0; k <
VLEN - 1; ++k) {
992 v[in].
v[k] = w[k + 1 +
VLEN * in];
999 template<
typename REALTYPE>
1000 inline void shift_vec2_fw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
int Nin)
1002 for (
int in = 0; in < Nin; ++in) {
1003 for (
int k = 1; k <
VLEN; ++k) {
1004 v[in].
v[k] = w[k - 1 +
VLEN * in];
1011 template<
typename REALTYPE>
1012 inline void load_vec1(REALTYPE *vt, REALTYPE *v,
int k,
int Nin)
1014 for (
int in = 0; in < Nin; ++in) {
1015 vt[in] = v[k +
VLEN * in];
1020 template<
typename REALTYPE>
1021 inline void save_vec1(REALTYPE *x,
Vsimd_t *vt,
int k,
int Nin)
1023 for (
int in = 0; in < Nin; ++in) {
1024 x[in] = vt[in].
v[k];
1031 return svcompact(pg, yt);
1035 template<
typename REALTYPE>
1039 svbool_t pg1 = set_predicate_whilelt(skip);
1041 load_vec(pg1, v1, v);
1042 v2 = svtbl(v1, index);
1043 vt = svsel(pg2, v2, vt);