9 #ifndef QXS_VSIMD_COMMON_INCLUDED 
   10 #define QXS_VSIMD_COMMON_INCLUDED 
   13   template<
typename REALTYPE>
 
   14   inline void load_vec(
Vsimd_t *vt, REALTYPE *vp, 
int Nin)
 
   16     for (
int in = 0; in < Nin; ++in) {
 
   17       for (
int k = 0; k < 
VLEN; ++k) {
 
   18         vt[in].
v[k] = vp[k + 
VLEN * in];
 
   24   template<
typename REALTYPE>
 
   25   inline void load_vec1_x(REALTYPE *vt, REALTYPE *v, 
int kx, 
int Nin)
 
   27     for (
int in = 0; in < Nin; ++in) {
 
   28       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
   35   template<
typename REALTYPE>
 
   36   inline void load_vec1_y(REALTYPE *vt, REALTYPE *v, 
int ky, 
int Nin)
 
   38     for (
int in = 0; in < Nin; ++in) {
 
   39       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
   46   template<
typename REALTYPE>
 
   47   inline void save_vec(REALTYPE *x, 
Vsimd_t *vt, 
int Nin)
 
   49     for (
int in = 0; in < Nin; ++in) {
 
   50       for (
int k = 0; k < 
VLEN; ++k) {
 
   51         x[k + 
VLEN * in] = vt[in].
v[k];
 
   57   template<
typename REALTYPE>
 
   64   template<
typename REALTYPE>
 
   65   inline void save_vec_scatter(
svbool_t pg, REALTYPE *vp,
 
   68     svst1_scatter_index(pg, vp, index, vt);
 
   72   template<
typename REALTYPE>
 
   73   inline void save_vec1_x(REALTYPE *x, 
Vsimd_t *vt, 
int kx, 
int Nin)
 
   75     for (
int in = 0; in < Nin; ++in) {
 
   76       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
   83   template<
typename REALTYPE>
 
   84   inline void save_vec1_y(REALTYPE *x, 
Vsimd_t *vt, 
int ky, 
int Nin)
 
   86     for (
int in = 0; in < Nin; ++in) {
 
   87       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
   94   inline void clear_vec(
Vsimd_t *vt, 
int Nin)
 
   96     for (
int in = 0; in < Nin; ++in) {
 
   97       for (
int k = 0; k < 
VLEN; ++k) {
 
  104   template<
typename REALTYPE>
 
  105   inline void add_vec(REALTYPE *x, 
Vsimd_t *vt, 
int Nin)
 
  107     for (
int in = 0; in < Nin; ++in) {
 
  108       for (
int k = 0; k < 
VLEN; ++k) {
 
  109         x[k + 
VLEN * in] += vt[in].
v[k];
 
  117     for (
int in = 0; in < Nin; ++in) {
 
  118       for (
int k = 0; k < 
VLEN; ++k) {
 
  119         x[in].
v[k] += y[in].
v[k];
 
  127     x = svadd_m(pg, x, y);
 
  133     z = svadd_m(pg, x, y);
 
  139     x = svsub_m(pg, x, y);
 
  145     z = svsub_m(pg, x, y);
 
  151     for (
int in = 0; in < Nin; ++in) {
 
  152       for (
int k = 0; k < 
VLEN; ++k) {
 
  153         x[in].
v[k] = y[in].
v[k];
 
  159   template<
typename REALTYPE>
 
  162     for (
int in = 0; in < Nin; ++in) {
 
  163       for (
int k = 0; k < 
VLEN; ++k) {
 
  164         x[in].
v[k] = a * y[in].
v[k];
 
  172     x = svmul_m(pg, y, a);
 
  176   template<
typename REALTYPE>
 
  179     for (
int in = 0; in < Nin; ++in) {
 
  180       for (
int k = 0; k < 
VLEN; ++k) {
 
  181         y[in].
v[k] += a * x[in].
v[k];
 
  189     y = svmla_m(pg, y, x, a);
 
  195     y = svmla_m(pg, y, x, a);
 
  201     y = svmls_m(pg, y, x, a);
 
  205   template<
typename REALTYPE>
 
  208     for (
int in = 0; in < Nin; ++in) {
 
  209       for (
int k = 0; k < 
VLEN; ++k) {
 
  210         x[in].
v[k] = a * x[in].
v[k] + y[in].
v[k];
 
  218     y = svmla_m(pg, x, y, a);
 
  222   template<
typename REALTYPE>
 
  223   inline void scal_vec(
Vsimd_t *x, REALTYPE a, 
int Nin)
 
  225     for (
int in = 0; in < Nin; ++in) {
 
  226       for (
int k = 0; k < 
VLEN; ++k) {
 
  235     x = svmul_m(pg, y, w);
 
  241     x = svmul_m(pg, x, a);
 
  247     x = svmul_m(pg, x, a);
 
  251   template<
typename REALTYPE>
 
  255     for (
int in = 0; in < Nin; ++in) {
 
  256       for (
int k = 0; k < 
VLEN; ++k) {
 
  257         a += x[in].
v[k] * y[in].
v[k];
 
  263   template<
typename REALTYPE>
 
  264   inline void norm2_vec(REALTYPE& a, 
Vsimd_t *x, 
int Nin)
 
  267     for (
int in = 0; in < Nin; ++in) {
 
  268       for (
int k = 0; k < 
VLEN; ++k) {
 
  269         a += x[in].
v[k] * x[in].
v[k];
 
  275   template<
typename REALTYPE>
 
  276   inline void reduce_vec(REALTYPE& a, 
Vsimd_t *x, 
int Nin)
 
  279     for (
int in = 0; in < Nin; ++in) {
 
  280       for (
int k = 0; k < 
VLEN; ++k) {
 
  287   template<
typename REALTYPE>
 
  296     y = svmla_m(pg, y, x, x);
 
  302     for (
int in = 0; in < Nin; ++in) {
 
  303       for (
int k = 0; k < 
VLEN; ++k) {
 
  304         y[in].
v[k] += x[in].
v[k] * x[in].
v[k];
 
  312     y = svmla_m(pg, y, x, w);
 
  318     for (
int in = 0; in < Nin; ++in) {
 
  319       for (
int k = 0; k < 
VLEN; ++k) {
 
  320         y[in].
v[k] += x[in].
v[k] * w[in].
v[k];
 
  328     y = svmls_m(pg, y, x, w);
 
  334     for (
int in = 0; in < Nin; ++in) {
 
  335       for (
int k = 0; k < 
VLEN; ++k) {
 
  336         y[in].
v[k] -= x[in].
v[k] * w[in].
v[k];
 
  343   inline void set_index_xp(
svint_t& svindex_xp)
 
  346     for (
int iy = 0; iy < 
VLENY; ++iy) {
 
  348       for (
int ix = 0; ix < 
VLENX - 1; ++ix) {
 
  349         index[ix + 
VLENX * iy] = 0;
 
  353     load_svint(pg, svindex_xp, index);
 
  357   inline void set_index_xm(
svint_t& svindex_xm)
 
  360     for (
int iy = 0; iy < 
VLENY; ++iy) {
 
  361       index[
VLENX * iy] = iy;
 
  362       for (
int ix = 1; ix < 
VLENX; ++ix) {
 
  363         index[ix + 
VLENX * iy] = 0;
 
  367     load_svint(pg, svindex_xm, index);
 
  371   inline void set_index_xp_eo(
svint_t& svindex_xp)
 
  374     for (
int iy = 0; iy < 
VLENY; ++iy) {
 
  376       for (
int ix = 0; ix < 
VLENX - 1; ++ix) {
 
  377         index[ix + 
VLENX * iy] = 0;
 
  381     load_svint(pg, svindex_xp, index);
 
  385   inline void set_index_xm_eo(
svint_t& svindex_xm)
 
  388     for (
int iy = 0; iy < 
VLENY; ++iy) {
 
  389       index[
VLENX * iy] = iy / 2;
 
  390       for (
int ix = 1; ix < 
VLENX; ++ix) {
 
  391         index[ix + 
VLENX * iy] = 0;
 
  395     load_svint(pg, svindex_xm, index);
 
  399   inline void set_index_xp_eo(
svuint_t& svindex_xp)
 
  402     for (
int iy = 0; iy < 
VLENY; ++iy) {
 
  404       for (
int ix = 0; ix < 
VLENX - 1; ++ix) {
 
  405         index[ix + 
VLENX * iy] = 0;
 
  409     load_svuint(pg, svindex_xp, index);
 
  413   inline void set_index_xm_eo(
svuint_t& svindex_xm)
 
  416     for (
int iy = 0; iy < 
VLENY; ++iy) {
 
  417       index[
VLENX * iy] = iy / 2;
 
  418       for (
int ix = 1; ix < 
VLENX; ++ix) {
 
  419         index[ix + 
VLENX * iy] = 0;
 
  423     load_svuint(pg, svindex_xm, index);
 
  427   inline void set_index_yp(
svint_t& svindex_yp)
 
  430     for (
int ix = 0; ix < 
VLENX; ++ix) {
 
  431       for (
int iy = 0; iy < 
VLENY - 1; ++iy) {
 
  432         index[ix + 
VLENX * iy] = 0;
 
  437     load_svint(pg, svindex_yp, index);
 
  441   inline void set_index_ym(
svint_t& svindex_ym)
 
  444     for (
int ix = 0; ix < 
VLENX; ++ix) {
 
  446       for (
int iy = 1; iy < 
VLENY; ++iy) {
 
  447         index[ix + 
VLENX * iy] = 0;
 
  451     load_svint(pg, svindex_ym, index);
 
  455   template<
typename REALTYPE>
 
  458                         const REALTYPE *__restrict xc,
 
  459                         const REALTYPE *__restrict xn)
 
  463     load_vec(pg0, vc, xc);
 
  464     load_vec(pg0, vn, xn);
 
  470   template<
typename REALTYPE>
 
  472                             svreal_t& v, REALTYPE *wx, REALTYPE *wn)
 
  474     load_vec(pg1, v, &wx[1]);
 
  475     load_add(pg2, v, &wn[-
VLENX + 1]);
 
  479   template<
typename REALTYPE>
 
  481                             svreal_t& v, REALTYPE *wx, REALTYPE *wn)
 
  483     load_vec(pg3, v, &wx[0]);
 
  484     load_add(pg1, v, &wx[1]);
 
  485     load_add(pg2, v, &wn[-
VLENX + 1]);
 
  489   template<
typename REALTYPE>
 
  491                             svreal_t& v, REALTYPE *wx, REALTYPE *wn)
 
  493     load_vec(pg1, v, &wx[-1]);
 
  494     load_add(pg2, v, &wn[
VLENX - 1]);
 
  498   template<
typename REALTYPE>
 
  500                             svreal_t& v, REALTYPE *wx, REALTYPE *wn)
 
  502     load_vec(pg3, v, &wx[0]);
 
  503     load_add(pg1, v, &wx[-1]);
 
  504     load_add(pg2, v, &wn[
VLENX - 1]);
 
  508   template<
typename REALTYPE>
 
  510                             Vsimd_t *x, REALTYPE *wx, REALTYPE *wn,
 
  514     for (
int in = 0; in < Nin; ++in) {
 
  516       load_vec(pg1, vt, &wx[
VLEN * in - 1]);
 
  517       load_add(pg2, vt, &wn[
VLEN * in + 
VLENX - 1]);
 
  518       svst1(pg, &x[in].v[0], vt);
 
  523   template<
typename REALTYPE>
 
  525                             svreal_t& v, REALTYPE *wx, REALTYPE *wn)
 
  527     load_vec(pg1, v, &wx[
VLENX]);
 
  532   template<
typename REALTYPE>
 
  534                             svreal_t& v, REALTYPE *wx, REALTYPE *wn)
 
  536     load_vec(pg1, v, &wx[-
VLENX]);
 
  541   template<
typename REALTYPE>
 
  542   inline void shift_vec_ybw(
svreal_t& v, REALTYPE *wx, REALTYPE *wn)
 
  546     load_vec(pg, v1, &wx[0]);
 
  547     load_vec(pg, v2, &wn[0]);
 
  548     v = svext(v1, v2, 
VLENX);
 
  552   template<
typename REALTYPE>
 
  553   inline void shift_vec_yfw(
svreal_t& v, REALTYPE *wx, REALTYPE *wn)
 
  557     load_vec(pg, v1, &wx[0]);
 
  558     load_vec(pg, v2, &wn[0]);
 
  563   template<
typename REALTYPE>
 
  565                             Vsimd_t *x, REALTYPE *wx, REALTYPE *wn,
 
  569     for (
int in = 0; in < Nin; ++in) {
 
  571       load_vec(pg1, vt, &wx[
VLEN * in - 
VLENX]);
 
  573       svst1(pg, &x[in].v[0], vt);
 
  578   template<
typename REALTYPE>
 
  579   inline void shift_vec0_xbw(REALTYPE *v, REALTYPE *w, 
int Nin)
 
  581     for (
int in = 0; in < Nin; ++in) {
 
  582       for (
int kx = 0; kx < 
VLENX - 1; ++kx) {
 
  583         for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  588       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  595   template<
typename REALTYPE>
 
  596   inline void shift_vec0_xfw(REALTYPE *v, REALTYPE *w, 
int Nin)
 
  598     for (
int in = 0; in < Nin; ++in) {
 
  599       for (
int kx = 1; kx < 
VLENX; ++kx) {
 
  600         for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  604       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  611   template<
typename REALTYPE>
 
  612   inline void shift_vec0_ybw(REALTYPE *v, REALTYPE *w, 
int Nin)
 
  614     for (
int in = 0; in < Nin; ++in) {
 
  615       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  616         for (
int ky = 0; ky < 
VLENY - 1; ++ky) {
 
  621       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  628   template<
typename REALTYPE>
 
  629   inline void shift_vec0_yfw(REALTYPE *v, REALTYPE *w, 
int Nin)
 
  631     for (
int in = 0; in < Nin; ++in) {
 
  632       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  633         for (
int ky = 1; ky < 
VLENY; ++ky) {
 
  638       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  645   template<
typename REALTYPE>
 
  646   inline void shift_vec1_xbw(
Vsimd_t *x, REALTYPE *buf, 
int Nin)
 
  648     for (
int in = 0; in < Nin; ++in) {
 
  649       for (
int kx = 0; kx < 
VLENX - 1; ++kx) {
 
  650         for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  651           x[in].
v[kx + 
VLENX * ky] = 0.0;
 
  655       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  662   template<
typename REALTYPE>
 
  663   inline void shift_vec1_xfw(
Vsimd_t *x, REALTYPE *buf, 
int Nin)
 
  665     for (
int in = 0; in < Nin; ++in) {
 
  666       for (
int kx = 1; kx < 
VLENX; ++kx) {
 
  667         for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  668           x[in].
v[kx + 
VLENX * ky] = 0.0;
 
  671       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  678   template<
typename REALTYPE>
 
  679   inline void shift_vec1_ybw(
Vsimd_t *v, REALTYPE *buf, 
int Nin)
 
  681     for (
int in = 0; in < Nin; ++in) {
 
  682       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  683         for (
int ky = 0; ky < 
VLENY - 1; ++ky) {
 
  684           v[in].
v[kx + 
VLENX * ky] = 0.0;
 
  688       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  695   template<
typename REALTYPE>
 
  696   inline void shift_vec1_yfw(
Vsimd_t *v, REALTYPE *buf, 
int Nin)
 
  698     for (
int in = 0; in < Nin; ++in) {
 
  699       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  700         for (
int ky = 1; ky < 
VLENY; ++ky) {
 
  701           v[in].
v[kx + 
VLENX * ky] = 0.0;
 
  705       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  712   template<
typename REALTYPE>
 
  713   inline void shift_vec2_xbw(REALTYPE *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  715     for (
int in = 0; in < Nin; ++in) {
 
  716       for (
int kx = 0; kx < 
VLENX - 1; ++kx) {
 
  717         for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  722       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  729   template<
typename REALTYPE>
 
  730   inline void shift_vec2_xfw(REALTYPE *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  732     for (
int in = 0; in < Nin; ++in) {
 
  733       for (
int kx = 1; kx < 
VLENX; ++kx) {
 
  734         for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  738       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  745   template<
typename REALTYPE>
 
  746   inline void shift_vec2_xbw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  748     for (
int in = 0; in < Nin; ++in) {
 
  749       for (
int kx = 0; kx < 
VLENX - 1; ++kx) {
 
  750         for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  755       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  762   template<
typename REALTYPE>
 
  763   inline void shift_vec2_xbw_eo(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
 
  766     for (
int in = 0; in < Nin; ++in) {
 
  767       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  768         if ((ky % 2) == ieo) {
 
  769           for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  773           for (
int kx = 0; kx < 
VLENX - 1; ++kx) {
 
  783   template<
typename REALTYPE>
 
  784   inline void shift_vec2_xfw_eo(
Vsimd_t *v, REALTYPE *w, REALTYPE *y,
 
  787     for (
int in = 0; in < Nin; ++in) {
 
  788       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  789         if ((ky % 2) == ieo) {
 
  790           for (
int kx = 1; kx < 
VLENX; ++kx) {
 
  795           for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  804   template<
typename REALTYPE>
 
  805   inline void shift_vec2_xfw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  807     for (
int in = 0; in < Nin; ++in) {
 
  808       for (
int kx = 1; kx < 
VLENX; ++kx) {
 
  809         for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  813       for (
int ky = 0; ky < 
VLENY; ++ky) {
 
  820   template<
typename REALTYPE>
 
  821   inline void shift_vec2_ybw(REALTYPE *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  823     for (
int in = 0; in < Nin; ++in) {
 
  824       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  825         for (
int ky = 0; ky < 
VLENY - 1; ++ky) {
 
  830       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  837   template<
typename REALTYPE>
 
  838   inline void shift_vec2_yfw(REALTYPE *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  840     for (
int in = 0; in < Nin; ++in) {
 
  841       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  842         for (
int ky = 1; ky < 
VLENY; ++ky) {
 
  847       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  854   template<
typename REALTYPE>
 
  855   inline void shift_vec2_ybw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  857     for (
int in = 0; in < Nin; ++in) {
 
  858       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  859         for (
int ky = 0; ky < 
VLENY - 1; ++ky) {
 
  864       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  871   template<
typename REALTYPE>
 
  872   inline void shift_vec2_yfw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  874     for (
int in = 0; in < Nin; ++in) {
 
  875       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  876         for (
int ky = 1; ky < 
VLENY; ++ky) {
 
  881       for (
int kx = 0; kx < 
VLENX; ++kx) {
 
  890   template<
typename REALTYPE>
 
  891   inline void shift_vec0_bw(REALTYPE *v, REALTYPE *w, 
int Nin)
 
  893     for (
int in = 0; in < Nin; ++in) {
 
  894       for (
int k = 0; k < 
VLEN - 1; ++k) {
 
  895         v[k + 
VLEN * in] = w[k + 1 + 
VLEN * in];
 
  902   template<
typename REALTYPE>
 
  903   inline void shift_vec0_fw(REALTYPE *v, REALTYPE *w, 
int Nin)
 
  905     for (
int in = 0; in < Nin; ++in) {
 
  906       for (
int k = 1; k < 
VLEN; ++k) {
 
  907         v[k + 
VLEN * in] = w[k - 1 + 
VLEN * in];
 
  909       v[0 + 
VLEN * in] = 0.0;
 
  914   template<
typename REALTYPE>
 
  915   inline void shift_vec1_bw(
Vsimd_t *x, REALTYPE *buf, 
int Nin)
 
  917     for (
int in = 0; in < Nin; ++in) {
 
  918       for (
int k = 0; k < 
VLEN - 1; ++k) {
 
  921       x[in].
v[
VLEN - 1] = buf[in];
 
  926   template<
typename REALTYPE>
 
  927   inline void shift_vec1_fw(
Vsimd_t *x, REALTYPE *buf, 
int Nin)
 
  929     for (
int in = 0; in < Nin; ++in) {
 
  930       for (
int k = 1; k < 
VLEN; ++k) {
 
  933       x[in].
v[0] = buf[in];
 
  938   template<
typename REALTYPE>
 
  939   inline void shift_vec1_bw(REALTYPE *v, REALTYPE *w, REALTYPE *buf, 
int Nin)
 
  941     for (
int in = 0; in < Nin; ++in) {
 
  942       for (
int k = 0; k < 
VLEN - 1; ++k) {
 
  943         v[k + 
VLEN * in] = w[k + 1 + 
VLEN * in];
 
  944         v[k + 
VLEN * in] = w[k + 1 + 
VLEN * in];
 
  951   template<
typename REALTYPE>
 
  952   inline void shift_vec1_fw(REALTYPE *v, REALTYPE *w, REALTYPE *buf, 
int Nin)
 
  954     for (
int in = 0; in < Nin; ++in) {
 
  955       for (
int k = 1; k < 
VLEN; ++k) {
 
  956         v[k + 
VLEN * in] = w[k - 1 + 
VLEN * in];
 
  958       v[0 + 
VLEN * in] = buf[in];
 
  963   template<
typename REALTYPE>
 
  964   inline void shift_vec2_bw(REALTYPE *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  966     for (
int in = 0; in < Nin; ++in) {
 
  967       for (
int k = 0; k < 
VLEN - 1; ++k) {
 
  968         v[k + 
VLEN * in] = w[k + 1 + 
VLEN * in];
 
  975   template<
typename REALTYPE>
 
  976   inline void shift_vec2_fw(REALTYPE *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  978     for (
int in = 0; in < Nin; ++in) {
 
  979       for (
int k = 1; k < 
VLEN; ++k) {
 
  980         v[k + 
VLEN * in] = w[k - 1 + 
VLEN * in];
 
  987   template<
typename REALTYPE>
 
  988   inline void shift_vec2_bw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
  990     for (
int in = 0; in < Nin; ++in) {
 
  991       for (
int k = 0; k < 
VLEN - 1; ++k) {
 
  992         v[in].
v[k] = w[k + 1 + 
VLEN * in];
 
  999   template<
typename REALTYPE>
 
 1000   inline void shift_vec2_fw(
Vsimd_t *v, REALTYPE *w, REALTYPE *y, 
int Nin)
 
 1002     for (
int in = 0; in < Nin; ++in) {
 
 1003       for (
int k = 1; k < 
VLEN; ++k) {
 
 1004         v[in].
v[k] = w[k - 1 + 
VLEN * in];
 
 1011   template<
typename REALTYPE>
 
 1012   inline void load_vec1(REALTYPE *vt, REALTYPE *v, 
int k, 
int Nin)
 
 1014     for (
int in = 0; in < Nin; ++in) {
 
 1015       vt[in] = v[k + 
VLEN * in];
 
 1020   template<
typename REALTYPE>
 
 1021   inline void save_vec1(REALTYPE *x, 
Vsimd_t *vt, 
int k, 
int Nin)
 
 1023     for (
int in = 0; in < Nin; ++in) {
 
 1024       x[in] = vt[in].
v[k];
 
 1031     return svcompact(pg, yt);
 
 1035   template<
typename REALTYPE>
 
 1039     svbool_t pg1 = set_predicate_whilelt(skip);
 
 1041     load_vec(pg1, v1, v);
 
 1042     v2 = svtbl(v1, index);
 
 1043     vt = svsel(pg2, v2, vt);