Bridge++  Ver. 2.0.2
vsimd_common_double-inc.h
Go to the documentation of this file.
1 
9 #ifndef QXS_VSIMD_COMMON_INCLUDED
10 #define QXS_VSIMD_COMMON_INCLUDED
11 
12 namespace {
13  template<typename REALTYPE>
14  inline void load_vec(Vsimd_t *vt, REALTYPE *vp, int Nin)
15  {
16  for (int in = 0; in < Nin; ++in) {
17  for (int k = 0; k < VLEN; ++k) {
18  vt[in].v[k] = vp[k + VLEN * in];
19  }
20  }
21  }
22 
23 
24  template<typename REALTYPE>
25  inline void load_vec1_x(REALTYPE *vt, REALTYPE *v, int kx, int Nin)
26  {
27  for (int in = 0; in < Nin; ++in) {
28  for (int ky = 0; ky < VLENY; ++ky) {
29  vt[ky + VLENY * in] = v[kx + VLENX * ky + VLEN * in];
30  }
31  }
32  }
33 
34 
35  template<typename REALTYPE>
36  inline void load_vec1_y(REALTYPE *vt, REALTYPE *v, int ky, int Nin)
37  {
38  for (int in = 0; in < Nin; ++in) {
39  for (int kx = 0; kx < VLENX; ++kx) {
40  vt[kx + VLENX * in] = v[kx + VLENX * ky + VLEN * in];
41  }
42  }
43  }
44 
45 
46  template<typename REALTYPE>
47  inline void save_vec(REALTYPE *x, Vsimd_t *vt, int Nin)
48  {
49  for (int in = 0; in < Nin; ++in) {
50  for (int k = 0; k < VLEN; ++k) {
51  x[k + VLEN * in] = vt[in].v[k];
52  }
53  }
54  }
55 
56 
57  template<typename REALTYPE>
58  inline void save_vec(svbool_t pg, REALTYPE *x, svreal_t& vt)
59  {
60  svst1(pg, x, vt);
61  }
62 
63 
64  template<typename REALTYPE>
65  inline void save_vec_scatter(svbool_t pg, REALTYPE *vp,
66  svreal_t& vt, svint_t& index)
67  {
68  svst1_scatter_index(pg, vp, index, vt);
69  }
70 
71 
72  template<typename REALTYPE>
73  inline void save_vec1_x(REALTYPE *x, Vsimd_t *vt, int kx, int Nin)
74  {
75  for (int in = 0; in < Nin; ++in) {
76  for (int ky = 0; ky < VLENY; ++ky) {
77  x[ky + VLENY * in] = vt[in].v[kx + VLENX * ky];
78  }
79  }
80  }
81 
82 
83  template<typename REALTYPE>
84  inline void save_vec1_y(REALTYPE *x, Vsimd_t *vt, int ky, int Nin)
85  {
86  for (int in = 0; in < Nin; ++in) {
87  for (int kx = 0; kx < VLENX; ++kx) {
88  x[kx + VLENX * in] = vt[in].v[kx + VLENX * ky];
89  }
90  }
91  }
92 
93 
94  inline void clear_vec(Vsimd_t *vt, int Nin)
95  {
96  for (int in = 0; in < Nin; ++in) {
97  for (int k = 0; k < VLEN; ++k) {
98  vt[in].v[k] = 0.0;
99  }
100  }
101  }
102 
103 
104  template<typename REALTYPE>
105  inline void add_vec(REALTYPE *x, Vsimd_t *vt, int Nin)
106  {
107  for (int in = 0; in < Nin; ++in) {
108  for (int k = 0; k < VLEN; ++k) {
109  x[k + VLEN * in] += vt[in].v[k];
110  }
111  }
112  }
113 
114 
115  inline void add_vec(Vsimd_t *x, Vsimd_t *y, int Nin)
116  {
117  for (int in = 0; in < Nin; ++in) {
118  for (int k = 0; k < VLEN; ++k) {
119  x[in].v[k] += y[in].v[k];
120  }
121  }
122  }
123 
124 
125  inline void add_vec(svbool_t pg, svreal_t& x, svreal_t& y)
126  {
127  x = svadd_m(pg, x, y);
128  }
129 
130 
131  inline void add_vec(svbool_t pg, svreal_t& z, svreal_t& x, svreal_t& y)
132  {
133  z = svadd_m(pg, x, y);
134  }
135 
136 
137  inline void sub_vec(svbool_t pg, svreal_t& x, svreal_t& y)
138  {
139  x = svsub_m(pg, x, y);
140  }
141 
142 
143  inline void sub_vec(svbool_t pg, svreal_t& z, svreal_t& x, svreal_t& y)
144  {
145  z = svsub_m(pg, x, y);
146  }
147 
148 
149  inline void copy_vec(Vsimd_t *x, Vsimd_t *y, int Nin)
150  {
151  for (int in = 0; in < Nin; ++in) {
152  for (int k = 0; k < VLEN; ++k) {
153  x[in].v[k] = y[in].v[k];
154  }
155  }
156  }
157 
158 
159  template<typename REALTYPE>
160  inline void set_vec(Vsimd_t *x, REALTYPE a, Vsimd_t *y, int Nin)
161  {
162  for (int in = 0; in < Nin; ++in) {
163  for (int k = 0; k < VLEN; ++k) {
164  x[in].v[k] = a * y[in].v[k];
165  }
166  }
167  }
168 
169 
170  inline void set_vec(svbool_t pg, svreal_t& x, real_t a, svreal_t y)
171  {
172  x = svmul_m(pg, y, a);
173  }
174 
175 
176  template<typename REALTYPE>
177  inline void axpy_vec(Vsimd_t *y, REALTYPE a, Vsimd_t *x, int Nin)
178  {
179  for (int in = 0; in < Nin; ++in) {
180  for (int k = 0; k < VLEN; ++k) {
181  y[in].v[k] += a * x[in].v[k];
182  }
183  }
184  }
185 
186 
187  inline void axpy_vec(svbool_t pg, svreal_t& y, real_t a, svreal_t x)
188  {
189  y = svmla_m(pg, y, x, a);
190  }
191 
192 
193  inline void axpy_vec(svbool_t pg, svreal_t& y, svreal_t a, svreal_t x)
194  {
195  y = svmla_m(pg, y, x, a);
196  }
197 
198 
199  inline void ymax_vec(svbool_t pg, svreal_t& y, svreal_t a, svreal_t x)
200  {
201  y = svmls_m(pg, y, x, a);
202  }
203 
204 
205  template<typename REALTYPE>
206  inline void aypx_vec(REALTYPE a, Vsimd_t *x, Vsimd_t *y, int Nin)
207  {
208  for (int in = 0; in < Nin; ++in) {
209  for (int k = 0; k < VLEN; ++k) {
210  x[in].v[k] = a * x[in].v[k] + y[in].v[k];
211  }
212  }
213  }
214 
215 
216  inline void aypx_vec(svbool_t pg, real_t a, svreal_t& y, svreal_t x)
217  {
218  y = svmla_m(pg, x, y, a);
219  }
220 
221 
222  template<typename REALTYPE>
223  inline void scal_vec(Vsimd_t *x, REALTYPE a, int Nin)
224  {
225  for (int in = 0; in < Nin; ++in) {
226  for (int k = 0; k < VLEN; ++k) {
227  x[in].v[k] *= a;
228  }
229  }
230  }
231 
232 
233  inline void mul_vec(svbool_t pg, svreal_t& x, svreal_t y, svreal_t w)
234  {
235  x = svmul_m(pg, y, w);
236  }
237 
238 
239  inline void scal_vec(svbool_t pg, svreal_t& x, svreal_t a)
240  {
241  x = svmul_m(pg, x, a);
242  }
243 
244 
245  inline void scal_vec(svbool_t pg, svreal_t& x, real_t a)
246  {
247  x = svmul_m(pg, x, a);
248  }
249 
250 
251  template<typename REALTYPE>
252  inline void dot_vec(REALTYPE& a, Vsimd_t *x, Vsimd_t *y, int Nin)
253  {
254  a = REALTYPE(0.0);
255  for (int in = 0; in < Nin; ++in) {
256  for (int k = 0; k < VLEN; ++k) {
257  a += x[in].v[k] * y[in].v[k];
258  }
259  }
260  }
261 
262 
263  template<typename REALTYPE>
264  inline void norm2_vec(REALTYPE& a, Vsimd_t *x, int Nin)
265  {
266  a = REALTYPE(0.0);
267  for (int in = 0; in < Nin; ++in) {
268  for (int k = 0; k < VLEN; ++k) {
269  a += x[in].v[k] * x[in].v[k];
270  }
271  }
272  }
273 
274 
275  template<typename REALTYPE>
276  inline void reduce_vec(REALTYPE& a, Vsimd_t *x, int Nin)
277  {
278  a = REALTYPE(0.0);
279  for (int in = 0; in < Nin; ++in) {
280  for (int k = 0; k < VLEN; ++k) {
281  a += x[in].v[k];
282  }
283  }
284  }
285 
286 
287  template<typename REALTYPE>
288  inline void reduce_vec(svbool_t pg, REALTYPE& a, svreal_t& x)
289  {
290  a = svaddv(pg, x);
291  }
292 
293 
294  inline void add_norm2_vec(svbool_t pg, svreal_t& y, svreal_t& x)
295  {
296  y = svmla_m(pg, y, x, x);
297  }
298 
299 
300  inline void add_norm2_vec(Vsimd_t *y, Vsimd_t *x, int Nin)
301  {
302  for (int in = 0; in < Nin; ++in) {
303  for (int k = 0; k < VLEN; ++k) {
304  y[in].v[k] += x[in].v[k] * x[in].v[k];
305  }
306  }
307  }
308 
309 
310  inline void add_dot_vec(svbool_t pg, svreal_t& y, svreal_t& x, svreal_t& w)
311  {
312  y = svmla_m(pg, y, x, w);
313  }
314 
315 
316  inline void add_dot_vec(Vsimd_t *y, Vsimd_t *x, Vsimd_t *w, int Nin)
317  {
318  for (int in = 0; in < Nin; ++in) {
319  for (int k = 0; k < VLEN; ++k) {
320  y[in].v[k] += x[in].v[k] * w[in].v[k];
321  }
322  }
323  }
324 
325 
326  inline void sub_dot_vec(svbool_t pg, svreal_t& y, svreal_t& x, svreal_t& w)
327  {
328  y = svmls_m(pg, y, x, w);
329  }
330 
331 
332  inline void sub_dot_vec(Vsimd_t *y, Vsimd_t *x, Vsimd_t *w, int Nin)
333  {
334  for (int in = 0; in < Nin; ++in) {
335  for (int k = 0; k < VLEN; ++k) {
336  y[in].v[k] -= x[in].v[k] * w[in].v[k];
337  }
338  }
339  }
340 
341 
342  // setup of index for gather load / scatter store
343  inline void set_index_xp(svint_t& svindex_xp)
344  {
345  int_t index[VLEN];
346  for (int iy = 0; iy < VLENY; ++iy) {
347  index[VLENX - 1 + VLENX * iy] = iy;
348  for (int ix = 0; ix < VLENX - 1; ++ix) {
349  index[ix + VLENX * iy] = 0;
350  }
351  }
352  svbool_t pg = set_predicate();
353  load_svint(pg, svindex_xp, index);
354  }
355 
356 
357  inline void set_index_xm(svint_t& svindex_xm)
358  {
359  int_t index[VLEN];
360  for (int iy = 0; iy < VLENY; ++iy) {
361  index[VLENX * iy] = iy;
362  for (int ix = 1; ix < VLENX; ++ix) {
363  index[ix + VLENX * iy] = 0;
364  }
365  }
366  svbool_t pg = set_predicate();
367  load_svint(pg, svindex_xm, index);
368  }
369 
370 
371  inline void set_index_xp_eo(svint_t& svindex_xp)
372  {
373  int_t index[VLEN];
374  for (int iy = 0; iy < VLENY; ++iy) {
375  index[VLENX - 1 + VLENX * iy] = iy / 2;
376  for (int ix = 0; ix < VLENX - 1; ++ix) {
377  index[ix + VLENX * iy] = 0;
378  }
379  }
380  svbool_t pg = set_predicate();
381  load_svint(pg, svindex_xp, index);
382  }
383 
384 
385  inline void set_index_xm_eo(svint_t& svindex_xm)
386  {
387  int_t index[VLEN];
388  for (int iy = 0; iy < VLENY; ++iy) {
389  index[VLENX * iy] = iy / 2;
390  for (int ix = 1; ix < VLENX; ++ix) {
391  index[ix + VLENX * iy] = 0;
392  }
393  }
394  svbool_t pg = set_predicate();
395  load_svint(pg, svindex_xm, index);
396  }
397 
398 
399  inline void set_index_xp_eo(svuint_t& svindex_xp)
400  {
401  uint_t index[VLEN];
402  for (int iy = 0; iy < VLENY; ++iy) {
403  index[VLENX - 1 + VLENX * iy] = iy / 2;
404  for (int ix = 0; ix < VLENX - 1; ++ix) {
405  index[ix + VLENX * iy] = 0;
406  }
407  }
408  svbool_t pg = set_predicate();
409  load_svuint(pg, svindex_xp, index);
410  }
411 
412 
413  inline void set_index_xm_eo(svuint_t& svindex_xm)
414  {
415  uint_t index[VLEN];
416  for (int iy = 0; iy < VLENY; ++iy) {
417  index[VLENX * iy] = iy / 2;
418  for (int ix = 1; ix < VLENX; ++ix) {
419  index[ix + VLENX * iy] = 0;
420  }
421  }
422  svbool_t pg = set_predicate();
423  load_svuint(pg, svindex_xm, index);
424  }
425 
426 
427  inline void set_index_yp(svint_t& svindex_yp)
428  {
429  int_t index[VLEN];
430  for (int ix = 0; ix < VLENX; ++ix) {
431  for (int iy = 0; iy < VLENY - 1; ++iy) {
432  index[ix + VLENX * iy] = 0;
433  }
434  index[ix + VLENX * (VLENY - 1)] = ix;
435  }
436  svbool_t pg = set_predicate();
437  load_svint(pg, svindex_yp, index);
438  }
439 
440 
441  inline void set_index_ym(svint_t& svindex_ym)
442  {
443  int_t index[VLEN];
444  for (int ix = 0; ix < VLENX; ++ix) {
445  index[ix] = ix;
446  for (int iy = 1; iy < VLENY; ++iy) {
447  index[ix + VLENX * iy] = 0;
448  }
449  }
450  svbool_t pg = set_predicate();
451  load_svint(pg, svindex_ym, index);
452  }
453 
454 
455  template<typename REALTYPE>
456  inline void shift_vec(svbool_t pg, svuint_t idx,
457  svreal_t& v,
458  const REALTYPE *__restrict xc,
459  const REALTYPE *__restrict xn)
460  {
461  svbool_t pg0 = set_predicate();
462  svreal_t vc, vn;
463  load_vec(pg0, vc, xc);
464  load_vec(pg0, vn, xn);
465  svreal_t vv = svsel(pg, vn, vc);
466  v = svtbl(vv, idx);
467  }
468 
469 
470  template<typename REALTYPE>
471  inline void shift_vec_xbw(svbool_t& pg1, svbool_t& pg2,
472  svreal_t& v, REALTYPE *wx, REALTYPE *wn)
473  {
474  load_vec(pg1, v, &wx[1]);
475  load_add(pg2, v, &wn[-VLENX + 1]);
476  }
477 
478 
479  template<typename REALTYPE>
480  inline void shift_vec_xbw(svbool_t& pg1, svbool_t& pg2, svbool_t& pg3,
481  svreal_t& v, REALTYPE *wx, REALTYPE *wn)
482  {
483  load_vec(pg3, v, &wx[0]);
484  load_add(pg1, v, &wx[1]);
485  load_add(pg2, v, &wn[-VLENX + 1]);
486  }
487 
488 
489  template<typename REALTYPE>
490  inline void shift_vec_xfw(svbool_t& pg1, svbool_t& pg2,
491  svreal_t& v, REALTYPE *wx, REALTYPE *wn)
492  {
493  load_vec(pg1, v, &wx[-1]);
494  load_add(pg2, v, &wn[VLENX - 1]);
495  }
496 
497 
498  template<typename REALTYPE>
499  inline void shift_vec_xfw(svbool_t& pg1, svbool_t& pg2, svbool_t& pg3,
500  svreal_t& v, REALTYPE *wx, REALTYPE *wn)
501  {
502  load_vec(pg3, v, &wx[0]);
503  load_add(pg1, v, &wx[-1]);
504  load_add(pg2, v, &wn[VLENX - 1]);
505  }
506 
507 
508  template<typename REALTYPE>
509  inline void shift_vec_xfw(svbool_t& pg1, svbool_t& pg2,
510  Vsimd_t *x, REALTYPE *wx, REALTYPE *wn,
511  int Nin)
512  {
513  svbool_t pg = set_predicate();
514  for (int in = 0; in < Nin; ++in) {
515  svreal_t vt;
516  load_vec(pg1, vt, &wx[VLEN * in - 1]);
517  load_add(pg2, vt, &wn[VLEN * in + VLENX - 1]);
518  svst1(pg, &x[in].v[0], vt);
519  }
520  }
521 
522 
523  template<typename REALTYPE>
524  inline void shift_vec_ybw(svbool_t& pg1, svbool_t& pg2,
525  svreal_t& v, REALTYPE *wx, REALTYPE *wn)
526  {
527  load_vec(pg1, v, &wx[VLENX]);
528  load_add(pg2, v, &wn[-VLENX * (VLENY - 1)]);
529  }
530 
531 
532  template<typename REALTYPE>
533  inline void shift_vec_yfw(svbool_t& pg1, svbool_t& pg2,
534  svreal_t& v, REALTYPE *wx, REALTYPE *wn)
535  {
536  load_vec(pg1, v, &wx[-VLENX]);
537  load_add(pg2, v, &wn[VLENX * (VLENY - 1)]);
538  }
539 
540 
541  template<typename REALTYPE>
542  inline void shift_vec_ybw(svreal_t& v, REALTYPE *wx, REALTYPE *wn)
543  {
544  svbool_t pg = set_predicate();
545  svreal_t v1, v2;
546  load_vec(pg, v1, &wx[0]);
547  load_vec(pg, v2, &wn[0]);
548  v = svext(v1, v2, VLENX);
549  }
550 
551 
552  template<typename REALTYPE>
553  inline void shift_vec_yfw(svreal_t& v, REALTYPE *wx, REALTYPE *wn)
554  {
555  svbool_t pg = set_predicate();
556  svreal_t v1, v2;
557  load_vec(pg, v1, &wx[0]);
558  load_vec(pg, v2, &wn[0]);
559  v = svext(v2, v1, VLENX * (VLENY - 1));
560  }
561 
562 
563  template<typename REALTYPE>
564  inline void shift_vec_yfw(svbool_t& pg1, svbool_t& pg2,
565  Vsimd_t *x, REALTYPE *wx, REALTYPE *wn,
566  int Nin)
567  {
568  svbool_t pg = set_predicate();
569  for (int in = 0; in < Nin; ++in) {
570  svreal_t vt;
571  load_vec(pg1, vt, &wx[VLEN * in - VLENX]);
572  load_add(pg2, vt, &wn[VLEN * in + VLENX * (VLENY - 1)]);
573  svst1(pg, &x[in].v[0], vt);
574  }
575  }
576 
577 
578  template<typename REALTYPE>
579  inline void shift_vec0_xbw(REALTYPE *v, REALTYPE *w, int Nin)
580  {
581  for (int in = 0; in < Nin; ++in) {
582  for (int kx = 0; kx < VLENX - 1; ++kx) {
583  for (int ky = 0; ky < VLENY; ++ky) {
584  v[kx + VLENX * ky + VLEN * in] = w[kx + 1 + VLENX * ky + VLEN * in];
585  }
586  }
587  int kx = VLENX - 1;
588  for (int ky = 0; ky < VLENY; ++ky) {
589  v[kx + VLENX * ky + VLEN * in] = 0.0;
590  }
591  }
592  }
593 
594 
595  template<typename REALTYPE>
596  inline void shift_vec0_xfw(REALTYPE *v, REALTYPE *w, int Nin)
597  {
598  for (int in = 0; in < Nin; ++in) {
599  for (int kx = 1; kx < VLENX; ++kx) {
600  for (int ky = 0; ky < VLENY; ++ky) {
601  v[kx + VLENX * ky + VLEN * in] = w[kx - 1 + VLENX * ky + VLEN * in];
602  }
603  }
604  for (int ky = 0; ky < VLENY; ++ky) {
605  v[0 + VLENX * ky + VLEN * in] = 0.0;
606  }
607  }
608  }
609 
610 
611  template<typename REALTYPE>
612  inline void shift_vec0_ybw(REALTYPE *v, REALTYPE *w, int Nin)
613  {
614  for (int in = 0; in < Nin; ++in) {
615  for (int kx = 0; kx < VLENX; ++kx) {
616  for (int ky = 0; ky < VLENY - 1; ++ky) {
617  v[kx + VLENX * ky + VLEN * in] = w[kx + VLENX * (ky + 1) + VLEN * in];
618  }
619  }
620  int ky = VLENY - 1;
621  for (int kx = 0; kx < VLENX; ++kx) {
622  v[kx + VLENX * ky + VLEN * in] = 0.0;
623  }
624  }
625  }
626 
627 
628  template<typename REALTYPE>
629  inline void shift_vec0_yfw(REALTYPE *v, REALTYPE *w, int Nin)
630  {
631  for (int in = 0; in < Nin; ++in) {
632  for (int kx = 0; kx < VLENX; ++kx) {
633  for (int ky = 1; ky < VLENY; ++ky) {
634  v[kx + VLENX * ky + VLEN * in] = w[kx + VLENX * (ky - 1) + VLEN * in];
635  }
636  }
637  int ky = 0;
638  for (int kx = 0; kx < VLENX; ++kx) {
639  v[kx + VLENX * ky + VLEN * in] = 0.0;
640  }
641  }
642  }
643 
644 
645  template<typename REALTYPE>
646  inline void shift_vec1_xbw(Vsimd_t *x, REALTYPE *buf, int Nin)
647  {
648  for (int in = 0; in < Nin; ++in) {
649  for (int kx = 0; kx < VLENX - 1; ++kx) {
650  for (int ky = 0; ky < VLENY; ++ky) {
651  x[in].v[kx + VLENX * ky] = 0.0;
652  }
653  }
654  int kx = VLENX - 1;
655  for (int ky = 0; ky < VLENY; ++ky) {
656  x[in].v[kx + VLENX * ky] = buf[ky + VLENY * in];
657  }
658  }
659  }
660 
661 
662  template<typename REALTYPE>
663  inline void shift_vec1_xfw(Vsimd_t *x, REALTYPE *buf, int Nin)
664  {
665  for (int in = 0; in < Nin; ++in) {
666  for (int kx = 1; kx < VLENX; ++kx) {
667  for (int ky = 0; ky < VLENY; ++ky) {
668  x[in].v[kx + VLENX * ky] = 0.0;
669  }
670  }
671  for (int ky = 0; ky < VLENY; ++ky) {
672  x[in].v[0 + VLENX * ky] = buf[ky + VLENY * in];
673  }
674  }
675  }
676 
677 
678  template<typename REALTYPE>
679  inline void shift_vec1_ybw(Vsimd_t *v, REALTYPE *buf, int Nin)
680  {
681  for (int in = 0; in < Nin; ++in) {
682  for (int kx = 0; kx < VLENX; ++kx) {
683  for (int ky = 0; ky < VLENY - 1; ++ky) {
684  v[in].v[kx + VLENX * ky] = 0.0;
685  }
686  }
687  int ky = VLENY - 1;
688  for (int kx = 0; kx < VLENX; ++kx) {
689  v[in].v[kx + VLENX * ky] = buf[kx + VLENX * in];
690  }
691  }
692  }
693 
694 
695  template<typename REALTYPE>
696  inline void shift_vec1_yfw(Vsimd_t *v, REALTYPE *buf, int Nin)
697  {
698  for (int in = 0; in < Nin; ++in) {
699  for (int kx = 0; kx < VLENX; ++kx) {
700  for (int ky = 1; ky < VLENY; ++ky) {
701  v[in].v[kx + VLENX * ky] = 0.0;
702  }
703  }
704  int ky = 0;
705  for (int kx = 0; kx < VLENX; ++kx) {
706  v[in].v[kx + VLENX * ky] = buf[kx + VLENX * in];
707  }
708  }
709  }
710 
711 
712  template<typename REALTYPE>
713  inline void shift_vec2_xbw(REALTYPE *v, REALTYPE *w, REALTYPE *y, int Nin)
714  {
715  for (int in = 0; in < Nin; ++in) {
716  for (int kx = 0; kx < VLENX - 1; ++kx) {
717  for (int ky = 0; ky < VLENY; ++ky) {
718  v[kx + VLENX * ky + VLEN * in] = w[kx + 1 + VLENX * ky + VLEN * in];
719  }
720  }
721  int kx = VLENX - 1;
722  for (int ky = 0; ky < VLENY; ++ky) {
723  v[kx + VLENX * ky + VLEN * in] = y[0 + VLENX * ky + VLEN * in];
724  }
725  }
726  }
727 
728 
729  template<typename REALTYPE>
730  inline void shift_vec2_xfw(REALTYPE *v, REALTYPE *w, REALTYPE *y, int Nin)
731  {
732  for (int in = 0; in < Nin; ++in) {
733  for (int kx = 1; kx < VLENX; ++kx) {
734  for (int ky = 0; ky < VLENY; ++ky) {
735  v[kx + VLENX * ky + VLEN * in] = w[kx - 1 + VLENX * ky + VLEN * in];
736  }
737  }
738  for (int ky = 0; ky < VLENY; ++ky) {
739  v[0 + VLENX * ky + VLEN * in] = y[VLENX - 1 + VLENX * ky + VLEN * in];
740  }
741  }
742  }
743 
744 
745  template<typename REALTYPE>
746  inline void shift_vec2_xbw(Vsimd_t *v, REALTYPE *w, REALTYPE *y, int Nin)
747  {
748  for (int in = 0; in < Nin; ++in) {
749  for (int kx = 0; kx < VLENX - 1; ++kx) {
750  for (int ky = 0; ky < VLENY; ++ky) {
751  v[in].v[kx + VLENX * ky] = w[kx + 1 + VLENX * ky + VLEN * in];
752  }
753  }
754  int kx = VLENX - 1;
755  for (int ky = 0; ky < VLENY; ++ky) {
756  v[in].v[kx + VLENX * ky] = y[0 + VLENX * ky + VLEN * in];
757  }
758  }
759  }
760 
761 
762  template<typename REALTYPE>
763  inline void shift_vec2_xbw_eo(Vsimd_t *v, REALTYPE *w, REALTYPE *y,
764  int ieo, int Nin)
765  {
766  for (int in = 0; in < Nin; ++in) {
767  for (int ky = 0; ky < VLENY; ++ky) {
768  if ((ky % 2) == ieo) {
769  for (int kx = 0; kx < VLENX; ++kx) {
770  v[in].v[kx + VLENX * ky] = w[kx + VLENX * ky + VLEN * in];
771  }
772  } else {
773  for (int kx = 0; kx < VLENX - 1; ++kx) {
774  v[in].v[kx + VLENX * ky] = w[kx + 1 + VLENX * ky + VLEN * in];
775  }
776  v[in].v[VLENX - 1 + VLENX * ky] = y[0 + VLENX * ky + VLEN * in];
777  }
778  }
779  }
780  }
781 
782 
783  template<typename REALTYPE>
784  inline void shift_vec2_xfw_eo(Vsimd_t *v, REALTYPE *w, REALTYPE *y,
785  int ieo, int Nin)
786  {
787  for (int in = 0; in < Nin; ++in) {
788  for (int ky = 0; ky < VLENY; ++ky) {
789  if ((ky % 2) == ieo) {
790  for (int kx = 1; kx < VLENX; ++kx) {
791  v[in].v[kx + VLENX * ky] = w[kx - 1 + VLENX * ky + VLEN * in];
792  }
793  v[in].v[0 + VLENX * ky] = y[VLENX - 1 + VLENX * ky + VLEN * in];
794  } else {
795  for (int kx = 0; kx < VLENX; ++kx) {
796  v[in].v[kx + VLENX * ky] = w[kx + VLENX * ky + VLEN * in];
797  }
798  }
799  }
800  }
801  }
802 
803 
804  template<typename REALTYPE>
805  inline void shift_vec2_xfw(Vsimd_t *v, REALTYPE *w, REALTYPE *y, int Nin)
806  {
807  for (int in = 0; in < Nin; ++in) {
808  for (int kx = 1; kx < VLENX; ++kx) {
809  for (int ky = 0; ky < VLENY; ++ky) {
810  v[in].v[kx + VLENX * ky] = w[kx - 1 + VLENX * ky + VLEN * in];
811  }
812  }
813  for (int ky = 0; ky < VLENY; ++ky) {
814  v[in].v[0 + VLENX * ky] = y[VLENX - 1 + VLENX * ky + VLEN * in];
815  }
816  }
817  }
818 
819 
820  template<typename REALTYPE>
821  inline void shift_vec2_ybw(REALTYPE *v, REALTYPE *w, REALTYPE *y, int Nin)
822  {
823  for (int in = 0; in < Nin; ++in) {
824  for (int kx = 0; kx < VLENX; ++kx) {
825  for (int ky = 0; ky < VLENY - 1; ++ky) {
826  v[kx + VLENX * ky + VLEN * in] = w[kx + VLENX * (ky + 1) + VLEN * in];
827  }
828  }
829  int ky = VLENY - 1;
830  for (int kx = 0; kx < VLENX; ++kx) {
831  v[kx + VLENX * ky + VLEN * in] = y[kx + VLENX * 0 + VLEN * in];
832  }
833  }
834  }
835 
836 
837  template<typename REALTYPE>
838  inline void shift_vec2_yfw(REALTYPE *v, REALTYPE *w, REALTYPE *y, int Nin)
839  {
840  for (int in = 0; in < Nin; ++in) {
841  for (int kx = 0; kx < VLENX; ++kx) {
842  for (int ky = 1; ky < VLENY; ++ky) {
843  v[kx + VLENX * ky + VLEN * in] = w[kx + VLENX * (ky - 1) + VLEN * in];
844  }
845  }
846  int ky = 0;
847  for (int kx = 0; kx < VLENX; ++kx) {
848  v[kx + VLENX * ky + VLEN * in] = y[kx + VLENX * (VLENY - 1) + VLEN * in];
849  }
850  }
851  }
852 
853 
854  template<typename REALTYPE>
855  inline void shift_vec2_ybw(Vsimd_t *v, REALTYPE *w, REALTYPE *y, int Nin)
856  {
857  for (int in = 0; in < Nin; ++in) {
858  for (int kx = 0; kx < VLENX; ++kx) {
859  for (int ky = 0; ky < VLENY - 1; ++ky) {
860  v[in].v[kx + VLENX * ky] = w[kx + VLENX * (ky + 1) + VLEN * in];
861  }
862  }
863  int ky = VLENY - 1;
864  for (int kx = 0; kx < VLENX; ++kx) {
865  v[in].v[kx + VLENX * ky] = y[kx + VLENX * 0 + VLEN * in];
866  }
867  }
868  }
869 
870 
871  template<typename REALTYPE>
872  inline void shift_vec2_yfw(Vsimd_t *v, REALTYPE *w, REALTYPE *y, int Nin)
873  {
874  for (int in = 0; in < Nin; ++in) {
875  for (int kx = 0; kx < VLENX; ++kx) {
876  for (int ky = 1; ky < VLENY; ++ky) {
877  v[in].v[kx + VLENX * ky] = w[kx + VLENX * (ky - 1) + VLEN * in];
878  }
879  }
880  int ky = 0;
881  for (int kx = 0; kx < VLENX; ++kx) {
882  v[in].v[kx + VLENX * ky] = y[kx + VLENX * (VLENY - 1) + VLEN * in];
883  }
884  }
885  }
886 
887 
888 // the following definitions are to be discarded.
889 
890  template<typename REALTYPE>
891  inline void shift_vec0_bw(REALTYPE *v, REALTYPE *w, int Nin)
892  {
893  for (int in = 0; in < Nin; ++in) {
894  for (int k = 0; k < VLEN - 1; ++k) {
895  v[k + VLEN * in] = w[k + 1 + VLEN * in];
896  }
897  v[VLEN - 1 + VLEN * in] = 0.0;
898  }
899  }
900 
901 
902  template<typename REALTYPE>
903  inline void shift_vec0_fw(REALTYPE *v, REALTYPE *w, int Nin)
904  {
905  for (int in = 0; in < Nin; ++in) {
906  for (int k = 1; k < VLEN; ++k) {
907  v[k + VLEN * in] = w[k - 1 + VLEN * in];
908  }
909  v[0 + VLEN * in] = 0.0;
910  }
911  }
912 
913 
914  template<typename REALTYPE>
915  inline void shift_vec1_bw(Vsimd_t *x, REALTYPE *buf, int Nin)
916  {
917  for (int in = 0; in < Nin; ++in) {
918  for (int k = 0; k < VLEN - 1; ++k) {
919  x[in].v[k] = 0.0;
920  }
921  x[in].v[VLEN - 1] = buf[in];
922  }
923  }
924 
925 
926  template<typename REALTYPE>
927  inline void shift_vec1_fw(Vsimd_t *x, REALTYPE *buf, int Nin)
928  {
929  for (int in = 0; in < Nin; ++in) {
930  for (int k = 1; k < VLEN; ++k) {
931  x[in].v[k] = 0.0;
932  }
933  x[in].v[0] = buf[in];
934  }
935  }
936 
937 
938  template<typename REALTYPE>
939  inline void shift_vec1_bw(REALTYPE *v, REALTYPE *w, REALTYPE *buf, int Nin)
940  {
941  for (int in = 0; in < Nin; ++in) {
942  for (int k = 0; k < VLEN - 1; ++k) {
943  v[k + VLEN * in] = w[k + 1 + VLEN * in];
944  v[k + VLEN * in] = w[k + 1 + VLEN * in];
945  }
946  v[VLEN - 1 + VLEN * in] = buf[in];
947  }
948  }
949 
950 
951  template<typename REALTYPE>
952  inline void shift_vec1_fw(REALTYPE *v, REALTYPE *w, REALTYPE *buf, int Nin)
953  {
954  for (int in = 0; in < Nin; ++in) {
955  for (int k = 1; k < VLEN; ++k) {
956  v[k + VLEN * in] = w[k - 1 + VLEN * in];
957  }
958  v[0 + VLEN * in] = buf[in];
959  }
960  }
961 
962 
963  template<typename REALTYPE>
964  inline void shift_vec2_bw(REALTYPE *v, REALTYPE *w, REALTYPE *y, int Nin)
965  {
966  for (int in = 0; in < Nin; ++in) {
967  for (int k = 0; k < VLEN - 1; ++k) {
968  v[k + VLEN * in] = w[k + 1 + VLEN * in];
969  }
970  v[VLEN - 1 + VLEN * in] = y[0 + VLEN * in];
971  }
972  }
973 
974 
975  template<typename REALTYPE>
976  inline void shift_vec2_fw(REALTYPE *v, REALTYPE *w, REALTYPE *y, int Nin)
977  {
978  for (int in = 0; in < Nin; ++in) {
979  for (int k = 1; k < VLEN; ++k) {
980  v[k + VLEN * in] = w[k - 1 + VLEN * in];
981  }
982  v[0 + VLEN * in] = y[VLEN - 1 + VLEN * in];
983  }
984  }
985 
986 
987  template<typename REALTYPE>
988  inline void shift_vec2_bw(Vsimd_t *v, REALTYPE *w, REALTYPE *y, int Nin)
989  {
990  for (int in = 0; in < Nin; ++in) {
991  for (int k = 0; k < VLEN - 1; ++k) {
992  v[in].v[k] = w[k + 1 + VLEN * in];
993  }
994  v[in].v[VLEN - 1] = y[0 + VLEN * in];
995  }
996  }
997 
998 
999  template<typename REALTYPE>
1000  inline void shift_vec2_fw(Vsimd_t *v, REALTYPE *w, REALTYPE *y, int Nin)
1001  {
1002  for (int in = 0; in < Nin; ++in) {
1003  for (int k = 1; k < VLEN; ++k) {
1004  v[in].v[k] = w[k - 1 + VLEN * in];
1005  }
1006  v[in].v[0] = y[VLEN - 1 + VLEN * in];
1007  }
1008  }
1009 
1010 
1011  template<typename REALTYPE>
1012  inline void load_vec1(REALTYPE *vt, REALTYPE *v, int k, int Nin)
1013  {
1014  for (int in = 0; in < Nin; ++in) {
1015  vt[in] = v[k + VLEN * in];
1016  }
1017  }
1018 
1019 
1020  template<typename REALTYPE>
1021  inline void save_vec1(REALTYPE *x, Vsimd_t *vt, int k, int Nin)
1022  {
1023  for (int in = 0; in < Nin; ++in) {
1024  x[in] = vt[in].v[k];
1025  }
1026  }
1027 
1028 
1029  inline svreal_t compact_vec(svbool_t pg, svreal_t& yt)
1030  {
1031  return svcompact(pg, yt);
1032  }
1033 
1034 
1035  template<typename REALTYPE>
1036  inline void load_add_gather(svbool_t pg2, svreal_t& vt, REALTYPE *v,
1037  svuint_t& index, int skip)
1038  {
1039  svbool_t pg1 = set_predicate_whilelt(skip);
1040  svreal_t v1, v2;
1041  load_vec(pg1, v1, v);
1042  v2 = svtbl(v1, index);
1043  vt = svsel(pg2, v2, vt);
1044  }
1045 } // end of nameless namespace
1046 
1047 #endif
1048 //============================================================END=====
VLEN
#define VLEN
Definition: bridgeQXS_Clover_coarse_double.cpp:12
uint_t
unsigned int uint_t
Definition: vsimd_double-inc.h:13
Vsimd_t
Definition: vsimd_double-inc.h:13
Isimd_t
Definition: vsimd_double-inc.h:20
int_t
int int_t
Definition: vsimd_double-inc.h:41
real_t
double real_t
Definition: bridgeQXS_Clover_coarse_double.cpp:16
Vsimd_t::v
double v[VLEND]
Definition: vsimd_double-inc.h:15
AIndex_eo_qxs::idx
int idx(const int in, const int Nin, const int ist, const int Nx2, const int Ny, const int leo, const int Nvol2, const int ex)
Definition: aindex_eo.h:27
Usimd_t
Definition: vsimd_double-inc.h:25
VLENY
#define VLENY
Definition: bridgeQXS_Clover_coarse_double.cpp:14
svbool_t
Definition: vsimd_double-inc.h:30
VLENX
#define VLENX
Definition: bridgeQXS_Clover_coarse_double.cpp:13