Bridge++  Ver. 2.0.2
mult_Wilson_eo_parts_qxs-inc.h
Go to the documentation of this file.
1 
10 #ifndef MULT_WILSON_EO_PARTS_QXS_H
11 #define MULT_WILSON_EO_PARTS_QXS_H
12 
13 namespace {
14 //====================================================================
15  template<typename REALTYPE>
16  inline void mult_wilson_eo_xp1(svbool_t& pg2, svint_t& svidx,
17  REALTYPE *__restrict buf,
18  REALTYPE *__restrict v1)
19  {
20  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
21  svreal_t v1r, v1i, v2r, v2i;
22 
23  for (int ic = 0; ic < NC; ++ic) {
24  int icr = ND * 2 * ic;
25  int ici = ND * 2 * ic + 1;
26 
27  load_vec(pg2, w1r, &v1[VLEN * (icr + ID1)]);
28  load_vec(pg2, w1i, &v1[VLEN * (ici + ID1)]);
29  load_vec(pg2, w2r, &v1[VLEN * (icr + ID2)]);
30  load_vec(pg2, w2i, &v1[VLEN * (ici + ID2)]);
31  load_vec(pg2, w3r, &v1[VLEN * (icr + ID3)]);
32  load_vec(pg2, w3i, &v1[VLEN * (ici + ID3)]);
33  load_vec(pg2, w4r, &v1[VLEN * (icr + ID4)]);
34  load_vec(pg2, w4i, &v1[VLEN * (ici + ID4)]);
35 
36  add_vec(pg2, v1r, w1r, w4i);
37  sub_vec(pg2, v1i, w1i, w4r);
38  add_vec(pg2, v2r, w2r, w3i);
39  sub_vec(pg2, v2i, w2i, w3r);
40 
41  int skip = (VLENY + 1) / 2;
42  save_vec_scatter(pg2, &buf[skip * (2 * ic)], v1r, svidx);
43  save_vec_scatter(pg2, &buf[skip * (2 * ic + 1)], v1i, svidx);
44  save_vec_scatter(pg2, &buf[skip * (2 * ic + NVC)], v2r, svidx);
45  save_vec_scatter(pg2, &buf[skip * (2 * ic + 1 + NVC)], v2i, svidx);
46  }
47  }
48 
49 
50 //====================================================================
51  template<typename REALTYPE>
52  inline void mult_wilson_eo_xp1(svbool_t& pg2,
53  REALTYPE *__restrict buf,
54  REALTYPE *__restrict v1)
55  {
56  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
57  svreal_t v1r, v1i, v2r, v2i;
58 
59  for (int ic = 0; ic < NC; ++ic) {
60  int icr = ND * 2 * ic;
61  int ici = ND * 2 * ic + 1;
62 
63  load_vec(pg2, w1r, &v1[VLEN * (icr + ID1)]);
64  load_vec(pg2, w1i, &v1[VLEN * (ici + ID1)]);
65  load_vec(pg2, w2r, &v1[VLEN * (icr + ID2)]);
66  load_vec(pg2, w2i, &v1[VLEN * (ici + ID2)]);
67  load_vec(pg2, w3r, &v1[VLEN * (icr + ID3)]);
68  load_vec(pg2, w3i, &v1[VLEN * (ici + ID3)]);
69  load_vec(pg2, w4r, &v1[VLEN * (icr + ID4)]);
70  load_vec(pg2, w4i, &v1[VLEN * (ici + ID4)]);
71 
72  add_vec(pg2, v1r, w1r, w4i);
73  sub_vec(pg2, v1i, w1i, w4r);
74  add_vec(pg2, v2r, w2r, w3i);
75  sub_vec(pg2, v2i, w2i, w3r);
76 
77  v1r = compact_vec(pg2, v1r);
78  v1i = compact_vec(pg2, v1i);
79  v2r = compact_vec(pg2, v2r);
80  v2i = compact_vec(pg2, v2i);
81 
82  int skip = (VLENY + 1) / 2;
83  svbool_t pg1 = set_predicate_whilelt(skip);
84  save_vec(pg1, &buf[skip * (2 * ic)], v1r);
85  save_vec(pg1, &buf[skip * (2 * ic + 1)], v1i);
86  save_vec(pg1, &buf[skip * (2 * ic + NVC)], v2r);
87  save_vec(pg1, &buf[skip * (2 * ic + 1 + NVC)], v2i);
88  }
89  }
90 
91 
92 //====================================================================
93  template<typename REALTYPE>
94  inline void set_sp2_xp2(svbool_t& pg, svbool_t& pg1, svbool_t& pg2,
95  svbool_t& pg3,
96  svreal_t& vt1r, svreal_t& vt1i,
97  svreal_t& vt2r, svreal_t& vt2i,
98  REALTYPE *__restrict v,
99  REALTYPE *__restrict buf,
100  svint_t& index, int ic)
101  {
102  int icr = ND * 2 * ic;
103  int ici = ND * 2 * ic + 1;
104 
105  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
106  load_vec(pg3, w1r, &v[VLEN * (icr + ID1)]);
107  load_vec(pg3, w1i, &v[VLEN * (ici + ID1)]);
108  load_vec(pg3, w2r, &v[VLEN * (icr + ID2)]);
109  load_vec(pg3, w2i, &v[VLEN * (ici + ID2)]);
110 
111  load_add(pg1, w1r, &v[VLEN * (icr + ID1) + 1]);
112  load_add(pg1, w1i, &v[VLEN * (ici + ID1) + 1]);
113  load_add(pg1, w2r, &v[VLEN * (icr + ID2) + 1]);
114  load_add(pg1, w2i, &v[VLEN * (ici + ID2) + 1]);
115 
116  load_vec(pg3, w3r, &v[VLEN * (icr + ID3)]);
117  load_vec(pg3, w3i, &v[VLEN * (ici + ID3)]);
118  load_vec(pg3, w4r, &v[VLEN * (icr + ID4)]);
119  load_vec(pg3, w4i, &v[VLEN * (ici + ID4)]);
120 
121  svbool_t pg13 = sveor_z(pg, pg1, pg3);
122 
123  load_add(pg1, w3r, &v[VLEN * (icr + ID3) + 1]);
124  load_add(pg1, w3i, &v[VLEN * (ici + ID3) + 1]);
125  load_add(pg1, w4r, &v[VLEN * (icr + ID4) + 1]);
126  load_add(pg1, w4i, &v[VLEN * (ici + ID4) + 1]);
127 
128  add_vec(pg13, vt1r, w1r, w4i);
129  sub_vec(pg13, vt1i, w1i, w4r);
130  add_vec(pg13, vt2r, w2r, w3i);
131  sub_vec(pg13, vt2i, w2i, w3r);
132 
133  int skip = (VLENY + 1) / 2;
134  load_add_gather(pg2, vt1r, &buf[skip * (2 * ic)], index);
135  load_add_gather(pg2, vt1i, &buf[skip * (2 * ic + 1)], index);
136  load_add_gather(pg2, vt2r, &buf[skip * (2 * ic + NVC)], index);
137  load_add_gather(pg2, vt2i, &buf[skip * (2 * ic + 1 + NVC)], index);
138  }
139 
140 
141 //====================================================================
142  template<typename REALTYPE>
143  inline void set_sp2_xp2(svbool_t& pg, svbool_t& pg1, svbool_t& pg2,
144  svbool_t& pg3,
145  svreal_t& vt1r, svreal_t& vt1i,
146  svreal_t& vt2r, svreal_t& vt2i,
147  REALTYPE *__restrict v,
148  REALTYPE *__restrict buf,
149  svuint_t& idx, int ic)
150  {
151  int icr = ND * 2 * ic;
152  int ici = ND * 2 * ic + 1;
153 
154  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
155  load_vec(pg3, w1r, &v[VLEN * (icr + ID1)]);
156  load_vec(pg3, w1i, &v[VLEN * (ici + ID1)]);
157  load_vec(pg3, w2r, &v[VLEN * (icr + ID2)]);
158  load_vec(pg3, w2i, &v[VLEN * (ici + ID2)]);
159 
160  load_add(pg1, w1r, &v[VLEN * (icr + ID1) + 1]);
161  load_add(pg1, w1i, &v[VLEN * (ici + ID1) + 1]);
162  load_add(pg1, w2r, &v[VLEN * (icr + ID2) + 1]);
163  load_add(pg1, w2i, &v[VLEN * (ici + ID2) + 1]);
164 
165  load_vec(pg3, w3r, &v[VLEN * (icr + ID3)]);
166  load_vec(pg3, w3i, &v[VLEN * (ici + ID3)]);
167  load_vec(pg3, w4r, &v[VLEN * (icr + ID4)]);
168  load_vec(pg3, w4i, &v[VLEN * (ici + ID4)]);
169 
170  svbool_t pg13 = sveor_z(pg, pg1, pg3);
171 
172  load_add(pg1, w3r, &v[VLEN * (icr + ID3) + 1]);
173  load_add(pg1, w3i, &v[VLEN * (ici + ID3) + 1]);
174  load_add(pg1, w4r, &v[VLEN * (icr + ID4) + 1]);
175  load_add(pg1, w4i, &v[VLEN * (ici + ID4) + 1]);
176 
177  add_vec(pg13, vt1r, w1r, w4i);
178  sub_vec(pg13, vt1i, w1i, w4r);
179  add_vec(pg13, vt2r, w2r, w3i);
180  sub_vec(pg13, vt2i, w2i, w3r);
181 
182  int skip = (VLENY + 1) / 2;
183  load_add_gather(pg2, vt1r, &buf[skip * (2 * ic)], idx, skip);
184  load_add_gather(pg2, vt1i, &buf[skip * (2 * ic + 1)], idx, skip);
185  load_add_gather(pg2, vt2r, &buf[skip * (2 * ic + NVC)], idx, skip);
186  load_add_gather(pg2, vt2i, &buf[skip * (2 * ic + 1 + NVC)], idx, skip);
187  }
188 
189 
190 //====================================================================
191  template<typename REALTYPE>
192  inline void mult_wilson_eo_xp2(svbool_t& pg1, svbool_t& pg2,
193  svbool_t& pg3, svuint_t& svidx,
194  Vsimd_t *v2, REALTYPE *u,
195  REALTYPE *v1, REALTYPE *buf)
196  {
197  svbool_t pg = set_predicate();
198 
199  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
200  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
201 
202  set_sp2_xp2(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
203  set_sp2_xp2(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
204  set_sp2_xp2(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
205 
206  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
207  svreal_t wt1r, wt1i, wt2r, wt2i;
208 
209  for (int ic = 0; ic < NC; ++ic) {
210  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
211  &u[VLEN * (2 * ic)]);
212  mult_uv(pg, wt1r, wt1i,
213  ut10, ut11, ut12, ut13, ut14, ut15,
214  vt10, vt11, vt12, vt13, vt14, vt15);
215  mult_uv(pg, wt2r, wt2i,
216  ut10, ut11, ut12, ut13, ut14, ut15,
217  vt20, vt21, vt22, vt23, vt24, vt25);
218  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
219  }
220  }
221 
222 
223 //====================================================================
224  template<typename REALTYPE>
225  inline void mult_wilson_eo_xp2(svbool_t& pg1, svbool_t& pg2,
226  svbool_t& pg3, svint_t& svidx,
227  Vsimd_t *v2, REALTYPE *u,
228  REALTYPE *v1, REALTYPE *buf)
229  {
230  svbool_t pg = set_predicate();
231 
232  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
233  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
234 
235  set_sp2_xp2(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
236  set_sp2_xp2(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
237  set_sp2_xp2(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
238 
239  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
240  svreal_t wt1r, wt1i, wt2r, wt2i;
241 
242  for (int ic = 0; ic < NC; ++ic) {
243  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
244  &u[VLEN * (2 * ic)]);
245  mult_uv(pg, wt1r, wt1i,
246  ut10, ut11, ut12, ut13, ut14, ut15,
247  vt10, vt11, vt12, vt13, vt14, vt15);
248  mult_uv(pg, wt2r, wt2i,
249  ut10, ut11, ut12, ut13, ut14, ut15,
250  vt20, vt21, vt22, vt23, vt24, vt25);
251  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
252  }
253  }
254 
255 
256 //====================================================================
257  template<typename REALTYPE>
258  inline void mult_wilson_eo_xp2(svbool_t& pg1, svbool_t& pg2,
259  svbool_t& pg3, svint_t& svidx,
260  REALTYPE *__restrict v2,
261  REALTYPE *__restrict u,
262  REALTYPE *__restrict v1,
263  REALTYPE *__restrict buf)
264  {
265  svbool_t pg = set_predicate();
266 
267  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
268  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
269 
270  set_sp2_xp2(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
271  set_sp2_xp2(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
272  set_sp2_xp2(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
273 
274  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
275  svreal_t wt1r, wt1i, wt2r, wt2i;
276 
277  for (int ic = 0; ic < NC; ++ic) {
278  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
279  &u[VLEN * (2 * ic)]);
280  mult_uv(pg, wt1r, wt1i,
281  ut10, ut11, ut12, ut13, ut14, ut15,
282  vt10, vt11, vt12, vt13, vt14, vt15);
283  mult_uv(pg, wt2r, wt2i,
284  ut10, ut11, ut12, ut13, ut14, ut15,
285  vt20, vt21, vt22, vt23, vt24, vt25);
286  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
287  }
288  }
289 
290 
291 //====================================================================
292  template<typename REALTYPE>
293  inline void set_sp2_xp(svbool_t& pg, svbool_t& pg1,
294  svbool_t& pg2, svbool_t& pg3,
295  svreal_t& vt1r, svreal_t& vt1i,
296  svreal_t& vt2r, svreal_t& vt2i,
297  REALTYPE *vx, REALTYPE *vn, int ic)
298  {
299  int icr = ND * 2 * ic;
300  int ici = ND * 2 * ic + 1;
301  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
302 
303  shift_vec_xbw(pg1, pg2, pg3, w1r, &vx[VLEN * (icr + ID1)],
304  &vn[VLEN * (icr + ID1)]);
305  shift_vec_xbw(pg1, pg2, pg3, w1i, &vx[VLEN * (ici + ID1)],
306  &vn[VLEN * (ici + ID1)]);
307 
308  shift_vec_xbw(pg1, pg2, pg3, w2r, &vx[VLEN * (icr + ID2)],
309  &vn[VLEN * (icr + ID2)]);
310  shift_vec_xbw(pg1, pg2, pg3, w2i, &vx[VLEN * (ici + ID2)],
311  &vn[VLEN * (ici + ID2)]);
312 
313  shift_vec_xbw(pg1, pg2, pg3, w3r, &vx[VLEN * (icr + ID3)],
314  &vn[VLEN * (icr + ID3)]);
315  shift_vec_xbw(pg1, pg2, pg3, w3i, &vx[VLEN * (ici + ID3)],
316  &vn[VLEN * (ici + ID3)]);
317 
318  shift_vec_xbw(pg1, pg2, pg3, w4r, &vx[VLEN * (icr + ID4)],
319  &vn[VLEN * (icr + ID4)]);
320  shift_vec_xbw(pg1, pg2, pg3, w4i, &vx[VLEN * (ici + ID4)],
321  &vn[VLEN * (ici + ID4)]);
322  add_vec(pg, vt1r, w1r, w4i);
323  sub_vec(pg, vt1i, w1i, w4r);
324  add_vec(pg, vt2r, w2r, w3i);
325  sub_vec(pg, vt2i, w2i, w3r);
326  }
327 
328 
329 //====================================================================
330  template<typename REALTYPE>
331  inline void mult_wilson_eo_xpb(svbool_t& pg1, svbool_t& pg2,
332  svbool_t& pg3, Vsimd_t *v2,
333  REALTYPE *u, REALTYPE *v1, REALTYPE *v1n)
334  {
335  svbool_t pg = set_predicate();
336 
337  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
338  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
339 
340  set_sp2_xp(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, v1n, 0);
341  set_sp2_xp(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, v1n, 1);
342  set_sp2_xp(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, v1n, 2);
343 
344  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
345  svreal_t wt1r, wt1i, wt2r, wt2i;
346 
347  for (int ic = 0; ic < NC; ++ic) {
348  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[VLEN * (2 * ic)]);
349  mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
350  vt10, vt11, vt12, vt13, vt14, vt15);
351  mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
352  vt20, vt21, vt22, vt23, vt24, vt25);
353  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
354  }
355  }
356 
357 
358 //====================================================================
359  template<typename REALTYPE>
360  inline void set_sp2_xp(svbool_t& pg, svbool_t& pg1,
361  svuint_t& idx,
362  svreal_t& vt1r, svreal_t& vt1i,
363  svreal_t& vt2r, svreal_t& vt2i,
364  REALTYPE *vx, REALTYPE *vn, int ic)
365  {
366  int icr = ND * 2 * ic;
367  int ici = ND * 2 * ic + 1;
368  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
369 
370  shift_vec(pg1, idx, w1r, &vx[VLEN * (icr + ID1)],
371  &vn[VLEN * (icr + ID1)]);
372  shift_vec(pg1, idx, w1i, &vx[VLEN * (ici + ID1)],
373  &vn[VLEN * (ici + ID1)]);
374 
375  shift_vec(pg1, idx, w2r, &vx[VLEN * (icr + ID2)],
376  &vn[VLEN * (icr + ID2)]);
377  shift_vec(pg1, idx, w2i, &vx[VLEN * (ici + ID2)],
378  &vn[VLEN * (ici + ID2)]);
379 
380  shift_vec(pg1, idx, w3r, &vx[VLEN * (icr + ID3)],
381  &vn[VLEN * (icr + ID3)]);
382  shift_vec(pg1, idx, w3i, &vx[VLEN * (ici + ID3)],
383  &vn[VLEN * (ici + ID3)]);
384 
385  shift_vec(pg1, idx, w4r, &vx[VLEN * (icr + ID4)],
386  &vn[VLEN * (icr + ID4)]);
387  shift_vec(pg1, idx, w4i, &vx[VLEN * (ici + ID4)],
388  &vn[VLEN * (ici + ID4)]);
389 
390  add_vec(pg, vt1r, w1r, w4i);
391  sub_vec(pg, vt1i, w1i, w4r);
392  add_vec(pg, vt2r, w2r, w3i);
393  sub_vec(pg, vt2i, w2i, w3r);
394  }
395 
396 
397 //====================================================================
398  template<typename REALTYPE>
399  inline void mult_wilson_eo_xpb(svbool_t& pg1, svuint_t& idx,
400  Vsimd_t *v2, REALTYPE *u,
401  REALTYPE *v1, REALTYPE *v1n)
402  {
403  svbool_t pg = set_predicate();
404 
405  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
406  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
407 
408  set_sp2_xp(pg, pg1, idx, vt10, vt11, vt20, vt21, v1, v1n, 0);
409  set_sp2_xp(pg, pg1, idx, vt12, vt13, vt22, vt23, v1, v1n, 1);
410  set_sp2_xp(pg, pg1, idx, vt14, vt15, vt24, vt25, v1, v1n, 2);
411 
412  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
413  svreal_t wt1r, wt1i, wt2r, wt2i;
414 
415  for (int ic = 0; ic < NC; ++ic) {
416  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[VLEN * (2 * ic)]);
417  mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
418  vt10, vt11, vt12, vt13, vt14, vt15);
419  mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
420  vt20, vt21, vt22, vt23, vt24, vt25);
421  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
422  }
423  }
424 
425 
426 //====================================================================
427  template<typename REALTYPE>
428  inline void mult_wilson_eo_xpb(svbool_t& pg1, svbool_t& pg2,
429  svbool_t& pg3,
430  REALTYPE *__restrict v2,
431  REALTYPE *__restrict u,
432  REALTYPE *v1, REALTYPE *v1n)
433  {
434  svbool_t pg = set_predicate();
435 
436  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
437  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
438 
439  set_sp2_xp(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, v1n, 0);
440  set_sp2_xp(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, v1n, 1);
441  set_sp2_xp(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, v1n, 2);
442 
443  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
444  svreal_t wt1r, wt1i, wt2r, wt2i;
445 
446  for (int ic = 0; ic < NC; ++ic) {
447  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[VLEN * (2 * ic)]);
448  mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
449  vt10, vt11, vt12, vt13, vt14, vt15);
450  mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
451  vt20, vt21, vt22, vt23, vt24, vt25);
452  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
453  }
454  }
455 
456 
457 //====================================================================
458  template<typename REALTYPE>
459  inline void mult_wilson_eo_xm1(svbool_t& pg2,
460  REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
461  {
462  svbool_t pg = set_predicate();
463 
464  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
465  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
466 
467  set_sp2_xm1(pg2, vt10, vt11, vt20, vt21, v1, 0);
468  set_sp2_xm1(pg2, vt12, vt13, vt22, vt23, v1, 1);
469  set_sp2_xm1(pg2, vt14, vt15, vt24, vt25, v1, 2);
470 
471  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
472  svreal_t wt1r, wt1i, wt2r, wt2i;
473 
474  for (int ic = 0; ic < NC; ++ic) {
475  load_udag(pg2, ut10, ut11, ut12, ut13, ut14, ut15,
476  &u[VLEN * NVC * ic]);
477 
478  mult_udv(pg2, wt1r, wt1i,
479  ut10, ut11, ut12, ut13, ut14, ut15,
480  vt10, vt11, vt12, vt13, vt14, vt15);
481  mult_udv(pg2, wt2r, wt2i,
482  ut10, ut11, ut12, ut13, ut14, ut15,
483  vt20, vt21, vt22, vt23, vt24, vt25);
484 
485  wt1r = compact_vec(pg2, wt1r);
486  wt1i = compact_vec(pg2, wt1i);
487  wt2r = compact_vec(pg2, wt2r);
488  wt2i = compact_vec(pg2, wt2i);
489 
490  int skip = (VLENY + 1) / 2;
491  svbool_t pg1 = set_predicate_whilelt(skip);
492  save_vec(pg1, &buf[skip * (2 * ic)], wt1r);
493  save_vec(pg1, &buf[skip * (2 * ic + 1)], wt1i);
494  save_vec(pg1, &buf[skip * (2 * ic + NVC)], wt2r);
495  save_vec(pg1, &buf[skip * (2 * ic + 1 + NVC)], wt2i);
496  }
497  }
498 
499 
500 //====================================================================
501  template<typename REALTYPE>
502  inline void mult_wilson_eo_xm1(svbool_t& pg2, svint_t& svidx,
503  REALTYPE *buf, REALTYPE *u, REALTYPE *v1)
504  {
505  svbool_t pg = set_predicate();
506 
507  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
508  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
509 
510  set_sp2_xm1(pg2, vt10, vt11, vt20, vt21, v1, 0);
511  set_sp2_xm1(pg2, vt12, vt13, vt22, vt23, v1, 1);
512  set_sp2_xm1(pg2, vt14, vt15, vt24, vt25, v1, 2);
513 
514  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
515  svreal_t wt1r, wt1i, wt2r, wt2i;
516 
517  for (int ic = 0; ic < NC; ++ic) {
518  load_udag(pg2, ut10, ut11, ut12, ut13, ut14, ut15,
519  &u[VLEN * NVC * ic]);
520 
521  mult_udv(pg2, wt1r, wt1i,
522  ut10, ut11, ut12, ut13, ut14, ut15,
523  vt10, vt11, vt12, vt13, vt14, vt15);
524  mult_udv(pg2, wt2r, wt2i,
525  ut10, ut11, ut12, ut13, ut14, ut15,
526  vt20, vt21, vt22, vt23, vt24, vt25);
527 
528  int skip = (VLENY + 1) / 2;
529  save_vec_scatter(pg2, &buf[skip * (2 * ic)], wt1r, svidx);
530  save_vec_scatter(pg2, &buf[skip * (2 * ic + 1)], wt1i, svidx);
531  save_vec_scatter(pg2, &buf[skip * (2 * ic + NVC)], wt2r, svidx);
532  save_vec_scatter(pg2, &buf[skip * (2 * ic + 1 + NVC)], wt2i, svidx);
533  }
534  }
535 
536 
537 //====================================================================
538  template<typename REALTYPE>
539  inline void set_sp2_xm2(svbool_t& pg1, svbool_t& pg3,
540  svreal_t& vt1r, svreal_t& vt1i,
541  svreal_t& vt2r, svreal_t& vt2i,
542  REALTYPE *vx, int ic)
543  {
544  int icr = ND * 2 * ic;
545  int ici = ND * 2 * ic + 1;
546  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
547 
548  load_vec(pg3, w1r, &vx[VLEN * (icr + ID1)]);
549  load_add(pg1, w1r, &vx[VLEN * (icr + ID1) - 1]);
550 
551  load_vec(pg3, w1i, &vx[VLEN * (ici + ID1)]);
552  load_add(pg1, w1i, &vx[VLEN * (ici + ID1) - 1]);
553 
554  load_vec(pg3, w2r, &vx[VLEN * (icr + ID2)]);
555  load_add(pg1, w2r, &vx[VLEN * (icr + ID2) - 1]);
556 
557  load_vec(pg3, w2i, &vx[VLEN * (ici + ID2)]);
558  load_add(pg1, w2i, &vx[VLEN * (ici + ID2) - 1]);
559 
560  load_vec(pg3, w3r, &vx[VLEN * (icr + ID3)]);
561  load_add(pg1, w3r, &vx[VLEN * (icr + ID3) - 1]);
562 
563  load_vec(pg3, w3i, &vx[VLEN * (ici + ID3)]);
564  load_add(pg1, w3i, &vx[VLEN * (ici + ID3) - 1]);
565 
566  load_vec(pg3, w4r, &vx[VLEN * (icr + ID4)]);
567  load_add(pg1, w4r, &vx[VLEN * (icr + ID4) - 1]);
568 
569  load_vec(pg3, w4i, &vx[VLEN * (ici + ID4)]);
570  load_add(pg1, w4i, &vx[VLEN * (ici + ID4) - 1]);
571 
572  svbool_t pg0 = set_predicate();
573  svbool_t pg13 = sveor_z(pg0, pg1, pg3);
574  sub_vec(pg13, vt1r, w1r, w4i);
575  add_vec(pg13, vt1i, w1i, w4r);
576  sub_vec(pg13, vt2r, w2r, w3i);
577  add_vec(pg13, vt2i, w2i, w3r);
578  }
579 
580 
581 //====================================================================
582  template<typename REALTYPE>
583  inline void mult_wilson_eo_xm2(svbool_t& pg1, svbool_t& pg2,
584  svbool_t& pg3, svint_t& svidx,
585  Vsimd_t *v2, REALTYPE *u,
586  REALTYPE *v1, REALTYPE *buf)
587  {
588  svbool_t pg = set_predicate();
589 
590  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
591  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
592 
593  set_sp2_xm2(pg1, pg3, vt10, vt11, vt20, vt21, v1, 0);
594  set_sp2_xm2(pg1, pg3, vt12, vt13, vt22, vt23, v1, 1);
595  set_sp2_xm2(pg1, pg3, vt14, vt15, vt24, vt25, v1, 2);
596 
597  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
598  svreal_t wt1r, wt1i, wt2r, wt2i;
599 
600  for (int ic = 0; ic < NC; ++ic) {
601  load_udag_xm2_eo(pg1, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
602  &u[VLEN * NVC * ic]);
603  svbool_t pg13 = sveor_z(pg, pg1, pg3);
604  mult_udv(pg13, wt1r, wt1i,
605  ut10, ut11, ut12, ut13, ut14, ut15,
606  vt10, vt11, vt12, vt13, vt14, vt15);
607  mult_udv(pg13, wt2r, wt2i,
608  ut10, ut11, ut12, ut13, ut14, ut15,
609  vt20, vt21, vt22, vt23, vt24, vt25);
610 
611  int skip = (VLENY + 1) / 2;
612  load_add_gather(pg2, wt1r, &buf[skip * (2 * ic)], svidx);
613  load_add_gather(pg2, wt1i, &buf[skip * (2 * ic + 1)], svidx);
614  load_add_gather(pg2, wt2r, &buf[skip * (2 * ic + NVC)], svidx);
615  load_add_gather(pg2, wt2i, &buf[skip * (2 * ic + 1 + NVC)], svidx);
616 
617  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
618  }
619  }
620 
621 
622 //====================================================================
623  template<typename REALTYPE>
624  inline void mult_wilson_eo_xm2(svbool_t& pg1, svbool_t& pg2,
625  svbool_t& pg3, svuint_t& idx,
626  Vsimd_t *v2, REALTYPE *u,
627  REALTYPE *v1, REALTYPE *buf)
628  {
629  svbool_t pg = set_predicate();
630 
631  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
632  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
633 
634  set_sp2_xm2(pg1, pg3, vt10, vt11, vt20, vt21, v1, 0);
635  set_sp2_xm2(pg1, pg3, vt12, vt13, vt22, vt23, v1, 1);
636  set_sp2_xm2(pg1, pg3, vt14, vt15, vt24, vt25, v1, 2);
637 
638  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
639  svreal_t wt1r, wt1i, wt2r, wt2i;
640 
641  for (int ic = 0; ic < NC; ++ic) {
642  load_udag_xm2_eo(pg1, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
643  &u[VLEN * NVC * ic]);
644  svbool_t pg13 = sveor_z(pg, pg1, pg3);
645  mult_udv(pg13, wt1r, wt1i,
646  ut10, ut11, ut12, ut13, ut14, ut15,
647  vt10, vt11, vt12, vt13, vt14, vt15);
648  mult_udv(pg13, wt2r, wt2i,
649  ut10, ut11, ut12, ut13, ut14, ut15,
650  vt20, vt21, vt22, vt23, vt24, vt25);
651 
652  int skip = (VLENY + 1) / 2;
653  load_add_gather(pg2, wt1r, &buf[skip * (2 * ic)], idx, skip);
654  load_add_gather(pg2, wt1i, &buf[skip * (2 * ic + 1)], idx, skip);
655  load_add_gather(pg2, wt2r, &buf[skip * (2 * ic + NVC)], idx, skip);
656  load_add_gather(pg2, wt2i, &buf[skip * (2 * ic + 1 + NVC)], idx, skip);
657 
658  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
659  }
660  }
661 
662 
663 //====================================================================
664  template<typename REALTYPE>
665  inline void mult_wilson_eo_xm2(svbool_t& pg1, svbool_t& pg2,
666  svbool_t& pg3, svint_t& svidx,
667  REALTYPE *__restrict v2,
668  REALTYPE *__restrict u,
669  REALTYPE *__restrict v1,
670  REALTYPE *__restrict buf)
671  {
672  svbool_t pg = set_predicate();
673 
674  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
675  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
676 
677  set_sp2_xm2(pg1, pg3, vt10, vt11, vt20, vt21, v1, 0);
678  set_sp2_xm2(pg1, pg3, vt12, vt13, vt22, vt23, v1, 1);
679  set_sp2_xm2(pg1, pg3, vt14, vt15, vt24, vt25, v1, 2);
680 
681  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
682  svreal_t wt1r, wt1i, wt2r, wt2i;
683 
684  for (int ic = 0; ic < NC; ++ic) {
685  load_udag_xm2_eo(pg1, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
686  &u[VLEN * NVC * ic]);
687  svbool_t pg13 = sveor_z(pg, pg1, pg3);
688  mult_udv(pg13, wt1r, wt1i,
689  ut10, ut11, ut12, ut13, ut14, ut15,
690  vt10, vt11, vt12, vt13, vt14, vt15);
691  mult_udv(pg13, wt2r, wt2i,
692  ut10, ut11, ut12, ut13, ut14, ut15,
693  vt20, vt21, vt22, vt23, vt24, vt25);
694 
695  int skip = (VLENY + 1) / 2;
696  load_add_gather(pg2, wt1r, &buf[skip * (2 * ic)], svidx);
697  load_add_gather(pg2, wt1i, &buf[skip * (2 * ic + 1)], svidx);
698  load_add_gather(pg2, wt2r, &buf[skip * (2 * ic + NVC)], svidx);
699  load_add_gather(pg2, wt2i, &buf[skip * (2 * ic + 1 + NVC)], svidx);
700 
701  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
702  }
703  }
704 
705 
706 //====================================================================
707  template<typename REALTYPE>
708  inline void set_sp2_xm(svbool_t& pg, svbool_t& pg1, svbool_t& pg2,
709  svbool_t& pg3,
710  svreal_t& vt1r, svreal_t& vt1i,
711  svreal_t& vt2r, svreal_t& vt2i,
712  REALTYPE *vx, REALTYPE *vn, int ic)
713  {
714  int icr = ND * 2 * ic;
715  int ici = ND * 2 * ic + 1;
716  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
717 
718  shift_vec_xfw(pg1, pg2, pg3, w1r, &vx[VLEN * (icr + ID1)],
719  &vn[VLEN * (icr + ID1)]);
720  shift_vec_xfw(pg1, pg2, pg3, w1i, &vx[VLEN * (ici + ID1)],
721  &vn[VLEN * (ici + ID1)]);
722 
723  shift_vec_xfw(pg1, pg2, pg3, w2r, &vx[VLEN * (icr + ID2)],
724  &vn[VLEN * (icr + ID2)]);
725  shift_vec_xfw(pg1, pg2, pg3, w2i, &vx[VLEN * (ici + ID2)],
726  &vn[VLEN * (ici + ID2)]);
727 
728  shift_vec_xfw(pg1, pg2, pg3, w3r, &vx[VLEN * (icr + ID3)],
729  &vn[VLEN * (icr + ID3)]);
730  shift_vec_xfw(pg1, pg2, pg3, w3i, &vx[VLEN * (ici + ID3)],
731  &vn[VLEN * (ici + ID3)]);
732 
733  shift_vec_xfw(pg1, pg2, pg3, w4r, &vx[VLEN * (icr + ID4)],
734  &vn[VLEN * (icr + ID4)]);
735  shift_vec_xfw(pg1, pg2, pg3, w4i, &vx[VLEN * (ici + ID4)],
736  &vn[VLEN * (ici + ID4)]);
737 
738  sub_vec(pg, vt1r, w1r, w4i);
739  add_vec(pg, vt1i, w1i, w4r);
740  sub_vec(pg, vt2r, w2r, w3i);
741  add_vec(pg, vt2i, w2i, w3r);
742  }
743 
744 
745 //====================================================================
746  template<typename REALTYPE>
747  inline void mult_wilson_eo_xmb(svbool_t& pg1, svbool_t& pg2,
748  svbool_t& pg3, Vsimd_t *v2,
749  REALTYPE *u, REALTYPE *un,
750  REALTYPE *v1, REALTYPE *v1n)
751  {
752  svbool_t pg = set_predicate();
753 
754  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
755  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
756 
757  set_sp2_xm(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, v1n, 0);
758  set_sp2_xm(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, v1n, 1);
759  set_sp2_xm(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, v1n, 2);
760 
761  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
762  svreal_t wt1r, wt1i, wt2r, wt2i;
763 
764  for (int ic = 0; ic < NC; ++ic) {
765  load_udag_xm_eo(pg1, pg2, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
766  &u[VLEN * NVC * ic], &un[VLEN * NVC * ic]);
767 
768  mult_udv(pg, wt1r, wt1i,
769  ut10, ut11, ut12, ut13, ut14, ut15,
770  vt10, vt11, vt12, vt13, vt14, vt15);
771  mult_udv(pg, wt2r, wt2i,
772  ut10, ut11, ut12, ut13, ut14, ut15,
773  vt20, vt21, vt22, vt23, vt24, vt25);
774  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
775  }
776  }
777 
778 
779 //====================================================================
780  template<typename REALTYPE>
781  inline void set_sp2_xm(svbool_t& pg, svbool_t& pg1, svuint_t& idx1,
782  svreal_t& vt1r, svreal_t& vt1i,
783  svreal_t& vt2r, svreal_t& vt2i,
784  REALTYPE *vx, REALTYPE *vn, int ic)
785  {
786  int icr = ND * 2 * ic;
787  int ici = ND * 2 * ic + 1;
788  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
789 
790  shift_vec(pg1, idx1, w1r, &vx[VLEN * (icr + ID1)],
791  &vn[VLEN * (icr + ID1)]);
792  shift_vec(pg1, idx1, w1i, &vx[VLEN * (ici + ID1)],
793  &vn[VLEN * (ici + ID1)]);
794 
795  shift_vec(pg1, idx1, w2r, &vx[VLEN * (icr + ID2)],
796  &vn[VLEN * (icr + ID2)]);
797  shift_vec(pg1, idx1, w2i, &vx[VLEN * (ici + ID2)],
798  &vn[VLEN * (ici + ID2)]);
799 
800  shift_vec(pg1, idx1, w3r, &vx[VLEN * (icr + ID3)],
801  &vn[VLEN * (icr + ID3)]);
802  shift_vec(pg1, idx1, w3i, &vx[VLEN * (ici + ID3)],
803  &vn[VLEN * (ici + ID3)]);
804 
805  shift_vec(pg1, idx1, w4r, &vx[VLEN * (icr + ID4)],
806  &vn[VLEN * (icr + ID4)]);
807  shift_vec(pg1, idx1, w4i, &vx[VLEN * (ici + ID4)],
808  &vn[VLEN * (ici + ID4)]);
809 
810  sub_vec(pg, vt1r, w1r, w4i);
811  add_vec(pg, vt1i, w1i, w4r);
812  sub_vec(pg, vt2r, w2r, w3i);
813  add_vec(pg, vt2i, w2i, w3r);
814  }
815 
816 
817 //====================================================================
818  template<typename REALTYPE>
819  inline void mult_wilson_eo_xmb(svbool_t& pg1, svuint_t& idx1,
820  Vsimd_t *v2,
821  REALTYPE *u, REALTYPE *un,
822  REALTYPE *v1, REALTYPE *v1n)
823  {
824  svbool_t pg = set_predicate();
825 
826  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
827  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
828 
829  set_sp2_xm(pg, pg1, idx1, vt10, vt11, vt20, vt21, v1, v1n, 0);
830  set_sp2_xm(pg, pg1, idx1, vt12, vt13, vt22, vt23, v1, v1n, 1);
831  set_sp2_xm(pg, pg1, idx1, vt14, vt15, vt24, vt25, v1, v1n, 2);
832 
833  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
834  svreal_t wt1r, wt1i, wt2r, wt2i;
835 
836  for (int ic = 0; ic < NC; ++ic) {
837  load_udag(pg1, idx1, ut10, ut11, ut12, ut13, ut14, ut15,
838  &u[VLEN * NVC * ic], &un[VLEN * NVC * ic]);
839 
840  mult_udv(pg, wt1r, wt1i,
841  ut10, ut11, ut12, ut13, ut14, ut15,
842  vt10, vt11, vt12, vt13, vt14, vt15);
843  mult_udv(pg, wt2r, wt2i,
844  ut10, ut11, ut12, ut13, ut14, ut15,
845  vt20, vt21, vt22, vt23, vt24, vt25);
846  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
847  }
848  }
849 
850 
851 //====================================================================
852  template<typename REALTYPE>
853  inline void mult_wilson_eo_xmb(svbool_t& pg1, svbool_t& pg2,
854  svbool_t& pg3,
855  REALTYPE *__restrict v2,
856  REALTYPE *u, REALTYPE *un,
857  REALTYPE *v1, REALTYPE *v1n)
858  {
859  svbool_t pg = set_predicate();
860 
861  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
862  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
863 
864  set_sp2_xm(pg, pg1, pg2, pg3, vt10, vt11, vt20, vt21, v1, v1n, 0);
865  set_sp2_xm(pg, pg1, pg2, pg3, vt12, vt13, vt22, vt23, v1, v1n, 1);
866  set_sp2_xm(pg, pg1, pg2, pg3, vt14, vt15, vt24, vt25, v1, v1n, 2);
867 
868  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
869  svreal_t wt1r, wt1i, wt2r, wt2i;
870 
871  for (int ic = 0; ic < NC; ++ic) {
872  load_udag_xm_eo(pg1, pg2, pg3, ut10, ut11, ut12, ut13, ut14, ut15,
873  &u[VLEN * NVC * ic], &un[VLEN * NVC * ic]);
874 
875  mult_udv(pg, wt1r, wt1i,
876  ut10, ut11, ut12, ut13, ut14, ut15,
877  vt10, vt11, vt12, vt13, vt14, vt15);
878  mult_udv(pg, wt2r, wt2i,
879  ut10, ut11, ut12, ut13, ut14, ut15,
880  vt20, vt21, vt22, vt23, vt24, vt25);
881  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
882  }
883  }
884 } // nameless namespace end
885 
886 #endif
887 //============================================================END=====
ID1
#define ID1
Definition: fopr_Wilson_impl_SU2-inc.h:18
VLEN
#define VLEN
Definition: bridgeQXS_Clover_coarse_double.cpp:12
Vsimd_t
Definition: vsimd_double-inc.h:13
Isimd_t
Definition: vsimd_double-inc.h:20
ID2
#define ID2
Definition: fopr_Wilson_impl_SU2-inc.h:19
ID4
#define ID4
Definition: fopr_Wilson_impl_SU2-inc.h:21
NC
#define NC
Definition: field_F_imp_SU2-inc.h:2
AIndex_eo_qxs::idx
int idx(const int in, const int Nin, const int ist, const int Nx2, const int Ny, const int leo, const int Nvol2, const int ex)
Definition: aindex_eo.h:27
ND
#define ND
Definition: field_F_imp_SU2-inc.h:5
ID3
#define ID3
Definition: fopr_Wilson_impl_SU2-inc.h:20
Usimd_t
Definition: vsimd_double-inc.h:25
VLENY
#define VLENY
Definition: bridgeQXS_Clover_coarse_double.cpp:14
NVC
#define NVC
Definition: fopr_Wilson_impl_SU2-inc.h:15
svbool_t
Definition: vsimd_double-inc.h:30