Bridge++  Ver. 2.0.2
mult_Wilson_parts_qxs-inc.h
Go to the documentation of this file.
1 
10 #ifndef MULT_WILSON_PARTS_QXS_H
11 #define MULT_WILSON_PARTS_QXS_H
12 
13 namespace {
14 //====================================================================
15  template<typename REALTYPE>
16  inline void mult_wilson_xp1(svbool_t& pg2, svint_t& svidx,
17  REALTYPE *__restrict buf,
18  REALTYPE *__restrict v1)
19  {
20  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
21  svreal_t v1r, v1i, v2r, v2i;
22 
23  for (int ic = 0; ic < NC; ++ic) {
24  int icr = ND * 2 * ic;
25  int ici = ND * 2 * ic + 1;
26 
27  load_vec(pg2, w1r, &v1[VLEN * (icr + ID1)]);
28  load_vec(pg2, w1i, &v1[VLEN * (ici + ID1)]);
29  load_vec(pg2, w2r, &v1[VLEN * (icr + ID2)]);
30  load_vec(pg2, w2i, &v1[VLEN * (ici + ID2)]);
31  load_vec(pg2, w3r, &v1[VLEN * (icr + ID3)]);
32  load_vec(pg2, w3i, &v1[VLEN * (ici + ID3)]);
33  load_vec(pg2, w4r, &v1[VLEN * (icr + ID4)]);
34  load_vec(pg2, w4i, &v1[VLEN * (ici + ID4)]);
35 
36  add_vec(pg2, v1r, w1r, w4i);
37  sub_vec(pg2, v1i, w1i, w4r);
38  add_vec(pg2, v2r, w2r, w3i);
39  sub_vec(pg2, v2i, w2i, w3r);
40 
41  save_vec_scatter(pg2, &buf[VLENY * (2 * ic)], v1r, svidx);
42  save_vec_scatter(pg2, &buf[VLENY * (2 * ic + 1)], v1i, svidx);
43  save_vec_scatter(pg2, &buf[VLENY * (2 * ic + NVC)], v2r, svidx);
44  save_vec_scatter(pg2, &buf[VLENY * (2 * ic + 1 + NVC)], v2i, svidx);
45  }
46  }
47 
48 
49 //====================================================================
50  template<typename REALTYPE>
51  inline void set_sp2_xp2(svbool_t& pg, svbool_t& pg1, svbool_t& pg2,
52  svreal_t& vt1r, svreal_t& vt1i,
53  svreal_t& vt2r, svreal_t& vt2i,
54  REALTYPE *v, REALTYPE *buf, svint_t& index, int ic)
55  {
56  int icr = ND * 2 * ic;
57  int ici = ND * 2 * ic + 1;
58 
59  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
60  load_vec(pg1, w1r, &v[VLEN * (icr + ID1) + 1]);
61  load_vec(pg1, w1i, &v[VLEN * (ici + ID1) + 1]);
62  load_vec(pg1, w2r, &v[VLEN * (icr + ID2) + 1]);
63  load_vec(pg1, w2i, &v[VLEN * (ici + ID2) + 1]);
64  load_vec(pg1, w3r, &v[VLEN * (icr + ID3) + 1]);
65  load_vec(pg1, w3i, &v[VLEN * (ici + ID3) + 1]);
66  load_vec(pg1, w4r, &v[VLEN * (icr + ID4) + 1]);
67  load_vec(pg1, w4i, &v[VLEN * (ici + ID4) + 1]);
68 
69  add_vec(pg1, vt1r, w1r, w4i);
70  sub_vec(pg1, vt1i, w1i, w4r);
71  add_vec(pg1, vt2r, w2r, w3i);
72  sub_vec(pg1, vt2i, w2i, w3r);
73 
74  load_add_gather(pg2, vt1r, &buf[VLENY * (2 * ic)], index);
75  load_add_gather(pg2, vt1i, &buf[VLENY * (2 * ic + 1)], index);
76  load_add_gather(pg2, vt2r, &buf[VLENY * (2 * ic + NVC)], index);
77  load_add_gather(pg2, vt2i, &buf[VLENY * (2 * ic + 1 + NVC)], index);
78  }
79 
80 
81 //====================================================================
82  template<typename REALTYPE>
83  inline void mult_wilson_xp2(svbool_t& pg1, svbool_t& pg2, svint_t& svidx,
84  Vsimd_t *v2, REALTYPE *u,
85  REALTYPE *v1, REALTYPE *buf)
86  {
87  svbool_t pg = set_predicate();
88 
89  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
90  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
91 
92  set_sp2_xp2(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
93  set_sp2_xp2(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
94  set_sp2_xp2(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
95 
96  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
97  svreal_t wt1r, wt1i, wt2r, wt2i;
98 
99  for (int ic = 0; ic < NC; ++ic) {
100  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
101  &u[VLEN * (2 * ic)]);
102  mult_uv(pg, wt1r, wt1i,
103  ut10, ut11, ut12, ut13, ut14, ut15,
104  vt10, vt11, vt12, vt13, vt14, vt15);
105  mult_uv(pg, wt2r, wt2i,
106  ut10, ut11, ut12, ut13, ut14, ut15,
107  vt20, vt21, vt22, vt23, vt24, vt25);
108  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
109  }
110  }
111 
112 
113 //====================================================================
114  template<typename REALTYPE>
115  inline void mult_wilson_xp2(svbool_t& pg1, svbool_t& pg2, svint_t& svidx,
116  REALTYPE *__restrict v2,
117  REALTYPE *__restrict u,
118  REALTYPE *__restrict v1,
119  REALTYPE *__restrict buf)
120  {
121  svbool_t pg = set_predicate();
122 
123  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
124  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
125 
126  set_sp2_xp2(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, buf, svidx, 0);
127  set_sp2_xp2(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, buf, svidx, 1);
128  set_sp2_xp2(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, buf, svidx, 2);
129 
130  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
131  svreal_t wt1r, wt1i, wt2r, wt2i;
132 
133  for (int ic = 0; ic < NC; ++ic) {
134  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
135  &u[VLEN * (2 * ic)]);
136  mult_uv(pg, wt1r, wt1i,
137  ut10, ut11, ut12, ut13, ut14, ut15,
138  vt10, vt11, vt12, vt13, vt14, vt15);
139  mult_uv(pg, wt2r, wt2i,
140  ut10, ut11, ut12, ut13, ut14, ut15,
141  vt20, vt21, vt22, vt23, vt24, vt25);
142  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
143  }
144  }
145 
146 
147 //====================================================================
148  template<typename REALTYPE>
149  inline void set_sp2_xp(svbool_t& pg, svbool_t& pg1, svbool_t& pg2,
150  svreal_t& vt1r, svreal_t& vt1i,
151  svreal_t& vt2r, svreal_t& vt2i,
152  REALTYPE *v, REALTYPE *vn, int ic)
153  {
154  int icr = ND * 2 * ic;
155  int ici = ND * 2 * ic + 1;
156  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
157 
158  shift_vec_xbw(pg1, pg2, w1r, &v[VLEN * (icr + ID1)],
159  &vn[VLEN * (icr + ID1)]);
160  shift_vec_xbw(pg1, pg2, w1i, &v[VLEN * (ici + ID1)],
161  &vn[VLEN * (ici + ID1)]);
162 
163  shift_vec_xbw(pg1, pg2, w2r, &v[VLEN * (icr + ID2)],
164  &vn[VLEN * (icr + ID2)]);
165  shift_vec_xbw(pg1, pg2, w2i, &v[VLEN * (ici + ID2)],
166  &vn[VLEN * (ici + ID2)]);
167 
168  shift_vec_xbw(pg1, pg2, w3r, &v[VLEN * (icr + ID3)],
169  &vn[VLEN * (icr + ID3)]);
170  shift_vec_xbw(pg1, pg2, w3i, &v[VLEN * (ici + ID3)],
171  &vn[VLEN * (ici + ID3)]);
172 
173  shift_vec_xbw(pg1, pg2, w4r, &v[VLEN * (icr + ID4)],
174  &vn[VLEN * (icr + ID4)]);
175  shift_vec_xbw(pg1, pg2, w4i, &v[VLEN * (ici + ID4)],
176  &vn[VLEN * (ici + ID4)]);
177 
178  add_vec(pg, vt1r, w1r, w4i);
179  sub_vec(pg, vt1i, w1i, w4r);
180  add_vec(pg, vt2r, w2r, w3i);
181  sub_vec(pg, vt2i, w2i, w3r);
182  }
183 
184 
185 //====================================================================
186  template<typename REALTYPE>
187  inline void mult_wilson_xpb(svbool_t& pg1, svbool_t& pg2, Vsimd_t *v2,
188  REALTYPE *u, REALTYPE *v1, REALTYPE *v1n)
189  {
190  svbool_t pg = set_predicate();
191 
192  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
193  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
194 
195  set_sp2_xp(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
196  set_sp2_xp(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
197  set_sp2_xp(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
198 
199  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
200  svreal_t wt1r, wt1i, wt2r, wt2i;
201 
202  for (int ic = 0; ic < NC; ++ic) {
203  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[VLEN * (2 * ic)]);
204  mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
205  vt10, vt11, vt12, vt13, vt14, vt15);
206  mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
207  vt20, vt21, vt22, vt23, vt24, vt25);
208  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
209  }
210  }
211 
212 
213 //====================================================================
214  template<typename REALTYPE>
215  inline void mult_wilson_xpb(svbool_t& pg1, svbool_t& pg2,
216  REALTYPE *__restrict v2,
217  REALTYPE *u, REALTYPE *v1, REALTYPE *v1n)
218  {
219  svbool_t pg = set_predicate();
220 
221  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
222  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
223 
224  set_sp2_xp(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
225  set_sp2_xp(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
226  set_sp2_xp(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
227 
228 
229  for (int ic = 0; ic < NC; ++ic) {
230  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
231  svreal_t wt1r, wt1i, wt2r, wt2i;
232  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15, &u[VLEN * (2 * ic)]);
233  mult_uv(pg, wt1r, wt1i, ut10, ut11, ut12, ut13, ut14, ut15,
234  vt10, vt11, vt12, vt13, vt14, vt15);
235  mult_uv(pg, wt2r, wt2i, ut10, ut11, ut12, ut13, ut14, ut15,
236  vt20, vt21, vt22, vt23, vt24, vt25);
237  set_sp4_xp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
238  }
239  }
240 
241 
242 //====================================================================
243  template<typename REALTYPE>
244  inline void set_sp2_xm1(svbool_t& pg,
245  svreal_t& vt1r, svreal_t& vt1i,
246  svreal_t& vt2r, svreal_t& vt2i,
247  REALTYPE *vx, int ic)
248  {
249  int icr = ND * 2 * ic;
250  int ici = ND * 2 * ic + 1;
251  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
252 
253  load_vec(pg, w1r, &vx[VLEN * (icr + ID1)]);
254  load_vec(pg, w1i, &vx[VLEN * (ici + ID1)]);
255 
256  load_vec(pg, w2r, &vx[VLEN * (icr + ID2)]);
257  load_vec(pg, w2i, &vx[VLEN * (ici + ID2)]);
258 
259  load_vec(pg, w3r, &vx[VLEN * (icr + ID3)]);
260  load_vec(pg, w3i, &vx[VLEN * (ici + ID3)]);
261 
262  load_vec(pg, w4r, &vx[VLEN * (icr + ID4)]);
263  load_vec(pg, w4i, &vx[VLEN * (ici + ID4)]);
264 
265  sub_vec(pg, vt1r, w1r, w4i);
266  add_vec(pg, vt1i, w1i, w4r);
267  sub_vec(pg, vt2r, w2r, w3i);
268  add_vec(pg, vt2i, w2i, w3r);
269  }
270 
271 
272 //====================================================================
273  template<typename REALTYPE>
274  inline void mult_wilson_xm1(svbool_t& pg2, svint_t& svidx,
275  REALTYPE *__restrict buf,
276  REALTYPE *__restrict u,
277  REALTYPE *__restrict v1)
278  {
279  svbool_t pg = set_predicate();
280 
281  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
282  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
283 
284  set_sp2_xm1(pg2, vt10, vt11, vt20, vt21, v1, 0);
285  set_sp2_xm1(pg2, vt12, vt13, vt22, vt23, v1, 1);
286  set_sp2_xm1(pg2, vt14, vt15, vt24, vt25, v1, 2);
287 
288  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
289  svreal_t wt1r, wt1i, wt2r, wt2i;
290 
291  for (int ic = 0; ic < NC; ++ic) {
292  load_udag(pg2, ut10, ut11, ut12, ut13, ut14, ut15,
293  &u[VLEN * NVC * ic]);
294 
295  mult_udv(pg2, wt1r, wt1i,
296  ut10, ut11, ut12, ut13, ut14, ut15,
297  vt10, vt11, vt12, vt13, vt14, vt15);
298  mult_udv(pg2, wt2r, wt2i,
299  ut10, ut11, ut12, ut13, ut14, ut15,
300  vt20, vt21, vt22, vt23, vt24, vt25);
301 
302  save_vec_scatter(pg2, &buf[VLENY * (2 * ic)], wt1r, svidx);
303  save_vec_scatter(pg2, &buf[VLENY * (2 * ic + 1)], wt1i, svidx);
304  save_vec_scatter(pg2, &buf[VLENY * (2 * ic + NVC)], wt2r, svidx);
305  save_vec_scatter(pg2, &buf[VLENY * (2 * ic + 1 + NVC)], wt2i, svidx);
306  }
307  }
308 
309 
310 //====================================================================
311  template<typename REALTYPE>
312  inline void set_sp2_xm2(svbool_t& pg,
313  svreal_t& vt1r, svreal_t& vt1i,
314  svreal_t& vt2r, svreal_t& vt2i,
315  REALTYPE *vx, int ic)
316  {
317  int icr = ND * 2 * ic;
318  int ici = ND * 2 * ic + 1;
319  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
320 
321  load_vec(pg, w1r, &vx[VLEN * (icr + ID1) - 1]);
322  load_vec(pg, w1i, &vx[VLEN * (ici + ID1) - 1]);
323  load_vec(pg, w2r, &vx[VLEN * (icr + ID2) - 1]);
324  load_vec(pg, w2i, &vx[VLEN * (ici + ID2) - 1]);
325  load_vec(pg, w3r, &vx[VLEN * (icr + ID3) - 1]);
326  load_vec(pg, w3i, &vx[VLEN * (ici + ID3) - 1]);
327  load_vec(pg, w4r, &vx[VLEN * (icr + ID4) - 1]);
328  load_vec(pg, w4i, &vx[VLEN * (ici + ID4) - 1]);
329 
330  sub_vec(pg, vt1r, w1r, w4i);
331  add_vec(pg, vt1i, w1i, w4r);
332  sub_vec(pg, vt2r, w2r, w3i);
333  add_vec(pg, vt2i, w2i, w3r);
334  }
335 
336 
337 //====================================================================
338  template<typename REALTYPE>
339  inline void mult_wilson_xm2(svbool_t& pg1, svbool_t& pg2, svint_t& svidx,
340  Vsimd_t *v2, REALTYPE *u,
341  REALTYPE *v1, REALTYPE *buf)
342  {
343  svbool_t pg = set_predicate();
344 
345  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
346  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
347 
348  set_sp2_xm2(pg1, vt10, vt11, vt20, vt21, v1, 0);
349  set_sp2_xm2(pg1, vt12, vt13, vt22, vt23, v1, 1);
350  set_sp2_xm2(pg1, vt14, vt15, vt24, vt25, v1, 2);
351 
352  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
353  svreal_t wt1r, wt1i, wt2r, wt2i;
354 
355  for (int ic = 0; ic < NC; ++ic) {
356  load_udag(pg1, ut10, ut11, ut12, ut13, ut14, ut15,
357  &u[VLEN * NVC * ic - 1]);
358  mult_udv(pg1, wt1r, wt1i,
359  ut10, ut11, ut12, ut13, ut14, ut15,
360  vt10, vt11, vt12, vt13, vt14, vt15);
361  mult_udv(pg1, wt2r, wt2i,
362  ut10, ut11, ut12, ut13, ut14, ut15,
363  vt20, vt21, vt22, vt23, vt24, vt25);
364 
365  load_add_gather(pg2, wt1r, &buf[VLENY * (2 * ic)], svidx);
366  load_add_gather(pg2, wt1i, &buf[VLENY * (2 * ic + 1)], svidx);
367  load_add_gather(pg2, wt2r, &buf[VLENY * (2 * ic + NVC)], svidx);
368  load_add_gather(pg2, wt2i, &buf[VLENY * (2 * ic + 1 + NVC)], svidx);
369 
370  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
371  }
372  }
373 
374 
375 //====================================================================
376  template<typename REALTYPE>
377  inline void mult_wilson_xm2(svbool_t& pg1, svbool_t& pg2, svint_t& svidx,
378  REALTYPE *__restrict v2,
379  REALTYPE *__restrict u,
380  REALTYPE *__restrict v1,
381  REALTYPE *__restrict buf)
382  {
383  svbool_t pg = set_predicate();
384 
385  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
386  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
387 
388  set_sp2_xm2(pg1, vt10, vt11, vt20, vt21, v1, 0);
389  set_sp2_xm2(pg1, vt12, vt13, vt22, vt23, v1, 1);
390  set_sp2_xm2(pg1, vt14, vt15, vt24, vt25, v1, 2);
391 
392  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
393  svreal_t wt1r, wt1i, wt2r, wt2i;
394 
395  for (int ic = 0; ic < NC; ++ic) {
396  load_udag(pg1, ut10, ut11, ut12, ut13, ut14, ut15,
397  &u[VLEN * NVC * ic - 1]);
398  mult_udv(pg1, wt1r, wt1i,
399  ut10, ut11, ut12, ut13, ut14, ut15,
400  vt10, vt11, vt12, vt13, vt14, vt15);
401  mult_udv(pg1, wt2r, wt2i,
402  ut10, ut11, ut12, ut13, ut14, ut15,
403  vt20, vt21, vt22, vt23, vt24, vt25);
404 
405  load_add_gather(pg2, wt1r, &buf[VLENY * (2 * ic)], svidx);
406  load_add_gather(pg2, wt1i, &buf[VLENY * (2 * ic + 1)], svidx);
407  load_add_gather(pg2, wt2r, &buf[VLENY * (2 * ic + NVC)], svidx);
408  load_add_gather(pg2, wt2i, &buf[VLENY * (2 * ic + 1 + NVC)], svidx);
409 
410  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
411  }
412  }
413 
414 
415 //====================================================================
416  template<typename REALTYPE>
417  inline void set_sp2_xm(svbool_t& pg, svbool_t& pg1, svbool_t& pg2,
418  svreal_t& vt1r, svreal_t& vt1i,
419  svreal_t& vt2r, svreal_t& vt2i,
420  REALTYPE *vx, REALTYPE *vn, int ic)
421  {
422  int icr = ND * 2 * ic;
423  int ici = ND * 2 * ic + 1;
424  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
425 
426  shift_vec_xfw(pg1, pg2, w1r, &vx[VLEN * (icr + ID1)],
427  &vn[VLEN * (icr + ID1)]);
428  shift_vec_xfw(pg1, pg2, w1i, &vx[VLEN * (ici + ID1)],
429  &vn[VLEN * (ici + ID1)]);
430 
431  shift_vec_xfw(pg1, pg2, w2r, &vx[VLEN * (icr + ID2)],
432  &vn[VLEN * (icr + ID2)]);
433  shift_vec_xfw(pg1, pg2, w2i, &vx[VLEN * (ici + ID2)],
434  &vn[VLEN * (ici + ID2)]);
435 
436  shift_vec_xfw(pg1, pg2, w3r, &vx[VLEN * (icr + ID3)],
437  &vn[VLEN * (icr + ID3)]);
438  shift_vec_xfw(pg1, pg2, w3i, &vx[VLEN * (ici + ID3)],
439  &vn[VLEN * (ici + ID3)]);
440 
441  shift_vec_xfw(pg1, pg2, w4r, &vx[VLEN * (icr + ID4)],
442  &vn[VLEN * (icr + ID4)]);
443  shift_vec_xfw(pg1, pg2, w4i, &vx[VLEN * (ici + ID4)],
444  &vn[VLEN * (ici + ID4)]);
445 
446  sub_vec(pg, vt1r, w1r, w4i);
447  add_vec(pg, vt1i, w1i, w4r);
448  sub_vec(pg, vt2r, w2r, w3i);
449  add_vec(pg, vt2i, w2i, w3r);
450  }
451 
452 
453 //====================================================================
454  template<typename REALTYPE>
455  inline void mult_wilson_xmb(svbool_t& pg1, svbool_t& pg2, Vsimd_t *v2,
456  REALTYPE *u, REALTYPE *un,
457  REALTYPE *v1, REALTYPE *v1n)
458  {
459  svbool_t pg = set_predicate();
460 
461  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
462  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
463 
464  set_sp2_xm(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
465  set_sp2_xm(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
466  set_sp2_xm(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
467 
468  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
469  svreal_t wt1r, wt1i, wt2r, wt2i;
470 
471  for (int ic = 0; ic < NC; ++ic) {
472  load_udag_xm(pg1, pg2, ut10, ut11, ut12, ut13, ut14, ut15,
473  &u[VLEN * NVC * ic], &un[VLEN * NVC * ic]);
474 
475  mult_udv(pg, wt1r, wt1i,
476  ut10, ut11, ut12, ut13, ut14, ut15,
477  vt10, vt11, vt12, vt13, vt14, vt15);
478  mult_udv(pg, wt2r, wt2i,
479  ut10, ut11, ut12, ut13, ut14, ut15,
480  vt20, vt21, vt22, vt23, vt24, vt25);
481  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
482  }
483  }
484 
485 
486 //====================================================================
487  template<typename REALTYPE>
488  inline void mult_wilson_xmb(svbool_t& pg1, svbool_t& pg2,
489  REALTYPE *__restrict *v2,
490  REALTYPE *u, REALTYPE *un,
491  REALTYPE *v1, REALTYPE *v1n)
492  {
493  svbool_t pg = set_predicate();
494 
495  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
496  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
497 
498  set_sp2_xm(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
499  set_sp2_xm(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
500  set_sp2_xm(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
501 
502  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
503  svreal_t wt1r, wt1i, wt2r, wt2i;
504 
505  for (int ic = 0; ic < NC; ++ic) {
506  load_udag_xm(pg1, pg2, ut10, ut11, ut12, ut13, ut14, ut15,
507  &u[VLEN * NVC * ic], &un[VLEN * NVC * ic]);
508 
509  mult_udv(pg, wt1r, wt1i,
510  ut10, ut11, ut12, ut13, ut14, ut15,
511  vt10, vt11, vt12, vt13, vt14, vt15);
512  mult_udv(pg, wt2r, wt2i,
513  ut10, ut11, ut12, ut13, ut14, ut15,
514  vt20, vt21, vt22, vt23, vt24, vt25);
515  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
516  }
517  }
518 
519 
520 //====================================================================
521  template<typename REALTYPE>
522  inline void mult_wilson_xmb(svbool_t& pg1, svbool_t& pg2, Vsimd_t *v2,
523  Vsimd_t *u,
524  REALTYPE *v1, REALTYPE *v1n)
525  {
526  svbool_t pg = set_predicate();
527 
528  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
529  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
530 
531  set_sp2_xm(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
532  set_sp2_xm(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
533  set_sp2_xm(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
534 
535  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
536  svreal_t wt1r, wt1i, wt2r, wt2i;
537 
538  for (int ic = 0; ic < NC; ++ic) {
539  load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
540  &u[NVC * ic].v[0]);
541 
542  mult_udv(pg, wt1r, wt1i,
543  ut10, ut11, ut12, ut13, ut14, ut15,
544  vt10, vt11, vt12, vt13, vt14, vt15);
545  mult_udv(pg, wt2r, wt2i,
546  ut10, ut11, ut12, ut13, ut14, ut15,
547  vt20, vt21, vt22, vt23, vt24, vt25);
548  set_sp4_xm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
549  }
550  }
551 
552 
553 //====================================================================
554  template<typename REALTYPE>
555  inline void mult_wilson_yp1(svbool_t& pg2,
556  REALTYPE *__restrict buf,
557  REALTYPE *__restrict v1)
558  {
559  svbool_t pg = set_predicate();
560 
561  for (int ic = 0; ic < NC; ++ic) {
562  svreal_t vt1r, vt1i, vt2r, vt2i;
563  set_sp2_yp(pg2, vt1r, vt1i, vt2r, vt2i, v1, ic);
564 
565  save_vec(pg2, &buf[VLENX * (2 * ic)], vt1r);
566  save_vec(pg2, &buf[VLENX * (2 * ic + 1)], vt1i);
567  save_vec(pg2, &buf[VLENX * (2 * ic + NVC)], vt2r);
568  save_vec(pg2, &buf[VLENX * (2 * ic + 1 + NVC)], vt2i);
569  }
570  }
571 
572 
573 //====================================================================
574  template<typename REALTYPE>
575  inline void mult_wilson_yp2(svbool_t& pg1, svbool_t& pg2,
576  Vsimd_t *v2, REALTYPE *u,
577  REALTYPE *v1, REALTYPE *buf)
578  {
579  svbool_t pg = set_predicate();
580 
581  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
582  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
583 
584  set_sp2_yp(pg1, vt10, vt11, vt20, vt21, &v1[VLENX], 0);
585  set_sp2_yp(pg1, vt12, vt13, vt22, vt23, &v1[VLENX], 1);
586  set_sp2_yp(pg1, vt14, vt15, vt24, vt25, &v1[VLENX], 2);
587 
588  int offset = -VLENX * (VLENY - 1);
589  int ic = 0;
590  load_add(pg2, vt10, &buf[offset + VLENX * (2 * ic)]);
591  load_add(pg2, vt11, &buf[offset + VLENX * (2 * ic + 1)]);
592  load_add(pg2, vt20, &buf[offset + VLENX * (2 * ic + NVC)]);
593  load_add(pg2, vt21, &buf[offset + VLENX * (2 * ic + 1 + NVC)]);
594  ic = 1;
595  load_add(pg2, vt12, &buf[offset + VLENX * (2 * ic)]);
596  load_add(pg2, vt13, &buf[offset + VLENX * (2 * ic + 1)]);
597  load_add(pg2, vt22, &buf[offset + VLENX * (2 * ic + NVC)]);
598  load_add(pg2, vt23, &buf[offset + VLENX * (2 * ic + 1 + NVC)]);
599  ic = 2;
600  load_add(pg2, vt14, &buf[offset + VLENX * (2 * ic)]);
601  load_add(pg2, vt15, &buf[offset + VLENX * (2 * ic + 1)]);
602  load_add(pg2, vt24, &buf[offset + VLENX * (2 * ic + NVC)]);
603  load_add(pg2, vt25, &buf[offset + VLENX * (2 * ic + 1 + NVC)]);
604 
605  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
606  svreal_t wt1r, wt1i, wt2r, wt2i;
607 
608  for (int ic = 0; ic < NC; ++ic) {
609  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
610  &u[VLEN * (2 * ic)]);
611  mult_uv(pg, wt1r, wt1i,
612  ut10, ut11, ut12, ut13, ut14, ut15,
613  vt10, vt11, vt12, vt13, vt14, vt15);
614  mult_uv(pg, wt2r, wt2i,
615  ut10, ut11, ut12, ut13, ut14, ut15,
616  vt20, vt21, vt22, vt23, vt24, vt25);
617  set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
618  }
619  }
620 
621 
622 //====================================================================
623  template<typename REALTYPE>
624  inline void mult_wilson_yp2(svbool_t& pg1, svbool_t& pg2,
625  REALTYPE *__restrict v2,
626  REALTYPE *__restrict u,
627  REALTYPE *__restrict v1,
628  REALTYPE *__restrict buf)
629  {
630  svbool_t pg = set_predicate();
631 
632  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
633  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
634 
635  set_sp2_yp(pg1, vt10, vt11, vt20, vt21, &v1[VLENX], 0);
636  set_sp2_yp(pg1, vt12, vt13, vt22, vt23, &v1[VLENX], 1);
637  set_sp2_yp(pg1, vt14, vt15, vt24, vt25, &v1[VLENX], 2);
638 
639  int offset = -VLENX * (VLENY - 1);
640  int ic = 0;
641  load_add(pg2, vt10, &buf[offset + VLENX * (2 * ic)]);
642  load_add(pg2, vt11, &buf[offset + VLENX * (2 * ic + 1)]);
643  load_add(pg2, vt20, &buf[offset + VLENX * (2 * ic + NVC)]);
644  load_add(pg2, vt21, &buf[offset + VLENX * (2 * ic + 1 + NVC)]);
645  ic = 1;
646  load_add(pg2, vt12, &buf[offset + VLENX * (2 * ic)]);
647  load_add(pg2, vt13, &buf[offset + VLENX * (2 * ic + 1)]);
648  load_add(pg2, vt22, &buf[offset + VLENX * (2 * ic + NVC)]);
649  load_add(pg2, vt23, &buf[offset + VLENX * (2 * ic + 1 + NVC)]);
650  ic = 2;
651  load_add(pg2, vt14, &buf[offset + VLENX * (2 * ic)]);
652  load_add(pg2, vt15, &buf[offset + VLENX * (2 * ic + 1)]);
653  load_add(pg2, vt24, &buf[offset + VLENX * (2 * ic + NVC)]);
654  load_add(pg2, vt25, &buf[offset + VLENX * (2 * ic + 1 + NVC)]);
655 
656  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
657  svreal_t wt1r, wt1i, wt2r, wt2i;
658 
659  for (int ic = 0; ic < NC; ++ic) {
660  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
661  &u[VLEN * (2 * ic)]);
662  mult_uv(pg, wt1r, wt1i,
663  ut10, ut11, ut12, ut13, ut14, ut15,
664  vt10, vt11, vt12, vt13, vt14, vt15);
665  mult_uv(pg, wt2r, wt2i,
666  ut10, ut11, ut12, ut13, ut14, ut15,
667  vt20, vt21, vt22, vt23, vt24, vt25);
668  set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
669  }
670  }
671 
672 
673 //====================================================================
674  template<typename REALTYPE>
675  inline void set_sp2_yp(svbool_t& pg, svbool_t& pg1, svbool_t& pg2,
676  svreal_t& vt1r, svreal_t& vt1i,
677  svreal_t& vt2r, svreal_t& vt2i,
678  REALTYPE *v, REALTYPE *vn, int ic)
679  {
680  int icr = ND * 2 * ic;
681  int ici = ND * 2 * ic + 1;
682  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
683 
684 #if VLENY > 1
685  shift_vec_ybw(pg1, pg2, w1r, &v[VLEN * (icr + ID1)],
686  &vn[VLEN * (icr + ID1)]);
687  shift_vec_ybw(pg1, pg2, w1i, &v[VLEN * (ici + ID1)],
688  &vn[VLEN * (ici + ID1)]);
689 
690  shift_vec_ybw(pg1, pg2, w2r, &v[VLEN * (icr + ID2)],
691  &vn[VLEN * (icr + ID2)]);
692  shift_vec_ybw(pg1, pg2, w2i, &v[VLEN * (ici + ID2)],
693  &vn[VLEN * (ici + ID2)]);
694 
695  shift_vec_ybw(pg1, pg2, w3r, &v[VLEN * (icr + ID3)],
696  &vn[VLEN * (icr + ID3)]);
697  shift_vec_ybw(pg1, pg2, w3i, &v[VLEN * (ici + ID3)],
698  &vn[VLEN * (ici + ID3)]);
699 
700  shift_vec_ybw(pg1, pg2, w4r, &v[VLEN * (icr + ID4)],
701  &vn[VLEN * (icr + ID4)]);
702  shift_vec_ybw(pg1, pg2, w4i, &v[VLEN * (ici + ID4)],
703  &vn[VLEN * (ici + ID4)]);
704 #else
705  load_vec(pg, w1r, &vn[VLEN * (icr + ID1)]);
706  load_vec(pg, w1i, &vn[VLEN * (ici + ID1)]);
707 
708  load_vec(pg, w2r, &vn[VLEN * (icr + ID2)]);
709  load_vec(pg, w2i, &vn[VLEN * (ici + ID2)]);
710 
711  load_vec(pg, w3r, &vn[VLEN * (icr + ID3)]);
712  load_vec(pg, w3i, &vn[VLEN * (ici + ID3)]);
713 
714  load_vec(pg, w4r, &vn[VLEN * (icr + ID4)]);
715  load_vec(pg, w4i, &vn[VLEN * (ici + ID4)]);
716 #endif
717 
718  sub_vec(pg, vt1r, w1r, w4r);
719  sub_vec(pg, vt1i, w1i, w4i);
720  add_vec(pg, vt2r, w2r, w3r);
721  add_vec(pg, vt2i, w2i, w3i);
722  }
723 
724 
725 //====================================================================
726  template<typename REALTYPE>
727  inline void set_sp2_yp(svbool_t& pg, svbool_t& pg1, svuint_t& idx1,
728  svreal_t& vt1r, svreal_t& vt1i,
729  svreal_t& vt2r, svreal_t& vt2i,
730  REALTYPE *v, REALTYPE *vn, int ic)
731  {
732  int icr = ND * 2 * ic;
733  int ici = ND * 2 * ic + 1;
734  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
735 
736 #if VLENY > 1
737 
738  /*
739  shift_vec(pg1, idx1, w1r, &v[ VLEN * (icr + ID1)],
740  &vn[VLEN * (icr + ID1)]);
741  shift_vec(pg1, idx1, w1i, &v[ VLEN * (ici + ID1)],
742  &vn[VLEN * (ici + ID1)]);
743 
744  shift_vec(pg1, idx1, w2r, &v[ VLEN * (icr + ID2)],
745  &vn[VLEN * (icr + ID2)]);
746  shift_vec(pg1, idx1, w2i, &v[ VLEN * (ici + ID2)],
747  &vn[VLEN * (ici + ID2)]);
748 
749  shift_vec(pg1, idx1, w3r, &v[ VLEN * (icr + ID3)],
750  &vn[VLEN * (icr + ID3)]);
751  shift_vec(pg1, idx1, w3i, &v[ VLEN * (ici + ID3)],
752  &vn[VLEN * (ici + ID3)]);
753 
754  shift_vec(pg1, idx1, w4r, &v[ VLEN * (icr + ID4)],
755  &vn[VLEN * (icr + ID4)]);
756  shift_vec(pg1, idx1, w4i, &v[ VLEN * (ici + ID4)],
757  &vn[VLEN * (ici + ID4)]);
758  */
759 
760  shift_vec_ybw(w1r, &v[VLEN * (icr + ID1)],
761  &vn[VLEN * (icr + ID1)]);
762  shift_vec_ybw(w1i, &v[VLEN * (ici + ID1)],
763  &vn[VLEN * (ici + ID1)]);
764 
765  shift_vec_ybw(w2r, &v[VLEN * (icr + ID2)],
766  &vn[VLEN * (icr + ID2)]);
767  shift_vec_ybw(w2i, &v[VLEN * (ici + ID2)],
768  &vn[VLEN * (ici + ID2)]);
769 
770  shift_vec_ybw(w3r, &v[VLEN * (icr + ID3)],
771  &vn[VLEN * (icr + ID3)]);
772  shift_vec_ybw(w3i, &v[VLEN * (ici + ID3)],
773  &vn[VLEN * (ici + ID3)]);
774 
775  shift_vec_ybw(w4r, &v[VLEN * (icr + ID4)],
776  &vn[VLEN * (icr + ID4)]);
777  shift_vec_ybw(w4i, &v[VLEN * (ici + ID4)],
778  &vn[VLEN * (ici + ID4)]);
779 #else
780  load_vec(pg, w1r, &vn[VLEN * (icr + ID1)]);
781  load_vec(pg, w1i, &vn[VLEN * (ici + ID1)]);
782 
783  load_vec(pg, w2r, &vn[VLEN * (icr + ID2)]);
784  load_vec(pg, w2i, &vn[VLEN * (ici + ID2)]);
785 
786  load_vec(pg, w3r, &vn[VLEN * (icr + ID3)]);
787  load_vec(pg, w3i, &vn[VLEN * (ici + ID3)]);
788 
789  load_vec(pg, w4r, &vn[VLEN * (icr + ID4)]);
790  load_vec(pg, w4i, &vn[VLEN * (ici + ID4)]);
791 #endif
792 
793  sub_vec(pg, vt1r, w1r, w4r);
794  sub_vec(pg, vt1i, w1i, w4i);
795  add_vec(pg, vt2r, w2r, w3r);
796  add_vec(pg, vt2i, w2i, w3i);
797  }
798 
799 
800 //====================================================================
801  template<typename REALTYPE>
802  inline void mult_wilson_ypb(svbool_t& pg1, svuint_t& idx1,
803  Vsimd_t *v2, REALTYPE *u,
804  REALTYPE *v1, REALTYPE *v1n)
805  {
806  svbool_t pg = set_predicate();
807 
808  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
809  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
810 
811  set_sp2_yp(pg, pg1, idx1, vt10, vt11, vt20, vt21, v1, v1n, 0);
812  set_sp2_yp(pg, pg1, idx1, vt12, vt13, vt22, vt23, v1, v1n, 1);
813  set_sp2_yp(pg, pg1, idx1, vt14, vt15, vt24, vt25, v1, v1n, 2);
814 
815  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
816  svreal_t wt1r, wt1i, wt2r, wt2i;
817 
818  for (int ic = 0; ic < NC; ++ic) {
819  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
820  &u[VLEN * (2 * ic)]);
821  mult_uv(pg, wt1r, wt1i,
822  ut10, ut11, ut12, ut13, ut14, ut15,
823  vt10, vt11, vt12, vt13, vt14, vt15);
824  mult_uv(pg, wt2r, wt2i,
825  ut10, ut11, ut12, ut13, ut14, ut15,
826  vt20, vt21, vt22, vt23, vt24, vt25);
827  set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
828  }
829  }
830 
831 
832 //====================================================================
833  template<typename REALTYPE>
834  inline void mult_wilson_ypb(svbool_t& pg1, svbool_t& pg2,
835  Vsimd_t *v2, REALTYPE *u,
836  REALTYPE *v1, REALTYPE *v1n)
837  {
838  svbool_t pg = set_predicate();
839 
840  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
841  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
842 
843  set_sp2_yp(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
844  set_sp2_yp(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
845  set_sp2_yp(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
846 
847  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
848  svreal_t wt1r, wt1i, wt2r, wt2i;
849 
850  for (int ic = 0; ic < NC; ++ic) {
851  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
852  &u[VLEN * (2 * ic)]);
853  mult_uv(pg, wt1r, wt1i,
854  ut10, ut11, ut12, ut13, ut14, ut15,
855  vt10, vt11, vt12, vt13, vt14, vt15);
856  mult_uv(pg, wt2r, wt2i,
857  ut10, ut11, ut12, ut13, ut14, ut15,
858  vt20, vt21, vt22, vt23, vt24, vt25);
859  set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
860  }
861  }
862 
863 
864 //====================================================================
865  template<typename REALTYPE>
866  inline void mult_wilson_ypb(svbool_t& pg1, svbool_t& pg2,
867  REALTYPE *__restrict v2,
868  REALTYPE *__restrict u,
869  REALTYPE *v1, REALTYPE *v1n)
870  {
871  // always v2 != u, but can be v1==v1n
872  svbool_t pg = set_predicate();
873 
874  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
875  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
876 
877  set_sp2_yp(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
878  set_sp2_yp(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
879  set_sp2_yp(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
880 
881 
882  for (int ic = 0; ic < NC; ++ic) {
883  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
884  svreal_t wt1r, wt1i, wt2r, wt2i;
885 
886  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
887  &u[VLEN * (2 * ic)]);
888  mult_uv(pg, wt1r, wt1i,
889  ut10, ut11, ut12, ut13, ut14, ut15,
890  vt10, vt11, vt12, vt13, vt14, vt15);
891  mult_uv(pg, wt2r, wt2i,
892  ut10, ut11, ut12, ut13, ut14, ut15,
893  vt20, vt21, vt22, vt23, vt24, vt25);
894  set_sp4_yp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
895  }
896  }
897 
898 
899 //====================================================================
900  template<typename REALTYPE>
901  inline void mult_wilson_ym1(svbool_t& pg2,
902  REALTYPE *__restrict buf,
903  REALTYPE *__restrict u,
904  REALTYPE *__restrict v1)
905  {
906  svbool_t pg = set_predicate();
907 
908  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
909  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
910  set_sp2_ym(pg2, vt10, vt11, vt20, vt21, v1, 0);
911  set_sp2_ym(pg2, vt12, vt13, vt22, vt23, v1, 1);
912  set_sp2_ym(pg2, vt14, vt15, vt24, vt25, v1, 2);
913 
914  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
915  svreal_t wt1r, wt1i, wt2r, wt2i;
916 
917  for (int ic = 0; ic < NC; ++ic) {
918  load_udag(pg2, ut10, ut11, ut12, ut13, ut14, ut15,
919  &u[VLEN * NVC * ic]);
920  mult_udv(pg2, wt1r, wt1i,
921  ut10, ut11, ut12, ut13, ut14, ut15,
922  vt10, vt11, vt12, vt13, vt14, vt15);
923  mult_udv(pg2, wt2r, wt2i,
924  ut10, ut11, ut12, ut13, ut14, ut15,
925  vt20, vt21, vt22, vt23, vt24, vt25);
926 
927  int offset = -VLENX * (VLENY - 1);
928 
929  save_vec(pg2, &buf[offset + VLENX * (2 * ic)], wt1r);
930  save_vec(pg2, &buf[offset + VLENX * (2 * ic + 1)], wt1i);
931  save_vec(pg2, &buf[offset + VLENX * (2 * ic + NVC)], wt2r);
932  save_vec(pg2, &buf[offset + VLENX * (2 * ic + 1 + NVC)], wt2i);
933  }
934  }
935 
936 
937 //====================================================================
938  template<typename REALTYPE>
939  inline void mult_wilson_ym2(svbool_t& pg1, svbool_t& pg2,
940  Vsimd_t *v2, REALTYPE *u,
941  REALTYPE *v1, REALTYPE *buf)
942  {
943  svbool_t pg = set_predicate();
944 
945  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
946  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
947 
948  set_sp2_ym(pg1, vt10, vt11, vt20, vt21, &v1[-VLENX], 0);
949  set_sp2_ym(pg1, vt12, vt13, vt22, vt23, &v1[-VLENX], 1);
950  set_sp2_ym(pg1, vt14, vt15, vt24, vt25, &v1[-VLENX], 2);
951 
952  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
953  svreal_t wt1r, wt1i, wt2r, wt2i;
954 
955  for (int ic = 0; ic < NC; ++ic) {
956  load_udag(pg1, ut10, ut11, ut12, ut13, ut14, ut15,
957  &u[VLEN * NVC * ic - VLENX]);
958  mult_udv(pg1, wt1r, wt1i,
959  ut10, ut11, ut12, ut13, ut14, ut15,
960  vt10, vt11, vt12, vt13, vt14, vt15);
961  mult_udv(pg1, wt2r, wt2i,
962  ut10, ut11, ut12, ut13, ut14, ut15,
963  vt20, vt21, vt22, vt23, vt24, vt25);
964 
965  load_add(pg2, wt1r, &buf[VLENX * (2 * ic)]);
966  load_add(pg2, wt1i, &buf[VLENX * (2 * ic + 1)]);
967  load_add(pg2, wt2r, &buf[VLENX * (2 * ic + NVC)]);
968  load_add(pg2, wt2i, &buf[VLENX * (2 * ic + 1 + NVC)]);
969 
970  set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
971  }
972  }
973 
974 
975 //====================================================================
976  template<typename REALTYPE>
977  inline void mult_wilson_ym2(svbool_t& pg1, svbool_t& pg2,
978  REALTYPE *__restrict v2,
979  REALTYPE *__restrict u,
980  REALTYPE *__restrict v1,
981  REALTYPE *__restrict buf)
982  {
983  svbool_t pg = set_predicate();
984 
985  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
986  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
987 
988  set_sp2_ym(pg1, vt10, vt11, vt20, vt21, &v1[-VLENX], 0);
989  set_sp2_ym(pg1, vt12, vt13, vt22, vt23, &v1[-VLENX], 1);
990  set_sp2_ym(pg1, vt14, vt15, vt24, vt25, &v1[-VLENX], 2);
991 
992  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
993  svreal_t wt1r, wt1i, wt2r, wt2i;
994 
995  for (int ic = 0; ic < NC; ++ic) {
996  load_udag(pg1, ut10, ut11, ut12, ut13, ut14, ut15,
997  &u[VLEN * NVC * ic - VLENX]);
998  mult_udv(pg1, wt1r, wt1i,
999  ut10, ut11, ut12, ut13, ut14, ut15,
1000  vt10, vt11, vt12, vt13, vt14, vt15);
1001  mult_udv(pg1, wt2r, wt2i,
1002  ut10, ut11, ut12, ut13, ut14, ut15,
1003  vt20, vt21, vt22, vt23, vt24, vt25);
1004 
1005  load_add(pg2, wt1r, &buf[VLENX * (2 * ic)]);
1006  load_add(pg2, wt1i, &buf[VLENX * (2 * ic + 1)]);
1007  load_add(pg2, wt2r, &buf[VLENX * (2 * ic + NVC)]);
1008  load_add(pg2, wt2i, &buf[VLENX * (2 * ic + 1 + NVC)]);
1009 
1010  set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1011  }
1012  }
1013 
1014 
1015 //====================================================================
1016  template<typename REALTYPE>
1017  inline void set_sp2_ym(svbool_t& pg, svbool_t& pg1, svbool_t& pg2,
1018  svreal_t& vt1r, svreal_t& vt1i,
1019  svreal_t& vt2r, svreal_t& vt2i,
1020  REALTYPE *vx, REALTYPE *vn, int ic)
1021  {
1022  int icr = ND * 2 * ic;
1023  int ici = ND * 2 * ic + 1;
1024  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
1025 
1026 #if VLENY > 1
1027  shift_vec_yfw(pg1, pg2, w1r, &vx[VLEN * (icr + ID1)],
1028  &vn[VLEN * (icr + ID1)]);
1029  shift_vec_yfw(pg1, pg2, w1i, &vx[VLEN * (ici + ID1)],
1030  &vn[VLEN * (ici + ID1)]);
1031 
1032  shift_vec_yfw(pg1, pg2, w2r, &vx[VLEN * (icr + ID2)],
1033  &vn[VLEN * (icr + ID2)]);
1034  shift_vec_yfw(pg1, pg2, w2i, &vx[VLEN * (ici + ID2)],
1035  &vn[VLEN * (ici + ID2)]);
1036 
1037  shift_vec_yfw(pg1, pg2, w3r, &vx[VLEN * (icr + ID3)],
1038  &vn[VLEN * (icr + ID3)]);
1039  shift_vec_yfw(pg1, pg2, w3i, &vx[VLEN * (ici + ID3)],
1040  &vn[VLEN * (ici + ID3)]);
1041 
1042  shift_vec_yfw(pg1, pg2, w4r, &vx[VLEN * (icr + ID4)],
1043  &vn[VLEN * (icr + ID4)]);
1044  shift_vec_yfw(pg1, pg2, w4i, &vx[VLEN * (ici + ID4)],
1045  &vn[VLEN * (ici + ID4)]);
1046 #else
1047  load_vec(pg, w1r, &vn[VLEN * (icr + ID1)]);
1048  load_vec(pg, w1i, &vn[VLEN * (ici + ID1)]);
1049 
1050  load_vec(pg, w2r, &vn[VLEN * (icr + ID2)]);
1051  load_vec(pg, w2i, &vn[VLEN * (ici + ID2)]);
1052 
1053  load_vec(pg, w3r, &vn[VLEN * (icr + ID3)]);
1054  load_vec(pg, w3i, &vn[VLEN * (ici + ID3)]);
1055 
1056  load_vec(pg, w4r, &vn[VLEN * (icr + ID4)]);
1057  load_vec(pg, w4i, &vn[VLEN * (ici + ID4)]);
1058 #endif
1059  add_vec(pg, vt1r, w1r, w4r);
1060  add_vec(pg, vt1i, w1i, w4i);
1061  sub_vec(pg, vt2r, w2r, w3r);
1062  sub_vec(pg, vt2i, w2i, w3i);
1063  }
1064 
1065 
1066 //====================================================================
1067  template<typename REALTYPE>
1068  inline void mult_wilson_ymb(svbool_t& pg1, svbool_t& pg2,
1069  Vsimd_t *v2, REALTYPE *u, REALTYPE *un,
1070  REALTYPE *v1, REALTYPE *v1n)
1071  {
1072  svbool_t pg = set_predicate();
1073 
1074  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1075  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1076 
1077  set_sp2_ym(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
1078  set_sp2_ym(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
1079  set_sp2_ym(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
1080 
1081  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1082  svreal_t wt1r, wt1i, wt2r, wt2i;
1083 
1084  for (int ic = 0; ic < NC; ++ic) {
1085  load_udag_ym(pg1, pg2, ut10, ut11, ut12, ut13, ut14, ut15,
1086  &u[VLEN * NVC * ic], &un[VLEN * NVC * ic]);
1087  mult_udv(pg, wt1r, wt1i,
1088  ut10, ut11, ut12, ut13, ut14, ut15,
1089  vt10, vt11, vt12, vt13, vt14, vt15);
1090  mult_udv(pg, wt2r, wt2i,
1091  ut10, ut11, ut12, ut13, ut14, ut15,
1092  vt20, vt21, vt22, vt23, vt24, vt25);
1093  set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1094  }
1095  }
1096 
1097 
1098 //====================================================================
1099  template<typename REALTYPE>
1100  inline void set_sp2_ym(svbool_t& pg, svbool_t& pg1, svuint_t& idx1,
1101  svreal_t& vt1r, svreal_t& vt1i,
1102  svreal_t& vt2r, svreal_t& vt2i,
1103  REALTYPE *vx, REALTYPE *vn, int ic)
1104  {
1105  int icr = ND * 2 * ic;
1106  int ici = ND * 2 * ic + 1;
1107  svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
1108 
1109 #if VLENY > 1
1110  shift_vec_yfw(w1r, &vx[VLEN * (icr + ID1)],
1111  &vn[VLEN * (icr + ID1)]);
1112  shift_vec_yfw(w1i, &vx[VLEN * (ici + ID1)],
1113  &vn[VLEN * (ici + ID1)]);
1114 
1115  shift_vec_yfw(w2r, &vx[VLEN * (icr + ID2)],
1116  &vn[VLEN * (icr + ID2)]);
1117  shift_vec_yfw(w2i, &vx[VLEN * (ici + ID2)],
1118  &vn[VLEN * (ici + ID2)]);
1119 
1120  shift_vec_yfw(w3r, &vx[VLEN * (icr + ID3)],
1121  &vn[VLEN * (icr + ID3)]);
1122  shift_vec_yfw(w3i, &vx[VLEN * (ici + ID3)],
1123  &vn[VLEN * (ici + ID3)]);
1124 
1125  shift_vec_yfw(w4r, &vx[VLEN * (icr + ID4)],
1126  &vn[VLEN * (icr + ID4)]);
1127  shift_vec_yfw(w4i, &vx[VLEN * (ici + ID4)],
1128  &vn[VLEN * (ici + ID4)]);
1129 
1130  /*
1131  shift_vec(pg1, idx1, w1r, &vx[VLEN * (icr + ID1)],
1132  &vn[VLEN * (icr + ID1)]);
1133  shift_vec(pg1, idx1, w1i, &vx[VLEN * (ici + ID1)],
1134  &vn[VLEN * (ici + ID1)]);
1135 
1136  shift_vec(pg1, idx1, w2r, &vx[VLEN * (icr + ID2)],
1137  &vn[VLEN * (icr + ID2)]);
1138  shift_vec(pg1, idx1, w2i, &vx[VLEN * (ici + ID2)],
1139  &vn[VLEN * (ici + ID2)]);
1140 
1141  shift_vec(pg1, idx1, w3r, &vx[VLEN * (icr + ID3)],
1142  &vn[VLEN * (icr + ID3)]);
1143  shift_vec(pg1, idx1, w3i, &vx[VLEN * (ici + ID3)],
1144  &vn[VLEN * (ici + ID3)]);
1145 
1146  shift_vec(pg1, idx1, w4r, &vx[VLEN * (icr + ID4)],
1147  &vn[VLEN * (icr + ID4)]);
1148  shift_vec(pg1, idx1, w4i, &vx[VLEN * (ici + ID4)],
1149  &vn[VLEN * (ici + ID4)]);
1150  */
1151 #else
1152  load_vec(pg, w1r, &vn[VLEN * (icr + ID1)]);
1153  load_vec(pg, w1i, &vn[VLEN * (ici + ID1)]);
1154 
1155  load_vec(pg, w2r, &vn[VLEN * (icr + ID2)]);
1156  load_vec(pg, w2i, &vn[VLEN * (ici + ID2)]);
1157 
1158  load_vec(pg, w3r, &vn[VLEN * (icr + ID3)]);
1159  load_vec(pg, w3i, &vn[VLEN * (ici + ID3)]);
1160 
1161  load_vec(pg, w4r, &vn[VLEN * (icr + ID4)]);
1162  load_vec(pg, w4i, &vn[VLEN * (ici + ID4)]);
1163 #endif
1164  add_vec(pg, vt1r, w1r, w4r);
1165  add_vec(pg, vt1i, w1i, w4i);
1166  sub_vec(pg, vt2r, w2r, w3r);
1167  sub_vec(pg, vt2i, w2i, w3i);
1168  }
1169 
1170 
1171 //====================================================================
1172  template<typename REALTYPE>
1173  inline void mult_wilson_ymb(svbool_t& pg1, svuint_t& idx1,
1174  Vsimd_t *v2, REALTYPE *u, REALTYPE *un,
1175  REALTYPE *v1, REALTYPE *v1n)
1176  {
1177  svbool_t pg = set_predicate();
1178 
1179  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1180  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1181 
1182  set_sp2_ym(pg, pg1, idx1, vt10, vt11, vt20, vt21, v1, v1n, 0);
1183  set_sp2_ym(pg, pg1, idx1, vt12, vt13, vt22, vt23, v1, v1n, 1);
1184  set_sp2_ym(pg, pg1, idx1, vt14, vt15, vt24, vt25, v1, v1n, 2);
1185 
1186  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1187  svreal_t wt1r, wt1i, wt2r, wt2i;
1188 
1189  for (int ic = 0; ic < NC; ++ic) {
1190  load_udag_ym(ut10, ut11, ut12, ut13, ut14, ut15,
1191  &u[VLEN * NVC * ic], &un[VLEN * NVC * ic]);
1192  mult_udv(pg, wt1r, wt1i,
1193  ut10, ut11, ut12, ut13, ut14, ut15,
1194  vt10, vt11, vt12, vt13, vt14, vt15);
1195  mult_udv(pg, wt2r, wt2i,
1196  ut10, ut11, ut12, ut13, ut14, ut15,
1197  vt20, vt21, vt22, vt23, vt24, vt25);
1198  set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1199  }
1200  }
1201 
1202 
1203 //====================================================================
1204  template<typename REALTYPE>
1205  inline void mult_wilson_ymb(svbool_t& pg1, svbool_t& pg2,
1206  REALTYPE *__restrict v2,
1207  REALTYPE *u, REALTYPE *un,
1208  REALTYPE *v1, REALTYPE *v1n)
1209  {
1210  svbool_t pg = set_predicate();
1211 
1212  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1213  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1214 
1215  set_sp2_ym(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
1216  set_sp2_ym(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
1217  set_sp2_ym(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
1218 
1219  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1220  svreal_t wt1r, wt1i, wt2r, wt2i;
1221 
1222  for (int ic = 0; ic < NC; ++ic) {
1223  load_udag_ym(pg1, pg2, ut10, ut11, ut12, ut13, ut14, ut15,
1224  &u[VLEN * NVC * ic], &un[VLEN * NVC * ic]);
1225  mult_udv(pg, wt1r, wt1i,
1226  ut10, ut11, ut12, ut13, ut14, ut15,
1227  vt10, vt11, vt12, vt13, vt14, vt15);
1228  mult_udv(pg, wt2r, wt2i,
1229  ut10, ut11, ut12, ut13, ut14, ut15,
1230  vt20, vt21, vt22, vt23, vt24, vt25);
1231  set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1232  }
1233  }
1234 
1235 
1236 //====================================================================
1237  template<typename REALTYPE>
1238  inline void mult_wilson_ymb(svbool_t& pg1, svbool_t& pg2,
1239  Vsimd_t *v2, Vsimd_t *u,
1240  REALTYPE *v1, REALTYPE *v1n)
1241  {
1242  svbool_t pg = set_predicate();
1243 
1244  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1245  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1246 
1247  set_sp2_ym(pg, pg1, pg2, vt10, vt11, vt20, vt21, v1, v1n, 0);
1248  set_sp2_ym(pg, pg1, pg2, vt12, vt13, vt22, vt23, v1, v1n, 1);
1249  set_sp2_ym(pg, pg1, pg2, vt14, vt15, vt24, vt25, v1, v1n, 2);
1250 
1251  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1252  svreal_t wt1r, wt1i, wt2r, wt2i;
1253 
1254  for (int ic = 0; ic < NC; ++ic) {
1255  load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1256  &u[NVC * ic].v[0]);
1257  mult_udv(pg, wt1r, wt1i,
1258  ut10, ut11, ut12, ut13, ut14, ut15,
1259  vt10, vt11, vt12, vt13, vt14, vt15);
1260  mult_udv(pg, wt2r, wt2i,
1261  ut10, ut11, ut12, ut13, ut14, ut15,
1262  vt20, vt21, vt22, vt23, vt24, vt25);
1263  set_sp4_ym(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1264  }
1265  }
1266 
1267 
1268 //====================================================================
1269  template<typename REALTYPE>
1270  inline void mult_wilson_zp1(REALTYPE *__restrict buf,
1271  REALTYPE *__restrict v1)
1272  {
1273  svbool_t pg = set_predicate();
1274 
1275  for (int ic = 0; ic < NC; ++ic) {
1276  svreal_t vt1r, vt1i, vt2r, vt2i;
1277  set_sp2_zp(pg, vt1r, vt1i, vt2r, vt2i, v1, ic);
1278  save_vec(pg, &buf[VLEN * (2 * ic)], vt1r);
1279  save_vec(pg, &buf[VLEN * (2 * ic + 1)], vt1i);
1280  save_vec(pg, &buf[VLEN * (2 * ic + NVC)], vt2r);
1281  save_vec(pg, &buf[VLEN * (2 * ic + 1 + NVC)], vt2i);
1282  }
1283  }
1284 
1285 
1286 //====================================================================
1287  template<typename REALTYPE>
1288  inline void mult_wilson_zp2(Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
1289  {
1290  svbool_t pg = set_predicate();
1291 
1292  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1293  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1294 
1295  load_vec(pg, vt10, &buf[VLEN * 0]);
1296  load_vec(pg, vt11, &buf[VLEN * 1]);
1297  load_vec(pg, vt12, &buf[VLEN * 2]);
1298  load_vec(pg, vt13, &buf[VLEN * 3]);
1299  load_vec(pg, vt14, &buf[VLEN * 4]);
1300  load_vec(pg, vt15, &buf[VLEN * 5]);
1301 
1302  load_vec(pg, vt20, &buf[VLEN * (0 + NVC)]);
1303  load_vec(pg, vt21, &buf[VLEN * (1 + NVC)]);
1304  load_vec(pg, vt22, &buf[VLEN * (2 + NVC)]);
1305  load_vec(pg, vt23, &buf[VLEN * (3 + NVC)]);
1306  load_vec(pg, vt24, &buf[VLEN * (4 + NVC)]);
1307  load_vec(pg, vt25, &buf[VLEN * (5 + NVC)]);
1308 
1309  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1310  svreal_t wt1r, wt1i, wt2r, wt2i;
1311 
1312  for (int ic = 0; ic < NC; ++ic) {
1313  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1314  &u[VLEN * (2 * ic)]);
1315  mult_uv(pg, wt1r, wt1i,
1316  ut10, ut11, ut12, ut13, ut14, ut15,
1317  vt10, vt11, vt12, vt13, vt14, vt15);
1318  mult_uv(pg, wt2r, wt2i,
1319  ut10, ut11, ut12, ut13, ut14, ut15,
1320  vt20, vt21, vt22, vt23, vt24, vt25);
1321  set_sp4_zp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1322  }
1323  }
1324 
1325 
1326 //====================================================================
1327  template<typename REALTYPE>
1328  inline void mult_wilson_zp2(REALTYPE *__restrict v2,
1329  REALTYPE *__restrict u,
1330  REALTYPE *__restrict buf)
1331  {
1332  svbool_t pg = set_predicate();
1333 
1334  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1335  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1336 
1337  load_vec(pg, vt10, &buf[VLEN * 0]);
1338  load_vec(pg, vt11, &buf[VLEN * 1]);
1339  load_vec(pg, vt12, &buf[VLEN * 2]);
1340  load_vec(pg, vt13, &buf[VLEN * 3]);
1341  load_vec(pg, vt14, &buf[VLEN * 4]);
1342  load_vec(pg, vt15, &buf[VLEN * 5]);
1343 
1344  load_vec(pg, vt20, &buf[VLEN * (0 + NVC)]);
1345  load_vec(pg, vt21, &buf[VLEN * (1 + NVC)]);
1346  load_vec(pg, vt22, &buf[VLEN * (2 + NVC)]);
1347  load_vec(pg, vt23, &buf[VLEN * (3 + NVC)]);
1348  load_vec(pg, vt24, &buf[VLEN * (4 + NVC)]);
1349  load_vec(pg, vt25, &buf[VLEN * (5 + NVC)]);
1350 
1351  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1352  svreal_t wt1r, wt1i, wt2r, wt2i;
1353 
1354  for (int ic = 0; ic < NC; ++ic) {
1355  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1356  &u[VLEN * (2 * ic)]);
1357  mult_uv(pg, wt1r, wt1i,
1358  ut10, ut11, ut12, ut13, ut14, ut15,
1359  vt10, vt11, vt12, vt13, vt14, vt15);
1360  mult_uv(pg, wt2r, wt2i,
1361  ut10, ut11, ut12, ut13, ut14, ut15,
1362  vt20, vt21, vt22, vt23, vt24, vt25);
1363  set_sp4_zp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1364  }
1365  }
1366 
1367 
1368 //====================================================================
1369  template<typename REALTYPE>
1370  inline void mult_wilson_zpb(Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
1371  {
1372  svbool_t pg = set_predicate();
1373 
1374  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1375  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1376 
1377  set_sp2_zp(pg, vt10, vt11, vt20, vt21, v1, 0);
1378  set_sp2_zp(pg, vt12, vt13, vt22, vt23, v1, 1);
1379  set_sp2_zp(pg, vt14, vt15, vt24, vt25, v1, 2);
1380 
1381  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1382  svreal_t wt1r, wt1i, wt2r, wt2i;
1383 
1384  for (int ic = 0; ic < NC; ++ic) {
1385  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1386  &u[VLEN * (2 * ic)]);
1387  mult_uv(pg, wt1r, wt1i,
1388  ut10, ut11, ut12, ut13, ut14, ut15,
1389  vt10, vt11, vt12, vt13, vt14, vt15);
1390  mult_uv(pg, wt2r, wt2i,
1391  ut10, ut11, ut12, ut13, ut14, ut15,
1392  vt20, vt21, vt22, vt23, vt24, vt25);
1393  set_sp4_zp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1394  }
1395  }
1396 
1397 
1398 //====================================================================
1399  template<typename REALTYPE>
1400  inline void mult_wilson_zpb(REALTYPE *__restrict v2,
1401  REALTYPE *__restrict u,
1402  REALTYPE *__restrict v1)
1403  {
1404  svbool_t pg = set_predicate();
1405 
1406  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1407  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1408 
1409  set_sp2_zp(pg, vt10, vt11, vt20, vt21, v1, 0);
1410  set_sp2_zp(pg, vt12, vt13, vt22, vt23, v1, 1);
1411  set_sp2_zp(pg, vt14, vt15, vt24, vt25, v1, 2);
1412 
1413  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1414  svreal_t wt1r, wt1i, wt2r, wt2i;
1415 
1416  for (int ic = 0; ic < NC; ++ic) {
1417  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1418  &u[VLEN * (2 * ic)]);
1419  mult_uv(pg, wt1r, wt1i,
1420  ut10, ut11, ut12, ut13, ut14, ut15,
1421  vt10, vt11, vt12, vt13, vt14, vt15);
1422  mult_uv(pg, wt2r, wt2i,
1423  ut10, ut11, ut12, ut13, ut14, ut15,
1424  vt20, vt21, vt22, vt23, vt24, vt25);
1425  set_sp4_zp(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1426  }
1427  }
1428 
1429 
1430 //====================================================================
1431  template<typename REALTYPE>
1432  inline void mult_wilson_zm1(REALTYPE *__restrict buf,
1433  REALTYPE *__restrict u,
1434  REALTYPE *__restrict v1)
1435  {
1436  svbool_t pg = set_predicate();
1437 
1438  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1439  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1440 
1441  set_sp2_zm(pg, vt10, vt11, vt20, vt21, v1, 0);
1442  set_sp2_zm(pg, vt12, vt13, vt22, vt23, v1, 1);
1443  set_sp2_zm(pg, vt14, vt15, vt24, vt25, v1, 2);
1444 
1445  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1446  svreal_t wt1r, wt1i, wt2r, wt2i;
1447 
1448  for (int ic = 0; ic < NC; ++ic) {
1449  load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1450  &u[VLEN * NVC * ic]);
1451  mult_udv(pg, wt1r, wt1i,
1452  ut10, ut11, ut12, ut13, ut14, ut15,
1453  vt10, vt11, vt12, vt13, vt14, vt15);
1454 
1455  mult_udv(pg, wt2r, wt2i,
1456  ut10, ut11, ut12, ut13, ut14, ut15,
1457  vt20, vt21, vt22, vt23, vt24, vt25);
1458 
1459  save_vec(pg, &buf[VLEN * (2 * ic)], wt1r);
1460  save_vec(pg, &buf[VLEN * (2 * ic + 1)], wt1i);
1461 
1462  save_vec(pg, &buf[VLEN * (2 * ic + NVC)], wt2r);
1463  save_vec(pg, &buf[VLEN * (2 * ic + 1 + NVC)], wt2i);
1464  }
1465  }
1466 
1467 
1468 //====================================================================
1469  template<typename REALTYPE>
1470  inline void mult_wilson_zm2(Vsimd_t *v2, REALTYPE *buf)
1471  {
1472  svbool_t pg = set_predicate();
1473 
1474  for (int ic = 0; ic < NC; ++ic) {
1475  svreal_t wt1r, wt1i, wt2r, wt2i;
1476  load_vec(pg, wt1r, &buf[VLEN * (2 * ic)]);
1477  load_vec(pg, wt1i, &buf[VLEN * (2 * ic + 1)]);
1478  load_vec(pg, wt2r, &buf[VLEN * (2 * ic + NVC)]);
1479  load_vec(pg, wt2i, &buf[VLEN * (2 * ic + 1 + NVC)]);
1480  set_sp4_zm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1481  }
1482  }
1483 
1484 
1485 //====================================================================
1486  template<typename REALTYPE>
1487  inline void mult_wilson_zm2(REALTYPE *__restrict v2,
1488  REALTYPE *__restrict buf)
1489  {
1490  svbool_t pg = set_predicate();
1491 
1492  for (int ic = 0; ic < NC; ++ic) {
1493  svreal_t wt1r, wt1i, wt2r, wt2i;
1494  load_vec(pg, wt1r, &buf[VLEN * (2 * ic)]);
1495  load_vec(pg, wt1i, &buf[VLEN * (2 * ic + 1)]);
1496  load_vec(pg, wt2r, &buf[VLEN * (2 * ic + NVC)]);
1497  load_vec(pg, wt2i, &buf[VLEN * (2 * ic + 1 + NVC)]);
1498  set_sp4_zm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1499  }
1500  }
1501 
1502 
1503 //====================================================================
1504  template<typename REALTYPE>
1505  inline void mult_wilson_zmb(Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
1506  {
1507  svbool_t pg = set_predicate();
1508 
1509  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1510  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1511 
1512  set_sp2_zm(pg, vt10, vt11, vt20, vt21, v1, 0);
1513  set_sp2_zm(pg, vt12, vt13, vt22, vt23, v1, 1);
1514  set_sp2_zm(pg, vt14, vt15, vt24, vt25, v1, 2);
1515 
1516  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1517  svreal_t wt1r, wt1i, wt2r, wt2i;
1518 
1519  for (int ic = 0; ic < NC; ++ic) {
1520  load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1521  &u[VLEN * NVC * ic]);
1522  mult_udv(pg, wt1r, wt1i,
1523  ut10, ut11, ut12, ut13, ut14, ut15,
1524  vt10, vt11, vt12, vt13, vt14, vt15);
1525  mult_udv(pg, wt2r, wt2i,
1526  ut10, ut11, ut12, ut13, ut14, ut15,
1527  vt20, vt21, vt22, vt23, vt24, vt25);
1528  set_sp4_zm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1529  }
1530  }
1531 
1532 
1533 //====================================================================
1534  template<typename REALTYPE>
1535  inline void mult_wilson_zmb(REALTYPE *__restrict v2,
1536  REALTYPE *__restrict u,
1537  REALTYPE *__restrict v1)
1538  {
1539  svbool_t pg = set_predicate();
1540 
1541  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1542  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1543 
1544  set_sp2_zm(pg, vt10, vt11, vt20, vt21, v1, 0);
1545  set_sp2_zm(pg, vt12, vt13, vt22, vt23, v1, 1);
1546  set_sp2_zm(pg, vt14, vt15, vt24, vt25, v1, 2);
1547 
1548  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1549  svreal_t wt1r, wt1i, wt2r, wt2i;
1550 
1551  for (int ic = 0; ic < NC; ++ic) {
1552  load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1553  &u[VLEN * NVC * ic]);
1554  mult_udv(pg, wt1r, wt1i,
1555  ut10, ut11, ut12, ut13, ut14, ut15,
1556  vt10, vt11, vt12, vt13, vt14, vt15);
1557  mult_udv(pg, wt2r, wt2i,
1558  ut10, ut11, ut12, ut13, ut14, ut15,
1559  vt20, vt21, vt22, vt23, vt24, vt25);
1560  set_sp4_zm(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1561  }
1562  }
1563 
1564 
1565 //====================================================================
1566  template<typename REALTYPE>
1567  inline void mult_wilson_tp1_dirac(REALTYPE *__restrict buf,
1568  REALTYPE *__restrict v1)
1569  {
1570  svbool_t pg = set_predicate();
1571 
1572  for (int ic = 0; ic < NC; ++ic) {
1573  svreal_t vt1r, vt1i, vt2r, vt2i;
1574  set_sp2_tp_dirac(pg, vt1r, vt1i, vt2r, vt2i, v1, ic);
1575  save_vec(pg, &buf[VLEN * (2 * ic)], vt1r);
1576  save_vec(pg, &buf[VLEN * (2 * ic + 1)], vt1i);
1577  save_vec(pg, &buf[VLEN * (2 * ic + NVC)], vt2r);
1578  save_vec(pg, &buf[VLEN * (2 * ic + 1 + NVC)], vt2i);
1579  }
1580  }
1581 
1582 
1583 //====================================================================
1584  template<typename REALTYPE>
1585  inline void mult_wilson_tp2_dirac(Vsimd_t *v2, REALTYPE *u, REALTYPE *buf)
1586  {
1587  svbool_t pg = set_predicate();
1588 
1589  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1590  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1591 
1592  load_vec(pg, vt10, &buf[VLEN * 0]);
1593  load_vec(pg, vt11, &buf[VLEN * 1]);
1594  load_vec(pg, vt12, &buf[VLEN * 2]);
1595  load_vec(pg, vt13, &buf[VLEN * 3]);
1596  load_vec(pg, vt14, &buf[VLEN * 4]);
1597  load_vec(pg, vt15, &buf[VLEN * 5]);
1598 
1599  load_vec(pg, vt20, &buf[VLEN * (0 + NVC)]);
1600  load_vec(pg, vt21, &buf[VLEN * (1 + NVC)]);
1601  load_vec(pg, vt22, &buf[VLEN * (2 + NVC)]);
1602  load_vec(pg, vt23, &buf[VLEN * (3 + NVC)]);
1603  load_vec(pg, vt24, &buf[VLEN * (4 + NVC)]);
1604  load_vec(pg, vt25, &buf[VLEN * (5 + NVC)]);
1605 
1606  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1607  svreal_t wt1r, wt1i, wt2r, wt2i;
1608 
1609  for (int ic = 0; ic < NC; ++ic) {
1610  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1611  &u[VLEN * (2 * ic)]);
1612  mult_uv(pg, wt1r, wt1i,
1613  ut10, ut11, ut12, ut13, ut14, ut15,
1614  vt10, vt11, vt12, vt13, vt14, vt15);
1615  mult_uv(pg, wt2r, wt2i,
1616  ut10, ut11, ut12, ut13, ut14, ut15,
1617  vt20, vt21, vt22, vt23, vt24, vt25);
1618  set_sp4_tp_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1619  }
1620  }
1621 
1622 
1623 //====================================================================
1624  template<typename REALTYPE>
1625  inline void mult_wilson_tp2_dirac(REALTYPE *__restrict v2,
1626  REALTYPE *__restrict u,
1627  REALTYPE *__restrict buf)
1628  {
1629  svbool_t pg = set_predicate();
1630 
1631  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1632  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1633 
1634  load_vec(pg, vt10, &buf[VLEN * 0]);
1635  load_vec(pg, vt11, &buf[VLEN * 1]);
1636  load_vec(pg, vt12, &buf[VLEN * 2]);
1637  load_vec(pg, vt13, &buf[VLEN * 3]);
1638  load_vec(pg, vt14, &buf[VLEN * 4]);
1639  load_vec(pg, vt15, &buf[VLEN * 5]);
1640 
1641  load_vec(pg, vt20, &buf[VLEN * (0 + NVC)]);
1642  load_vec(pg, vt21, &buf[VLEN * (1 + NVC)]);
1643  load_vec(pg, vt22, &buf[VLEN * (2 + NVC)]);
1644  load_vec(pg, vt23, &buf[VLEN * (3 + NVC)]);
1645  load_vec(pg, vt24, &buf[VLEN * (4 + NVC)]);
1646  load_vec(pg, vt25, &buf[VLEN * (5 + NVC)]);
1647 
1648  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1649  svreal_t wt1r, wt1i, wt2r, wt2i;
1650 
1651  for (int ic = 0; ic < NC; ++ic) {
1652  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1653  &u[VLEN * (2 * ic)]);
1654  mult_uv(pg, wt1r, wt1i,
1655  ut10, ut11, ut12, ut13, ut14, ut15,
1656  vt10, vt11, vt12, vt13, vt14, vt15);
1657  mult_uv(pg, wt2r, wt2i,
1658  ut10, ut11, ut12, ut13, ut14, ut15,
1659  vt20, vt21, vt22, vt23, vt24, vt25);
1660  set_sp4_tp_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1661  }
1662  }
1663 
1664 
1665 //====================================================================
1666  template<typename REALTYPE>
1667  inline void mult_wilson_tpb_dirac(Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
1668  {
1669  svbool_t pg = set_predicate();
1670 
1671  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1672  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1673 
1674  set_sp2_tp_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1675  set_sp2_tp_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1676  set_sp2_tp_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1677 
1678  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1679  svreal_t wt1r, wt1i, wt2r, wt2i;
1680 
1681  for (int ic = 0; ic < NC; ++ic) {
1682  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1683  &u[VLEN * (2 * ic)]);
1684  mult_uv(pg, wt1r, wt1i,
1685  ut10, ut11, ut12, ut13, ut14, ut15,
1686  vt10, vt11, vt12, vt13, vt14, vt15);
1687  mult_uv(pg, wt2r, wt2i,
1688  ut10, ut11, ut12, ut13, ut14, ut15,
1689  vt20, vt21, vt22, vt23, vt24, vt25);
1690  set_sp4_tp_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1691  }
1692  }
1693 
1694 
1695 //====================================================================
1696  template<typename REALTYPE>
1697  inline void mult_wilson_tpb_dirac(REALTYPE *__restrict v2,
1698  REALTYPE *__restrict u,
1699  REALTYPE *__restrict v1)
1700  {
1701  svbool_t pg = set_predicate();
1702 
1703  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1704  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1705 
1706  set_sp2_tp_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1707  set_sp2_tp_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1708  set_sp2_tp_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1709 
1710  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1711  svreal_t wt1r, wt1i, wt2r, wt2i;
1712 
1713  for (int ic = 0; ic < NC; ++ic) {
1714  load_u(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1715  &u[VLEN * (2 * ic)]);
1716  mult_uv(pg, wt1r, wt1i,
1717  ut10, ut11, ut12, ut13, ut14, ut15,
1718  vt10, vt11, vt12, vt13, vt14, vt15);
1719  mult_uv(pg, wt2r, wt2i,
1720  ut10, ut11, ut12, ut13, ut14, ut15,
1721  vt20, vt21, vt22, vt23, vt24, vt25);
1722  set_sp4_tp_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1723  }
1724  }
1725 
1726 
1727 //====================================================================
1728  template<typename REALTYPE>
1729  inline void mult_wilson_tm1_dirac(REALTYPE *__restrict buf,
1730  REALTYPE *__restrict u,
1731  REALTYPE *__restrict v1)
1732  {
1733  svbool_t pg = set_predicate();
1734 
1735  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1736  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1737 
1738  set_sp2_tm_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1739  set_sp2_tm_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1740  set_sp2_tm_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1741 
1742  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1743  svreal_t wt1r, wt1i, wt2r, wt2i;
1744 
1745  for (int ic = 0; ic < NC; ++ic) {
1746  load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1747  &u[VLEN * NVC * ic]);
1748  mult_udv(pg, wt1r, wt1i,
1749  ut10, ut11, ut12, ut13, ut14, ut15,
1750  vt10, vt11, vt12, vt13, vt14, vt15);
1751 
1752  mult_udv(pg, wt2r, wt2i,
1753  ut10, ut11, ut12, ut13, ut14, ut15,
1754  vt20, vt21, vt22, vt23, vt24, vt25);
1755 
1756  save_vec(pg, &buf[VLEN * (2 * ic)], wt1r);
1757  save_vec(pg, &buf[VLEN * (2 * ic + 1)], wt1i);
1758 
1759  save_vec(pg, &buf[VLEN * (2 * ic + NVC)], wt2r);
1760  save_vec(pg, &buf[VLEN * (2 * ic + 1 + NVC)], wt2i);
1761  }
1762  }
1763 
1764 
1765 //====================================================================
1766  template<typename REALTYPE>
1767  inline void mult_wilson_tm2_dirac(Vsimd_t *v2, REALTYPE *buf)
1768  {
1769  svbool_t pg = set_predicate();
1770 
1771  for (int ic = 0; ic < NC; ++ic) {
1772  svreal_t wt1r, wt1i, wt2r, wt2i;
1773  load_vec(pg, wt1r, &buf[VLEN * (2 * ic)]);
1774  load_vec(pg, wt1i, &buf[VLEN * (2 * ic + 1)]);
1775  load_vec(pg, wt2r, &buf[VLEN * (2 * ic + NVC)]);
1776  load_vec(pg, wt2i, &buf[VLEN * (2 * ic + 1 + NVC)]);
1777  set_sp4_tm_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1778  }
1779  }
1780 
1781 
1782 //====================================================================
1783  template<typename REALTYPE>
1784  inline void mult_wilson_tm2_dirac(REALTYPE *__restrict v2,
1785  REALTYPE *__restrict buf)
1786  {
1787  svbool_t pg = set_predicate();
1788 
1789  for (int ic = 0; ic < NC; ++ic) {
1790  svreal_t wt1r, wt1i, wt2r, wt2i;
1791  load_vec(pg, wt1r, &buf[VLEN * (2 * ic)]);
1792  load_vec(pg, wt1i, &buf[VLEN * (2 * ic + 1)]);
1793  load_vec(pg, wt2r, &buf[VLEN * (2 * ic + NVC)]);
1794  load_vec(pg, wt2i, &buf[VLEN * (2 * ic + 1 + NVC)]);
1795  set_sp4_tm_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1796  }
1797  }
1798 
1799 
1800 //====================================================================
1801  template<typename REALTYPE>
1802  inline void mult_wilson_tmb_dirac(Vsimd_t *v2, REALTYPE *u, REALTYPE *v1)
1803  {
1804  svbool_t pg = set_predicate();
1805 
1806  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1807  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1808 
1809  set_sp2_tm_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1810  set_sp2_tm_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1811  set_sp2_tm_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1812 
1813  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1814  svreal_t wt1r, wt1i, wt2r, wt2i;
1815 
1816  for (int ic = 0; ic < NC; ++ic) {
1817  load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1818  &u[VLEN * NVC * ic]);
1819  mult_udv(pg, wt1r, wt1i,
1820  ut10, ut11, ut12, ut13, ut14, ut15,
1821  vt10, vt11, vt12, vt13, vt14, vt15);
1822  mult_udv(pg, wt2r, wt2i,
1823  ut10, ut11, ut12, ut13, ut14, ut15,
1824  vt20, vt21, vt22, vt23, vt24, vt25);
1825  set_sp4_tm_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1826  }
1827  }
1828 
1829 
1830 //====================================================================
1831  template<typename REALTYPE>
1832  inline void mult_wilson_tmb_dirac(REALTYPE *__restrict v2,
1833  REALTYPE *__restrict u,
1834  REALTYPE *__restrict v1)
1835  {
1836  svbool_t pg = set_predicate();
1837 
1838  svreal_t vt10, vt11, vt12, vt13, vt14, vt15;
1839  svreal_t vt20, vt21, vt22, vt23, vt24, vt25;
1840 
1841  set_sp2_tm_dirac(pg, vt10, vt11, vt20, vt21, v1, 0);
1842  set_sp2_tm_dirac(pg, vt12, vt13, vt22, vt23, v1, 1);
1843  set_sp2_tm_dirac(pg, vt14, vt15, vt24, vt25, v1, 2);
1844 
1845  svreal_t ut10, ut11, ut12, ut13, ut14, ut15;
1846  svreal_t wt1r, wt1i, wt2r, wt2i;
1847 
1848  for (int ic = 0; ic < NC; ++ic) {
1849  load_udag(pg, ut10, ut11, ut12, ut13, ut14, ut15,
1850  &u[VLEN * NVC * ic]);
1851  mult_udv(pg, wt1r, wt1i,
1852  ut10, ut11, ut12, ut13, ut14, ut15,
1853  vt10, vt11, vt12, vt13, vt14, vt15);
1854  mult_udv(pg, wt2r, wt2i,
1855  ut10, ut11, ut12, ut13, ut14, ut15,
1856  vt20, vt21, vt22, vt23, vt24, vt25);
1857  set_sp4_tm_dirac(pg, v2, wt1r, wt1i, wt2r, wt2i, ic);
1858  }
1859  }
1860 
1861 
1862 //====================================================================
1863  template<typename REALTYPE>
1864  inline void mult_wilson_aypx_save(REALTYPE *__restrict v2, REALTYPE a,
1865  Vsimd_t *__restrict v2v, REALTYPE *__restrict v1)
1866  {
1867  svreal_t v2F, v1F;
1868  svbool_t pg = set_predicate();
1869 
1870  for (int i = 0; i < NVCD; ++i) {
1871  load_vec(pg, v1F, &v1[VLEN * i]);
1872  load_vec(pg, v2F, &v2v[i].v[0]);
1873  // v2F = svmla_m(pg, v1F, v2F, a); // v1F = v1F + v2F * a
1874  // save_vec(pg, &v2[VLEN*i], v2F);
1875  axpy_vec(pg, v1F, a, v2F); // v1F = v1F + v2F * a
1876  save_vec(pg, &v2[VLEN * i], v1F);
1877  }
1878  }
1879 
1880 
1881 //====================================================================
1882 } // nameless namespace end
1883 #endif
ID1
#define ID1
Definition: fopr_Wilson_impl_SU2-inc.h:18
NVCD
#define NVCD
Definition: define_params_SU3.h:20
VLEN
#define VLEN
Definition: bridgeQXS_Clover_coarse_double.cpp:12
Vsimd_t
Definition: vsimd_double-inc.h:13
Isimd_t
Definition: vsimd_double-inc.h:20
ID2
#define ID2
Definition: fopr_Wilson_impl_SU2-inc.h:19
ID4
#define ID4
Definition: fopr_Wilson_impl_SU2-inc.h:21
NC
#define NC
Definition: field_F_imp_SU2-inc.h:2
ND
#define ND
Definition: field_F_imp_SU2-inc.h:5
ID3
#define ID3
Definition: fopr_Wilson_impl_SU2-inc.h:20
Usimd_t
Definition: vsimd_double-inc.h:25
VLENY
#define VLENY
Definition: bridgeQXS_Clover_coarse_double.cpp:14
NVC
#define NVC
Definition: fopr_Wilson_impl_SU2-inc.h:15
svbool_t
Definition: vsimd_double-inc.h:30
VLENX
#define VLENX
Definition: bridgeQXS_Clover_coarse_double.cpp:13