9 #ifndef QXS_VSIMD_DOMAINWALL_SU3_DOUBLE_INC_INCLUDED
10 #define QXS_VSIMD_DOMAINWALL_SU3_DOUBLE_INC_INCLUDED
29 template<
typename REALTYPE>
30 inline void set_aPp5_dirac_vec(
Vsimd_t *v,
33 for (
int ivc = 0; ivc <
NVC; ++ivc) {
34 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
35 for (
int k = 0; k <
VLEN; ++k) {
36 v[
ID1 + ivc2].
v[k] = a * (w[
ID1 + ivc2].
v[k] - w[
ID3 + ivc2].
v[k]);
37 v[
ID2 + ivc2].
v[k] = a * (w[
ID2 + ivc2].
v[k] - w[
ID4 + ivc2].
v[k]);
38 v[
ID3 + ivc2].
v[k] = a * (w[
ID3 + ivc2].
v[k] - w[
ID1 + ivc2].
v[k]);
39 v[
ID4 + ivc2].
v[k] = a * (w[
ID4 + ivc2].
v[k] - w[
ID2 + ivc2].
v[k]);
45 template<
typename REALTYPE>
46 inline void set_aPm5_dirac_vec(
Vsimd_t *v,
49 for (
int ivc = 0; ivc <
NVC; ++ivc) {
50 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
51 for (
int k = 0; k <
VLEN; ++k) {
52 v[
ID1 + ivc2].
v[k] = a * (w[
ID1 + ivc2].
v[k] + w[
ID3 + ivc2].
v[k]);
53 v[
ID2 + ivc2].
v[k] = a * (w[
ID2 + ivc2].
v[k] + w[
ID4 + ivc2].
v[k]);
54 v[
ID3 + ivc2].
v[k] = a * (w[
ID3 + ivc2].
v[k] + w[
ID1 + ivc2].
v[k]);
55 v[
ID4 + ivc2].
v[k] = a * (w[
ID4 + ivc2].
v[k] + w[
ID2 + ivc2].
v[k]);
61 template<
typename REALTYPE>
62 inline void add_aPp5_dirac_vec(
Vsimd_t *v,
65 for (
int ivc = 0; ivc <
NVC; ++ivc) {
66 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
67 for (
int k = 0; k <
VLEN; ++k) {
68 v[
ID1 + ivc2].
v[k] += a * (w[
ID1 + ivc2].
v[k] - w[
ID3 + ivc2].
v[k]);
69 v[
ID2 + ivc2].
v[k] += a * (w[
ID2 + ivc2].
v[k] - w[
ID4 + ivc2].
v[k]);
70 v[
ID3 + ivc2].
v[k] += a * (w[
ID3 + ivc2].
v[k] - w[
ID1 + ivc2].
v[k]);
71 v[
ID4 + ivc2].
v[k] += a * (w[
ID4 + ivc2].
v[k] - w[
ID2 + ivc2].
v[k]);
77 template<
typename REALTYPE>
78 inline void add_aPm5_dirac_vec(
Vsimd_t *v,
81 for (
int ivc = 0; ivc <
NVC; ++ivc) {
82 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
83 for (
int k = 0; k <
VLEN; ++k) {
84 v[
ID1 + ivc2].
v[k] += a * (w[
ID1 + ivc2].
v[k] + w[
ID3 + ivc2].
v[k]);
85 v[
ID2 + ivc2].
v[k] += a * (w[
ID2 + ivc2].
v[k] + w[
ID4 + ivc2].
v[k]);
86 v[
ID3 + ivc2].
v[k] += a * (w[
ID3 + ivc2].
v[k] + w[
ID1 + ivc2].
v[k]);
87 v[
ID4 + ivc2].
v[k] += a * (w[
ID4 + ivc2].
v[k] + w[
ID2 + ivc2].
v[k]);
93 template<
typename REALTYPE>
98 REALTYPE a, REALTYPE *w,
int is,
int ic)
101 int off_up = 2 *
ND * ic +
NVCD * is;
104 load_vec(pg, vt1r, &w[
VLEN * (
ID1 + off_up)]);
105 load_vec(pg, vt1i, &w[
VLEN * (
ID1 + 1 + off_up)]);
106 load_vec(pg, vt2r, &w[
VLEN * (
ID2 + off_up)]);
107 load_vec(pg, vt2i, &w[
VLEN * (
ID2 + 1 + off_up)]);
109 load_vec(pg, w3r, &w[
VLEN * (
ID3 + off_up)]);
110 load_vec(pg, w3i, &w[
VLEN * (
ID3 + 1 + off_up)]);
111 load_vec(pg, w4r, &w[
VLEN * (
ID4 + off_up)]);
112 load_vec(pg, w4i, &w[
VLEN * (
ID4 + 1 + off_up)]);
114 add_vec(pg, vt1r, w3r);
115 add_vec(pg, vt1i, w3i);
116 scal_vec(pg, vt1r, a);
117 scal_vec(pg, vt1i, a);
122 add_vec(pg, vt2r, w4r);
123 add_vec(pg, vt2i, w4i);
124 scal_vec(pg, vt2r, a);
125 scal_vec(pg, vt2i, a);
132 template<
typename REALTYPE>
137 REALTYPE a, REALTYPE *w,
int is,
int ic)
140 int off_up = 2 *
ND * ic +
NVCD * is;
141 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
143 load_vec(pg, w1r, &w[
VLEN * (
ID1 + off_up)]);
144 load_vec(pg, w1i, &w[
VLEN * (
ID1 + 1 + off_up)]);
145 load_vec(pg, w2r, &w[
VLEN * (
ID2 + off_up)]);
146 load_vec(pg, w2i, &w[
VLEN * (
ID2 + 1 + off_up)]);
147 load_vec(pg, w3r, &w[
VLEN * (
ID3 + off_up)]);
148 load_vec(pg, w3i, &w[
VLEN * (
ID3 + 1 + off_up)]);
149 load_vec(pg, w4r, &w[
VLEN * (
ID4 + off_up)]);
150 load_vec(pg, w4i, &w[
VLEN * (
ID4 + 1 + off_up)]);
152 sub_vec(pg, w1r, w3r);
153 sub_vec(pg, w1i, w3i);
154 sub_vec(pg, w2r, w4r);
155 sub_vec(pg, w2i, w4i);
157 axpy_vec(pg, vt1r, a, w1r);
158 axpy_vec(pg, vt1i, a, w1i);
159 axpy_vec(pg, vt3r, -a, w1r);
160 axpy_vec(pg, vt3i, -a, w1i);
162 axpy_vec(pg, vt2r, a, w2r);
163 axpy_vec(pg, vt2i, a, w2i);
164 axpy_vec(pg, vt4r, -a, w2r);
165 axpy_vec(pg, vt4i, -a, w2i);
169 template<
typename REALTYPE>
174 REALTYPE a, REALTYPE *w,
int is,
int ic)
177 int off_up = 2 *
ND * ic +
NVCD * is;
178 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
180 load_vec(pg, w1r, &w[
VLEN * (
ID1 + off_up)]);
181 load_vec(pg, w1i, &w[
VLEN * (
ID1 + 1 + off_up)]);
182 load_vec(pg, w2r, &w[
VLEN * (
ID2 + off_up)]);
183 load_vec(pg, w2i, &w[
VLEN * (
ID2 + 1 + off_up)]);
184 load_vec(pg, w3r, &w[
VLEN * (
ID3 + off_up)]);
185 load_vec(pg, w3i, &w[
VLEN * (
ID3 + 1 + off_up)]);
186 load_vec(pg, w4r, &w[
VLEN * (
ID4 + off_up)]);
187 load_vec(pg, w4i, &w[
VLEN * (
ID4 + 1 + off_up)]);
189 add_vec(pg, w1r, w3r);
190 add_vec(pg, w1i, w3i);
191 add_vec(pg, w2r, w4r);
192 add_vec(pg, w2i, w4i);
194 axpy_vec(pg, vt1r, a, w1r);
195 axpy_vec(pg, vt1i, a, w1i);
196 axpy_vec(pg, vt3r, a, w1r);
197 axpy_vec(pg, vt3i, a, w1i);
199 axpy_vec(pg, vt2r, a, w2r);
200 axpy_vec(pg, vt2i, a, w2i);
201 axpy_vec(pg, vt4r, a, w2r);
202 axpy_vec(pg, vt4i, a, w2i);
206 template<
typename REALTYPE>
207 inline void add_aPp5_dirac_vec(
svbool_t pg,
219 yt1r = svsub_m(pg, xt1r, xt3r);
220 yt1i = svsub_m(pg, xt1i, xt3i);
221 yt2r = svsub_m(pg, xt2r, xt4r);
222 yt2i = svsub_m(pg, xt2i, xt4i);
223 axpy_vec(pg, vt1r, a, yt1r);
224 axpy_vec(pg, vt1i, a, yt1i);
225 axpy_vec(pg, vt2r, a, yt2r);
226 axpy_vec(pg, vt2i, a, yt2i);
227 axpy_vec(pg, vt3r, -a, yt1r);
228 axpy_vec(pg, vt3i, -a, yt1i);
229 axpy_vec(pg, vt4r, -a, yt2r);
230 axpy_vec(pg, vt4i, -a, yt2i);
234 template<
typename REALTYPE>
235 inline void add_aPm5_dirac_vec(
svbool_t pg,
247 yt1r = svadd_m(pg, xt1r, xt3r);
248 yt1i = svadd_m(pg, xt1i, xt3i);
249 yt2r = svadd_m(pg, xt2r, xt4r);
250 yt2i = svadd_m(pg, xt2i, xt4i);
251 axpy_vec(pg, vt1r, a, yt1r);
252 axpy_vec(pg, vt1i, a, yt1i);
253 axpy_vec(pg, vt2r, a, yt2r);
254 axpy_vec(pg, vt2i, a, yt2i);
255 axpy_vec(pg, vt3r, a, yt1r);
256 axpy_vec(pg, vt3i, a, yt1i);
257 axpy_vec(pg, vt4r, a, yt2r);
258 axpy_vec(pg, vt4i, a, yt2i);
262 template<
typename REALTYPE>
263 inline void set_aPp5_dirac_vec(
svbool_t pg,
274 vt1r = svsub_m(pg, xt1r, xt3r);
275 vt1i = svsub_m(pg, xt1i, xt3i);
276 vt2r = svsub_m(pg, xt2r, xt4r);
277 vt2i = svsub_m(pg, xt2i, xt4i);
283 scal_vec(pg, vt1r, a);
284 scal_vec(pg, vt1i, a);
285 scal_vec(pg, vt2r, a);
286 scal_vec(pg, vt2i, a);
287 scal_vec(pg, vt3r, -a);
288 scal_vec(pg, vt3i, -a);
289 scal_vec(pg, vt4r, -a);
290 scal_vec(pg, vt4i, -a);
294 template<
typename REALTYPE>
295 inline void set_aPm5_dirac_vec(
svbool_t pg,
306 vt1r = svadd_m(pg, xt1r, xt3r);
307 vt1i = svadd_m(pg, xt1i, xt3i);
308 vt2r = svadd_m(pg, xt2r, xt4r);
309 vt2i = svadd_m(pg, xt2i, xt4i);
310 scal_vec(pg, vt1r, a);
311 scal_vec(pg, vt1i, a);
312 scal_vec(pg, vt2r, a);
313 scal_vec(pg, vt2i, a);
321 template<
typename REALTYPE>
322 inline void load_mult_gm5_dirac_vec(
svbool_t pg,
329 load_vec(pg, vt3r, &w[
VLEN * (
ID1)]);
330 load_vec(pg, vt3i, &w[
VLEN * (
ID1 + 1)]);
334 load_vec(pg, vt4r, &w[
VLEN * (
ID2)]);
335 load_vec(pg, vt4i, &w[
VLEN * (
ID2 + 1)]);
339 load_vec(pg, vt1r, &w[
VLEN * (
ID3)]);
340 load_vec(pg, vt1i, &w[
VLEN * (
ID3 + 1)]);
344 load_vec(pg, vt2r, &w[
VLEN * (
ID4)]);
345 load_vec(pg, vt2i, &w[
VLEN * (
ID4 + 1)]);
353 for (
int ivc = 0; ivc <
NVC; ++ivc) {
354 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
355 for (
int k = 0; k <
VLEN; ++k) {
356 v[
ID1 + ivc2].
v[k] = -w[
ID3 + ivc2].
v[k];
357 v[
ID2 + ivc2].
v[k] = -w[
ID4 + ivc2].
v[k];
358 v[
ID3 + ivc2].
v[k] = -w[
ID1 + ivc2].
v[k];
359 v[
ID4 + ivc2].
v[k] = -w[
ID2 + ivc2].
v[k];
365 template<
typename REALTYPE>
366 inline void load_mult_gm5_dirac_vec(
Vsimd_t *v, REALTYPE *w,
int Nc)
368 for (
int ivc = 0; ivc <
NVC; ++ivc) {
369 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
370 for (
int k = 0; k <
VLEN; ++k) {
380 template<
typename REALTYPE>
381 inline void load_mult_gm5_dirac_vec(
Vsimd_t *v,
382 REALTYPE a, REALTYPE *w,
int Nc)
384 for (
int ivc = 0; ivc <
NVC; ++ivc) {
385 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
386 for (
int k = 0; k <
VLEN; ++k) {
387 v[
ID1 + ivc2].
v[k] = -a * w[k +
VLEN * (
ID3 + ivc2)];
388 v[
ID2 + ivc2].
v[k] = -a * w[k +
VLEN * (
ID4 + ivc2)];
389 v[
ID3 + ivc2].
v[k] = -a * w[k +
VLEN * (
ID1 + ivc2)];
390 v[
ID4 + ivc2].
v[k] = -a * w[k +
VLEN * (
ID2 + ivc2)];
396 template<
typename REALTYPE>
397 inline void dw_5dir_axpy(
svbool_t pg, REALTYPE *v,
398 REALTYPE *y, REALTYPE *w,
399 REALTYPE a1, REALTYPE a2,
400 REALTYPE b1, REALTYPE b2,
406 load_vec(pg, wt, &w[
VLEN * index]);
407 set_vec(pg, vt, a1, wt);
408 axpy_vec(pg, vt, a2, zt);
409 save_vec(pg, &v[
VLEN * index], vt);
411 set_vec(pg, yt, -0.5 * b1, wt);
412 axpy_vec(pg, yt, -0.5 * b2, zt);
413 save_vec(pg, &y[
VLEN * index], yt);
417 template<
typename REALTYPE>
418 inline void dw_5dir_dag(
svbool_t pg,
423 REALTYPE *w, REALTYPE *y,
424 REALTYPE a1, REALTYPE a2,
int index)
426 load_vec(pg, vt1r, &w[
VLEN * (
ID3 + index)]);
427 load_vec(pg, vt1i, &w[
VLEN * (
ID3 + 1 + index)]);
428 load_vec(pg, vt2r, &w[
VLEN * (
ID4 + index)]);
429 load_vec(pg, vt2i, &w[
VLEN * (
ID4 + 1 + index)]);
430 load_vec(pg, vt3r, &w[
VLEN * (
ID1 + index)]);
431 load_vec(pg, vt3i, &w[
VLEN * (
ID1 + 1 + index)]);
432 load_vec(pg, vt4r, &w[
VLEN * (
ID2 + index)]);
433 load_vec(pg, vt4i, &w[
VLEN * (
ID2 + 1 + index)]);
436 load_vec(pg, yt1r, &y[
VLEN * (
ID1 + index)]);
437 load_vec(pg, yt1i, &y[
VLEN * (
ID1 + 1 + index)]);
438 scal_vec(pg, vt1r, -a1);
439 scal_vec(pg, vt1i, -a1);
440 axpy_vec(pg, vt1r, a2, yt1r);
441 axpy_vec(pg, vt1i, a2, yt1i);
444 load_vec(pg, yt2r, &y[
VLEN * (
ID2 + index)]);
445 load_vec(pg, yt2i, &y[
VLEN * (
ID2 + 1 + index)]);
446 scal_vec(pg, vt2r, -a1);
447 scal_vec(pg, vt2i, -a1);
448 axpy_vec(pg, vt2r, a2, yt2r);
449 axpy_vec(pg, vt2i, a2, yt2i);
452 load_vec(pg, yt3r, &y[
VLEN * (
ID3 + index)]);
453 load_vec(pg, yt3i, &y[
VLEN * (
ID3 + 1 + index)]);
454 scal_vec(pg, vt3r, -a1);
455 scal_vec(pg, vt3i, -a1);
456 axpy_vec(pg, vt3r, a2, yt3r);
457 axpy_vec(pg, vt3i, a2, yt3i);
460 load_vec(pg, yt4r, &y[
VLEN * (
ID4 + index)]);
461 load_vec(pg, yt4i, &y[
VLEN * (
ID4 + 1 + index)]);
462 scal_vec(pg, vt4r, -a1);
463 scal_vec(pg, vt4i, -a1);
464 axpy_vec(pg, vt4r, a2, yt4r);
465 axpy_vec(pg, vt4i, a2, yt4i);