9 #ifndef QXS_VSIMD_DOMAINWALL_SU3_DOUBLE_INC_INCLUDED
10 #define QXS_VSIMD_DOMAINWALL_SU3_DOUBLE_INC_INCLUDED
29 template<
typename REALTYPE>
30 inline void set_aPp5_dirac_vec(
Vsimd_t *v,
33 for (
int ivc = 0; ivc <
NVC; ++ivc) {
34 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
35 for (
int k = 0; k <
VLEN; ++k) {
36 v[
ID1 + ivc2].
v[k] = a * (w[
ID1 + ivc2].
v[k] - w[
ID3 + ivc2].
v[k]);
37 v[
ID2 + ivc2].
v[k] = a * (w[
ID2 + ivc2].
v[k] - w[
ID4 + ivc2].
v[k]);
38 v[
ID3 + ivc2].
v[k] = a * (w[
ID3 + ivc2].
v[k] - w[
ID1 + ivc2].
v[k]);
39 v[
ID4 + ivc2].
v[k] = a * (w[
ID4 + ivc2].
v[k] - w[
ID2 + ivc2].
v[k]);
45 template<
typename REALTYPE>
46 inline void set_aPm5_dirac_vec(
Vsimd_t *v,
49 for (
int ivc = 0; ivc <
NVC; ++ivc) {
50 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
51 for (
int k = 0; k <
VLEN; ++k) {
52 v[
ID1 + ivc2].
v[k] = a * (w[
ID1 + ivc2].
v[k] + w[
ID3 + ivc2].
v[k]);
53 v[
ID2 + ivc2].
v[k] = a * (w[
ID2 + ivc2].
v[k] + w[
ID4 + ivc2].
v[k]);
54 v[
ID3 + ivc2].
v[k] = a * (w[
ID3 + ivc2].
v[k] + w[
ID1 + ivc2].
v[k]);
55 v[
ID4 + ivc2].
v[k] = a * (w[
ID4 + ivc2].
v[k] + w[
ID2 + ivc2].
v[k]);
61 template<
typename REALTYPE>
62 inline void add_aPp5_dirac_vec(
Vsimd_t *v,
65 for (
int ivc = 0; ivc <
NVC; ++ivc) {
66 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
67 for (
int k = 0; k <
VLEN; ++k) {
68 v[
ID1 + ivc2].
v[k] += a * (w[
ID1 + ivc2].
v[k] - w[
ID3 + ivc2].
v[k]);
69 v[
ID2 + ivc2].
v[k] += a * (w[
ID2 + ivc2].
v[k] - w[
ID4 + ivc2].
v[k]);
70 v[
ID3 + ivc2].
v[k] += a * (w[
ID3 + ivc2].
v[k] - w[
ID1 + ivc2].
v[k]);
71 v[
ID4 + ivc2].
v[k] += a * (w[
ID4 + ivc2].
v[k] - w[
ID2 + ivc2].
v[k]);
77 template<
typename REALTYPE>
78 inline void add_aPm5_dirac_vec(
Vsimd_t *v,
81 for (
int ivc = 0; ivc <
NVC; ++ivc) {
82 int ivc2 = (ivc % 2) + 2 *
ND * (ivc / 2);
83 for (
int k = 0; k <
VLEN; ++k) {
84 v[
ID1 + ivc2].
v[k] += a * (w[
ID1 + ivc2].
v[k] + w[
ID3 + ivc2].
v[k]);
85 v[
ID2 + ivc2].
v[k] += a * (w[
ID2 + ivc2].
v[k] + w[
ID4 + ivc2].
v[k]);
86 v[
ID3 + ivc2].
v[k] += a * (w[
ID3 + ivc2].
v[k] + w[
ID1 + ivc2].
v[k]);
87 v[
ID4 + ivc2].
v[k] += a * (w[
ID4 + ivc2].
v[k] + w[
ID2 + ivc2].
v[k]);
93 template<
typename REALTYPE>
98 REALTYPE a, REALTYPE *w,
int is,
int ic)
101 int off_up = 2 *
ND * ic +
NVCD * is;
102 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
104 load_vec(pg, w1r, &w[
VLEN * (
ID1 + off_up)]);
105 load_vec(pg, w1i, &w[
VLEN * (
ID1 + 1 + off_up)]);
106 load_vec(pg, w2r, &w[
VLEN * (
ID2 + off_up)]);
107 load_vec(pg, w2i, &w[
VLEN * (
ID2 + 1 + off_up)]);
108 load_vec(pg, w3r, &w[
VLEN * (
ID3 + off_up)]);
109 load_vec(pg, w3i, &w[
VLEN * (
ID3 + 1 + off_up)]);
110 load_vec(pg, w4r, &w[
VLEN * (
ID4 + off_up)]);
111 load_vec(pg, w4i, &w[
VLEN * (
ID4 + 1 + off_up)]);
113 sub_vec(pg, w1r, w3r);
114 sub_vec(pg, w1i, w3i);
115 sub_vec(pg, w2r, w4r);
116 sub_vec(pg, w2i, w4i);
118 axpy_vec(pg, vt1r, a, w1r);
119 axpy_vec(pg, vt1i, a, w1i);
120 axpy_vec(pg, vt3r, -a, w1r);
121 axpy_vec(pg, vt3i, -a, w1i);
123 axpy_vec(pg, vt2r, a, w2r);
124 axpy_vec(pg, vt2i, a, w2i);
125 axpy_vec(pg, vt4r, -a, w2r);
126 axpy_vec(pg, vt4i, -a, w2i);
130 template<
typename REALTYPE>
135 REALTYPE a, REALTYPE *w,
int is,
int ic)
138 int off_up = 2 *
ND * ic +
NVCD * is;
139 svreal_t w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i;
141 load_vec(pg, w1r, &w[
VLEN * (
ID1 + off_up)]);
142 load_vec(pg, w1i, &w[
VLEN * (
ID1 + 1 + off_up)]);
143 load_vec(pg, w2r, &w[
VLEN * (
ID2 + off_up)]);
144 load_vec(pg, w2i, &w[
VLEN * (
ID2 + 1 + off_up)]);
145 load_vec(pg, w3r, &w[
VLEN * (
ID3 + off_up)]);
146 load_vec(pg, w3i, &w[
VLEN * (
ID3 + 1 + off_up)]);
147 load_vec(pg, w4r, &w[
VLEN * (
ID4 + off_up)]);
148 load_vec(pg, w4i, &w[
VLEN * (
ID4 + 1 + off_up)]);
150 add_vec(pg, w1r, w3r);
151 add_vec(pg, w1i, w3i);
152 add_vec(pg, w2r, w4r);
153 add_vec(pg, w2i, w4i);
155 axpy_vec(pg, vt1r, a, w1r);
156 axpy_vec(pg, vt1i, a, w1i);
157 axpy_vec(pg, vt3r, a, w1r);
158 axpy_vec(pg, vt3i, a, w1i);
160 axpy_vec(pg, vt2r, a, w2r);
161 axpy_vec(pg, vt2i, a, w2i);
162 axpy_vec(pg, vt4r, a, w2r);
163 axpy_vec(pg, vt4i, a, w2i);
167 template<
typename REALTYPE>
172 REALTYPE a, REALTYPE *w,
int is,
int ic)
175 int off_up = 2 *
ND * ic +
NVCD * is;
178 load_vec(pg, vt1r, &w[
VLEN * (
ID1 + off_up)]);
179 load_vec(pg, vt1i, &w[
VLEN * (
ID1 + 1 + off_up)]);
180 load_vec(pg, vt2r, &w[
VLEN * (
ID2 + off_up)]);
181 load_vec(pg, vt2i, &w[
VLEN * (
ID2 + 1 + off_up)]);
183 load_vec(pg, w3r, &w[
VLEN * (
ID3 + off_up)]);
184 load_vec(pg, w3i, &w[
VLEN * (
ID3 + 1 + off_up)]);
185 load_vec(pg, w4r, &w[
VLEN * (
ID4 + off_up)]);
186 load_vec(pg, w4i, &w[
VLEN * (
ID4 + 1 + off_up)]);
188 add_vec(pg, vt1r, w3r);
189 add_vec(pg, vt1i, w3i);
190 scal_vec(pg, vt1r, a);
191 scal_vec(pg, vt1i, a);
196 add_vec(pg, vt2r, w4r);
197 add_vec(pg, vt2i, w4i);
198 scal_vec(pg, vt2r, a);
199 scal_vec(pg, vt2i, a);
206 template<
typename REALTYPE>
207 inline void dw_5dir_axpy(
svbool_t pg, REALTYPE *v,
208 REALTYPE *y, REALTYPE *w,
209 REALTYPE a1, REALTYPE a2,
210 REALTYPE b1, REALTYPE b2,
216 load_vec(pg, wt, &w[
VLEN * index]);
217 set_vec(pg, vt, a1, wt);
218 axpy_vec(pg, vt, a2, zt);
219 save_vec(pg, &v[
VLEN * index], vt);
221 set_vec(pg, yt, -0.5 * b1, wt);
222 axpy_vec(pg, yt, -0.5 * b2, zt);
223 save_vec(pg, &y[
VLEN * index], yt);
227 template<
typename REALTYPE>
228 inline void dw_5dir_dag(
svbool_t pg,
233 REALTYPE *w, REALTYPE *y,
234 REALTYPE a1, REALTYPE a2,
int index)
236 load_vec(pg, vt1r, &w[
VLEN * (
ID3 + index)]);
237 load_vec(pg, vt1i, &w[
VLEN * (
ID3 + 1 + index)]);
238 load_vec(pg, vt2r, &w[
VLEN * (
ID4 + index)]);
239 load_vec(pg, vt2i, &w[
VLEN * (
ID4 + 1 + index)]);
240 load_vec(pg, vt3r, &w[
VLEN * (
ID1 + index)]);
241 load_vec(pg, vt3i, &w[
VLEN * (
ID1 + 1 + index)]);
242 load_vec(pg, vt4r, &w[
VLEN * (
ID2 + index)]);
243 load_vec(pg, vt4i, &w[
VLEN * (
ID2 + 1 + index)]);
246 load_vec(pg, yt1r, &y[
VLEN * (
ID1 + index)]);
247 load_vec(pg, yt1i, &y[
VLEN * (
ID1 + 1 + index)]);
248 scal_vec(pg, vt1r, -a1);
249 scal_vec(pg, vt1i, -a1);
250 axpy_vec(pg, vt1r, a2, yt1r);
251 axpy_vec(pg, vt1i, a2, yt1i);
254 load_vec(pg, yt2r, &y[
VLEN * (
ID2 + index)]);
255 load_vec(pg, yt2i, &y[
VLEN * (
ID2 + 1 + index)]);
256 scal_vec(pg, vt2r, -a1);
257 scal_vec(pg, vt2i, -a1);
258 axpy_vec(pg, vt2r, a2, yt2r);
259 axpy_vec(pg, vt2i, a2, yt2i);
262 load_vec(pg, yt3r, &y[
VLEN * (
ID3 + index)]);
263 load_vec(pg, yt3i, &y[
VLEN * (
ID3 + 1 + index)]);
264 scal_vec(pg, vt3r, -a1);
265 scal_vec(pg, vt3i, -a1);
266 axpy_vec(pg, vt3r, a2, yt3r);
267 axpy_vec(pg, vt3i, a2, yt3i);
270 load_vec(pg, yt4r, &y[
VLEN * (
ID4 + index)]);
271 load_vec(pg, yt4i, &y[
VLEN * (
ID4 + 1 + index)]);
272 scal_vec(pg, vt4r, -a1);
273 scal_vec(pg, vt4i, -a1);
274 axpy_vec(pg, vt4r, a2, yt4r);
275 axpy_vec(pg, vt4i, a2, yt4i);
279 template<
typename REALTYPE>
280 inline void add_aPp5_dirac_vec(
svbool_t pg,
292 yt1r = svsub_m(pg, xt1r, xt3r);
293 yt1i = svsub_m(pg, xt1i, xt3i);
294 yt2r = svsub_m(pg, xt2r, xt4r);
295 yt2i = svsub_m(pg, xt2i, xt4i);
296 axpy_vec(pg, vt1r, a, yt1r);
297 axpy_vec(pg, vt1i, a, yt1i);
298 axpy_vec(pg, vt2r, a, yt2r);
299 axpy_vec(pg, vt2i, a, yt2i);
300 axpy_vec(pg, vt3r, -a, yt1r);
301 axpy_vec(pg, vt3i, -a, yt1i);
302 axpy_vec(pg, vt4r, -a, yt2r);
303 axpy_vec(pg, vt4i, -a, yt2i);
307 template<
typename REALTYPE>
308 inline void set_aPp5_dirac_vec(
svbool_t pg,
319 vt1r = svsub_m(pg, xt1r, xt3r);
320 vt1i = svsub_m(pg, xt1i, xt3i);
321 vt2r = svsub_m(pg, xt2r, xt4r);
322 vt2i = svsub_m(pg, xt2i, xt4i);
328 scal_vec(pg, vt1r, a);
329 scal_vec(pg, vt1i, a);
330 scal_vec(pg, vt2r, a);
331 scal_vec(pg, vt2i, a);
332 scal_vec(pg, vt3r, -a);
333 scal_vec(pg, vt3i, -a);
334 scal_vec(pg, vt4r, -a);
335 scal_vec(pg, vt4i, -a);
339 template<
typename REALTYPE>
340 inline void add_aPm5_dirac_vec(
svbool_t pg,
352 yt1r = svadd_m(pg, xt1r, xt3r);
353 yt1i = svadd_m(pg, xt1i, xt3i);
354 yt2r = svadd_m(pg, xt2r, xt4r);
355 yt2i = svadd_m(pg, xt2i, xt4i);
356 axpy_vec(pg, vt1r, a, yt1r);
357 axpy_vec(pg, vt1i, a, yt1i);
358 axpy_vec(pg, vt2r, a, yt2r);
359 axpy_vec(pg, vt2i, a, yt2i);
360 axpy_vec(pg, vt3r, a, yt1r);
361 axpy_vec(pg, vt3i, a, yt1i);
362 axpy_vec(pg, vt4r, a, yt2r);
363 axpy_vec(pg, vt4i, a, yt2i);
367 template<
typename REALTYPE>
368 inline void set_aPm5_dirac_vec(
svbool_t pg,
379 vt1r = svadd_m(pg, xt1r, xt3r);
380 vt1i = svadd_m(pg, xt1i, xt3i);
381 vt2r = svadd_m(pg, xt2r, xt4r);
382 vt2i = svadd_m(pg, xt2i, xt4i);
383 scal_vec(pg, vt1r, a);
384 scal_vec(pg, vt1i, a);
385 scal_vec(pg, vt2r, a);
386 scal_vec(pg, vt2i, a);