10 #ifndef QXS_AFIELD_DD_INC_INCLUDED
11 #define QXS_AFIELD_DD_INC_INCLUDED
18 #include "lib_alt_QXS/inline/afield_th-inc.h"
23 template<
typename INDEX,
typename AFIELD>
25 const AFIELD& w,
const INDEX& block_index)
32 template<
typename INDEX,
typename AFIELD>
35 const int ieo,
const INDEX& block_index)
47 int Nxv = block_index.fine_lattice_size(0) /
VLENX;
48 int Nyv = block_index.fine_lattice_size(1) /
VLENY;
49 int Nz = block_index.fine_lattice_size(2);
51 int Nblock = block_index.coarse_nvol();
52 int NBx = block_index.coarse_lattice_size(0);
53 int NBy = block_index.coarse_lattice_size(1);
54 int NBz = block_index.coarse_lattice_size(2);
55 int NBt = block_index.coarse_lattice_size(3);
57 int Bsize = block_index.block_nvol() /
VLEN;
58 int Bxv = block_index.block_size(0) /
VLENX;
59 int Byv = block_index.block_size(1) /
VLENY;
60 int Bz = block_index.block_size(2);
61 int Bt = block_index.block_size(3);
64 set_threadtask(ith, nth, is, ns, Nblock);
66 int ieo_skip = 1 - ieo;
67 for (
int block = is; block < ns; ++block) {
68 if (block_index.block_eo(block) == ieo_skip) {
73 int ibx = block % NBx;
74 int iby = (block / NBx) % NBy;
75 int ibz = (block / (NBx * NBy)) % NBz;
76 int ibt = block / (NBx * NBy * NBz);
80 set_vec(pg, ytr, 0.0);
81 set_vec(pg, yti, 0.0);
83 for (
int bsite = 0; bsite < Bsize; ++bsite) {
85 int ix = kx + Bxv * ibx;
86 int kyzt = bsite / Bxv;
88 int iy = ky + Byv * iby;
91 int iz = kz + Bz * ibz;
93 int it = kt + Bt * ibt;
94 int site = ix + Nxv * (iy + Nyv * (iz + Nz * it));
98 set_vec(pg, xtr, 0.0);
99 set_vec(pg, xti, 0.0);
101 for (
int ex = 0; ex < Nex; ++ex) {
102 for (
int in2 = 0; in2 < Nin / 2; ++in2) {
104 int ini = 2 * in2 + 1;
107 load_vec(pg, vtr, &vp[
VLEN * (inr + Nin * (site + Nstv * ex))]);
108 load_vec(pg, vti, &vp[
VLEN * (ini + Nin * (site + Nstv * ex))]);
109 load_vec(pg, wtr, &wp[
VLEN * (inr + Nin * (site + Nstv * ex))]);
110 load_vec(pg, wti, &wp[
VLEN * (ini + Nin * (site + Nstv * ex))]);
111 add_dot_vec(pg, xtr, vtr, wtr);
112 add_dot_vec(pg, xtr, vti, wti);
113 add_dot_vec(pg, xti, vtr, wti);
114 sub_dot_vec(pg, xti, vti, wtr);
118 add_vec(pg, ytr, xtr);
119 add_vec(pg, yti, xti);
123 reduce_vec(pg, atr, ytr);
124 reduce_vec(pg, ati, yti);
125 out[block] = cmplx(atr, ati);
131 template<
typename INDEX,
typename AFIELD>
133 const INDEX& block_index)
140 template<
typename INDEX,
typename AFIELD>
142 const AFIELD& v,
const int ieo,
143 const INDEX& block_index)
153 int Nxv = block_index.fine_lattice_size(0) /
VLENX;
154 int Nyv = block_index.fine_lattice_size(1) /
VLENY;
155 int Nz = block_index.fine_lattice_size(2);
157 int Nblock = block_index.coarse_nvol();
158 int NBx = block_index.coarse_lattice_size(0);
159 int NBy = block_index.coarse_lattice_size(1);
160 int NBz = block_index.coarse_lattice_size(2);
161 int NBt = block_index.coarse_lattice_size(3);
163 int Bsize = block_index.block_nvol() /
VLEN;
164 int Bxv = block_index.block_size(0) /
VLENX;
165 int Byv = block_index.block_size(1) /
VLENY;
166 int Bz = block_index.block_size(2);
167 int Bt = block_index.block_size(3);
169 int ith, nth, is, ns;
170 set_threadtask(ith, nth, is, ns, Nblock);
171 int ieo_skip = 1 - ieo;
173 for (
int block = is; block < ns; ++block) {
174 if (block_index.block_eo(block) == ieo_skip) {
179 int ibx = block % NBx;
180 int iby = (block / NBx) % NBy;
181 int ibz = (block / (NBx * NBy)) % NBz;
182 int ibt = block / (NBx * NBy * NBz);
186 set_vec(pg, yt, 0.0);
188 for (
int bsite = 0; bsite < Bsize; ++bsite) {
189 int kx = bsite % Bxv;
190 int ix = kx + Bxv * ibx;
191 int kyzt = bsite / Bxv;
193 int iy = ky + Byv * iby;
194 int kzt = kyzt / Byv;
196 int iz = kz + Bz * ibz;
198 int it = kt + Bt * ibt;
199 int site = ix + Nxv * (iy + Nyv * (iz + Nz * it));
202 set_vec(pg, xt, 0.0);
204 for (
int ex = 0; ex < Nex; ++ex) {
205 for (
int in = 0; in < Nin; ++in) {
206 load_vec(pg, vt, &vp[
VLEN * (in + Nin * (site + Nstv * ex))]);
207 add_dot_vec(pg, xt, vt, vt);
213 reduce_vec(pg, a, yt);
220 template<
typename INDEX,
typename AFIELD>
222 const INDEX& block_index)
229 template<
typename INDEX,
typename AFIELD>
231 const int ieo,
const INDEX& block_index)
241 int Nxv = block_index.fine_lattice_size(0) /
VLENX;
242 int Nyv = block_index.fine_lattice_size(1) /
VLENY;
243 int Nz = block_index.fine_lattice_size(2);
245 int Nblock = block_index.coarse_nvol();
246 int NBx = block_index.coarse_lattice_size(0);
247 int NBy = block_index.coarse_lattice_size(1);
248 int NBz = block_index.coarse_lattice_size(2);
249 int NBt = block_index.coarse_lattice_size(3);
251 int Bsize = block_index.block_nvol() /
VLEN;
252 int Bxv = block_index.block_size(0) /
VLENX;
253 int Byv = block_index.block_size(1) /
VLENY;
254 int Bz = block_index.block_size(2);
255 int Bt = block_index.block_size(3);
257 int ith, nth, is, ns;
258 set_threadtask(ith, nth, is, ns, Nblock);
259 int ieo_skip = 1 - ieo;
263 for (
int block = is; block < ns; ++block) {
264 if (block_index.block_eo(block) == ieo_skip)
continue;
265 int ibx = block % NBx;
266 int iby = (block / NBx) % NBy;
267 int ibz = (block / (NBx * NBy)) % NBz;
268 int ibt = block / (NBx * NBy * NBz);
271 for (
int bsite = 0; bsite < Bsize; ++bsite) {
272 int kx = bsite % Bxv;
273 int ix = kx + Bxv * ibx;
274 int kyzt = bsite / Bxv;
276 int iy = ky + Byv * iby;
277 int kzt = kyzt / Byv;
279 int iz = kz + Bz * ibz;
281 int it = kt + Bt * ibt;
282 int site = ix + Nxv * (iy + Nyv * (iz + Nz * it));
284 for (
int ex = 0; ex < Nex; ++ex) {
285 for (
int in = 0; in < Nin; ++in) {
287 load_vec(pg, vt, &vp[
VLEN * (in + Nin * (site + Nstv * ex))]);
288 scal_vec(pg, vt, at);
289 save_vec(pg, &vp[
VLEN * (in + Nin * (site + Nstv * ex))], vt);
298 template<
typename INDEX,
typename AFIELD>
300 const INDEX& block_index)
307 template<
typename INDEX,
typename AFIELD>
309 const int ieo,
const INDEX& block_index)
320 int Nxv = block_index.fine_lattice_size(0) /
VLENX;
321 int Nyv = block_index.fine_lattice_size(1) /
VLENY;
322 int Nz = block_index.fine_lattice_size(2);
324 int Nblock = block_index.coarse_nvol();
325 int NBx = block_index.coarse_lattice_size(0);
326 int NBy = block_index.coarse_lattice_size(1);
327 int NBz = block_index.coarse_lattice_size(2);
328 int NBt = block_index.coarse_lattice_size(3);
330 int Bsize = block_index.block_nvol() /
VLEN;
331 int Bxv = block_index.block_size(0) /
VLENX;
332 int Byv = block_index.block_size(1) /
VLENY;
333 int Bz = block_index.block_size(2);
334 int Bt = block_index.block_size(3);
336 int ith, nth, is, ns;
337 set_threadtask(ith, nth, is, ns, Nblock);
338 int ieo_skip = 1 - ieo;
342 for (
int block = is; block < ns; ++block) {
343 if (block_index.block_eo(block) == ieo_skip)
continue;
344 int ibx = block % NBx;
345 int iby = (block / NBx) % NBy;
346 int ibz = (block / (NBx * NBy)) % NBz;
347 int ibt = block / (NBx * NBy * NBz);
349 for (
int bsite = 0; bsite < Bsize; ++bsite) {
350 int kx = bsite % Bxv;
351 int ix = kx + Bxv * ibx;
352 int kyzt = bsite / Bxv;
354 int iy = ky + Byv * iby;
355 int kzt = kyzt / Byv;
357 int iz = kz + Bz * ibz;
359 int it = kt + Bt * ibt;
360 int site = ix + Nxv * (iy + Nyv * (iz + Nz * it));
366 for (
int ex = 0; ex < Nex; ++ex) {
367 for (
int in2 = 0; in2 < Nin / 2; ++in2) {
369 int ini = 2 * in2 + 1;
371 load_vec(pg, wtr, &vp[
VLEN * (inr + Nin * (site + Nstv * ex))]);
372 load_vec(pg, wti, &vp[
VLEN * (ini + Nin * (site + Nstv * ex))]);
373 set_vec(pg, vtr, atr, wtr);
374 axpy_vec(pg, vtr, -ati, wti);
375 set_vec(pg, vti, atr, wti);
376 axpy_vec(pg, vti, ati, wtr);
377 save_vec(pg, &vp[
VLEN * (inr + Nin * (site + Nstv * ex))], vtr);
378 save_vec(pg, &vp[
VLEN * (ini + Nin * (site + Nstv * ex))], vti);
387 template<
typename INDEX,
typename AFIELD>
390 const INDEX& block_index)
397 template<
typename INDEX,
typename AFIELD>
399 const AFIELD& w,
const int ieo,
401 const INDEX& block_index)
412 int Nxv = block_index.fine_lattice_size(0) /
VLENX;
413 int Nyv = block_index.fine_lattice_size(1) /
VLENY;
414 int Nz = block_index.fine_lattice_size(2);
416 int Nblock = block_index.coarse_nvol();
417 int NBx = block_index.coarse_lattice_size(0);
418 int NBy = block_index.coarse_lattice_size(1);
419 int NBz = block_index.coarse_lattice_size(2);
420 int NBt = block_index.coarse_lattice_size(3);
422 int Bsize = block_index.block_nvol() /
VLEN;
423 int Bxv = block_index.block_size(0) /
VLENX;
424 int Byv = block_index.block_size(1) /
VLENY;
425 int Bz = block_index.block_size(2);
426 int Bt = block_index.block_size(3);
430 int ith, nth, is, ns;
431 set_threadtask(ith, nth, is, ns, Nblock);
432 int ieo_skip = 1 - ieo;
434 for (
int block = is; block < ns; ++block) {
435 if (block_index.block_eo(block) == ieo_skip)
continue;
436 int ibx = block % NBx;
437 int iby = (block / NBx) % NBy;
438 int ibz = (block / (NBx * NBy)) % NBz;
439 int ibt = block / (NBx * NBy * NBz);
441 for (
int bsite = 0; bsite < Bsize; ++bsite) {
442 int kx = bsite % Bxv;
443 int ix = kx + Bxv * ibx;
444 int kyzt = bsite / Bxv;
446 int iy = ky + Byv * iby;
447 int kzt = kyzt / Byv;
449 int iz = kz + Bz * ibz;
451 int it = kt + Bt * ibt;
452 int site = ix + Nxv * (iy + Nyv * (iz + Nz * it));
454 real_t at = fac * a[block];
456 for (
int ex = 0; ex < Nex; ++ex) {
457 for (
int in = 0; in < Nin; ++in) {
459 load_vec(pg, wt, &wp[
VLEN * (in + Nin * (site + Nstv * ex))]);
460 load_vec(pg, vt, &vp[
VLEN * (in + Nin * (site + Nstv * ex))]);
461 axpy_vec(pg, vt, at, wt);
462 save_vec(pg, &vp[
VLEN * (in + Nin * (site + Nstv * ex))], vt);
471 template<
typename INDEX,
typename AFIELD>
474 const INDEX& block_index)
481 template<
typename INDEX,
typename AFIELD>
483 const AFIELD& w,
const int ieo,
485 const INDEX& block_index)
497 int Nxv = block_index.fine_lattice_size(0) /
VLENX;
498 int Nyv = block_index.fine_lattice_size(1) /
VLENY;
499 int Nz = block_index.fine_lattice_size(2);
501 int Nblock = block_index.coarse_nvol();
502 int NBx = block_index.coarse_lattice_size(0);
503 int NBy = block_index.coarse_lattice_size(1);
504 int NBz = block_index.coarse_lattice_size(2);
505 int NBt = block_index.coarse_lattice_size(3);
507 int Bsize = block_index.block_nvol() /
VLEN;
508 int Bxv = block_index.block_size(0) /
VLENX;
509 int Byv = block_index.block_size(1) /
VLENY;
510 int Bz = block_index.block_size(2);
511 int Bt = block_index.block_size(3);
515 int ith, nth, is, ns;
516 set_threadtask(ith, nth, is, ns, Nblock);
517 int ieo_skip = 1 - ieo;
519 for (
int block = is; block < ns; ++block) {
520 if (block_index.block_eo(block) == ieo_skip)
continue;
521 int ibx = block % NBx;
522 int iby = (block / NBx) % NBy;
523 int ibz = (block / (NBx * NBy)) % NBz;
524 int ibt = block / (NBx * NBy * NBz);
526 for (
int bsite = 0; bsite < Bsize; ++bsite) {
527 int kx = bsite % Bxv;
528 int ix = kx + Bxv * ibx;
529 int kyzt = bsite / Bxv;
531 int iy = ky + Byv * iby;
532 int kzt = kyzt / Byv;
534 int iz = kz + Bz * ibz;
536 int it = kt + Bt * ibt;
537 int site = ix + Nxv * (iy + Nyv * (iz + Nz * it));
540 real_t atr = fac * real(at);
541 real_t ati = fac * imag(at);
543 for (
int ex = 0; ex < Nex; ++ex) {
544 for (
int in2 = 0; in2 < Nin / 2; ++in2) {
546 int ini = 2 * in2 + 1;
548 load_vec(pg, wtr, &wp[
VLEN * (inr + Nin * (site + Nstv * ex))]);
549 load_vec(pg, wti, &wp[
VLEN * (ini + Nin * (site + Nstv * ex))]);
550 load_vec(pg, vtr, &vp[
VLEN * (inr + Nin * (site + Nstv * ex))]);
551 load_vec(pg, vti, &vp[
VLEN * (ini + Nin * (site + Nstv * ex))]);
552 axpy_vec(pg, vtr, atr, wtr);
553 axpy_vec(pg, vtr, -ati, wti);
554 axpy_vec(pg, vti, atr, wti);
555 axpy_vec(pg, vti, ati, wtr);
556 save_vec(pg, &vp[
VLEN * (inr + Nin * (site + Nstv * ex))], vtr);
557 save_vec(pg, &vp[
VLEN * (ini + Nin * (site + Nstv * ex))], vti);