10 #ifndef MULT_COARSE_QXS_INCLUDED
11 #define MULT_COARSE_QXS_INCLUDED
37 int ncol,
const int *do_comm)
40 int Nstv = Nsize[0] * Nsize[1] * Nsize[2] * Nsize[3];
47 int Nc2 = ncol * ncol;
51 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
52 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
53 set_predicate_xp(pg1_xp, pg2_xp);
54 set_predicate_xm(pg1_xm, pg2_xm);
55 set_predicate_yp(pg1_yp, pg2_yp);
56 set_predicate_ym(pg1_ym, pg2_ym);
57 svint_t svidx_xp, svidx_xm, svidx_yp, svidx_ym;
58 set_index_xp(svidx_xp);
59 set_index_xm(svidx_xm);
60 set_index_yp(svidx_yp);
61 set_index_ym(svidx_ym);
64 int taskx = (do_comm[0] > 0) ? (Nyv * Nz * Nt) : 0;
65 int tasky = (do_comm[1] > 0) ? (Nxv * Nz * Nt) : 0;
66 int taskz = (do_comm[2] > 0) ? (Nxv * Nyv * Nt) : 0;
67 int taskt = (do_comm[3] > 0) ? (Nxv * Nyv * Nz) : 0;
68 int task_total = taskx + tasky + taskz + taskt;
69 set_threadtask(ith, nth, is, ns, task_total);
72 int nsx = (ns > taskx) ? taskx : ns;
75 int isy = (is < 0) ? 0 : is;
76 int nsy = (ns > tasky) ? tasky : ns;
79 int isz = (is < 0) ? 0 : is;
80 int nsz = (ns > taskz) ? taskz : ns;
83 int ist = (is < 0) ? 0 : is;
84 int nst = (ns < 0) ? 0 : ns;
86 for (
int sitex = isx; sitex < nsx; ++sitex) {
88 int ibf =
VLENY * Nvc * iyzt;
93 int site = ix + Nxv * iyzt;
95 mult_coarse_xp1(pg2_xm, svidx_xm,
96 &buf1_xp[ibf], &v1[
VLEN * Nvc * site], Nc);
98 mult_coarse_xp1(&buf1_xp[ibf], &v1[
VLEN * Nvc * site], Nc);
103 int site = ix + Nxv * iyzt;
105 mult_coarse_xm1(pg2_xp, svidx_xp,
106 &buf1_xm[ibf], &u[
VLEN * Ndf * site],
107 &v1[
VLEN * Nvc * site], Nc);
109 mult_coarse_xm1(&buf1_xm[ibf], &u[
VLEN * Ndf * site],
110 &v1[
VLEN * Nvc * site], Nc);
115 for (
int sitey = isy; sitey < nsy; sitey++) {
117 int ix = sitey % Nxv;
118 int izt = sitey / Nxv;
119 int ibf =
VLENX * Nvc * ixzt;
124 int site = ix + Nxv * iy + Nxv * Nyv * izt;
126 mult_coarse_yp1(pg2_ym, svidx_ym,
127 &buf1_yp[ibf], &v1[
VLEN * Nvc * site], Nc);
129 mult_coarse_yp1(&buf1_yp[ibf], &v1[
VLEN * Nvc * site], Nc);
134 int site = ix + Nxv * iy + Nxv * Nyv * izt;
136 mult_coarse_ym1(pg2_yp, svidx_yp,
137 &buf1_ym[ibf], &u[
VLEN * Ndf * site],
138 &v1[
VLEN * Nvc * site], Nc);
140 mult_coarse_ym1(&buf1_ym[ibf], &u[
VLEN * Ndf * site],
141 &v1[
VLEN * Nvc * site], Nc);
146 for (
int sitez = isz; sitez < nsz; sitez++) {
148 int ixy = sitez % (Nxv * Nyv);
149 int it = sitez / (Nxv * Nyv);
154 int site = ixy + Nxv * Nyv * (iz + Nz * it);
155 mult_coarse_zp1(&buf1_zp[
VLEN * Nvc * ixyt], &v1[
VLEN * Nvc * site], Nc);
159 int site = ixy + Nxv * Nyv * (iz + Nz * it);
160 mult_coarse_zm1(&buf1_zm[
VLEN * Nvc * ixyt],
161 &u[
VLEN * Ndf * site], &v1[
VLEN * Nvc * site], Nc);
165 for (
int sitet = ist; sitet < nst; sitet++) {
171 int site = ixyz + Nxv * Nyv * Nz * it;
172 mult_coarse_tp1(&buf1_tp[
VLEN * Nvc * ixyz], &v1[
VLEN * Nvc * site], Nc);
176 int site = ixyz + Nxv * Nyv * Nz * it;
177 mult_coarse_tm1(&buf1_tm[
VLEN * Nvc * ixyz],
178 &u[
VLEN * Ndf * site], &v1[
VLEN * Nvc * site], Nc);
188 const int *Nsize,
int ncol,
189 const int *do_comm,
real_t *work)
191 int ith, nth, is, ns;
192 int Nstv = Nsize[0] * Nsize[1] * Nsize[2] * Nsize[3];
199 int Nc2 = ncol * ncol;
203 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
204 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
205 set_predicate_xp(pg1_xp, pg2_xp);
206 set_predicate_xm(pg1_xm, pg2_xm);
207 set_predicate_yp(pg1_yp, pg2_yp);
208 set_predicate_ym(pg1_ym, pg2_ym);
213 int nv2 =
VLEN * Ndf;
214 set_threadtask(ith, nth, is, ns, Nstv);
216 for (
int site = is; site < ns; ++site) {
217 real_t *out = &v2[nv * site];
221 set_mult_u(out, &v1[nv * site],
222 &c0[nv2 * site], Nc);
224 for (
int i = 0; i < nv; i++) {
229 int iyzt = site / Nxv;
235 if ((ix < Nxv - 1) || (do_comm[0] == 0)) {
236 int nei = (ix + 1) + Nxv * iyzt;
237 if (ix == Nxv - 1) nei = 0 + Nxv * iyzt;
239 mult_coarse_xpb(pg1_xp, pg2_xp, out,
241 &v1[nv * site], &v1[nv * nei], Nc, work);
245 &v1[nv * site], &v1[nv * nei], Nc, work);
251 if ((ix > 0) || (do_comm[0] == 0)) {
252 int ix2 = (ix - 1 + Nxv) % Nxv;
253 int nei = ix2 + Nxv * iyzt;
255 mult_coarse_xmb(pg1_xm, pg2_xm, out,
256 &u[nv2 * site], &u[nv2 * nei],
257 &v1[nv * site], &v1[nv * nei],
261 &u[nv2 * site], &u[nv2 * nei],
262 &v1[nv * site], &v1[nv * nei],
270 int izt = iyzt / Nyv;
275 if ((iy < Nyv - 1) || (do_comm[1] == 0)) {
276 int iy2 = (iy + 1) % Nyv;
277 int nei = ix + Nxv * (iy2 + Nyv * izt);
279 mult_coarse_ypb(pg1_yp, pg2_yp, out,
281 &v1[nv * site], &v1[nv * nei],
286 &v1[nv * site], &v1[nv * nei],
292 if ((iy != 0) || (do_comm[idir] == 0)) {
293 int iy2 = (iy - 1 + Nyv) % Nyv;
294 int nei = ix + Nxv * (iy2 + Nyv * izt);
296 mult_coarse_ymb(pg1_ym, pg2_ym, out,
297 &u[nv2 * site], &u[nv2 * nei],
298 &v1[nv * site], &v1[nv * nei],
302 &u[nv2 * site], &u[nv2 * nei],
303 &v1[nv * site], &v1[nv * nei],
310 int ixy = ix + Nxv * iy;
313 int Nxyv = Nxv * Nyv;
319 if ((iz != Nz - 1) || (do_comm[2] == 0)) {
320 int iz2 = (iz + 1) % Nz;
321 int nei = ixy + Nxyv * (iz2 + Nz * it);
323 &u[nv2 * site], &v1[nv * nei], Nc);
327 if ((iz > 0) || (do_comm[2] == 0)) {
328 int iz2 = (iz - 1 + Nz) % Nz;
329 int nei = ixy + Nxyv * (iz2 + Nz * it);
331 &u[nv2 * nei], &v1[nv * nei], Nc);
336 int Nxyzv = Nxyv * Nz;
337 int ixyz = site - it * Nxyzv;
343 if ((it < Nt - 1) || (do_comm[3] == 0)) {
344 int it2 = (it + 1) % Nt;
345 int nei = ixyz + Nxyzv * it2;
347 &u[nv2 * site], &v1[nv * nei], Nc);
351 if ((it > 0) || (do_comm[3] == 0)) {
352 int it2 = (it - 1 + Nt) % Nt;
353 int nei = ixyz + Nxyzv * it2;
355 &u[nv2 * nei], &v1[nv * nei], Nc);
369 const int *Nsize,
int ncol,
const int *do_comm,
371 std::vector<int>& list)
373 int ith, nth, is, ns;
374 int Nstv = Nsize[0] * Nsize[1] * Nsize[2] * Nsize[3];
381 int Nc2 = ncol * ncol;
385 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
386 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
387 set_predicate_xp(pg1_xp, pg2_xp);
388 set_predicate_xm(pg1_xm, pg2_xm);
389 set_predicate_yp(pg1_yp, pg2_yp);
390 set_predicate_ym(pg1_ym, pg2_ym);
391 svint_t svidx_xp, svidx_xm, svidx_yp, svidx_ym;
392 set_index_xp(svidx_xp);
393 set_index_xm(svidx_xm);
394 set_index_yp(svidx_yp);
395 set_index_ym(svidx_ym);
399 int nv2 =
VLEN * Ndf;
401 for (
int i = 0; i < list.size(); i++) {
403 real_t *out = &v2[nv * site];
405 const int ix = site % Nxv;
406 const int iyzt = site / Nxv;
408 if (do_comm[0] == 1) {
410 int ibf =
VLENY * Nvc * iyzt;
411 real_t *u = u0 + nv2 * Nstv * idir;
415 mult_coarse_xp2(pg1_xp, pg2_xp, svidx_xp,
417 &v1[nv * site], &buf2_xp[ibf], Nc, work);
419 mult_coarse_xp2(out, &u[nv2 * site],
420 &v1[nv * site], &buf2_xp[ibf], Nc, work);
427 mult_coarse_xm2(pg1_xm, pg2_xm, svidx_xm,
429 &v1[nv * site], &buf2_xm[ibf], Nc);
431 mult_coarse_xm2(out, &u[nv2 * site],
432 &v1[nv * site], &buf2_xm[ibf], Nc);
439 const int iy = iyzt % Nyv;
440 const int izt = iyzt / Nyv;
442 if (do_comm[1] == 1) {
444 int ixzt = ix + Nxv * izt;
445 int ibf =
VLENX * Nvc * ixzt;
446 real_t *u = u0 + nv2 * Nstv * idir;
450 mult_coarse_yp2(pg1_yp, pg2_yp, svidx_yp,
453 &v1[nv * site], &buf2_yp[ibf], Nc, work);
457 &v1[nv * site], &buf2_yp[ibf], Nc, work);
464 mult_coarse_ym2(pg1_ym, pg2_ym, svidx_ym,
467 &v1[nv * site], &buf2_ym[ibf], Nc);
471 &v1[nv * site], &buf2_ym[ibf], Nc);
478 const int ixy = ix + Nxv * iy;
479 const int iz = izt % Nz;
480 const int it = izt / Nz;
481 const int Nxyv = Nxv * Nyv;
483 if (do_comm[2] == 1) {
485 int ixyt = ixy + Nxyv * it;
486 real_t *u = u0 + nv2 * Nstv * idir;
490 &u[nv2 * site], &buf2_zp[nv * ixyt], Nc);
496 &buf2_zm[nv * ixyt], Nc);
501 if (do_comm[3] == 1) {
503 int ixyz = ixy + Nxyv * iz;
504 real_t *u = u0 + nv2 * Nstv * idir;
508 &u[nv2 * site], &buf2_tp[nv * ixyz], Nc);
514 &buf2_tm[nv * ixyz], Nc);