10 #ifndef MULT_COARSE_QXS_INCLUDED
11 #define MULT_COARSE_QXS_INCLUDED
37 int ncol,
const int *do_comm)
40 int Nstv = Nsize[0] * Nsize[1] * Nsize[2] * Nsize[3];
47 int Nc2 = ncol * ncol;
50 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
51 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
52 set_predicate_xp(pg1_xp, pg2_xp);
53 set_predicate_xm(pg1_xm, pg2_xm);
54 set_predicate_yp(pg1_yp, pg2_yp);
55 set_predicate_ym(pg1_ym, pg2_ym);
57 set_index_xp(svidx_xp);
58 set_index_xm(svidx_xm);
60 int taskx = (do_comm[0] > 0) ? (Nyv * Nz * Nt) : 0;
61 int tasky = (do_comm[1] > 0) ? (Nxv * Nz * Nt) : 0;
62 int taskz = (do_comm[2] > 0) ? (Nxv * Nyv * Nt) : 0;
63 int taskt = (do_comm[3] > 0) ? (Nxv * Nyv * Nz) : 0;
64 int task_total = taskx + tasky + taskz + taskt;
65 set_threadtask(ith, nth, is, ns, task_total);
68 int nsx = (ns > taskx) ? taskx : ns;
71 int isy = (is < 0) ? 0 : is;
72 int nsy = (ns > tasky) ? tasky : ns;
75 int isz = (is < 0) ? 0 : is;
76 int nsz = (ns > taskz) ? taskz : ns;
79 int ist = (is < 0) ? 0 : is;
80 int nst = (ns < 0) ? 0 : ns;
82 for (
int sitex = isx; sitex < nsx; ++sitex) {
84 int ibf =
VLENY * Nvc * iyzt;
89 int site = ix + Nxv * iyzt;
90 set_index_xm(svidx_xm);
91 mult_coarse_xp1(pg2_xm, svidx_xm,
92 &buf1_xp[ibf], &v1[
VLEN * Nvc * site], Nc);
96 int site = ix + Nxv * iyzt;
97 set_index_xp(svidx_xp);
98 mult_coarse_xm1(pg2_xp, svidx_xp,
99 &buf1_xm[ibf], &u[
VLEN * Ndf * site],
100 &v1[
VLEN * Nvc * site], Nc);
104 for (
int sitey = isy; sitey < nsy; sitey++) {
106 int ix = sitey % Nxv;
107 int izt = sitey / Nxv;
108 int ibf =
VLENX * Nvc * ixzt;
113 int site = ix + Nxv * iy + Nxv * Nyv * izt;
114 mult_coarse_yp1(pg2_ym,
115 &buf1_yp[ibf], &v1[
VLEN * Nvc * site], Nc);
119 int site = ix + Nxv * iy + Nxv * Nyv * izt;
120 mult_coarse_ym1(pg2_yp,
121 &buf1_ym[ibf], &u[
VLEN * Ndf * site],
122 &v1[
VLEN * Nvc * site], Nc);
126 for (
int sitez = isz; sitez < nsz; sitez++) {
128 int ixy = sitez % (Nxv * Nyv);
129 int it = sitez / (Nxv * Nyv);
134 int site = ixy + Nxv * Nyv * (iz + Nz * it);
135 mult_coarse_zp1(&buf1_zp[
VLEN * Nvc * ixyt], &v1[
VLEN * Nvc * site], Nc);
139 int site = ixy + Nxv * Nyv * (iz + Nz * it);
140 mult_coarse_zm1(&buf1_zm[
VLEN * Nvc * ixyt],
141 &u[
VLEN * Ndf * site], &v1[
VLEN * Nvc * site], Nc);
145 for (
int sitet = ist; sitet < nst; sitet++) {
151 int site = ixyz + Nxv * Nyv * Nz * it;
152 mult_coarse_tp1(&buf1_tp[
VLEN * Nvc * ixyz], &v1[
VLEN * Nvc * site], Nc);
156 int site = ixyz + Nxv * Nyv * Nz * it;
157 mult_coarse_tm1(&buf1_tm[
VLEN * Nvc * ixyz],
158 &u[
VLEN * Ndf * site], &v1[
VLEN * Nvc * site], Nc);
168 const int *Nsize,
int ncol,
169 const int *do_comm,
real_t *work)
171 int ith, nth, is, ns;
172 int Nstv = Nsize[0] * Nsize[1] * Nsize[2] * Nsize[3];
179 int Nc2 = ncol * ncol;
182 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
183 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
184 set_predicate_xp(pg1_xp, pg2_xp);
185 set_predicate_xm(pg1_xm, pg2_xm);
186 set_predicate_yp(pg1_yp, pg2_yp);
187 set_predicate_ym(pg1_ym, pg2_ym);
190 int nv2 =
VLEN * Ndf;
191 set_threadtask(ith, nth, is, ns, Nstv);
193 for (
int site = is; site < ns; ++site) {
194 real_t *out = &v2[nv * site];
198 set_mult_u(out, &v1[nv * site],
199 &c0[nv2 * site], Nc);
201 for (
int i = 0; i < nv; i++) {
206 int iyzt = site / Nxv;
212 if ((ix < Nxv - 1) || (do_comm[0] == 0)) {
213 int nei = (ix + 1) + Nxv * iyzt;
214 if (ix == Nxv - 1) nei = 0 + Nxv * iyzt;
215 mult_coarse_xpb(pg1_xp, pg2_xp, out,
217 &v1[nv * site], &v1[nv * nei], Nc, work);
222 if ((ix > 0) || (do_comm[0] == 0)) {
223 int ix2 = (ix - 1 + Nxv) % Nxv;
224 int nei = ix2 + Nxv * iyzt;
225 mult_coarse_xmb(pg1_xm, pg2_xm, out,
226 &u[nv2 * site], &u[nv2 * nei],
227 &v1[nv * site], &v1[nv * nei],
234 int izt = iyzt / Nyv;
239 if ((iy < Nyv - 1) || (do_comm[1] == 0)) {
240 int iy2 = (iy + 1) % Nyv;
241 int nei = ix + Nxv * (iy2 + Nyv * izt);
242 mult_coarse_ypb(pg1_yp, pg2_yp, out,
244 &v1[nv * site], &v1[nv * nei],
249 if ((iy != 0) || (do_comm[idir] == 0)) {
250 int iy2 = (iy - 1 + Nyv) % Nyv;
251 int nei = ix + Nxv * (iy2 + Nyv * izt);
252 mult_coarse_ymb(pg1_ym, pg2_ym, out,
253 &u[nv2 * site], &u[nv2 * nei],
254 &v1[nv * site], &v1[nv * nei],
260 int ixy = ix + Nxv * iy;
263 int Nxyv = Nxv * Nyv;
269 if ((iz != Nz - 1) || (do_comm[2] == 0)) {
270 int iz2 = (iz + 1) % Nz;
271 int nei = ixy + Nxyv * (iz2 + Nz * it);
273 &u[nv2 * site], &v1[nv * nei], Nc);
277 if ((iz > 0) || (do_comm[2] == 0)) {
278 int iz2 = (iz - 1 + Nz) % Nz;
279 int nei = ixy + Nxyv * (iz2 + Nz * it);
281 &u[nv2 * nei], &v1[nv * nei], Nc);
286 int Nxyzv = Nxyv * Nz;
287 int ixyz = site - it * Nxyzv;
293 if ((it < Nt - 1) || (do_comm[3] == 0)) {
294 int it2 = (it + 1) % Nt;
295 int nei = ixyz + Nxyzv * it2;
297 &u[nv2 * site], &v1[nv * nei], Nc);
301 if ((it > 0) || (do_comm[3] == 0)) {
302 int it2 = (it - 1 + Nt) % Nt;
303 int nei = ixyz + Nxyzv * it2;
305 &u[nv2 * nei], &v1[nv * nei], Nc);
319 const int *Nsize,
int ncol,
const int *do_comm,
321 std::vector<int>& list)
323 int ith, nth, is, ns;
324 int Nstv = Nsize[0] * Nsize[1] * Nsize[2] * Nsize[3];
331 int Nc2 = ncol * ncol;
334 svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
335 svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
336 set_predicate_xp(pg1_xp, pg2_xp);
337 set_predicate_xm(pg1_xm, pg2_xm);
338 set_predicate_yp(pg1_yp, pg2_yp);
339 set_predicate_ym(pg1_ym, pg2_ym);
341 set_index_xp(svidx_xp);
342 set_index_xm(svidx_xm);
345 int nv2 =
VLEN * Ndf;
347 for (
int i = 0; i < list.size(); i++) {
349 real_t *out = &v2[nv * site];
351 const int ix = site % Nxv;
352 const int iyzt = site / Nxv;
354 if (do_comm[0] == 1) {
356 int ibf =
VLENY * Nvc * iyzt;
357 real_t *u = u0 + nv2 * Nstv * idir;
360 set_index_xp(svidx_xp);
361 mult_coarse_xp2(pg1_xp, pg2_xp, svidx_xp,
363 &v1[nv * site], &buf2_xp[ibf], Nc, work);
368 set_index_xm(svidx_xm);
369 mult_coarse_xm2(pg1_xm, pg2_xm, svidx_xm,
371 &v1[nv * site], &buf2_xm[ibf], Nc);
377 const int iy = iyzt % Nyv;
378 const int izt = iyzt / Nyv;
380 if (do_comm[1] == 1) {
382 int ixzt = ix + Nxv * izt;
383 int ibf =
VLENX * Nvc * ixzt;
384 real_t *u = u0 + nv2 * Nstv * idir;
387 mult_coarse_yp2(pg1_yp, pg2_yp,
390 &v1[nv * site], &buf2_yp[ibf], Nc, work);
395 mult_coarse_ym2(pg1_ym, pg2_ym,
398 &v1[nv * site], &buf2_ym[ibf], Nc);
404 const int ixy = ix + Nxv * iy;
405 const int iz = izt % Nz;
406 const int it = izt / Nz;
407 const int Nxyv = Nxv * Nyv;
409 if (do_comm[2] == 1) {
411 int ixyt = ixy + Nxyv * it;
412 real_t *u = u0 + nv2 * Nstv * idir;
416 &u[nv2 * site], &buf2_zp[nv * ixyt], Nc);
422 &buf2_zm[nv * ixyt], Nc);
427 if (do_comm[3] == 1) {
429 int ixyz = ixy + Nxyv * iz;
430 real_t *u = u0 + nv2 * Nstv * idir;
434 &u[nv2 * site], &buf2_tp[nv * ixyz], Nc);
440 &buf2_tm[nv * ixyz], Nc);