20 #include "lib_alt_QXS/inline/define_vlen.h"
24 #include "lib_alt_QXS/inline/afield_th-inc.h"
31 extern __attribute__((aligned(64))) pglud_t glud;
32 extern __attribute__((aligned(64))) pglus_t glus;
33 extern __attribute__((aligned(64))) pclvd_t clvd;
34 extern __attribute__((aligned(64))) pclvs_t clvs;
35 extern
double kappa, kappa2, mkappa;
36 extern
int nt, nz, ny, nx, nxh, nxd, vold, nxs, vols;
37 extern
int px, py, pz, pt;
38 extern
int domain_e, domain_o;
50 int std_xyzt2i_(
int *j)
52 return j[0] + nx * (j[1] + ny * (j[2] + nz * j[3]));
56 int std_xyzt2kd_(
int *j)
64 int std_xyzt2ks_(
int *j)
72 int std_xyzt2ivd_(
int *j)
77 int nvxy = nx * ny /
VLEND;
78 return ivx + nvx * ivy + nvxy * (j[2] + nz * j[3]);
82 int std_xyzt2ivs_(
int *j)
87 int nvxy = nx * ny /
VLENS;
88 return ivx + nvx * ivy + nvxy * (j[2] + nz * j[3]);
92 int std_xvyzt2ivd_(
int x,
int ivy,
int zt)
96 int nvxy = nx * ny /
VLEND;
97 return ivx + nvx * ivy + nvxy * zt;
101 int std_xky2kd_(
int x,
int ky)
108 int std_xvyzt2ivs_(
int x,
int ivy,
int zt)
112 int nvxy = nx * ny /
VLENS;
113 return ivx + nvx * ivy + nvxy * zt;
117 int std_xky2ks_(
int x,
int ky)
124 int e_o_(
int *j,
int eo_offset)
126 return (j[0] + j[1] + j[2] + j[3] + eo_offset) % 2;
130 void qws_loadg_bridge_(
const double *u,
const int *volh_tot,
const double *in_kappa)
132 int x, y, z, t, j[4], mu, c1, c2, eo_offset;
135 kappa2 = kappa * kappa;
136 eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
138 for (t = 0; t < nt; t++) {
139 for (z = 0; z < nz; z++) {
140 for (y = 0; y < ny; y++) {
141 for (x = 0; x < nx; x++) {
149 int ikd = std_xyzt2kd_(j);
150 int ivd = std_xyzt2ivd_(j);
155 int eo = e_o_(j, eo_offset);
156 int iwsd = x / 2 /
VLEND + nxd * y + nxd * ny * z + nxd * ny * nz * t +
NDIM * vold * eo;
157 int ixxd = (x / 2) %
VLEND;
158 int iwss = x / 2 /
VLENS + nxs * y + nxs * ny * z + nxs * ny * nz * t +
NDIM * vols * eo;
159 int ixxs = (x / 2) %
VLENS;
161 for (mu = 0; mu <
NDIM; mu++) {
162 #if SU3_RECONSTRUCT_D == 18
163 for (c1 = 0; c1 <
NCOL; ++c1) {
164 for (c2 = 0; c2 <
NCOL; ++c2) {
171 glud[iwsd + vold * mu].c[c2][c1][0][ixxd]
172 = u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
173 glud[iwsd + vold * mu].c[c2][c1][1][ixxd]
174 = u[ikd +
VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
177 #elif SU3_RECONSTRUCT_D == 12
178 for (c1 = 0; c1 <
NCOL; ++c1) {
179 for (c2 = 0; c2 <
NCOL - 1; ++c2) {
180 glud[iwsd + vold * mu].c[c2][c1][0][ixxd]
181 = u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
182 glud[iwsd + vold * mu].c[c2][c1][1][ixxd]
183 = u[ikd +
VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
187 #if SU3_RECONSTRUCT_S == 18
188 for (c1 = 0; c1 <
NCOL; ++c1) {
189 for (c2 = 0; c2 <
NCOL; ++c2) {
190 glus[iwss + vols * mu].c[c2][c1][0][ixxs]
191 = (float)u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
192 glus[iwss + vols * mu].c[c2][c1][1][ixxs]
193 = (float)u[ikd +
VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
196 #elif SU3_RECONSTRUCT_S == 12
197 for (c1 = 0; c1 <
NCOL; ++c1) {
198 for (c2 = 0; c2 <
NCOL - 1; ++c2) {
199 glus[iwss + vols * mu].c[c2][c1][0][ixxs]
200 = (float)u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
201 glus[iwss + vols * mu].c[c2][c1][1][ixxs]
202 = (float)u[ikd +
VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
214 void qws_loadfd_bridge_(scd_t *out,
const double *in)
216 int x, y, z, t, j[4], mu, c, s, eo_offset;
217 eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
218 printf(
"hoge: %s: eo_offset=%d: (px,py,pz,pt)=(%d,%d,%d,%d): (nx,ny,nz,nt)=((%d,%d,%d,%d), VLENXD=%d\n",
219 __func__, eo_offset, px, py, pz, pt, nx, ny, nz, nt,
VLENXD);
220 for (t = 0; t < nt; t++) {
221 for (z = 0; z < nz; z++) {
222 for (y = 0; y < ny; y++) {
223 for (x = 0; x < nx; x++) {
230 int ikd = std_xyzt2kd_(j);
231 int ivd = std_xyzt2ivd_(j);
234 int eo = e_o_(j, eo_offset);
236 int iwsd = x / 2 /
VLEND + nxd * y + nxd * ny * z + nxd * ny * nz * t;
237 int ixxd = (x / 2) %
VLEND;
241 for (c = 0; c <
NCOL; c++) {
242 for (s = 0; s < 4; s++) {
243 out[iwsd].c[c][s][0][ixxd]
244 = in[
VLEND * (0 + 2 * s + 8 * c + 24 * ivd) + ikd];
245 out[iwsd].c[c][s][1][ixxd]
246 = in[
VLEND * (1 + 2 * s + 8 * c + 24 * ivd) + ikd];
257 void qws_loadfs_bridge_(scs_t *out,
const float *in)
259 int x, y, z, t, j[4], mu, c, s, eo_offset;
260 eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
262 for (t = 0; t < nt; t++) {
263 for (z = 0; z < nz; z++) {
264 for (y = 0; y < ny; y++) {
265 for (x = 0; x < nx; x++) {
272 int iks = std_xyzt2ks_(j);
273 int ivs = std_xyzt2ivs_(j);
276 int eo = e_o_(j, eo_offset);
278 int iwss = x / 2 /
VLENS + nxs * y + nxs * ny * z + nxs * ny * nz * t;
279 int ixxs = (x / 2) %
VLENS;
281 for (c = 0; c <
NCOL; c++) {
282 for (s = 0; s < 4; s++) {
283 out[iwss].c[c][s][0][ixxs] = in[
VLENS * (0 + 2 * s + 8 * c + 24 * ivs) + iks];
284 out[iwss].c[c][s][1][ixxs] = in[
VLENS * (1 + 2 * s + 8 * c + 24 * ivs) + iks];
295 void qws_storefd_bridge_(
double *out,
const scd_t *in)
297 int x, y, z, t, j[4], mu, c, s, eo_offset;
298 eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
300 for (t = 0; t < nt; t++) {
301 for (z = 0; z < nz; z++) {
302 for (y = 0; y < ny; y++) {
303 for (x = 0; x < nx; x++) {
310 int ikd = std_xyzt2kd_(j);
311 int ivd = std_xyzt2ivd_(j);
314 int eo = e_o_(j, eo_offset);
316 int iwsd = x / 2 /
VLEND + nxd * y + nxd * ny * z + nxd * ny * nz * t;
317 int ixxd = (x / 2) %
VLEND;
319 for (c = 0; c <
NCOL; c++) {
320 for (s = 0; s < 4; s++) {
321 out[
VLEND * (0 + 2 * s + 8 * c + 24 * ivd) + ikd]
322 = in[iwsd].c[c][s][0][ixxd];
323 out[
VLEND * (1 + 2 * s + 8 * c + 24 * ivd) + ikd]
324 = in[iwsd].c[c][s][1][ixxd];
335 void qws_storefs_bridge_(
float *out,
const scs_t *in)
337 int x, y, z, t, j[4], mu, c, s, eo_offset;
338 eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
340 for (t = 0; t < nt; t++) {
341 for (z = 0; z < nz; z++) {
342 for (y = 0; y < ny; y++) {
343 for (x = 0; x < nx; x++) {
350 int iks = std_xyzt2ks_(j);
351 int ivs = std_xyzt2ivs_(j);
354 int eo = e_o_(j, eo_offset);
356 int iwss = x / 2 /
VLENS + nxs * y + nxs * ny * z + nxs * ny * nz * t;
357 int ixxs = (x / 2) %
VLENS;
359 for (c = 0; c <
NCOL; c++) {
360 for (s = 0; s < 4; s++) {
361 out[
VLENS * (0 + 2 * s + 8 * c + 24 * ivs) + iks]
362 = in[iwss].c[c][s][0][ixxs];
363 out[
VLENS * (1 + 2 * s + 8 * c + 24 * ivs) + iks]
364 = in[iwss].c[c][s][1][ixxs];
375 void qws_loadg_dd_bridge_(
const double *u,
const double *in_kappa)
377 int x, y, z, t, mu, c1, c2;
380 kappa2 = kappa * kappa;
382 int nyztv = nt * nz * nvy;
383 int ith, nth, is, ns;
384 set_threadtask(ith, nth, is, ns, nyztv);
386 for (
int yztv = is; yztv < ns; yztv++) {
389 for (
int ky = 0; ky <
VLENYD; ky++) {
391 for (x = 0; x < nx; x++) {
393 int ikd = std_xky2kd_(x, ky);
394 int ivd = std_xvyzt2ivd_(x, vy, zt);
397 int iwsd = (x % nxh) /
VLEND + nxd * y + nxd * ny * zt +
NDIM * vold * (x / nxh);
398 int ixxd = (x % nxh) %
VLEND;
399 int iwss = (x % nxh) /
VLENS + nxs * y + nxs * ny * zt +
NDIM * vols * (x / nxh);
400 int ixxs = (x % nxh) %
VLENS;
401 for (mu = 0; mu <
NDIM; mu++) {
402 #if SU3_RECONSTRUCT_D == 18
403 for (c1 = 0; c1 <
NCOL; c1++) {
404 for (c2 = 0; c2 <
NCOL; c2++) {
405 double val = u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
406 glud[iwsd + vold * mu].c[c2][c1][0][ixxd]
407 = u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
408 glud[iwsd + vold * mu].c[c2][c1][1][ixxd]
409 = u[ikd +
VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
412 #elif SU3_RECONSTRUCT_D == 12
413 for (c1 = 0; c1 <
NCOL; c1++) {
414 for (c2 = 0; c2 <
NCOL - 1; c2++) {
415 glud[iwsd + vold * mu].c[c2][c1][0][ixxd]
416 = u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
417 glud[iwsd + vold * mu].c[c2][c1][1][ixxd]
418 = u[ikd +
VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
422 #if SU3_RECONSTRUCT_S == 18
423 for (c1 = 0; c1 <
NCOL; c1++) {
424 for (c2 = 0; c2 <
NCOL; c2++) {
425 glus[iwss + vols * mu].c[c2][c1][0][ixxs]
426 = (float)u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
427 glus[iwss + vols * mu].c[c2][c1][1][ixxs]
428 = (float)u[ikd +
VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
431 #elif SU3_RECONSTRUCT_S == 12
432 for (c1 = 0; c1 <
NCOL; c1++) {
433 for (c2 = 0; c2 <
NCOL - 1; c2++) {
434 glus[iwss + vols * mu].c[c2][c1][0][ixxs]
435 = (float)u[ikd +
VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
436 glus[iwss + vols * mu].c[c2][c1][1][ixxs]
437 = (float)u[ikd +
VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
449 void qws_loadgs_dd_bridge_(
const float *u,
const double *in_kappa)
451 int x, y, z, t, mu, c1, c2;
454 kappa2 = kappa * kappa;
456 int nyztv = nt * nz * nvy;
457 int ith, nth, is, ns;
458 set_threadtask(ith, nth, is, ns, nyztv);
460 for (
int yztv = is; yztv < ns; yztv++) {
463 for (
int ky = 0; ky <
VLENYS; ky++) {
465 for (x = 0; x < nx; x++) {
467 int iks = std_xky2ks_(x, ky);
468 int ivs = std_xvyzt2ivs_(x, vy, zt);
471 int iwss = (x % nxh) /
VLENS + nxs * y + nxs * ny * zt +
NDIM * vols * (x / nxh);
472 int ixxs = (x % nxh) %
VLENS;
473 for (mu = 0; mu <
NDIM; mu++) {
474 #if SU3_RECONSTRUCT_S == 18
475 for (c1 = 0; c1 <
NCOL; c1++) {
476 for (c2 = 0; c2 <
NCOL; c2++) {
477 glus[iwss + vols * mu].c[c2][c1][0][ixxs]
478 = u[iks +
VLENS * (0 + 2 * c1 + 6 * c2 + 18 * (ivs + mu * vols * 2))];
479 glus[iwss + vols * mu].c[c2][c1][1][ixxs]
480 = u[iks +
VLENS * (1 + 2 * c1 + 6 * c2 + 18 * (ivs + mu * vols * 2))];
483 #elif SU3_RECONSTRUCT_S == 12
484 for (c1 = 0; c1 <
NCOL; c1++) {
485 for (c2 = 0; c2 <
NCOL - 1; c2++) {
486 glus[iwss + vols * mu].c[c2][c1][0][ixxs]
487 = u[iws +
VLENS * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vols * 2))];
488 glus[iwss + vols * mu].c[c2][c1][1][ixxs]
489 = u[iws +
VLENS * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vols * 2))];
501 void qws_loadc_dd_bridge_(
const double *c)
529 int x, y, ky, yztv, ud;
531 int nyztv = nt * nz * nvy;
532 int ith, nth, is, ns;
533 set_threadtask(ith, nth, is, ns, nyztv);
535 for (yztv = is; yztv < ns; yztv++) {
538 for (ky = 0; ky <
VLENYD; ky++) {
540 for (x = 0; x < nx; x++) {
542 int ikd = std_xky2kd_(x, ky);
543 int ivd = std_xvyzt2ivd_(x, vy, zt);
545 int iwsd = (x % nxh) /
VLEND + nxd * y + nxd * ny * zt + vold * (x / nxh);
546 int ixxd = (x % nxh) %
VLEND;
547 int iwss = (x % nxh) /
VLENS + nxs * y + nxs * ny * zt + vols * (x / nxh);
548 int ixxs = (x % nxh) %
VLENS;
550 for (ud = 0; ud < 2; ud++) {
551 int offset = 72 * ivd + 36 * ud;
552 for (
int ij = 0; ij < 36; ij++) {
553 clvd[iwsd].c[ud][ij][ixxd] = c[
VLEND * (ij + offset) + ikd];
557 for (
int ij = 0; ij < 36; ij++) {
558 clvs[iwss].c[ud][ij][ixxs] = (float)c[
VLEND * (ij + offset) + ikd];
567 void qws_loadcs_dd_bridge_(
const float *c)
586 int x, y, ky, yztv, ud;
588 int nyztv = nt * nz * nvy;
589 int ith, nth, is, ns;
590 set_threadtask(ith, nth, is, ns, nyztv);
592 for (yztv = is; yztv < ns; yztv++) {
595 for (ky = 0; ky <
VLENYS; ky++) {
597 for (x = 0; x < nx; x++) {
599 int iks = std_xky2ks_(x, ky);
600 int ivs = std_xvyzt2ivs_(x, vy, zt);
602 int iwss = (x % nxh) /
VLENS + nxs * y + nxs * ny * zt + vols * (x / nxh);
603 int ixxs = (x % nxh) %
VLENS;
605 for (ud = 0; ud < 2; ud++) {
606 int offset = 72 * ivs + 36 * ud;
607 for (
int ij = 0; ij < 36; ij++) {
608 clvs[iwss].c[ud][ij][ixxs] = c[
VLENS * (ij + offset) + iks];
617 void qws_loadfd_dd_bridge_(scd_t *out,
const double *in)
619 int x, y, z, t, s, c;
621 int nyztv = nt * nz * nvy;
622 int ith, nth, is, ns;
623 set_threadtask(ith, nth, is, ns, nyztv);
625 for (
int yztv = is; yztv < ns; yztv++) {
628 for (
int ky = 0; ky <
VLENYD; ky++) {
630 for (x = 0; x < nx; x++) {
632 int ikd = std_xky2kd_(x, ky);
633 int ivd = std_xvyzt2ivd_(x, vy, zt);
637 int iwsd = (x % nxh) /
VLEND + nxd * y + nxd * ny * zt + vold * (x / nxh);
638 int ixxd = (x % nxh) %
VLEND;
639 for (c = 0; c <
NCOL; c++) {
640 for (s = 0; s < 4; s++) {
641 out[iwsd].c[c][s][0][ixxd]
642 = in[
VLEND * (0 + 2 * s + 8 * c + 24 * ivd) + ikd];
643 out[iwsd].c[c][s][1][ixxd]
644 = in[
VLEND * (1 + 2 * s + 8 * c + 24 * ivd) + ikd];
654 void qws_storefd_dd_bridge_(
double *out,
const scd_t *in)
656 int x, y, z, t, s, c;
658 int nyztv = nt * nz * nvy;
659 int ith, nth, is, ns;
660 set_threadtask(ith, nth, is, ns, nyztv);
662 for (
int yztv = is; yztv < ns; yztv++) {
665 for (
int ky = 0; ky <
VLENYD; ky++) {
667 for (x = 0; x < nx; x++) {
669 int ikd = std_xky2kd_(x, ky);
670 int ivd = std_xvyzt2ivd_(x, vy, zt);
673 int iwsd = (x % nxh) /
VLEND + nxd * y + nxd * ny * zt + vold * (x / nxh);
674 int ixxd = (x % nxh) %
VLEND;
675 for (c = 0; c <
NCOL; c++) {
676 for (s = 0; s < 4; s++) {
677 out[
VLEND * (0 + 2 * s + 8 * c + 24 * ivd) + ikd] = in[iwsd].c[c][s][0][ixxd];
678 out[
VLEND * (1 + 2 * s + 8 * c + 24 * ivd) + ikd] = in[iwsd].c[c][s][1][ixxd];
688 void qws_loadfs_dd_bridge_(scs_t *out,
const float *in)
690 int x, y, z, t, s, c;
692 int nyztv = nt * nz * nvy;
693 int ith, nth, is, ns;
694 set_threadtask(ith, nth, is, ns, nyztv);
696 for (
int yztv = is; yztv < ns; yztv++) {
699 for (
int ky = 0; ky <
VLENYS; ky++) {
701 for (x = 0; x < nx; x++) {
703 int iks = std_xky2ks_(x, ky);
704 int ivs = std_xvyzt2ivs_(x, vy, zt);
707 int iwss = (x % nxh) /
VLENS + nxs * y + nxs * ny * zt + vols * (x / nxh);
708 int ixxs = (x % nxh) %
VLENS;
709 for (c = 0; c <
NCOL; c++) {
710 for (s = 0; s < 4; s++) {
711 out[iwss].c[c][s][0][ixxs]
712 = in[
VLENS * (0 + 2 * s + 8 * c + 24 * ivs) + iks];
713 out[iwss].c[c][s][1][ixxs]
714 = in[
VLENS * (1 + 2 * s + 8 * c + 24 * ivs) + iks];
724 void qws_storefs_dd_bridge_(
float *out,
const scs_t *in)
726 int x, y, z, t, s, c;
728 int nyztv = nt * nz * nvy;
729 int ith, nth, is, ns;
730 set_threadtask(ith, nth, is, ns, nyztv);
732 for (
int yztv = is; yztv < ns; yztv++) {
735 for (
int ky = 0; ky <
VLENYS; ky++) {
737 for (x = 0; x < nx; x++) {
739 int iks = std_xky2ks_(x, ky);
740 int ivs = std_xvyzt2ivs_(x, vy, zt);
743 int iwss = (x % nxh) /
VLENS + nxs * y + nxs * ny * zt + vols * (x / nxh);
744 int ixxs = (x % nxh) %
VLENS;
745 for (c = 0; c <
NCOL; c++) {
746 for (s = 0; s < 4; s++) {
747 out[
VLENS * (0 + 2 * s + 8 * c + 24 * ivs) + iks] = in[iwss].c[c][s][0][ixxs];
748 out[
VLENS * (1 + 2 * s + 8 * c + 24 * ivs) + iks] = in[iwss].c[c][s][1][ixxs];
759 void ddd_out_pre_d_(scd_t *in,
int *domain);
760 void ddd_out_pos_d_(scd_t *out, scd_t *in,
int *domain);
761 void ddd_in_d_(scd_t *out, scd_t *in,
int *domain);
763 void ddd_out_pre_s_(scs_t *in,
int *domain);
764 void ddd_out_pre_s_noprl_(scs_t *in,
int *domain);
765 void ddd_out_pre_s_no_timer_(scs_t *in,
int *domain);
766 void ddd_out_pre_s_noprl_no_timer_(scs_t *in,
int *domain);
767 void ddd_out_pos_s_(scs_t *out, scs_t *in,
int *domain,
float factor);
768 void ddd_out_pos_s_no_timer_(scs_t *out, scs_t *in,
int *domain,
float factor);
769 void ddd_in_s_(scs_t *out, scs_t *in,
int *domain);
770 void ddd_out_pos_s_noprl_(scs_t *out, scs_t *in,
int *domain,
float factor);
771 void ddd_out_pos_s_noprl_no_timer_(scs_t *out, scs_t *in,
int *domain,
float factor);
772 void ddd_in_s_noprl(scs_t *out, scs_t *in,
int *domain);
773 void jinv_ddd_in_s_noprl_(scs_t *x, scs_t *b,
int *domain,
int *maxiter);
789 void ddd_eo_s_noprl_(scs_t *out, scs_t *in,
int *domain)
791 ddd_out_pre_s_noprl_no_timer_(in, domain);
792 ddd_in_s_noprl(&out[vols * (*domain)], &in[vols * (*domain)], domain);
793 ddd_out_pos_s_noprl_no_timer_(&out[vols * (*domain)], in, domain, (
float)mkappa);
798 void ddd_s_noprl_(scs_t *out, scs_t *in)
800 ddd_eo_s_noprl_(out, in, &domain_e);
801 ddd_eo_s_noprl_(out, in, &domain_o);
806 extern void copy_vols2_s_noprl_(scs_t *out, scs_t *in);
807 extern void zero_vols_s_noprl_(scs_t *out);
808 extern void accum_add_vols_s_noprl_(scs_t *out, scs_t *in);
809 extern void accum_sub_vols_s_noprl_(scs_t *out, scs_t *in);
810 extern void accum_addsub_vols_s_noprl_(scs_t *out, scs_t *in1, scs_t *in2);
813 void prec_s_noprl_(scs_t *out, scs_t *in,
int *nsap,
int *nm, scs_t *s, scs_t *q)
815 #ifdef COMPILE_TIME_DIM_SIZE
816 const int vols = VOLS;
818 const int vols = ::vols;
820 float kappa = ::kappa;
823 copy_vols2_s_noprl_(s, in);
824 if ((npe[1] == 1) || (npe[2] == 1) || (npe[3] == 1)) zero_vols_s_noprl_(&q[vols * domain_o]);
825 for (
int isap = 0; isap < *nsap; isap++) {
826 jinv_ddd_in_s_noprl_(&q[vols * domain_e], &s[vols * domain_e], &domain_e, nm);
827 ddd_out_pre_s_noprl_no_timer_(q, &domain_o);
828 ddd_in_s_noprl(&q[vols * domain_o], &q[vols * domain_e], &domain_e);
829 accum_addsub_vols_s_noprl_(&s[vols * domain_e], &in[vols * domain_e], &q[vols * domain_o]);
830 ddd_out_pos_s_noprl_no_timer_(&s[vols * domain_o], q, &domain_o, (
float)kappa);
831 jinv_ddd_in_s_noprl_(&q[vols * domain_o], &s[vols * domain_o], &domain_o, nm);
832 ddd_out_pre_s_noprl_no_timer_(q, &domain_e);
833 ddd_in_s_noprl(&q[vols * domain_e], &q[vols * domain_o], &domain_o);
834 accum_addsub_vols_s_noprl_(&s[vols * domain_o], &in[vols * domain_o], &q[vols * domain_e]);
835 ddd_out_pos_s_noprl_no_timer_(&s[vols * domain_e], q, &domain_e, (
float)kappa);
837 if ((npe[1] == 1) || (npe[2] == 1) || (npe[3] == 1)) zero_vols_s_noprl_(&out[vols * domain_o]);
838 jinv_ddd_in_s_noprl_(&out[vols * domain_e], &s[vols * domain_e], &domain_e, nm);
839 ddd_out_pre_s_noprl_no_timer_(out, &domain_o);
840 ddd_out_pos_s_noprl_no_timer_(&s[vols * domain_o], out, &domain_o, (
float)kappa);
841 jinv_ddd_in_s_noprl_(&out[vols * domain_o], &s[vols * domain_o], &domain_o, nm);
847 void clv_s_dd_(
int *deo, scs_t *inout)
849 __attribute__((aligned(64))) scs_t tmp;
851 #pragma omp for private(x,y,z,t,tmp) collapse(3) schedule(static)
852 for (t = 0; t < nt; t++) {
853 for (z = 0; z < nz; z++) {
854 for (y = 0; y < ny; y++) {
855 for (x = 0; x < nxs; x++) {
856 int i0 = x + nxs * y + nxs * ny * z + nxs * ny * nz * t + vols * (*deo);
857 __load_sc_s(tmp.c, inout[i0].c);
858 __mult_clvs(tmp.cv, clvs[i0].cv);
859 __store_sc_s(inout[i0].c, tmp.c);
867 void clv_matvec_s_dd_(
int *deo, scs_t *out, clvs_t *clv, scs_t *in)
869 __attribute__((aligned(64))) scs_t tmp;
871 #pragma omp for private(x,y,z,t,tmp) collapse(3) schedule(static)
872 for (t = 0; t < nt; t++) {
873 for (z = 0; z < nz; z++) {
874 for (y = 0; y < ny; y++) {
875 for (x = 0; x < nxs; x++) {
876 int i0 = x + nxs * y + nxs * ny * z + nxs * ny * nz * t + vols * (*deo);
877 __load_sc_s(tmp.c, in[i0].c);
878 __mult_clvs(tmp.cv, clv[i0].cv);
879 __store_sc_s(out[i0].c, tmp.c);
888 void clv_d_dd_(
int *deo, scd_t *inout)
890 __attribute__((aligned(64))) scd_t tmp;
892 #pragma omp for private(x,y,z,t,tmp) collapse(3) schedule(static)
893 for (t = 0; t < nt; t++) {
894 for (z = 0; z < nz; z++) {
895 for (y = 0; y < ny; y++) {
896 for (x = 0; x < nxd; x++) {
897 int i0 = x + nxd * y + nxd * ny * z + nxd * ny * nz * t + vold * (*deo);
898 __load_sc(tmp.c, inout[i0].c);
899 __mult_clvd(tmp.cv, clvd[i0].cv);
900 __store_sc(inout[i0].c, tmp.c);
908 void clv_matvec_d_dd_(
int *deo, scd_t *out, clvd_t *clv, scd_t *in)
910 __attribute__((aligned(64))) scd_t tmp;
912 #pragma omp for private(x,y,z,t,tmp) collapse(3) schedule(static)
913 for (t = 0; t < nt; t++) {
914 for (z = 0; z < nz; z++) {
915 for (y = 0; y < ny; y++) {
916 for (x = 0; x < nxd; x++) {
917 int i0 = x + nxd * y + nxd * ny * z + nxd * ny * nz * t + vold * (*deo);
918 __load_sc(tmp.c, in[i0].c);
919 __mult_clvd(tmp.cv, clv[i0].cv);
920 __store_sc(out[i0].c, tmp.c);