Bridge++  Ver. 2.0.2
qws_bridge.cpp
Go to the documentation of this file.
1 
10 #ifdef USE_QWSLIB
11 
12 #include <qws.h>
13 #include <clover_d.h>
14 #include <clover_s.h>
15 #include <wilson_d.h>
16 #include <wilson_s.h>
17 #include <mpi.h>
18 
19 
20 #include "lib_alt_QXS/inline/define_vlen.h"
21 
22 // for thread parallelization a la Bridge
24 #include "lib_alt_QXS/inline/afield_th-inc.h"
25 
26 
27 #ifdef __cplusplus
28 extern "C" {
29 #endif
30 
31 extern __attribute__((aligned(64))) pglud_t glud;
32 extern __attribute__((aligned(64))) pglus_t glus;
33 extern __attribute__((aligned(64))) pclvd_t clvd;
34 extern __attribute__((aligned(64))) pclvs_t clvs;
35 extern double kappa, kappa2, mkappa;
36 extern int nt, nz, ny, nx, nxh, nxd, vold, nxs, vols;
37 extern int px, py, pz, pt;
38 extern int domain_e, domain_o;
39 extern int npe[4];
40 // extern int volse;
41 
42 // cf. qws.cc
43 // nt, nz, ny, nx: local lattice size
44 // nxh = nx/2
45 // nxd = nx/2/VLEND;
46 // nxs = nx/2/VLENS;
47 // vold = nxd * ny * nz * nt; ( = local_vol /2/VLEND )
48 // vols = nxs * ny * nz * nt; ( = local_vol /2/VLENS )
49 
50 int std_xyzt2i_(int *j)
51 {
52  return j[0] + nx * (j[1] + ny * (j[2] + nz * j[3]));
53 }
54 
55 
56 int std_xyzt2kd_(int *j) // in-vector index (double)
57 {
58  int kx = j[0] % VLENXD;
59  int ky = j[1] % VLENYD;
60  return kx + VLENXD * ky;
61 }
62 
63 
64 int std_xyzt2ks_(int *j) // in-vector index (single)
65 {
66  int kx = j[0] % VLENXS;
67  int ky = j[1] % VLENYS;
68  return kx + VLENXS * ky;
69 }
70 
71 
72 int std_xyzt2ivd_(int *j) // vector index (double)
73 {
74  int ivx = j[0] / VLENXD;
75  int ivy = j[1] / VLENYD;
76  int nvx = nx / VLENXD;
77  int nvxy = nx * ny / VLEND;
78  return ivx + nvx * ivy + nvxy * (j[2] + nz * j[3]);
79 }
80 
81 
82 int std_xyzt2ivs_(int *j) // vector index (single)
83 {
84  int ivx = j[0] / VLENXS;
85  int ivy = j[1] / VLENYS;
86  int nvx = nx / VLENXS;
87  int nvxy = nx * ny / VLENS;
88  return ivx + nvx * ivy + nvxy * (j[2] + nz * j[3]);
89 }
90 
91 
92 int std_xvyzt2ivd_(int x, int ivy, int zt) // vector index (double)
93 {
94  int ivx = x / VLENXD;
95  int nvx = nx / VLENXD;
96  int nvxy = nx * ny / VLEND;
97  return ivx + nvx * ivy + nvxy * zt;
98 }
99 
100 
101 int std_xky2kd_(int x, int ky) // in-vector index (double)
102 {
103  int ikx = x % VLENXD;
104  return ikx + VLENXD * ky;
105 }
106 
107 
108 int std_xvyzt2ivs_(int x, int ivy, int zt) // vector index (single)
109 {
110  int ivx = x / VLENXS;
111  int nvx = nx / VLENXS;
112  int nvxy = nx * ny / VLENS;
113  return ivx + nvx * ivy + nvxy * zt;
114 }
115 
116 
117 int std_xky2ks_(int x, int ky) // in-vector index (single)
118 {
119  int ikx = x % VLENXS;
120  return ikx + VLENXS * ky;
121 }
122 
123 
124 int e_o_(int *j, int eo_offset)
125 {
126  return (j[0] + j[1] + j[2] + j[3] + eo_offset) % 2;
127 }
128 
129 
130 void qws_loadg_bridge_(const double *u, const int *volh_tot, const double *in_kappa)
131 {
132  int x, y, z, t, j[4], mu, c1, c2, eo_offset;
133  kappa = *in_kappa;
134  mkappa = -kappa;
135  kappa2 = kappa * kappa;
136  eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
137 
138  for (t = 0; t < nt; t++) {
139  for (z = 0; z < nz; z++) {
140  for (y = 0; y < ny; y++) {
141  for (x = 0; x < nx; x++) {
142  j[0] = x;
143  j[1] = y;
144  j[2] = z;
145  j[3] = t;
146 
147  // index in bridge (lexical)
148  // Note that input AField is in double prec.
149  int ikd = std_xyzt2kd_(j);
150  int ivd = std_xyzt2ivd_(j);
151  //int iks = std_xyzt2ks_(j);
152  //int ivs = std_xyzt2ivs_(j);
153 
154  // index in qws (even-odd)
155  int eo = e_o_(j, eo_offset);
156  int iwsd = x / 2 / VLEND + nxd * y + nxd * ny * z + nxd * ny * nz * t + NDIM * vold * eo;
157  int ixxd = (x / 2) % VLEND;
158  int iwss = x / 2 / VLENS + nxs * y + nxs * ny * z + nxs * ny * nz * t + NDIM * vols * eo;
159  int ixxs = (x / 2) % VLENS;
160 
161  for (mu = 0; mu < NDIM; mu++) {
162 #if SU3_RECONSTRUCT_D == 18
163  for (c1 = 0; c1 < NCOL; ++c1) {
164  for (c2 = 0; c2 < NCOL; ++c2) {
165  // double val=u[ikd + VLEND*(0 + 2*c1 + 6*c2 + 18*(ivd + mu*vold*2))];
166  // if(val>0.1){
167  // printf("hoge: val=%15.7e: (x,y,z,t)=(%d,%d,%d,%d), eo=%d, mu=%d, c1=%d, c2=%d, ikd=%d, ixxd=%d, ivd=%d, iwsd=%d\n",
168  // val, x,y,z,t, eo, mu, c1, c2, ikd, ixxd, ivd, iwsd);
169  // }
170 
171  glud[iwsd + vold * mu].c[c2][c1][0][ixxd]
172  = u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
173  glud[iwsd + vold * mu].c[c2][c1][1][ixxd]
174  = u[ikd + VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
175  }
176  }
177 #elif SU3_RECONSTRUCT_D == 12
178  for (c1 = 0; c1 < NCOL; ++c1) {
179  for (c2 = 0; c2 < NCOL - 1; ++c2) {
180  glud[iwsd + vold * mu].c[c2][c1][0][ixxd]
181  = u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
182  glud[iwsd + vold * mu].c[c2][c1][1][ixxd]
183  = u[ikd + VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
184  }
185  }
186 #endif
187 #if SU3_RECONSTRUCT_S == 18
188  for (c1 = 0; c1 < NCOL; ++c1) {
189  for (c2 = 0; c2 < NCOL; ++c2) {
190  glus[iwss + vols * mu].c[c2][c1][0][ixxs]
191  = (float)u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
192  glus[iwss + vols * mu].c[c2][c1][1][ixxs]
193  = (float)u[ikd + VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
194  }
195  }
196 #elif SU3_RECONSTRUCT_S == 12
197  for (c1 = 0; c1 < NCOL; ++c1) {
198  for (c2 = 0; c2 < NCOL - 1; ++c2) {
199  glus[iwss + vols * mu].c[c2][c1][0][ixxs]
200  = (float)u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
201  glus[iwss + vols * mu].c[c2][c1][1][ixxs]
202  = (float)u[ikd + VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
203  }
204  }
205 #endif
206  }
207  }
208  }
209  }
210  }
211 }
212 
213 
214 void qws_loadfd_bridge_(scd_t *out, const double *in)
215 {
216  int x, y, z, t, j[4], mu, c, s, eo_offset;
217  eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
218  printf("hoge: %s: eo_offset=%d: (px,py,pz,pt)=(%d,%d,%d,%d): (nx,ny,nz,nt)=((%d,%d,%d,%d), VLENXD=%d\n",
219  __func__, eo_offset, px, py, pz, pt, nx, ny, nz, nt, VLENXD);
220  for (t = 0; t < nt; t++) {
221  for (z = 0; z < nz; z++) {
222  for (y = 0; y < ny; y++) {
223  for (x = 0; x < nx; x++) {
224  j[0] = x;
225  j[1] = y;
226  j[2] = z;
227  j[3] = t;
228 
229  // index in bridge (lexical)
230  int ikd = std_xyzt2kd_(j);
231  int ivd = std_xyzt2ivd_(j);
232 
233  // index in qws (even-odd)
234  int eo = e_o_(j, eo_offset);
235  if (eo == 0) {
236  int iwsd = x / 2 / VLEND + nxd * y + nxd * ny * z + nxd * ny * nz * t;
237  int ixxd = (x / 2) % VLEND;
238  // printf("hoge: (x,y,z,t)=(%d,%d,%d,%d): ivd=%d, iwsd=%d, ixxd=%d, ikd=%d\n",
239  // x,y,z,t, ivd, iwsd, ixxd, ikd);
240 
241  for (c = 0; c < NCOL; c++) {
242  for (s = 0; s < 4; s++) {
243  out[iwsd].c[c][s][0][ixxd]
244  = in[VLEND * (0 + 2 * s + 8 * c + 24 * ivd) + ikd];
245  out[iwsd].c[c][s][1][ixxd]
246  = in[VLEND * (1 + 2 * s + 8 * c + 24 * ivd) + ikd];
247  } //s
248  } // c
249  }
250  }
251  }
252  }
253  }
254 }
255 
256 
257 void qws_loadfs_bridge_(scs_t *out, const float *in)
258 {
259  int x, y, z, t, j[4], mu, c, s, eo_offset;
260  eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
261 
262  for (t = 0; t < nt; t++) {
263  for (z = 0; z < nz; z++) {
264  for (y = 0; y < ny; y++) {
265  for (x = 0; x < nx; x++) {
266  j[0] = x;
267  j[1] = y;
268  j[2] = z;
269  j[3] = t;
270 
271  // index in bridge (lexical)
272  int iks = std_xyzt2ks_(j);
273  int ivs = std_xyzt2ivs_(j);
274 
275  // index in qws (even-odd)
276  int eo = e_o_(j, eo_offset);
277  if (eo == 0) {
278  int iwss = x / 2 / VLENS + nxs * y + nxs * ny * z + nxs * ny * nz * t;
279  int ixxs = (x / 2) % VLENS;
280 
281  for (c = 0; c < NCOL; c++) {
282  for (s = 0; s < 4; s++) {
283  out[iwss].c[c][s][0][ixxs] = in[VLENS * (0 + 2 * s + 8 * c + 24 * ivs) + iks];
284  out[iwss].c[c][s][1][ixxs] = in[VLENS * (1 + 2 * s + 8 * c + 24 * ivs) + iks];
285  } //s
286  } // c
287  } // eo==0
288  }
289  }
290  }
291  }
292 }
293 
294 
295 void qws_storefd_bridge_(double *out, const scd_t *in)
296 {
297  int x, y, z, t, j[4], mu, c, s, eo_offset;
298  eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
299 
300  for (t = 0; t < nt; t++) {
301  for (z = 0; z < nz; z++) {
302  for (y = 0; y < ny; y++) {
303  for (x = 0; x < nx; x++) {
304  j[0] = x;
305  j[1] = y;
306  j[2] = z;
307  j[3] = t;
308 
309  // index in bridge (lexical)
310  int ikd = std_xyzt2kd_(j);
311  int ivd = std_xyzt2ivd_(j);
312 
313  // index in qws (even-odd)
314  int eo = e_o_(j, eo_offset);
315  if (eo == 0) {
316  int iwsd = x / 2 / VLEND + nxd * y + nxd * ny * z + nxd * ny * nz * t;
317  int ixxd = (x / 2) % VLEND;
318 
319  for (c = 0; c < NCOL; c++) {
320  for (s = 0; s < 4; s++) {
321  out[VLEND * (0 + 2 * s + 8 * c + 24 * ivd) + ikd]
322  = in[iwsd].c[c][s][0][ixxd];
323  out[VLEND * (1 + 2 * s + 8 * c + 24 * ivd) + ikd]
324  = in[iwsd].c[c][s][1][ixxd];
325  } //s
326  } // c
327  } // eo==0
328  }
329  }
330  }
331  }
332 }
333 
334 
335 void qws_storefs_bridge_(float *out, const scs_t *in)
336 {
337  int x, y, z, t, j[4], mu, c, s, eo_offset;
338  eo_offset = (px * nx + py * ny + pz * nz + pt * nt) % 2;
339 
340  for (t = 0; t < nt; t++) {
341  for (z = 0; z < nz; z++) {
342  for (y = 0; y < ny; y++) {
343  for (x = 0; x < nx; x++) {
344  j[0] = x;
345  j[1] = y;
346  j[2] = z;
347  j[3] = t;
348 
349  // index in bridge (lexical)
350  int iks = std_xyzt2ks_(j);
351  int ivs = std_xyzt2ivs_(j);
352 
353  // index in qws (even-odd)
354  int eo = e_o_(j, eo_offset);
355  if (eo == 0) {
356  int iwss = x / 2 / VLENS + nxs * y + nxs * ny * z + nxs * ny * nz * t;
357  int ixxs = (x / 2) % VLENS;
358 
359  for (c = 0; c < NCOL; c++) {
360  for (s = 0; s < 4; s++) {
361  out[VLENS * (0 + 2 * s + 8 * c + 24 * ivs) + iks]
362  = in[iwss].c[c][s][0][ixxs];
363  out[VLENS * (1 + 2 * s + 8 * c + 24 * ivs) + iks]
364  = in[iwss].c[c][s][1][ixxs];
365  } //s
366  } // c
367  } // eo==0
368  }
369  }
370  }
371  }
372 }
373 
374 
375 void qws_loadg_dd_bridge_(const double *u, const double *in_kappa)
376 {
377  int x, y, z, t, mu, c1, c2;
378  kappa = *in_kappa;
379  mkappa = -kappa;
380  kappa2 = kappa * kappa;
381  int nvy = ny / VLENYD;
382  int nyztv = nt * nz * nvy;
383  int ith, nth, is, ns;
384  set_threadtask(ith, nth, is, ns, nyztv);
385 #pragma omp barrier
386  for (int yztv = is; yztv < ns; yztv++) {
387  int vy = yztv % nvy;
388  int zt = yztv / nvy;
389  for (int ky = 0; ky < VLENYD; ky++) {
390  y = ky + VLENYD * vy;
391  for (x = 0; x < nx; x++) {
392  // index in bridge (lexical)
393  int ikd = std_xky2kd_(x, ky);
394  int ivd = std_xvyzt2ivd_(x, vy, zt);
395 
396  // index in qws (domain decomposed)
397  int iwsd = (x % nxh) / VLEND + nxd * y + nxd * ny * zt + NDIM * vold * (x / nxh);
398  int ixxd = (x % nxh) % VLEND;
399  int iwss = (x % nxh) / VLENS + nxs * y + nxs * ny * zt + NDIM * vols * (x / nxh);
400  int ixxs = (x % nxh) % VLENS;
401  for (mu = 0; mu < NDIM; mu++) {
402 #if SU3_RECONSTRUCT_D == 18
403  for (c1 = 0; c1 < NCOL; c1++) {
404  for (c2 = 0; c2 < NCOL; c2++) {
405  double val = u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
406  glud[iwsd + vold * mu].c[c2][c1][0][ixxd]
407  = u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
408  glud[iwsd + vold * mu].c[c2][c1][1][ixxd]
409  = u[ikd + VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
410  }
411  }
412 #elif SU3_RECONSTRUCT_D == 12
413  for (c1 = 0; c1 < NCOL; c1++) {
414  for (c2 = 0; c2 < NCOL - 1; c2++) {
415  glud[iwsd + vold * mu].c[c2][c1][0][ixxd]
416  = u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
417  glud[iwsd + vold * mu].c[c2][c1][1][ixxd]
418  = u[ikd + VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
419  }
420  }
421 #endif
422 #if SU3_RECONSTRUCT_S == 18
423  for (c1 = 0; c1 < NCOL; c1++) {
424  for (c2 = 0; c2 < NCOL; c2++) {
425  glus[iwss + vols * mu].c[c2][c1][0][ixxs]
426  = (float)u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
427  glus[iwss + vols * mu].c[c2][c1][1][ixxs]
428  = (float)u[ikd + VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
429  }
430  }
431 #elif SU3_RECONSTRUCT_S == 12
432  for (c1 = 0; c1 < NCOL; c1++) {
433  for (c2 = 0; c2 < NCOL - 1; c2++) {
434  glus[iwss + vols * mu].c[c2][c1][0][ixxs]
435  = (float)u[ikd + VLEND * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
436  glus[iwss + vols * mu].c[c2][c1][1][ixxs]
437  = (float)u[ikd + VLEND * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vold * 2))];
438  }
439  }
440 #endif
441  } //mu
442  } //x
443  } // ky
444  } // yvtv
445 #pragma omp barrier
446 }
447 
448 
449 void qws_loadgs_dd_bridge_(const float *u, const double *in_kappa)
450 {
451  int x, y, z, t, mu, c1, c2;
452  kappa = *in_kappa;
453  mkappa = -kappa;
454  kappa2 = kappa * kappa;
455  int nvy = ny / VLENYS;
456  int nyztv = nt * nz * nvy;
457  int ith, nth, is, ns;
458  set_threadtask(ith, nth, is, ns, nyztv);
459 #pragma omp barrier
460  for (int yztv = is; yztv < ns; yztv++) {
461  int vy = yztv % nvy;
462  int zt = yztv / nvy;
463  for (int ky = 0; ky < VLENYS; ky++) {
464  y = ky + VLENYS * vy;
465  for (x = 0; x < nx; x++) {
466  // index in bridge (lexical)
467  int iks = std_xky2ks_(x, ky);
468  int ivs = std_xvyzt2ivs_(x, vy, zt);
469 
470  // index in qws (domain decomposed)
471  int iwss = (x % nxh) / VLENS + nxs * y + nxs * ny * zt + NDIM * vols * (x / nxh);
472  int ixxs = (x % nxh) % VLENS;
473  for (mu = 0; mu < NDIM; mu++) {
474 #if SU3_RECONSTRUCT_S == 18
475  for (c1 = 0; c1 < NCOL; c1++) {
476  for (c2 = 0; c2 < NCOL; c2++) {
477  glus[iwss + vols * mu].c[c2][c1][0][ixxs]
478  = u[iks + VLENS * (0 + 2 * c1 + 6 * c2 + 18 * (ivs + mu * vols * 2))];
479  glus[iwss + vols * mu].c[c2][c1][1][ixxs]
480  = u[iks + VLENS * (1 + 2 * c1 + 6 * c2 + 18 * (ivs + mu * vols * 2))];
481  }
482  }
483 #elif SU3_RECONSTRUCT_S == 12
484  for (c1 = 0; c1 < NCOL; c1++) {
485  for (c2 = 0; c2 < NCOL - 1; c2++) {
486  glus[iwss + vols * mu].c[c2][c1][0][ixxs]
487  = u[iws + VLENS * (0 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vols * 2))];
488  glus[iwss + vols * mu].c[c2][c1][1][ixxs]
489  = u[iws + VLENS * (1 + 2 * c1 + 6 * c2 + 18 * (ivd + mu * vols * 2))];
490  }
491  }
492 #endif
493  } // mu
494  } // x
495  } // ky
496  } // yztv
497 #pragma omp barrier
498 }
499 
500 
501 void qws_loadc_dd_bridge_(const double *c)
502 {
503  // notation in qws
504  // c[2][36] 2 x (6x6 Hermitian matrix)
505  // c[ud][0], c[ud][1],..., c[ud][5]: daigonal elements (real): C_ii
506  // (c[ud][6], c[ud][7]),..., (c[ud][14],c[ud][15]) C01, C02, C03, C04, C05
507  // (c[ud][16], c[ud][17]),..., (c[ud][22],c[ud][23]) C12, C13, C14, C15
508  // (c[ud][24], c[ud][25]),..., (c[ud][28],c[ud][29]) C23, C24, C25
509  // (c[ud][30], c[ud][31]),(c[ud][32],c[ud][33]) C34, C35
510  // (c[ud][34], c[ud][35] ) C45
511  // (spin, color) --> 3spin +color = 0,1,...,5
512  // IMPORTANT:
513  // the stored colver term is in the Chiral representation while the fermion field
514  // is in the Dirac represenation. The conversion of represenation of the fermion
515  // feild requires a factor 1/sqrt(2) for each of Dirac --> Chiral and Chiral --> Dirac,
516  // i.e., one needs to mulitply a factor 1/2. This 1/2 multiplication is moved to
517  // the Clover term.
518  // M^dag Clov[Chiral] M psi[Dirac]
519  // = ( sqrt(2) M^dag ) Clov[Chial]/2 (sqrt(2) M) psi[Dirac]
520  // and we store Clov[Chiral]/2
521  //
522  // 0 6 8 10 12 14
523  // 1 16 18 20 22
524  // 2 24 26 28
525  // 3 30 32
526  // 4 34
527  // 5
528 
529  int x, y, ky, yztv, ud;
530  int nvy = ny / VLENYD;
531  int nyztv = nt * nz * nvy;
532  int ith, nth, is, ns;
533  set_threadtask(ith, nth, is, ns, nyztv);
534 #pragma omp barrier
535  for (yztv = is; yztv < ns; yztv++) {
536  int vy = yztv % nvy;
537  int zt = yztv / nvy;
538  for (ky = 0; ky < VLENYD; ky++) {
539  y = ky + VLENYD * vy;
540  for (x = 0; x < nx; x++) {
541  // index in bridge (lexical)
542  int ikd = std_xky2kd_(x, ky);
543  int ivd = std_xvyzt2ivd_(x, vy, zt);
544 
545  int iwsd = (x % nxh) / VLEND + nxd * y + nxd * ny * zt + vold * (x / nxh);
546  int ixxd = (x % nxh) % VLEND;
547  int iwss = (x % nxh) / VLENS + nxs * y + nxs * ny * zt + vols * (x / nxh);
548  int ixxs = (x % nxh) % VLENS;
549 
550  for (ud = 0; ud < 2; ud++) {
551  int offset = 72 * ivd + 36 * ud;
552  for (int ij = 0; ij < 36; ij++) {
553  clvd[iwsd].c[ud][ij][ixxd] = c[VLEND * (ij + offset) + ikd];
554  }
555 
556  // single
557  for (int ij = 0; ij < 36; ij++) {
558  clvs[iwss].c[ud][ij][ixxs] = (float)c[VLEND * (ij + offset) + ikd];
559  }
560  } // ud
561  }
562  }
563  }
564 }
565 
566 
567 void qws_loadcs_dd_bridge_(const float *c)
568 {
569  // notation in qws
570  // c[2][36] 2 x (6x6 Hermitian matrix)
571  // c[ud][0], c[ud][1],..., c[ud][5]: daigonal elements (real): C_ii
572  // (c[ud][6], c[ud][7]),..., (c[ud][14],c[ud][15]) C01, C02, C03, C04, C05
573  // (c[ud][16], c[ud][17]),..., (c[ud][22],c[ud][23]) C12, C13, C14, C15
574  // (c[ud][24], c[ud][25]),..., (c[ud][28],c[ud][29]) C23, C24, C25
575  // (c[ud][30], c[ud][31]),(c[ud][32],c[ud][33]) C34, C35
576  // (c[ud][34], c[ud][35] ) C45
577 
578  // a naive 6x6 notation as real varibles
579  // 0 2 4 6 8 10
580  // 12 14 16 18 20 22
581  // 24 26 28 30 32 34
582  // 36 38 40 42 44 46
583  // 48 50 52 52 56 58
584  // 60 62 64 66 68 70
585 
586  int x, y, ky, yztv, ud;
587  int nvy = ny / VLENYS;
588  int nyztv = nt * nz * nvy;
589  int ith, nth, is, ns;
590  set_threadtask(ith, nth, is, ns, nyztv);
591 #pragma omp barrier
592  for (yztv = is; yztv < ns; yztv++) {
593  int vy = yztv % nvy;
594  int zt = yztv / nvy;
595  for (ky = 0; ky < VLENYS; ky++) {
596  y = ky + VLENYS * vy;
597  for (x = 0; x < nx; x++) {
598  // index in bridge (lexical)
599  int iks = std_xky2ks_(x, ky);
600  int ivs = std_xvyzt2ivs_(x, vy, zt);
601 
602  int iwss = (x % nxh) / VLENS + nxs * y + nxs * ny * zt + vols * (x / nxh);
603  int ixxs = (x % nxh) % VLENS;
604 
605  for (ud = 0; ud < 2; ud++) {
606  int offset = 72 * ivs + 36 * ud;
607  for (int ij = 0; ij < 36; ij++) {
608  clvs[iwss].c[ud][ij][ixxs] = c[VLENS * (ij + offset) + iks];
609  }
610  } // ud
611  }
612  }
613  }
614 }
615 
616 
617 void qws_loadfd_dd_bridge_(scd_t *out, const double *in)
618 {
619  int x, y, z, t, s, c;
620  int nvy = ny / VLENYD;
621  int nyztv = nt * nz * nvy;
622  int ith, nth, is, ns;
623  set_threadtask(ith, nth, is, ns, nyztv);
624 #pragma omp barrier
625  for (int yztv = is; yztv < ns; yztv++) {
626  int vy = yztv % nvy;
627  int zt = yztv / nvy;
628  for (int ky = 0; ky < VLENYD; ky++) {
629  y = ky + VLENYD * vy;
630  for (x = 0; x < nx; x++) {
631  // index in bridge (lexical)
632  int ikd = std_xky2kd_(x, ky);
633  int ivd = std_xvyzt2ivd_(x, vy, zt);
634 
635 
636  // index in qws (domain decomposed)
637  int iwsd = (x % nxh) / VLEND + nxd * y + nxd * ny * zt + vold * (x / nxh);
638  int ixxd = (x % nxh) % VLEND;
639  for (c = 0; c < NCOL; c++) {
640  for (s = 0; s < 4; s++) {
641  out[iwsd].c[c][s][0][ixxd]
642  = in[VLEND * (0 + 2 * s + 8 * c + 24 * ivd) + ikd];
643  out[iwsd].c[c][s][1][ixxd]
644  = in[VLEND * (1 + 2 * s + 8 * c + 24 * ivd) + ikd];
645  } //s
646  } // c
647  } //x
648  } // ky
649  } // yztv
650 #pragma omp barrier
651 }
652 
653 
654 void qws_storefd_dd_bridge_(double *out, const scd_t *in)
655 {
656  int x, y, z, t, s, c;
657  int nvy = ny / VLENYD;
658  int nyztv = nt * nz * nvy;
659  int ith, nth, is, ns;
660  set_threadtask(ith, nth, is, ns, nyztv);
661 #pragma omp barrier
662  for (int yztv = is; yztv < ns; yztv++) {
663  int vy = yztv % nvy;
664  int zt = yztv / nvy;
665  for (int ky = 0; ky < VLENYD; ky++) {
666  y = ky + VLENYD * vy;
667  for (x = 0; x < nx; x++) {
668  // index in bridge (lexical)
669  int ikd = std_xky2kd_(x, ky);
670  int ivd = std_xvyzt2ivd_(x, vy, zt);
671 
672  // index in qws (domain decomposed)
673  int iwsd = (x % nxh) / VLEND + nxd * y + nxd * ny * zt + vold * (x / nxh);
674  int ixxd = (x % nxh) % VLEND;
675  for (c = 0; c < NCOL; c++) {
676  for (s = 0; s < 4; s++) {
677  out[VLEND * (0 + 2 * s + 8 * c + 24 * ivd) + ikd] = in[iwsd].c[c][s][0][ixxd];
678  out[VLEND * (1 + 2 * s + 8 * c + 24 * ivd) + ikd] = in[iwsd].c[c][s][1][ixxd];
679  } //s
680  } // c
681  } // x
682  } // ky
683  } // yztv
684 #pragma omp barrier
685 }
686 
687 
688 void qws_loadfs_dd_bridge_(scs_t *out, const float *in)
689 {
690  int x, y, z, t, s, c;
691  int nvy = ny / VLENYS;
692  int nyztv = nt * nz * nvy;
693  int ith, nth, is, ns;
694  set_threadtask(ith, nth, is, ns, nyztv);
695 #pragma omp barrier
696  for (int yztv = is; yztv < ns; yztv++) {
697  int vy = yztv % nvy;
698  int zt = yztv / nvy;
699  for (int ky = 0; ky < VLENYS; ky++) {
700  y = ky + VLENYS * vy;
701  for (x = 0; x < nx; x++) {
702  // index in bridge (lexical)
703  int iks = std_xky2ks_(x, ky);
704  int ivs = std_xvyzt2ivs_(x, vy, zt);
705 
706  // index in qws (domain decomposed)
707  int iwss = (x % nxh) / VLENS + nxs * y + nxs * ny * zt + vols * (x / nxh);
708  int ixxs = (x % nxh) % VLENS;
709  for (c = 0; c < NCOL; c++) {
710  for (s = 0; s < 4; s++) {
711  out[iwss].c[c][s][0][ixxs]
712  = in[VLENS * (0 + 2 * s + 8 * c + 24 * ivs) + iks];
713  out[iwss].c[c][s][1][ixxs]
714  = in[VLENS * (1 + 2 * s + 8 * c + 24 * ivs) + iks];
715  } //s
716  } // c
717  }
718  }
719  } // x,ky,yztv
720 #pragma omp barrier
721 }
722 
723 
724 void qws_storefs_dd_bridge_(float *out, const scs_t *in)
725 {
726  int x, y, z, t, s, c;
727  int nvy = ny / VLENYS;
728  int nyztv = nt * nz * nvy;
729  int ith, nth, is, ns;
730  set_threadtask(ith, nth, is, ns, nyztv);
731 #pragma omp barrier
732  for (int yztv = is; yztv < ns; yztv++) {
733  int vy = yztv % nvy;
734  int zt = yztv / nvy;
735  for (int ky = 0; ky < VLENYS; ky++) {
736  y = ky + VLENYS * vy;
737  for (x = 0; x < nx; x++) {
738  // index in bridge (lexical)
739  int iks = std_xky2ks_(x, ky);
740  int ivs = std_xvyzt2ivs_(x, vy, zt);
741 
742  // index in qws (domain decomposed)
743  int iwss = (x % nxh) / VLENS + nxs * y + nxs * ny * zt + vols * (x / nxh);
744  int ixxs = (x % nxh) % VLENS;
745  for (c = 0; c < NCOL; c++) {
746  for (s = 0; s < 4; s++) {
747  out[VLENS * (0 + 2 * s + 8 * c + 24 * ivs) + iks] = in[iwss].c[c][s][0][ixxs];
748  out[VLENS * (1 + 2 * s + 8 * c + 24 * ivs) + iks] = in[iwss].c[c][s][1][ixxs];
749  } //s
750  } // c
751  }
752  }
753  } // x,ky,yztv
754 #pragma omp barrier
755 }
756 
757 
758 //---------------------------------------------------------------------------------------- for DD solver
759 void ddd_out_pre_d_(scd_t *in, int *domain);
760 void ddd_out_pos_d_(scd_t *out, scd_t *in, int *domain);
761 void ddd_in_d_(scd_t *out, scd_t *in, int *domain);
762 
763 void ddd_out_pre_s_(scs_t *in, int *domain);
764 void ddd_out_pre_s_noprl_(scs_t *in, int *domain);
765 void ddd_out_pre_s_no_timer_(scs_t *in, int *domain);
766 void ddd_out_pre_s_noprl_no_timer_(scs_t *in, int *domain);
767 void ddd_out_pos_s_(scs_t *out, scs_t *in, int *domain, float factor);
768 void ddd_out_pos_s_no_timer_(scs_t *out, scs_t *in, int *domain, float factor);
769 void ddd_in_s_(scs_t *out, scs_t *in, int *domain);
770 void ddd_out_pos_s_noprl_(scs_t *out, scs_t *in, int *domain, float factor);
771 void ddd_out_pos_s_noprl_no_timer_(scs_t *out, scs_t *in, int *domain, float factor);
772 void ddd_in_s_noprl(scs_t *out, scs_t *in, int *domain);
773 void jinv_ddd_in_s_noprl_(scs_t *x, scs_t *b, int *domain, int *maxiter);
774 
775 //----------------------------------------------------------------------------------------
776 
777 /*
778 void ddd_d_(scd_t* out, scd_t* in){
779  printf("hoge: %s: kappa=%f\n", __func__,kappa); fflush(stdout);
780  ddd_out_pre_d_(in, &domain0);
781  ddd_in_d_( &out[vold*0], &in[vold*0], &domain0);
782  ddd_out_pos_d_(&out[vold*0], &in[vold*0], &domain0);
783  ddd_out_pre_d_(in, &domain1);
784  ddd_in_d_( &out[vold*1], &in[vold*1], &domain1);
785  ddd_out_pos_d_(&out[vold*1], &in[vold*0], &domain1);
786 }
787 */
788 //----------------------------------------------------------------------------------------
789 void ddd_eo_s_noprl_(scs_t *out, scs_t *in, int *domain)
790 {
791  ddd_out_pre_s_noprl_no_timer_(in, domain);
792  ddd_in_s_noprl(&out[vols * (*domain)], &in[vols * (*domain)], domain);
793  ddd_out_pos_s_noprl_no_timer_(&out[vols * (*domain)], in, domain, (float)mkappa);
794 }
795 
796 
797 //----------------------------------------------------------------------------------------
798 void ddd_s_noprl_(scs_t *out, scs_t *in)
799 {
800  ddd_eo_s_noprl_(out, in, &domain_e);
801  ddd_eo_s_noprl_(out, in, &domain_o);
802 }
803 
804 
805 // inline functions defiend in qws.c
806 extern void copy_vols2_s_noprl_(scs_t *out, scs_t *in);
807 extern void zero_vols_s_noprl_(scs_t *out);
808 extern void accum_add_vols_s_noprl_(scs_t *out, scs_t *in);
809 extern void accum_sub_vols_s_noprl_(scs_t *out, scs_t *in);
810 extern void accum_addsub_vols_s_noprl_(scs_t *out, scs_t *in1, scs_t *in2);
811 
812 //----------------------------------------------------------------------------------------
813 void prec_s_noprl_(scs_t *out, scs_t *in, int *nsap, int *nm, scs_t *s, scs_t *q)
814 {
815 #ifdef COMPILE_TIME_DIM_SIZE
816  const int vols = VOLS;
817 #else
818  const int vols = ::vols;
819 #endif
820  float kappa = ::kappa;
821 
822  {
823  copy_vols2_s_noprl_(s, in);
824  if ((npe[1] == 1) || (npe[2] == 1) || (npe[3] == 1)) zero_vols_s_noprl_(&q[vols * domain_o]);
825  for (int isap = 0; isap < *nsap; isap++) {
826  jinv_ddd_in_s_noprl_(&q[vols * domain_e], &s[vols * domain_e], &domain_e, nm);
827  ddd_out_pre_s_noprl_no_timer_(q, &domain_o);
828  ddd_in_s_noprl(&q[vols * domain_o], &q[vols * domain_e], &domain_e);
829  accum_addsub_vols_s_noprl_(&s[vols * domain_e], &in[vols * domain_e], &q[vols * domain_o]);
830  ddd_out_pos_s_noprl_no_timer_(&s[vols * domain_o], q, &domain_o, (float)kappa);
831  jinv_ddd_in_s_noprl_(&q[vols * domain_o], &s[vols * domain_o], &domain_o, nm);
832  ddd_out_pre_s_noprl_no_timer_(q, &domain_e);
833  ddd_in_s_noprl(&q[vols * domain_e], &q[vols * domain_o], &domain_o);
834  accum_addsub_vols_s_noprl_(&s[vols * domain_o], &in[vols * domain_o], &q[vols * domain_e]);
835  ddd_out_pos_s_noprl_no_timer_(&s[vols * domain_e], q, &domain_e, (float)kappa);
836  }
837  if ((npe[1] == 1) || (npe[2] == 1) || (npe[3] == 1)) zero_vols_s_noprl_(&out[vols * domain_o]);
838  jinv_ddd_in_s_noprl_(&out[vols * domain_e], &s[vols * domain_e], &domain_e, nm);
839  ddd_out_pre_s_noprl_no_timer_(out, &domain_o);
840  ddd_out_pos_s_noprl_no_timer_(&s[vols * domain_o], out, &domain_o, (float)kappa);
841  jinv_ddd_in_s_noprl_(&out[vols * domain_o], &s[vols * domain_o], &domain_o, nm);
842  }
843 }
844 
845 
846 //---------------------------------------------------------------------
847 void clv_s_dd_(int *deo, scs_t *inout)
848 {
849  __attribute__((aligned(64))) scs_t tmp;
850  int x, y, z, t;
851 #pragma omp for private(x,y,z,t,tmp) collapse(3) schedule(static)
852  for (t = 0; t < nt; t++) {
853  for (z = 0; z < nz; z++) {
854  for (y = 0; y < ny; y++) {
855  for (x = 0; x < nxs; x++) {
856  int i0 = x + nxs * y + nxs * ny * z + nxs * ny * nz * t + vols * (*deo);
857  __load_sc_s(tmp.c, inout[i0].c);
858  __mult_clvs(tmp.cv, clvs[i0].cv);
859  __store_sc_s(inout[i0].c, tmp.c);
860  }
861  }
862  }
863  }
864 }
865 
866 
867 void clv_matvec_s_dd_(int *deo, scs_t *out, clvs_t *clv, scs_t *in)
868 {
869  __attribute__((aligned(64))) scs_t tmp;
870  int x, y, z, t;
871 #pragma omp for private(x,y,z,t,tmp) collapse(3) schedule(static)
872  for (t = 0; t < nt; t++) {
873  for (z = 0; z < nz; z++) {
874  for (y = 0; y < ny; y++) {
875  for (x = 0; x < nxs; x++) {
876  int i0 = x + nxs * y + nxs * ny * z + nxs * ny * nz * t + vols * (*deo);
877  __load_sc_s(tmp.c, in[i0].c);
878  __mult_clvs(tmp.cv, clv[i0].cv);
879  __store_sc_s(out[i0].c, tmp.c);
880  }
881  }
882  }
883  }
884 }
885 
886 
887 //--------------------------------------------------------------------
888 void clv_d_dd_(int *deo, scd_t *inout)
889 {
890  __attribute__((aligned(64))) scd_t tmp;
891  int x, y, z, t;
892 #pragma omp for private(x,y,z,t,tmp) collapse(3) schedule(static)
893  for (t = 0; t < nt; t++) {
894  for (z = 0; z < nz; z++) {
895  for (y = 0; y < ny; y++) {
896  for (x = 0; x < nxd; x++) {
897  int i0 = x + nxd * y + nxd * ny * z + nxd * ny * nz * t + vold * (*deo);
898  __load_sc(tmp.c, inout[i0].c);
899  __mult_clvd(tmp.cv, clvd[i0].cv);
900  __store_sc(inout[i0].c, tmp.c);
901  }
902  }
903  }
904  }
905 }
906 
907 
908 void clv_matvec_d_dd_(int *deo, scd_t *out, clvd_t *clv, scd_t *in)
909 {
910  __attribute__((aligned(64))) scd_t tmp;
911  int x, y, z, t;
912 #pragma omp for private(x,y,z,t,tmp) collapse(3) schedule(static)
913  for (t = 0; t < nt; t++) {
914  for (z = 0; z < nz; z++) {
915  for (y = 0; y < ny; y++) {
916  for (x = 0; x < nxd; x++) {
917  int i0 = x + nxd * y + nxd * ny * z + nxd * ny * nz * t + vold * (*deo);
918  __load_sc(tmp.c, in[i0].c);
919  __mult_clvd(tmp.cv, clv[i0].cv);
920  __store_sc(out[i0].c, tmp.c);
921  }
922  }
923  }
924  }
925 }
926 
927 
928 #ifdef __cplusplus
929 } // extern "C"
930 #endif
931 
932 #endif
933 //==============================================================END===
VLEND
#define VLEND
Definition: define_vlen.h:42
VLENYS
#define VLENYS
Definition: define_vlen.h:27
VLENS
#define VLENS
Definition: define_vlen.h:41
NCOL
#define NCOL
Definition: field_G_imp_SU2-inc.h:2
threadManager.h
VLENXD
#define VLENXD
Definition: define_vlen.h:23
VLENYD
#define VLENYD
Definition: define_vlen.h:24
NDIM
#define NDIM
Definition: contract_4spinor.cpp:18
VLENXS
#define VLENXS
Definition: define_vlen.h:26