Bridge++  Ver. 2.0.2
mult_Domainwall_5din_eo_qxs-inc.h
Go to the documentation of this file.
1 
10 /*
11  Copyright
12  Bridge++ project and RIKEN (2022)
13 
14  Licence: GPL
15  see README.txt and LICENSE for more details
16 */
17 
18 #ifndef MULT_DOMAINWALL_5DIN_EO_QXS_INCLUDED
19 #define MULT_DOMAINWALL_5DIN_EO_QXS_INCLUDED
20 
21 #include "mult_common_th-inc.h"
22 #include "prefetch.h"
23 //====================================================================
25  real_t *__restrict yp,
26  real_t *__restrict wp,
27  real_t mq, real_t M0, int Ns, int *bc,
28  real_t *__restrict b,
29  real_t *__restrict c,
30  int *Nsize, int *do_comm)
31 {
32  int Nx2v = Nsize[0];
33  int Ny = Nsize[1];
34  int Nz = Nsize[2];
35  int Nt = Nsize[3];
36  int Nst2v = Nx2v * Ny * Nz * Nt;
37 
38  int Nin4 = VLEN * NVCD;
39  int Nin5 = Nin4 * Ns;
40 
41  int ith, nth, site0, site1;
42  set_threadtask(ith, nth, site0, site1, Nst2v);
43 
44  for (int site = site0; site < site1; ++site) {
45  real_t *wp2 = &wp[Nin5 * site];
46  real_t *yp2 = &yp[Nin5 * site];
47 
48  svbool_t pg = set_predicate();
49  for (int is = 0; is < Ns; ++is) {
50  for (int ic = 0; ic < NC; ++ic) {
51  svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
52  int offset = 2 * ND * ic + NVCD * is;
53  real_t factor = -0.5;
54  real_t bb = b[is] * factor;
55 
56  load_vec(pg, vt1r, &wp2[VLEN * (offset + 0)]);
57  load_vec(pg, vt1i, &wp2[VLEN * (offset + 1)]);
58  load_vec(pg, vt2r, &wp2[VLEN * (offset + 2)]);
59  load_vec(pg, vt2i, &wp2[VLEN * (offset + 3)]);
60  load_vec(pg, vt3r, &wp2[VLEN * (offset + 4)]);
61  load_vec(pg, vt3i, &wp2[VLEN * (offset + 5)]);
62  load_vec(pg, vt4r, &wp2[VLEN * (offset + 6)]);
63  load_vec(pg, vt4i, &wp2[VLEN * (offset + 7)]);
64  scal_vec(pg, vt1r, bb);
65  scal_vec(pg, vt1i, bb);
66  scal_vec(pg, vt2r, bb);
67  scal_vec(pg, vt2i, bb);
68  scal_vec(pg, vt3r, bb);
69  scal_vec(pg, vt3i, bb);
70  scal_vec(pg, vt4r, bb);
71  scal_vec(pg, vt4i, bb);
72 
73  int is_up = (is + 1) % Ns;
74  real_t Fup = 0.5 * c[is] * factor;
75  if (is == Ns - 1) Fup *= -mq;
76  add_aPm5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
77  vt3r, vt3i, vt4r, vt4i,
78  Fup, wp2, is_up, ic);
79 
80  int is_dn = (is - 1 + Ns) % Ns;
81  real_t Fdn = 0.5 * c[is] * factor;
82  if (is == 0) Fdn *= -mq;
83  add_aPp5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
84  vt3r, vt3i, vt4r, vt4i,
85  Fdn, wp2, is_dn, ic);
86  save_vec(pg, &yp2[VLEN * (offset + 0)], vt1r);
87  save_vec(pg, &yp2[VLEN * (offset + 1)], vt1i);
88  save_vec(pg, &yp2[VLEN * (offset + 2)], vt2r);
89  save_vec(pg, &yp2[VLEN * (offset + 3)], vt2i);
90  save_vec(pg, &yp2[VLEN * (offset + 4)], vt3r);
91  save_vec(pg, &yp2[VLEN * (offset + 5)], vt3i);
92  save_vec(pg, &yp2[VLEN * (offset + 6)], vt4r);
93  save_vec(pg, &yp2[VLEN * (offset + 7)], vt4i);
94  }
95  } // is
96  } // site
97 }
98 
99 
100 //====================================================================
102  real_t *__restrict vp,
103  real_t *__restrict wp,
104  int Ns, int *Nsize)
105 {
106  int Nxv = Nsize[0];
107  int Ny = Nsize[1];
108  int Nz = Nsize[2];
109  int Nt = Nsize[3];
110  int Nstv = Nxv * Ny * Nz * Nt;
111 
112  int Nin4 = VLEN * NVCD;
113  int Nin5 = Nin4 * Ns;
114 
115  int ith, nth, site0, site1;
116  set_threadtask(ith, nth, site0, site1, Nstv);
117 
118  for (int site = site0; site < site1; ++site) {
119  real_t *vp2 = &vp[Nin5 * site];
120  real_t *wp2 = &wp[Nin5 * site];
121  svbool_t pg = set_predicate();
122  for (int is = 0; is < Ns; ++is) {
123  for (int ic = 0; ic < NC; ++ic) {
124  svreal_t vt1r, vt1i, vt2r, vt2i;
125  svreal_t vt3r, vt3i, vt4r, vt4i;
126  int offset = 2 * ND * ic + NVCD * is;
127 
128  load_mult_gm5_dirac_vec(pg, vt1r, vt1i, vt2r, vt2i,
129  vt3r, vt3i, vt4r, vt4i, &wp2[VLEN * offset]);
130  save_vec(pg, &vp2[VLEN * (offset + 0)], vt1r);
131  save_vec(pg, &vp2[VLEN * (offset + 1)], vt1i);
132  save_vec(pg, &vp2[VLEN * (offset + 2)], vt2r);
133  save_vec(pg, &vp2[VLEN * (offset + 3)], vt2i);
134  save_vec(pg, &vp2[VLEN * (offset + 4)], vt3r);
135  save_vec(pg, &vp2[VLEN * (offset + 5)], vt3i);
136  save_vec(pg, &vp2[VLEN * (offset + 6)], vt4r);
137  save_vec(pg, &vp2[VLEN * (offset + 7)], vt4i);
138  }
139  }
140  }
141 }
142 
143 
144 //====================================================================
146 {
147  int Nxv = Nsize[0];
148  int Ny = Nsize[1];
149  int Nz = Nsize[2];
150  int Nt = Nsize[3];
151  int Nstv = Nxv * Ny * Nz * Nt;
152 
153  int Nin4 = VLEN * NVCD;
154  int Nin5 = Nin4 * Ns;
155 
156  int ith, nth, site0, site1;
157  set_threadtask(ith, nth, site0, site1, Nstv);
158 
159  svreal_t y;
160  svbool_t pg;
161  pg = set_predicate();
162  clear_vec(pg, y);
163  for (int site = site0; site < site1; ++site) {
164  real_t *vp2 = &vp[Nin5 * site];
165 
166  for (int is = 0; is < Ns; ++is) {
167  for (int i = 0; i < NVCD; ++i) {
168  save_vec(pg, &vp2[VLEN * (is * NVCD + i)], y);
169  }
170  }
171  }
172 }
173 
174 
175 //====================================================================
177  real_t *__restrict vp,
178  real_t *__restrict yp,
179  real_t mq, real_t M0, int Ns, int *bc,
180  real_t *b, real_t *c,
181  int *Nsize, int *do_comm)
182 {
183  int Nx2v = Nsize[0];
184  int Ny = Nsize[1];
185  int Nz = Nsize[2];
186  int Nt = Nsize[3];
187  int Nst2v = Nx2v * Ny * Nz * Nt;
188  int Nst2 = Nst2v * VLEN;
189 
190  int Nin4 = VLEN * NVCD;
191  int Nin5 = Nin4 * Ns;
192 
193  int ith, nth, site0, site1;
194  set_threadtask(ith, nth, site0, site1, Nst2v);
195 
196  for (int site = site0; site < site1; ++site) {
197  real_t *vp2 = &vp[Nin5 * site];
198  real_t *yp2 = &yp[Nin5 * site];
199 
200  svbool_t pg = set_predicate();
201  for (int is = 0; is < Ns; ++is) {
202  for (int ic = 0; ic < NC; ++ic) {
203  svreal_t vt1r, vt1i, vt2r, vt2i, vt3r, vt3i, vt4r, vt4i;
204  int offset = 2 * ND * ic + NVCD * is;
205  real_t bb = 0.5 * b[is];
206 
207  load_vec(pg, vt3r, &yp2[VLEN * (offset + 0)]);
208  load_vec(pg, vt3i, &yp2[VLEN * (offset + 1)]);
209  load_vec(pg, vt4r, &yp2[VLEN * (offset + 2)]);
210  load_vec(pg, vt4i, &yp2[VLEN * (offset + 3)]);
211  load_vec(pg, vt1r, &yp2[VLEN * (offset + 4)]);
212  load_vec(pg, vt1i, &yp2[VLEN * (offset + 5)]);
213  load_vec(pg, vt2r, &yp2[VLEN * (offset + 6)]);
214  load_vec(pg, vt2i, &yp2[VLEN * (offset + 7)]);
215  scal_vec(pg, vt1r, bb);
216  scal_vec(pg, vt1i, bb);
217  scal_vec(pg, vt2r, bb);
218  scal_vec(pg, vt2i, bb);
219  scal_vec(pg, vt3r, bb);
220  scal_vec(pg, vt3i, bb);
221  scal_vec(pg, vt4r, bb);
222  scal_vec(pg, vt4i, bb);
223 
224  int is_up = (is + 1) % Ns;
225  real_t Fup = 0.5 * (-0.5) * c[is_up];
226  if (is == Ns - 1) Fup *= -mq;
227  add_aPp5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
228  vt3r, vt3i, vt4r, vt4i,
229  Fup, yp2, is_up, ic);
230 
231  int is_dn = (is - 1 + Ns) % Ns;
232  real_t Fdn = -0.5 * (-0.5) * c[is_dn];
233  if (is == 0) Fdn *= -mq;
234  add_aPm5_dirac_vec(vt1r, vt1i, vt2r, vt2i,
235  vt3r, vt3i, vt4r, vt4i,
236  Fdn, yp2, is_dn, ic);
237  save_vec(pg, &vp2[VLEN * (offset + 0)], vt1r);
238  save_vec(pg, &vp2[VLEN * (offset + 1)], vt1i);
239  save_vec(pg, &vp2[VLEN * (offset + 2)], vt2r);
240  save_vec(pg, &vp2[VLEN * (offset + 3)], vt2i);
241  save_vec(pg, &vp2[VLEN * (offset + 4)], vt3r);
242  save_vec(pg, &vp2[VLEN * (offset + 5)], vt3i);
243  save_vec(pg, &vp2[VLEN * (offset + 6)], vt4r);
244  save_vec(pg, &vp2[VLEN * (offset + 7)], vt4i);
245  } // ic
246  } //is
247  } //site
248 }
249 
250 
251 //====================================================================
253  real_t *vp, real_t *up, real_t *wp,
254  real_t mq, real_t M0, int Ns, int *bc,
255  real_t *b, real_t *c,
256  int *Leo, int *Nsize, int *do_comm,
257  const int ieo)
258 {
259  int Nx2v = Nsize[0];
260  int Ny = Nsize[1];
261  int Nz = Nsize[2];
262  int Nt = Nsize[3];
263  int Nst2v = Nx2v * Ny * Nz * Nt;
264  int Nst2 = Nst2v * VLEN;
265 
266  int Nin4 = VLEN * NVCD;
267  int Nin5 = Nin4 * Ns;
268 
269  int NvU2 = NDF * Nst2;
270 
271  int Nxy2 = Nx2v * Ny;
272  int Nxyz2 = Nx2v * Ny * Nz;
273 
274  svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
275  svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
276  svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
277  set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
278  set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
279  set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
280  set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
281  set_predicate_yp(pg1_yp, pg2_yp);
282  set_predicate_ym(pg1_ym, pg2_ym);
283 
284  int ith, nth, site0, site1;
285  set_threadtask(ith, nth, site0, site1, Nst2v);
286 
287  real_t bufL[Nin5];
288 
289  for (int site = site0; site < site1; ++site) {
290  int ix = site % Nx2v;
291  int iyzt = site / Nx2v;
292  int iy = iyzt % Ny;
293  int izt = site / Nxy2;
294  int iz = izt % Nz;
295  int it = izt / Nz;
296  int ixy = ix + Nx2v * iy;
297  int ixyz = ixy + Nxy2 * iz;
298  int jeo = (ieo + Leo[VLENY * iyzt]) % 2;
299 
300  // index for prefetch
301  int ix_p1 = (site + 1) % Nx2v;
302  int iyzt_p1 = (site + 1) / Nx2v;
303  int iy_p1 = iyzt_p1 % Ny;
304  int izt_p1 = (site + 1) / Nxy2;
305  int iz_p1 = izt_p1 % Nz;
306  int it_p1 = izt_p1 / Nz;
307  int ixy_p1 = ix_p1 + Nx2v * iy_p1;
308  int ixyz_p1 = ixy_p1 + Nxy2 * iz_p1;
309  int jeo_p1 = (ieo + Leo[VLENY * iyzt_p1]) % 2;
310 
311  int idir;
312 
313  real_t *wp2 = &wp[Nin5 * site];
314  real_t *vp2 = &vp[Nin5 * site];
315 
316  real_t z4[VLEN * NVCD];
317  Vsimd_t vL[NVCD];
318 
319  int nei_p1;
320 
321  idir = 0;
322 
323  __prefetch_load_hop_u_l2(up, ieo + 2 * idir, site + 1);
324 
325  if ((ix < Nx2v - 1) || (do_comm[0] == 0)) {
326  int nei = ix + 1 + Nx2v * iyzt;
327 
328  nei_p1 = ix - 1 + Nx2v * iyzt;
329  if (ix == Nx2v - 1) nei = 0 + Nx2v * iyzt;
330 
331  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (ieo + 2 * idir)];
332  real_t *wpn = &wp[Nin5 * nei];
333 
334  if (jeo == 0) {
335  for (int is = 0; is < Ns; ++is) {
336  __prefetch_load_hop_vec_l2(wp, site + 1, is);
337  __prefetch_write_hop_vec_l2(vp, site + 1, is);
338  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
339 
340  mult_wilson_eo_xpb(pg1e_xp, pg2e_xp, pg3e_xp,
341  &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
342  }
343  } else {
344  for (int is = 0; is < Ns; ++is) {
345  __prefetch_load_hop_vec_l2(wp, site + 1, is);
346  __prefetch_write_hop_vec_l2(vp, site + 1, is);
347  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
348 
349  mult_wilson_eo_xpb(pg1o_xp, pg2o_xp, pg3o_xp,
350  &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
351  }
352  }
353  }
354 
355 
356  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * idir, site + 1);
357 
358  if ((ix > 0) || (do_comm[0] == 0)) {
359  int nei = ix - 1 + Nx2v * iyzt;
360 
361  int iy2_p1 = (iy + 1) % Ny;
362  nei_p1 = ix + Nx2v * (iy2_p1 + Ny * izt);
363 
364  if (ix == 0) nei = Nx2v - 1 + Nx2v * iyzt;
365 
366  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (1 - ieo + 2 * idir)];
367  real_t *un = &up[VLEN * NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
368  real_t *wpn = &wp[Nin5 * nei];
369  if (jeo == 0) {
370  for (int is = 0; is < Ns; ++is) {
371  __prefetch_load_hop_vec_l2(wp, site + 1, is);
372  __prefetch_write_hop_vec_l2(vp, site + 1, is);
373  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
374 
375  mult_wilson_eo_xmb(pg1e_xm, pg2e_xm, pg3e_xm, &vp2[Nin4 * is],
376  u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
377  }
378  } else {
379  for (int is = 0; is < Ns; ++is) {
380  __prefetch_load_hop_vec_l2(wp, site + 1, is);
381  __prefetch_write_hop_vec_l2(vp, site + 1, is);
382  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
383 
384  mult_wilson_eo_xmb(pg1o_xm, pg2o_xm, pg3o_xm, &vp2[Nin4 * is],
385  u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
386  }
387  }
388  }
389 
390 
391  idir = 1;
392 
393  __prefetch_load_hop_u_l2(up, ieo + 2 * idir, site + 1);
394 
395  if ((iy < Ny - 1) || (do_comm[idir] == 0)) {
396  int iy2 = (iy + 1) % Ny;
397  int nei = ix + Nx2v * (iy2 + Ny * izt);
398 
399  int iy2_p1 = (iy - 1 + Ny) % Ny;
400  nei_p1 = ix + Nx2v * (iy2_p1 + Ny * izt);
401 
402 
403  real_t *wpn = &wp[Nin5 * nei];
404  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (ieo + 2 * idir)];
405  for (int is = 0; is < Ns; ++is) {
406  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
407 
408  mult_wilson_ypb(pg1_yp, pg2_yp,
409  &vp2[Nin4 * is], u2, &wp2[Nin4 * is], &wpn[Nin4 * is]);
410  }
411  }
412 
413  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * idir, site + 1);
414 
415  if ((iy > 0) || (do_comm[idir] == 0)) {
416  int iy2 = (iy - 1 + Ny) % Ny;
417  int nei = ix + Nx2v * (iy2 + Ny * izt);
418 
419  int iz2_p1 = (iz + 1) % Nz;
420  nei_p1 = ixy + Nxy2 * (iz2_p1 + Nz * it);
421 
422  real_t *wpn = &wp[Nin5 * nei];
423  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (1 - ieo + 2 * idir)];
424  real_t *un = &up[VLEN * NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
425  for (int is = 0; is < Ns; ++is) {
426  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
427  mult_wilson_ymb(pg1_ym, pg2_ym,
428  &vp2[Nin4 * is], u2, un, &wp2[Nin4 * is], &wpn[Nin4 * is]);
429  }
430  }
431 
432  idir = 2;
433 
434  __prefetch_load_hop_u_l2(up, ieo + 2 * idir, site + 1);
435 
436  if ((iz < Nz - 1) || (do_comm[idir] == 0)) {
437  int iz2 = (iz + 1) % Nz;
438  int nei = ixy + Nxy2 * (iz2 + Nz * it);
439 
440  int iz2_p1 = (iz - 1 + Nz) % Nz;
441  nei_p1 = ixy + Nxy2 * (iz2_p1 + Nz * it);
442 
443  real_t *wpn = &wp[Nin5 * nei];
444  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (ieo + 2 * idir)];
445  for (int is = 0; is < Ns; ++is) {
446  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
447 
448  mult_wilson_zpb(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
449  }
450  }
451 
452  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * idir, site + 1);
453 
454  if ((iz > 0) || (do_comm[idir] == 0)) {
455  int iz2 = (iz - 1 + Nz) % Nz;
456  int nei = ixy + Nxy2 * (iz2 + Nz * it);
457 
458  int it2_p1 = (it + 1) % Nt;
459  nei_p1 = ixyz + Nxyz2 * it2_p1;
460 
461  real_t *wpn = &wp[Nin5 * nei];
462  real_t *u2 = &up[VLEN * NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
463  for (int is = 0; is < Ns; ++is) {
464  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
465 
466  mult_wilson_zmb(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
467  }
468  }
469 
470  idir = 3;
471 
472  __prefetch_load_hop_u_l2(up, ieo + 2 * idir, site + 1);
473 
474  if ((it < Nt - 1) || (do_comm[idir] == 0)) {
475  int it2 = (it + 1) % Nt;
476  int nei = ixyz + Nxyz2 * it2;
477 
478  int it2_p1 = (it - 1 + Nt) % Nt;
479  nei_p1 = ixyz + Nxyz2 * it2_p1;
480 
481  real_t *wpn = &wp[Nin5 * nei];
482  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (ieo + 2 * idir)];
483  for (int is = 0; is < Ns; ++is) {
484  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
485 
486  mult_wilson_tpb_dirac(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
487  }
488  }
489 
490  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * idir, site + 1);
491 
492  if ((it > 0) || (do_comm[idir] == 0)) {
493  int it2 = (it - 1 + Nt) % Nt;
494  int nei = ixyz + Nxyz2 * it2;
495 
496  nei_p1 = ix_p1 + 1 + Nx2v * iyzt_p1;
497 
498  real_t *wpn = &wp[Nin5 * nei];
499  real_t *u2 = &up[VLEN * NDF * nei + NvU2 * (1 - ieo + 2 * idir)];
500  for (int is = 0; is < Ns; ++is) {
501  __prefetch_load_hop_vec_l1(wp, nei_p1, is);
502 
503  mult_wilson_tmb_dirac(&vp2[Nin4 * is], u2, &wpn[Nin4 * is]);
504  }
505  }
506  }
507 }
508 
509 
510 //====================================================================
512  real_t *buf1_xp, real_t *buf1_xm,
513  real_t *buf1_yp, real_t *buf1_ym,
514  real_t *buf1_zp, real_t *buf1_zm,
515  real_t *buf1_tp, real_t *buf1_tm,
516  real_t *up, real_t *wp,
517  real_t mq, real_t M0, int Ns, int *bc,
518  int *Leo, int *Nsize, int *do_comm,
519  const int ieo)
520 {
521  int Nx2v = Nsize[0];
522  int Ny = Nsize[1];
523  int Nz = Nsize[2];
524  int Nt = Nsize[3];
525  int Nst2v = Nx2v * Ny * Nz * Nt;
526  int Nst2 = Nst2v * VLEN;
527 
528  int Nin4 = VLEN * NVCD;
529  int Nin5 = Nin4 * Ns;
530  int Nin4H = VLEN * NVC * ND2;
531  int Nin5H = Nin4H * Ns;
532  int NvU2 = NDF * Nst2;
533 
534  int Nxy2 = Nx2v * Ny;
535  int Nxyz2 = Nx2v * Ny * Nz;
536 
537  svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
538  svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
539  set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
540  set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
541  set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
542  set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
543  svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
544  set_predicate_yp(pg1_yp, pg2_yp);
545  set_predicate_ym(pg1_ym, pg2_ym);
546  svint_t svidx_xp, svidx_xm;
547  set_index_xp_eo(svidx_xp);
548  set_index_xm_eo(svidx_xm);
549 
550  int idir = 0;
551  if (do_comm[idir] == 1) {
552  int Nyzt = Ny * Nz * Nt;
553 
554  int ith, nth, site0, site1;
555  set_threadtask(ith, nth, site0, site1, Nyzt);
556 
557  for (int iyzt = site0; iyzt < site1; ++iyzt) {
558  int iy = iyzt % Ny;
559  int iz = (iyzt / Ny) % Nz;
560  int it = iyzt / (Ny * Nz);
561 
562  int jeo = (ieo + Leo[VLENY * iyzt]) % 2;
563 
564  int Nskipx = (VLENY + 1) / 2;
565 
566  // index for prefetch
567 #if VLENY > 1
568  int ibf_up_xp1 = Nskipx * NVC * ND2 * Ns * (iyzt + 1);
569  int ibf_dn_xp1 = Nskipx * NVC * ND2 * Ns * (iyzt + 1);
570  int ibf_dn_xp1_2 = Nskipx * NVC * ND2 * Ns * iyzt;
571  int ibf_up_xp1_2 = Nskipx * NVC * ND2 * Ns * (iyzt + 1);
572 #else
573  int ibf_up_xp1 = Nskipx * NVC * ND2 * Ns * ((iyzt + 1) / 2);
574  int ibf_dn_xp1 = Nskipx * NVC * ND2 * Ns * ((iyzt + 1) / 2);
575  int ibf_dn_xp1_2 = Nskipx * NVC * ND2 * Ns * (iyzt / 2);
576  int ibf_up_xp1_2 = Nskipx * NVC * ND2 * Ns * ((iyzt + 1) / 2);
577 #endif
578 
579  int i_p1 = 1;
580  int i_p2 = 2;
581 
582 #if VLENY > 1
583  int ibf_up = Nskipx * NVC * ND2 * Ns * iyzt;
584  int ibf_dn = Nskipx * NVC * ND2 * Ns * iyzt;
585 #else
586  int ibf_up = Nskipx * NVC * ND2 * Ns * (iyzt / 2);
587  int ibf_dn = Nskipx * NVC * ND2 * Ns * (iyzt / 2);
588 #endif
589 
590  {
591  int ix = 0;
592  int site = ix + Nx2v * iyzt;
593  real_t *wp2 = &wp[Nin5 * site];
594 
595  // index for prefetch
596  int site_xp1 = ix + Nx2v * (iyzt + 1);
597  int site_xp1_2 = Nx2v - 1 + Nx2v * (iyzt + 1);
598 
599  set_index_xm_eo(svidx_xm);
600  if (jeo == 0) {
601 #if VLENY > 1
602  for (int is = 0; is < Ns; ++is) {
603  __prefetch_load_hop_vec_l2(wp, site_xp1, is);
604  __prefetch_write_hop1_buf_x_l2(buf1_xp, ibf_up_xp1, is,
605  Nskipx * NVC * ND2);
606 
607  if ((!((it == 0) || (it == Nt - 1))) &&
608  (!((iz == 0) || (iz == Nz - 1))) &&
609  (!((iy == 0) || (iy == Ny - 1)))) {
610  __prefetch_load_hop_vec_l1(wp, site_xp1_2, is);
611  __prefetch_write_hop1_buf_x_l1(buf1_xm, ibf_dn_xp1_2, is,
612  Nskipx * NVC * ND2);
613  }
614 
615  mult_wilson_eo_xp1(pg2o_xm, svidx_xm,
616  &buf1_xp[ibf_up + Nskipx * NVC * ND2 * is],
617  &wp2[Nin4 * is]);
618  }
619 #endif
620  } else {
621  for (int is = 0; is < Ns; ++is) {
622  __prefetch_load_hop_vec_l2(wp, site_xp1, is);
623  __prefetch_write_hop1_buf_x_l2(buf1_xp, ibf_up_xp1, is,
624  Nskipx * NVC * ND2);
625 
626  if ((!((it == 0) || (it == Nt - 1))) &&
627  (!((iz == 0) || (iz == Nz - 1))) &&
628  (!((iy == 0) || (iy == Ny - 1)))) {
629  __prefetch_load_hop_vec_l1(wp, site_xp1_2, is);
630  __prefetch_write_hop1_buf_x_l1(buf1_xm, ibf_dn_xp1_2, is,
631  Nskipx * NVC * ND2);
632  }
633 
634  mult_wilson_eo_xp1(pg2e_xm, svidx_xm,
635  &buf1_xp[ibf_up + Nskipx * NVC * ND2 * is],
636  &wp2[Nin4 * is]);
637  }
638  }
639  }
640 
641  {
642  int ix = Nx2v - 1;
643  int site = ix + Nx2v * iyzt;
644  real_t *wp2 = &wp[Nin5 * site];
645  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (1 - ieo + 2 * idir)];
646 
647  int site_xp1 = ix + Nx2v * (iyzt + 1);
648  int site_xp1_2 = 0 + Nx2v * (iyzt + 1);
649 
650  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * 0, site_xp1);
651  // __prefetch_load_hop_u_l1(up,1-ieo+2*0,site_xp1);
652 
653  set_index_xp_eo(svidx_xp);
654  if (jeo == 0) {
655  for (int is = 0; is < Ns; ++is) {
656  __prefetch_load_hop_vec_l2(wp, site_xp1, is);
657  __prefetch_write_hop1_buf_x_l2(buf1_xm, ibf_dn_xp1, is,
658  Nskipx * NVC * ND2);
659 
660  if ((!((it == 0) || (it == Nt - 1))) &&
661  (!((iz == 0) || (iz == Nz - 1))) &&
662  (!((iy == 0) || (iy == Ny - 1)))) {
663  __prefetch_load_hop_vec_l1(wp, site_xp1_2, is);
664  __prefetch_write_hop1_buf_x_l1(buf1_xp, ibf_up_xp1_2, is,
665  Nskipx * NVC * ND2);
666  }
667 
668  mult_wilson_eo_xm1(pg2o_xp, svidx_xp,
669  &buf1_xm[ibf_dn + Nskipx * NVC * ND2 * is],
670  u2, &wp2[Nin4 * is]);
671  }
672  } else {
673 #if VLENY > 1
674  for (int is = 0; is < Ns; ++is) {
675  __prefetch_load_hop_vec_l2(wp, site_xp1, is);
676  __prefetch_write_hop1_buf_x_l2(buf1_xm, ibf_dn_xp1, is,
677  Nskipx * NVC * ND2);
678 
679  if ((!((it == 0) || (it == Nt - 1))) &&
680  (!((iz == 0) || (iz == Nz - 1))) &&
681  (!((iy == 0) || (iy == Ny - 1)))) {
682  __prefetch_load_hop_vec_l1(wp, site_xp1_2, is);
683  __prefetch_write_hop1_buf_x_l1(buf1_xp, ibf_up_xp1_2, is,
684  Nskipx * NVC * ND2);
685  }
686 
687  mult_wilson_eo_xm1(pg2e_xp, svidx_xp,
688  &buf1_xm[ibf_dn + Nskipx * NVC * ND2 * is],
689  u2, &wp2[Nin4 * is]);
690  }
691 #endif
692  }
693  }
694  } // iyzt loop
695  }
696 
697  idir = 1;
698  if (do_comm[idir] == 1) {
699  int Nxzt2 = Nx2v * Nz * Nt;
700 
701  int ith, nth, site0, site1;
702  set_threadtask(ith, nth, site0, site1, Nxzt2);
703 
704  for (int ixzt = site0; ixzt < site1; ++ixzt) {
705  int ix = ixzt % Nx2v;
706  int izt = ixzt / Nx2v;
707  int iz = izt % Nz;
708  int it = izt / Nz;
709 
710  // index for prefetch
711  int i_p1 = 1;
712  int i_p2 = 2;
713 
714  int ibf_yp1 = VLENX * NVC * ND2 * Ns *
715  ((ix + i_p1) % Nx2v + Nx2v * (izt + (ix + i_p1) / Nx2v));
716  int ibf_yp2 = VLENX * NVC * ND2 * Ns *
717  ((ix + i_p2) % Nx2v + Nx2v * (izt + (ix + i_p2) / Nx2v));
718 
719  {
720  int iy = 0;
721  int site = ix + Nx2v * (iy + Ny * izt);
722  int ibf = VLENX * NVC * ND2 * Ns * ixzt;
723  real_t *wp2 = &wp[Nin5 * site];
724 
725  int iy_p1 = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
726  int izt_p1 = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
727  int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
728  int iy_p2 = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
729  int izt_p2 = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
730  int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
731 
732  for (int is = 0; is < Ns; ++is) {
733  __prefetch_load_hop_vec_l2(wp, site_yp2, is);
734  __prefetch_write_hop1_buf_y_l2(buf1_yp, ibf_yp2, is, VLENX * NVC * ND2);
735 
736  if ((!((it == 0) || (it == Nt - 1))) &&
737  (!((iz == 0) || (iz == Nz - 1)))) {
738  __prefetch_load_hop_vec_l1(wp, site_yp1, is);
739  __prefetch_write_hop1_buf_y_l1(buf1_yp, ibf_yp1, is,
740  VLENX * NVC * ND2);
741  }
742  mult_wilson_yp1(pg2_ym,
743  &buf1_yp[ibf + VLENX * NVC * ND2 * is], &wp2[Nin4 * is]);
744  }
745  }
746  {
747  int iy = Ny - 1;
748  int site = ix + Nx2v * (iy + Ny * izt);
749  int ibf = VLENX * NVC * ND2 * Ns * ixzt;
750  real_t *wp2 = &wp[Nin5 * site];
751  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (1 - ieo + 2 * idir)];
752 
753  int iy_p1 = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
754  int izt_p1 = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
755  int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
756  int iy_p2 = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
757  int izt_p2 = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
758  int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
759 
760  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * idir, site_yp2);
761 
762  for (int is = 0; is < Ns; ++is) {
763  __prefetch_load_hop_vec_l2(wp, site_yp2, is);
764  __prefetch_write_hop1_buf_y_l2(buf1_ym, ibf_yp2, is, VLENX * NVC * ND2);
765 
766  if ((!((it == 0) || (it == Nt - 1))) &&
767  (!((iz == 0) || (iz == Nz - 1)))) {
768  __prefetch_load_hop_vec_l1(wp, site_yp1, is);
769  __prefetch_write_hop1_buf_y_l1(buf1_ym, ibf_yp1, is,
770  VLENX * NVC * ND2);
771  }
772 
773  mult_wilson_ym1(pg2_yp,
774  &buf1_ym[ibf + VLENX * NVC * ND2 * is], u2, &wp2[Nin4 * is]);
775  }
776  }
777  }
778  }
779 
780  idir = 2;
781  if (do_comm[idir] == 1) {
782  int Nxyt2 = Nxy2 * Nt;
783 
784  int ith, nth, site0, site1;
785  set_threadtask(ith, nth, site0, site1, Nxyt2);
786 
787  for (int ixyt = site0; ixyt < site1; ++ixyt) {
788  int ixy = ixyt % Nxy2;
789  int it = ixyt / Nxy2;
790 
791  int i_p1 = 1;
792  int i_p2 = 2;
793  int ibf_zp1 = Nin5H * ((ixy + i_p1) % Nxy2 + Nxy2 * (it + (ixy + i_p1) / Nxy2));
794  int ibf_zp2 = Nin5H * ((ixy + i_p2) % Nxy2 + Nxy2 * (it + (ixy + i_p2) / Nxy2));
795 
796  int ibf = Nin5H * (ixy + Nxy2 * it);
797 
798  {
799  int iz = 0;
800  int site = ixy + Nxy2 * (iz + Nz * it);
801  real_t *wp2 = &wp[Nin5 * site];
802 
803  int iz_p1 = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
804  int it_p1 = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
805  int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
806  int iz_p2 = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
807  int it_p2 = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
808  int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
809 
810  for (int is = 0; is < Ns; ++is) {
811  __prefetch_load_hop_vec_l2(wp, site_zp2, is);
812  __prefetch_write_hop1_buf_zt_l2(buf1_zp, ibf_zp2, is, Nin4H);
813 
814  if (!((it == 0) || (it == Nt - 1))) {
815  __prefetch_load_hop_vec_l1(wp, site_zp1, is);
816  __prefetch_write_hop1_buf_zt_l1(buf1_zp, ibf_zp1, is, Nin4H);
817  }
818 
819  mult_wilson_zp1(&buf1_zp[ibf + Nin4H * is], &wp2[Nin4 * is]);
820  }
821  }
822 
823  {
824  int iz = Nz - 1;
825  int site = ixy + Nxy2 * (iz + Nz * it);
826  real_t *wp2 = &wp[Nin5 * site];
827  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (1 - ieo + 2 * idir)];
828 
829  int iz_p1 = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
830  int it_p1 = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
831  int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
832  int iz_p2 = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
833  int it_p2 = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
834  int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
835 
836  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * idir, site_zp2);
837 
838  for (int is = 0; is < Ns; ++is) {
839  __prefetch_load_hop_vec_l2(wp, site_zp2, is);
840  __prefetch_write_hop1_buf_zt_l2(buf1_zm, ibf_zp2, is, Nin4H);
841 
842  if (!((it == 0) || (it == Nt - 1))) {
843  __prefetch_load_hop_vec_l1(wp, site_zp1, is);
844  __prefetch_write_hop1_buf_zt_l1(buf1_zm, ibf_zp1, is, Nin4H);
845  }
846 
847  mult_wilson_zm1(&buf1_zm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
848  }
849  }
850  }
851  }
852 
853  idir = 3;
854  if (do_comm[idir] == 1) {
855  int ith, nth, site0, site1;
856  set_threadtask(ith, nth, site0, site1, Nxyz2);
857 
858  for (int ixyz = site0; ixyz < site1; ++ixyz) {
859  int ibf = Nin5H * ixyz;
860 
861  int i_p1 = 1;
862  int i_p2 = 2;
863 
864  // index for prefetch
865  int ibf_tp1 = Nin5H * (ixyz + i_p1);
866  int ibf_tp2 = Nin5H * (ixyz + i_p2);
867 
868  {
869  int it = 0;
870  int site = ixyz + Nxyz2 * it;
871  real_t *wp2 = &wp[Nin5 * site];
872 
873  // index for prefetch
874  int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
875  int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
876 
877  for (int is = 0; is < Ns; ++is) {
878  __prefetch_load_hop_vec_l2(wp, site_tp2, is);
879  __prefetch_write_hop1_buf_zt_l2(buf1_tp, ibf_tp2, is, Nin4H);
880 
881  __prefetch_load_hop_vec_l1(wp, site_tp1, is);
882  __prefetch_write_hop1_buf_zt_l1(buf1_tp, ibf_tp1, is, Nin4H);
883 
884  mult_wilson_tp1_dirac(&buf1_tp[ibf + Nin4H * is], &wp2[Nin4 * is]);
885  }
886  }
887 
888  {
889  int it = Nt - 1;
890  int site = ixyz + Nxyz2 * it;
891  real_t *wp2 = &wp[Nin5 * site];
892 
893  // index for prefetch
894  int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
895  int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
896 
897  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * idir, site_tp2);
898 
899  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (1 - ieo + 2 * idir)];
900  for (int is = 0; is < Ns; ++is) {
901  __prefetch_load_hop_vec_l2(wp, site_tp2, is);
902  __prefetch_write_hop1_buf_zt_l2(buf1_tm, ibf_tp2, is, Nin4H);
903 
904  __prefetch_load_hop_vec_l1(wp, site_tp1, is);
905  __prefetch_write_hop1_buf_zt_l1(buf1_tm, ibf_tp1, is, Nin4H);
906 
907  mult_wilson_tm1_dirac(&buf1_tm[ibf + Nin4H * is], u2, &wp2[Nin4 * is]);
908  }
909  }
910  }
911  }
912 }
913 
914 
915 //====================================================================
917  real_t *vp, real_t *up, real_t *wp,
918  real_t *buf2_xp, real_t *buf2_xm,
919  real_t *buf2_yp, real_t *buf2_ym,
920  real_t *buf2_zp, real_t *buf2_zm,
921  real_t *buf2_tp, real_t *buf2_tm,
922  real_t mq, real_t M0, int Ns, int *bc,
923  int *Leo, int *Nsize, int *do_comm,
924  const int ieo)
925 {
926  int Nx2v = Nsize[0];
927  int Ny = Nsize[1];
928  int Nz = Nsize[2];
929  int Nt = Nsize[3];
930  int Nst2v = Nx2v * Ny * Nz * Nt;
931  int Nst2 = Nst2v * VLEN;
932 
933  int Nin4 = VLEN * NVCD;
934  int Nin5 = Nin4 * Ns;
935  int Nin4H = VLEN * NVC * ND2;
936  int Nin5H = Nin4H * Ns;
937  int NvU2 = NDF * Nst2;
938 
939  int Nxy2 = Nx2v * Ny;
940  int Nxyz2 = Nx2v * Ny * Nz;
941 
942  svbool_t pg1e_xp, pg2e_xp, pg3e_xp, pg1e_xm, pg2e_xm, pg3e_xm;
943  svbool_t pg1o_xp, pg2o_xp, pg3o_xp, pg1o_xm, pg2o_xm, pg3o_xm;
944  set_predicate_xp_eo(pg1e_xp, pg2e_xp, pg3e_xp, 0);
945  set_predicate_xp_eo(pg1o_xp, pg2o_xp, pg3o_xp, 1);
946  set_predicate_xm_eo(pg1e_xm, pg2e_xm, pg3e_xm, 0);
947  set_predicate_xm_eo(pg1o_xm, pg2o_xm, pg3o_xm, 1);
948  svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
949  set_predicate_yp(pg1_yp, pg2_yp);
950  set_predicate_ym(pg1_ym, pg2_ym);
951  svint_t svidx_xp, svidx_xm;
952  set_index_xp_eo(svidx_xp);
953  set_index_xm_eo(svidx_xm);
954 
955  int Nskipx = (VLENY + 1) / 2;
956 
957  int ith, nth, site0, site1;
958  set_threadtask(ith, nth, site0, site1, Nst2v);
959 
960  real_t bufL[Nin5];
961 
962  /* #pragma loop noprefetch */
963  /* #pragma fj loop prefetch_sequential soft */
964  for (int site = site0; site < site1; ++site) {
965  int ix = site % Nx2v;
966  int iyzt = site / Nx2v;
967  int iy = iyzt % Ny;
968  int izt = site / Nxy2;
969  int iz = izt % Nz;
970  int it = izt / Nz;
971  int ixy = ix + Nx2v * iy;
972  int ixyz = ixy + Nxy2 * iz;
973  int jeo = (ieo + Leo[VLENY * iyzt]) % 2;
974 
975  // index for prefetch
976  int ibf_up_xp1, ibf_dn_xp1, site_xp1, site_xp1_2, ibf_up_xp1_2, ibf_dn_xp1_2;
977 
978  ibf_up_xp1 = Nskipx * NVC * ND2 * Ns * (iyzt + 1);
979  if (VLENY == 1) {
980  ibf_up_xp1 = Nskipx * NVC * ND2 * Ns * ((iyzt + 1) / 2);
981  }
982  ibf_dn_xp1 = Nskipx * NVC * ND2 * Ns * (iyzt + 1);
983  if (VLENY == 1) {
984  ibf_dn_xp1 = Nskipx * NVC * ND2 * Ns * ((iyzt + 1) / 2);
985  }
986  site_xp1 = (iyzt + 1) * Nx2v + ix;
987 
988  if (ix == 0) {
989  site_xp1_2 = iyzt * Nx2v + Nx2v - 1;
990 
991  ibf_dn_xp1_2 = Nskipx * NVC * ND2 * Ns * iyzt;
992  if (VLENY == 1) {
993  ibf_dn_xp1_2 = Nskipx * NVC * ND2 * Ns * (iyzt / 2);
994  }
995  } else if (ix == Nx2v - 1) {
996  site_xp1_2 = (iyzt + 1) * Nx2v + 0;
997 
998  ibf_up_xp1_2 = Nskipx * NVC * ND2 * Ns * (iyzt + 1);
999  if (VLENY == 1) {
1000  ibf_up_xp1_2 = Nskipx * NVC * ND2 * Ns * ((iyzt + 1) / 2);
1001  }
1002  }
1003 
1004  int i_p1 = 1;
1005  int i_p2 = 2;
1006 
1007  int ibf_yp1 = VLENX * NVC * ND2 * Ns * ((ix + i_p1) % Nx2v + Nx2v * (izt + (ix + i_p1) / Nx2v));
1008  int iy_p1 = ((iy + (ix + i_p1) / Nx2v) % Ny) * (Ny - 1);
1009  int izt_p1 = izt + (iy + (ix + i_p1) / Nx2v) / Ny;
1010  int site_yp1 = (ix + i_p1) % Nx2v + Nx2v * (izt_p1 * Ny + iy_p1);
1011 
1012  int ibf_yp2 = VLENX * NVC * ND2 * Ns * ((ix + i_p2) % Nx2v + Nx2v * (izt + (ix + i_p2) / Nx2v));
1013  int iy_p2 = ((iy + (ix + i_p2) / Nx2v) % Ny) * (Ny - 1);
1014  int izt_p2 = izt + (iy + (ix + i_p2) / Nx2v) / Ny;
1015  int site_yp2 = (ix + i_p2) % Nx2v + Nx2v * (izt_p2 * Ny + iy_p2);
1016 
1017  int ibf_zp1 = Nin5H * ((ixy + i_p1) % Nxy2 + Nxy2 * (it + (ixy + i_p1) / Nxy2));
1018  int iz_p1 = ((iz + (ixy + i_p1) / Nxy2) % Nz) * (Nz - 1);
1019  int it_p1 = it + (iz + (ixy + i_p1) / Nxy2) / Nz;
1020  int site_zp1 = (ixy + i_p1) % Nxy2 + Nxy2 * (it_p1 * Nz + iz_p1);
1021 
1022  int ibf_zp2 = Nin5H * ((ixy + i_p2) % Nxy2 + Nxy2 * (it + (ixy + i_p2) / Nxy2));
1023  int iz_p2 = ((iz + (ixy + 2) / Nxy2) % Nz) * (Nz - 1);
1024  int it_p2 = it + (iz + (ixy + i_p2) / Nxy2) / Nz;
1025  int site_zp2 = (ixy + 2) % Nxy2 + Nxy2 * (it_p2 * Nz + iz_p2);
1026 
1027  int ibf_tp1 = Nin5H * (ixyz + i_p1);
1028  int site_tp1 = ixyz + i_p1 + it * Nxy2 * Nz;
1029 
1030  int ibf_tp2 = Nin5H * (ixyz + i_p2);
1031  int site_tp2 = ixyz + i_p2 + it * Nxy2 * Nz;
1032  //
1033 
1034  int idir, nei;
1035 
1036  real_t *wp2 = &wp[Nin5 * site];
1037  real_t *vp2 = &vp[Nin5 * site];
1038 
1039  idir = 0;
1040  if (do_comm[idir] == 1) {
1041  if (ix == Nx2v - 1) {
1042  int ibf_up = Nskipx * NVC * ND2 * Ns * iyzt;
1043  if (VLENY == 1) {
1044  ibf_up = Nskipx * NVC * ND2 * Ns * (iyzt / 2);
1045  }
1046 
1047 
1048  __prefetch_load_hop_u_l2(up, ieo + 2 * 0, site_xp1);
1049  real_t *u = &up[NDF * Nst2 * (ieo + 2 * 0)];
1050  set_index_xp_eo(svidx_xp);
1051  if (jeo == 0) {
1052  for (int is = 0; is < Ns; ++is) {
1053  __prefetch_load_hop_vec_l2(wp, site_xp1, is);
1054  __prefetch_write_hop_vec_l2(vp, site_xp1, is);
1055  __prefetch_load_hop2_buf_x_l2(buf2_xp, ibf_up_xp1, is, Nskipx * NVC * ND2);
1056 
1057 
1058  if ((!((it == 0) || (it == Nt - 1))) &&
1059  (!((iz == 0) || (iz == Nz - 1))) &&
1060  (!((iy == 0) || (iy == Ny - 1)))) {
1061  __prefetch_load_hop_vec_l1(wp, site_xp1_2, is);
1062  __prefetch_write_hop_vec_l1(vp, site_xp1_2, is);
1063  __prefetch_load_hop2_buf_x_l1(buf2_xm, ibf_dn_xp1_2, is, Nskipx * NVC * ND2);
1064  }
1065 
1066  mult_wilson_eo_xp2(pg1e_xp, pg2e_xp, pg3e_xp, svidx_xp,
1067  &vp2[Nin4 * is], &u[VLEN * NDF * site], &wp2[Nin4 * is],
1068  &buf2_xp[ibf_up + Nskipx * NVC * ND2 * is]);
1069  }
1070  } else {
1071  for (int is = 0; is < Ns; ++is) {
1072  __prefetch_load_hop_vec_l2(wp, site_xp1, is);
1073  __prefetch_write_hop_vec_l2(vp, site_xp1, is);
1074  __prefetch_load_hop2_buf_x_l2(buf2_xp, ibf_up_xp1, is, Nskipx * NVC * ND2);
1075 
1076  if ((!((it == 0) || (it == Nt - 1))) &&
1077  (!((iz == 0) || (iz == Nz - 1))) &&
1078  (!((iy == 0) || (iy == Ny - 1)))) {
1079  __prefetch_load_hop_vec_l1(wp, site_xp1_2, is);
1080  __prefetch_write_hop_vec_l1(vp, site_xp1_2, is);
1081  __prefetch_load_hop2_buf_x_l1(buf2_xm, ibf_dn_xp1_2, is, Nskipx * NVC * ND2);
1082  }
1083 
1084  mult_wilson_eo_xp2(pg1o_xp, pg2o_xp, pg3o_xp, svidx_xp,
1085  &vp2[Nin4 * is], &u[VLEN * NDF * site], &wp2[Nin4 * is],
1086  &buf2_xp[ibf_up + Nskipx * NVC * ND2 * is]);
1087  }
1088  }
1089  }
1090 
1091  if (ix == 0) {
1092  int ibf_dn = Nskipx * NVC * ND2 * Ns * iyzt;
1093  if (VLENY == 1) {
1094  ibf_dn = Nskipx * NVC * ND2 * Ns * (iyzt / 2);
1095  }
1096 
1097 
1098  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * 0, site_xp1);
1099  real_t *u = &up[NDF * Nst2 * (1 - ieo + 2 * 0)];
1100 
1101  set_index_xm_eo(svidx_xm);
1102  if (jeo == 0) {
1103  for (int is = 0; is < Ns; ++is) {
1104  __prefetch_load_hop_vec_l2(wp, site_xp1, is);
1105  __prefetch_write_hop_vec_l2(vp, site_xp1, is);
1106  __prefetch_load_hop2_buf_x_l2(buf2_xm, ibf_dn_xp1, is, Nskipx * NVC * ND2);
1107 
1108  if ((!((it == 0) || (it == Nt - 1))) &&
1109  (!((iz == 0) || (iz == Nz - 1))) &&
1110  (!((iy == 0) || (iy == Ny - 1)))) {
1111  __prefetch_load_hop_vec_l1(wp, site_xp1_2, is);
1112  __prefetch_write_hop_vec_l1(vp, site_xp1_2, is);
1113  __prefetch_write_hop1_buf_x_l1(buf2_xp, ibf_up_xp1_2, is, Nskipx * NVC * ND2)
1114  }
1115 
1116  mult_wilson_eo_xm2(pg1e_xm, pg2e_xm, pg3e_xm, svidx_xm,
1117  &vp2[Nin4 * is], &u[VLEN * NDF * site], &wp2[Nin4 * is],
1118  &buf2_xm[ibf_dn + Nskipx * NVC * ND2 * is]);
1119  }
1120  } else {
1121  for (int is = 0; is < Ns; ++is) {
1122  __prefetch_load_hop_vec_l2(wp, site_xp1, is);
1123  __prefetch_write_hop_vec_l2(vp, site_xp1, is);
1124  __prefetch_load_hop2_buf_x_l2(buf2_xm, ibf_dn_xp1, is, Nskipx * NVC * ND2);
1125 
1126  if ((!((it == 0) || (it == Nt - 1))) &&
1127  (!((iz == 0) || (iz == Nz - 1))) &&
1128  (!((iy == 0) || (iy == Ny - 1)))) {
1129  __prefetch_load_hop_vec_l1(wp, site_xp1_2, is);
1130  __prefetch_write_hop_vec_l1(vp, site_xp1_2, is);
1131  __prefetch_write_hop1_buf_x_l1(buf2_xp, ibf_up_xp1_2, is, Nskipx * NVC * ND2);
1132  }
1133 
1134  mult_wilson_eo_xm2(pg1o_xm, pg2o_xm, pg3o_xm, svidx_xm,
1135  &vp2[Nin4 * is], &u[VLEN * NDF * site], &wp2[Nin4 * is],
1136  &buf2_xm[ibf_dn + Nskipx * NVC * ND2 * is]);
1137  }
1138  }
1139  }
1140  }
1141 
1142  idir = 1;
1143  if (do_comm[idir] == 1) {
1144  if (iy == Ny - 1) {
1145  int ibf = VLENX * NVC * ND2 * Ns * (ix + Nx2v * izt);
1146 
1147 
1148  __prefetch_load_hop_u_l2(up, ieo + 2 * 1, site_yp2);
1149  real_t *u = &up[NDF * Nst2 * (ieo + 2 * 1)];
1150  for (int is = 0; is < Ns; ++is) {
1151  __prefetch_load_hop_vec_l2(wp, site_yp2, is);
1152  __prefetch_write_hop_vec_l2(vp, site_yp2, is);
1153  __prefetch_load_hop2_buf_y_l2(buf2_yp, ibf_yp2, is, VLENX * NVC * ND2);
1154 
1155  if ((!((it == 0) || (it == Nt - 1))) &&
1156  (!((iz == 0) || (iz == Nz - 1)))) {
1157  __prefetch_load_hop_vec_l1(wp, site_yp1, is);
1158  __prefetch_write_hop_vec_l1(vp, site_yp1, is);
1159  __prefetch_write_hop1_buf_y_l1(buf2_yp, ibf_yp1, is, VLENX * NVC * ND2);
1160  }
1161 
1162  mult_wilson_yp2(pg1_yp, pg2_yp,
1163  &vp2[Nin4 * is], &u[VLEN * NDF * site],
1164  &wp2[Nin4 * is], &buf2_yp[ibf + VLENX * NVC * ND2 * is]);
1165  }
1166  }
1167 
1168  if (iy == 0) {
1169  int ibf = VLENX * NVC * ND2 * Ns * (ix + Nx2v * izt);
1170 
1171  __prefetch_load_hop_u_l2(up, 1 - ieo + 2 * 1, site_yp2);
1172  real_t *u = &up[NDF * Nst2 * (1 - ieo + 2 * 1)];
1173  for (int is = 0; is < Ns; ++is) {
1174  __prefetch_load_hop_vec_l2(wp, site_yp2, is);
1175  __prefetch_write_hop_vec_l2(vp, site_yp2, is);
1176  __prefetch_load_hop2_buf_y_l2(buf2_ym, ibf_yp2, is, VLENX * NVC * ND2);
1177 
1178  if ((!((it == 0) || (it == Nt - 1))) &&
1179  (!((iz == 0) || (iz == Nz - 1)))) {
1180  __prefetch_load_hop_vec_l1(wp, site_yp1, is);
1181  __prefetch_write_hop_vec_l1(vp, site_yp1, is);
1182 
1183  __prefetch_write_hop1_buf_y_l1(buf2_ym, ibf_yp1, is, VLENX * NVC * ND2);
1184  }
1185 
1186  mult_wilson_ym2(pg1_ym, pg2_ym,
1187  &vp2[Nin4 * is], &u[VLEN * NDF * site],
1188  &wp2[Nin4 * is], &buf2_ym[ibf + VLENX * NVC * ND2 * is]);
1189  }
1190  }
1191  }
1192 
1193  idir = 2;
1194  if (do_comm[idir] == 1) {
1195  if (iz == Nz - 1) {
1196  int ibf = Nin5H * (ixy + Nxy2 * it);
1197 
1198  __prefetch_load_hop_u_l2(up, ieo + 2 * idir, site_zp2);
1199  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (ieo + 2 * idir)];
1200  for (int is = 0; is < Ns; ++is) {
1201  __prefetch_write_hop_vec_l2(vp, site_zp2, is);
1202  __prefetch_load_hop2_buf_zt_l2(buf2_zp, ibf_zp2, is, Nin4H);
1203 
1204  if (!((it == 0) || (it == Nt - 1))) {
1205  __prefetch_write_hop_vec_l1(vp, site_zp1, is);
1206  __prefetch_write_hop1_buf_zt_l1(buf2_zp, ibf_zp1, is, Nin4H);
1207  }
1208 
1209  mult_wilson_zp2(&vp2[Nin4 * is], u2, &buf2_zp[ibf + Nin4H * is]);
1210  }
1211  }
1212 
1213  if (iz == 0) {
1214  int ibf = Nin5H * (ixy + Nxy2 * it);
1215 
1216  for (int is = 0; is < Ns; ++is) {
1217  __prefetch_write_hop_vec_l2(vp, site_zp2, is);
1218  __prefetch_load_hop2_buf_zt_l2(buf2_zm, ibf_zp2, is, Nin4H);
1219 
1220  if (!((it == 0) || (it == Nt - 1))) {
1221  __prefetch_write_hop_vec_l1(vp, site_zp1, is);
1222  __prefetch_write_hop1_buf_zt_l1(buf2_zm, ibf_zp1, is, Nin4H);
1223  }
1224 
1225  mult_wilson_zm2(&vp2[Nin4 * is], &buf2_zm[ibf + Nin4H * is]);
1226  }
1227  }
1228  }
1229 
1230  idir = 3;
1231  if (do_comm[idir] == 1) {
1232  if (it == Nt - 1) {
1233  int ibf = Nin5H * ixyz;
1234 
1235  __prefetch_load_hop_u_l2(up, ieo + 2 * idir, site_tp2);
1236 
1237  real_t *u2 = &up[VLEN * NDF * site + NvU2 * (ieo + 2 * idir)];
1238  for (int is = 0; is < Ns; ++is) {
1239  __prefetch_write_hop_vec_l2(vp, site_tp2, is);
1240  __prefetch_load_hop2_buf_zt_l2(buf2_tp, ibf_tp2, is, Nin4H);
1241 
1242  __prefetch_write_hop_vec_l1(vp, site_tp1, is);
1243  __prefetch_load_hop2_buf_zt_l1(buf2_tp, ibf_tp1, is, Nin4H);
1244 
1245  mult_wilson_tp2_dirac(&vp2[Nin4 * is], u2, &buf2_tp[ibf + Nin4H * is]);
1246  }
1247  }
1248 
1249  if (it == 0) {
1250  int ibf = Nin5H * ixyz;
1251 
1252 
1253  for (int is = 0; is < Ns; ++is) {
1254  __prefetch_write_hop_vec_l2(vp, site_tp2, is);
1255  __prefetch_load_hop2_buf_zt_l2(buf2_tp, ibf_tp2, is, Nin4H);
1256 
1257  __prefetch_write_hop_vec_l1(vp, site_tp1, is);
1258  __prefetch_load_hop2_buf_zt_l1(buf2_tp, ibf_tp1, is, Nin4H);
1259 
1260  mult_wilson_tm2_dirac(&vp2[Nin4 * is], &buf2_tm[ibf + Nin4H * is]);
1261  }
1262  }
1263  }
1264  }
1265 }
1266 
1267 
1268 //====================================================================
1270  real_t *vp, real_t *up, real_t *wp,
1271  real_t *yp,
1272  real_t mq, real_t M0, int Ns, int *bc,
1273  real_t *b, real_t *c,
1274  int *Leo, int *Nsize, int *do_comm,
1275  const int ieo)
1276 {
1277  mult_domainwall_5din_eo_clear(vp, Ns, Nsize);
1278 
1279  mult_domainwall_5din_eo_5dir_dirac(yp, wp, mq, M0, Ns, bc, b, c,
1280  Nsize, do_comm);
1281 
1282  mult_domainwall_5din_eo_hopb_dirac(vp, up, yp, mq, M0, Ns, bc, b, c,
1283  Leo, Nsize, do_comm, ieo);
1284 }
1285 
1286 
1287 //====================================================================
1289  real_t *__restrict vp,
1290  real_t *__restrict wp,
1291  int Ns, int *Nsize,
1292  real_t *e, real_t *dpinv, real_t *dm)
1293 {
1294  int Nx2v = Nsize[0];
1295  int Ny = Nsize[1];
1296  int Nz = Nsize[2];
1297  int Nt = Nsize[3];
1298  int Nst2v = Nx2v * Ny * Nz * Nt;
1299  int Nst2 = Nst2v * VLEN;
1300 
1301  int Nin4 = VLEN * NVCD;
1302  int Nin5 = Nin4 * Ns;
1303 
1304 
1305  int ith, nth, site0, site1;
1306  set_threadtask(ith, nth, site0, site1, Nst2v);
1307 
1308  for (int site = site0; site < site1; ++site) {
1309  real_t *vp2 = &vp[Nin5 * site];
1310  real_t *wp2 = &wp[Nin5 * site];
1311  svbool_t pg = set_predicate();
1312 
1313  for (int ic = 0; ic < NC; ++ic) {
1314  svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
1315  svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
1316  svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
1317  int offset0 = 2 * ND * ic;
1318 
1319  __prefetch_load_luinv_l1(wp, offset0);
1320  __prefetch_write_luinv_l1(vp, offset0);
1321 
1322  load_vec(pg, v1r, &wp2[VLEN * (offset0 + 0)]);
1323  load_vec(pg, v1i, &wp2[VLEN * (offset0 + 1)]);
1324  load_vec(pg, v2r, &wp2[VLEN * (offset0 + 2)]);
1325  load_vec(pg, v2i, &wp2[VLEN * (offset0 + 3)]);
1326  load_vec(pg, v3r, &wp2[VLEN * (offset0 + 4)]);
1327  load_vec(pg, v3i, &wp2[VLEN * (offset0 + 5)]);
1328  load_vec(pg, v4r, &wp2[VLEN * (offset0 + 6)]);
1329  load_vec(pg, v4i, &wp2[VLEN * (offset0 + 7)]);
1330  save_vec(pg, &vp2[VLEN * (offset0 + 0)], v1r);
1331  save_vec(pg, &vp2[VLEN * (offset0 + 1)], v1i);
1332  save_vec(pg, &vp2[VLEN * (offset0 + 2)], v2r);
1333  save_vec(pg, &vp2[VLEN * (offset0 + 3)], v2i);
1334  save_vec(pg, &vp2[VLEN * (offset0 + 4)], v3r);
1335  save_vec(pg, &vp2[VLEN * (offset0 + 5)], v3i);
1336  save_vec(pg, &vp2[VLEN * (offset0 + 6)], v4r);
1337  save_vec(pg, &vp2[VLEN * (offset0 + 7)], v4i);
1338  y1r = v1r;
1339  scal_vec(pg, y1r, e[0]);
1340  y1i = v1i;
1341  scal_vec(pg, y1i, e[0]);
1342  y2r = v2r;
1343  scal_vec(pg, y2r, e[0]);
1344  y2i = v2i;
1345  scal_vec(pg, y2i, e[0]);
1346  y3r = v3r;
1347  scal_vec(pg, y3r, e[0]);
1348  y3i = v3i;
1349  scal_vec(pg, y3i, e[0]);
1350  y4r = v4r;
1351  scal_vec(pg, y4r, e[0]);
1352  y4i = v4i;
1353  scal_vec(pg, y4i, e[0]);
1354 
1355  for (int is = 1; is < Ns - 1; ++is) {
1356  x1r = v1r;
1357  x1i = v1i;
1358  x2r = v2r;
1359  x2i = v2i;
1360  x3r = v3r;
1361  x3i = v3i;
1362  x4r = v4r;
1363  x4i = v4i;
1364 
1365  int offset = 2 * ND * ic + NVCD * is;
1366 
1367 
1368  __prefetch_load_luinv_l1(wp, offset);
1369  __prefetch_write_luinv_l1(vp, offset);
1370 
1371  load_vec(pg, v1r, &wp2[VLEN * (offset + 0)]);
1372  load_vec(pg, v1i, &wp2[VLEN * (offset + 1)]);
1373  load_vec(pg, v2r, &wp2[VLEN * (offset + 2)]);
1374  load_vec(pg, v2i, &wp2[VLEN * (offset + 3)]);
1375  load_vec(pg, v3r, &wp2[VLEN * (offset + 4)]);
1376  load_vec(pg, v3i, &wp2[VLEN * (offset + 5)]);
1377  load_vec(pg, v4r, &wp2[VLEN * (offset + 6)]);
1378  load_vec(pg, v4i, &wp2[VLEN * (offset + 7)]);
1379 
1380  real_t a = real_t(0.5) * dm[is] * dpinv[is - 1];
1381 
1382  add_aPp5_dirac_vec(pg,
1383  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1384  a,
1385  x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1386  save_vec(pg, &vp2[VLEN * (offset + 0)], v1r);
1387  save_vec(pg, &vp2[VLEN * (offset + 1)], v1i);
1388  save_vec(pg, &vp2[VLEN * (offset + 2)], v2r);
1389  save_vec(pg, &vp2[VLEN * (offset + 3)], v2i);
1390  save_vec(pg, &vp2[VLEN * (offset + 4)], v3r);
1391  save_vec(pg, &vp2[VLEN * (offset + 5)], v3i);
1392  save_vec(pg, &vp2[VLEN * (offset + 6)], v4r);
1393  save_vec(pg, &vp2[VLEN * (offset + 7)], v4i);
1394  axpy_vec(pg, y1r, e[is], v1r);
1395  axpy_vec(pg, y1i, e[is], v1i);
1396  axpy_vec(pg, y2r, e[is], v2r);
1397  axpy_vec(pg, y2i, e[is], v2i);
1398  axpy_vec(pg, y3r, e[is], v3r);
1399  axpy_vec(pg, y3i, e[is], v3i);
1400  axpy_vec(pg, y4r, e[is], v4r);
1401  axpy_vec(pg, y4i, e[is], v4i);
1402  }
1403  int is = Ns - 1;
1404  x1r = v1r;
1405  x1i = v1i;
1406  x2r = v2r;
1407  x2i = v2i;
1408  x3r = v3r;
1409  x3i = v3i;
1410  x4r = v4r;
1411  x4i = v4i;
1412 
1413  int offset = 2 * ND * ic + NVCD * is;
1414  load_vec(pg, v1r, &wp2[VLEN * (offset + 0)]);
1415  load_vec(pg, v1i, &wp2[VLEN * (offset + 1)]);
1416  load_vec(pg, v2r, &wp2[VLEN * (offset + 2)]);
1417  load_vec(pg, v2i, &wp2[VLEN * (offset + 3)]);
1418  load_vec(pg, v3r, &wp2[VLEN * (offset + 4)]);
1419  load_vec(pg, v3i, &wp2[VLEN * (offset + 5)]);
1420  load_vec(pg, v4r, &wp2[VLEN * (offset + 6)]);
1421  load_vec(pg, v4i, &wp2[VLEN * (offset + 7)]);
1422  real_t a = real_t(0.5) * dm[is] * dpinv[is - 1];
1423  add_aPp5_dirac_vec(pg,
1424  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1425  a,
1426  x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1427 
1428  add_aPm5_dirac_vec(pg,
1429  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1430  real_t(-0.5),
1431  y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i);
1432  save_vec(pg, &vp2[VLEN * (offset + 0)], v1r);
1433  save_vec(pg, &vp2[VLEN * (offset + 1)], v1i);
1434  save_vec(pg, &vp2[VLEN * (offset + 2)], v2r);
1435  save_vec(pg, &vp2[VLEN * (offset + 3)], v2i);
1436  save_vec(pg, &vp2[VLEN * (offset + 4)], v3r);
1437  save_vec(pg, &vp2[VLEN * (offset + 5)], v3i);
1438  save_vec(pg, &vp2[VLEN * (offset + 6)], v4r);
1439  save_vec(pg, &vp2[VLEN * (offset + 7)], v4i);
1440  } //ic
1441  } //site
1442 
1443 #pragma omp barrier
1444 }
1445 
1446 
1447 //====================================================================
1449  real_t *__restrict vp,
1450  real_t *__restrict wp,
1451  int Ns, int *Nsize,
1452  real_t *f, real_t *dpinv, real_t *dm)
1453 {
1454  int Nx2v = Nsize[0];
1455  int Ny = Nsize[1];
1456  int Nz = Nsize[2];
1457  int Nt = Nsize[3];
1458  int Nst2v = Nx2v * Ny * Nz * Nt;
1459  int Nst2 = Nst2v * VLEN;
1460 
1461  int Nin4 = VLEN * NVCD;
1462  int Nin5 = Nin4 * Ns;
1463 
1464 
1465  int ith, nth, site0, site1;
1466  set_threadtask(ith, nth, site0, site1, Nst2v);
1467 
1468  for (int site = site0; site < site1; ++site) {
1469  real_t *vp2 = &vp[Nin5 * site];
1470  real_t *wp2 = &wp[Nin5 * site];
1471  svbool_t pg = set_predicate();
1472 
1473  for (int ic = 0; ic < NC; ++ic) {
1474  svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
1475  svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
1476  svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
1477 
1478  int offset0 = 2 * ND * ic + NVCD * (Ns - 1);
1479 
1480  __prefetch_load_luinv_l1(wp, offset0);
1481  __prefetch_write_luinv_l1(vp, offset0);
1482 
1483  load_vec(pg, v1r, &wp2[VLEN * (offset0 + 0)]);
1484  load_vec(pg, v1i, &wp2[VLEN * (offset0 + 1)]);
1485  load_vec(pg, v2r, &wp2[VLEN * (offset0 + 2)]);
1486  load_vec(pg, v2i, &wp2[VLEN * (offset0 + 3)]);
1487  load_vec(pg, v3r, &wp2[VLEN * (offset0 + 4)]);
1488  load_vec(pg, v3i, &wp2[VLEN * (offset0 + 5)]);
1489  load_vec(pg, v4r, &wp2[VLEN * (offset0 + 6)]);
1490  load_vec(pg, v4i, &wp2[VLEN * (offset0 + 7)]);
1491 
1492  real_t a = dpinv[Ns - 1];
1493 
1494  scal_vec(pg, v1r, a);
1495  scal_vec(pg, v1i, a);
1496  scal_vec(pg, v2r, a);
1497  scal_vec(pg, v2i, a);
1498  scal_vec(pg, v3r, a);
1499  scal_vec(pg, v3i, a);
1500  scal_vec(pg, v4r, a);
1501  scal_vec(pg, v4i, a);
1502 
1503  save_vec(pg, &vp2[VLEN * (offset0 + 0)], v1r);
1504  save_vec(pg, &vp2[VLEN * (offset0 + 1)], v1i);
1505  save_vec(pg, &vp2[VLEN * (offset0 + 2)], v2r);
1506  save_vec(pg, &vp2[VLEN * (offset0 + 3)], v2i);
1507  save_vec(pg, &vp2[VLEN * (offset0 + 4)], v3r);
1508  save_vec(pg, &vp2[VLEN * (offset0 + 5)], v3i);
1509  save_vec(pg, &vp2[VLEN * (offset0 + 6)], v4r);
1510  save_vec(pg, &vp2[VLEN * (offset0 + 7)], v4i);
1511 
1512  set_aPp5_dirac_vec(pg,
1513  y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i,
1514  real_t(0.5),
1515  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i);
1516  for (int is = Ns - 2; is >= 0; --is) {
1517  x1r = v1r;
1518  x1i = v1i;
1519  x2r = v2r;
1520  x2i = v2i;
1521  x3r = v3r;
1522  x3i = v3i;
1523  x4r = v4r;
1524  x4i = v4i;
1525 
1526  int offset = 2 * ND * ic + NVCD * is;
1527 
1528  __prefetch_load_luinv_l1(wp, offset);
1529  __prefetch_write_luinv_l1(vp, offset);
1530 
1531  load_vec(pg, v1r, &wp2[VLEN * (offset + 0)]);
1532  load_vec(pg, v1i, &wp2[VLEN * (offset + 1)]);
1533  load_vec(pg, v2r, &wp2[VLEN * (offset + 2)]);
1534  load_vec(pg, v2i, &wp2[VLEN * (offset + 3)]);
1535  load_vec(pg, v3r, &wp2[VLEN * (offset + 4)]);
1536  load_vec(pg, v3i, &wp2[VLEN * (offset + 5)]);
1537  load_vec(pg, v4r, &wp2[VLEN * (offset + 6)]);
1538  load_vec(pg, v4i, &wp2[VLEN * (offset + 7)]);
1539 
1540  real_t a = real_t(0.5) * dm[is];
1541 
1542  add_aPm5_dirac_vec(pg,
1543  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1544  a,
1545  x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1546 
1547  axpy_vec(pg, v1r, -f[is], y1r);
1548  axpy_vec(pg, v1i, -f[is], y1i);
1549  axpy_vec(pg, v2r, -f[is], y2r);
1550  axpy_vec(pg, v2i, -f[is], y2i);
1551  axpy_vec(pg, v3r, -f[is], y3r);
1552  axpy_vec(pg, v3i, -f[is], y3i);
1553  axpy_vec(pg, v4r, -f[is], y4r);
1554  axpy_vec(pg, v4i, -f[is], y4i);
1555 
1556  real_t aa = dpinv[is];
1557  scal_vec(pg, v1r, aa);
1558  scal_vec(pg, v1i, aa);
1559  scal_vec(pg, v2r, aa);
1560  scal_vec(pg, v2i, aa);
1561  scal_vec(pg, v3r, aa);
1562  scal_vec(pg, v3i, aa);
1563  scal_vec(pg, v4r, aa);
1564  scal_vec(pg, v4i, aa);
1565 
1566  save_vec(pg, &vp2[VLEN * (offset + 0)], v1r);
1567  save_vec(pg, &vp2[VLEN * (offset + 1)], v1i);
1568  save_vec(pg, &vp2[VLEN * (offset + 2)], v2r);
1569  save_vec(pg, &vp2[VLEN * (offset + 3)], v2i);
1570  save_vec(pg, &vp2[VLEN * (offset + 4)], v3r);
1571  save_vec(pg, &vp2[VLEN * (offset + 5)], v3i);
1572  save_vec(pg, &vp2[VLEN * (offset + 6)], v4r);
1573  save_vec(pg, &vp2[VLEN * (offset + 7)], v4i);
1574  }
1575  } //ic
1576  } //site
1577 #pragma omp barrier
1578 }
1579 
1580 
1581 //====================================================================
1583  real_t *__restrict vp,
1584  real_t *__restrict wp,
1585  int Ns, int *Nsize,
1586  real_t *f, real_t *dpinv, real_t *dm)
1587 {
1588  int Nx2v = Nsize[0];
1589  int Ny = Nsize[1];
1590  int Nz = Nsize[2];
1591  int Nt = Nsize[3];
1592  int Nst2v = Nx2v * Ny * Nz * Nt;
1593  int Nst2 = Nst2v * VLEN;
1594 
1595  int Nin4 = VLEN * NVCD;
1596  int Nin5 = Nin4 * Ns;
1597 
1598 
1599  int ith, nth, site0, site1;
1600  set_threadtask(ith, nth, site0, site1, Nst2v);
1601 
1602  for (int site = site0; site < site1; ++site) {
1603  real_t *vp2 = &vp[Nin5 * site];
1604  real_t *wp2 = &wp[Nin5 * site];
1605  svbool_t pg = set_predicate();
1606 
1607  for (int ic = 0; ic < NC; ++ic) {
1608  svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
1609  svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
1610  svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
1611  int offset0 = 2 * ND * ic;
1612 
1613  __prefetch_load_luinv_l1(wp, offset0);
1614  __prefetch_write_luinv_l1(vp, offset0);
1615 
1616  load_vec(pg, v1r, &wp2[VLEN * (offset0 + 0)]);
1617  load_vec(pg, v1i, &wp2[VLEN * (offset0 + 1)]);
1618  load_vec(pg, v2r, &wp2[VLEN * (offset0 + 2)]);
1619  load_vec(pg, v2i, &wp2[VLEN * (offset0 + 3)]);
1620  load_vec(pg, v3r, &wp2[VLEN * (offset0 + 4)]);
1621  load_vec(pg, v3i, &wp2[VLEN * (offset0 + 5)]);
1622  load_vec(pg, v4r, &wp2[VLEN * (offset0 + 6)]);
1623  load_vec(pg, v4i, &wp2[VLEN * (offset0 + 7)]);
1624 
1625  real_t a = dpinv[0];
1626  scal_vec(pg, v1r, a);
1627  scal_vec(pg, v1i, a);
1628  scal_vec(pg, v2r, a);
1629  scal_vec(pg, v2i, a);
1630  scal_vec(pg, v3r, a);
1631  scal_vec(pg, v3i, a);
1632  scal_vec(pg, v4r, a);
1633  scal_vec(pg, v4i, a);
1634 
1635  save_vec(pg, &vp2[VLEN * (offset0 + 0)], v1r);
1636  save_vec(pg, &vp2[VLEN * (offset0 + 1)], v1i);
1637  save_vec(pg, &vp2[VLEN * (offset0 + 2)], v2r);
1638  save_vec(pg, &vp2[VLEN * (offset0 + 3)], v2i);
1639  save_vec(pg, &vp2[VLEN * (offset0 + 4)], v3r);
1640  save_vec(pg, &vp2[VLEN * (offset0 + 5)], v3i);
1641  save_vec(pg, &vp2[VLEN * (offset0 + 6)], v4r);
1642  save_vec(pg, &vp2[VLEN * (offset0 + 7)], v4i);
1643  y1r = v1r;
1644  scal_vec(pg, y1r, f[0]);
1645  y1i = v1i;
1646  scal_vec(pg, y1i, f[0]);
1647  y2r = v2r;
1648  scal_vec(pg, y2r, f[0]);
1649  y2i = v2i;
1650  scal_vec(pg, y2i, f[0]);
1651  y3r = v3r;
1652  scal_vec(pg, y3r, f[0]);
1653  y3i = v3i;
1654  scal_vec(pg, y3i, f[0]);
1655  y4r = v4r;
1656  scal_vec(pg, y4r, f[0]);
1657  y4i = v4i;
1658  scal_vec(pg, y4i, f[0]);
1659 
1660  for (int is = 1; is < Ns - 1; ++is) {
1661  x1r = v1r;
1662  x1i = v1i;
1663  x2r = v2r;
1664  x2i = v2i;
1665  x3r = v3r;
1666  x3i = v3i;
1667  x4r = v4r;
1668  x4i = v4i;
1669 
1670  int offset = 2 * ND * ic + NVCD * is;
1671 
1672  __prefetch_load_luinv_l1(wp, offset);
1673  __prefetch_write_luinv_l1(vp, offset);
1674 
1675  load_vec(pg, v1r, &wp2[VLEN * (offset + 0)]);
1676  load_vec(pg, v1i, &wp2[VLEN * (offset + 1)]);
1677  load_vec(pg, v2r, &wp2[VLEN * (offset + 2)]);
1678  load_vec(pg, v2i, &wp2[VLEN * (offset + 3)]);
1679  load_vec(pg, v3r, &wp2[VLEN * (offset + 4)]);
1680  load_vec(pg, v3i, &wp2[VLEN * (offset + 5)]);
1681  load_vec(pg, v4r, &wp2[VLEN * (offset + 6)]);
1682  load_vec(pg, v4i, &wp2[VLEN * (offset + 7)]);
1683 
1684  real_t a = real_t(0.5) * dm[is - 1];
1685 
1686  add_aPm5_dirac_vec(pg,
1687  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1688  a,
1689  x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1690 
1691  real_t aa = dpinv[is];
1692  scal_vec(pg, v1r, aa);
1693  scal_vec(pg, v1i, aa);
1694  scal_vec(pg, v2r, aa);
1695  scal_vec(pg, v2i, aa);
1696  scal_vec(pg, v3r, aa);
1697  scal_vec(pg, v3i, aa);
1698  scal_vec(pg, v4r, aa);
1699  scal_vec(pg, v4i, aa);
1700 
1701  save_vec(pg, &vp2[VLEN * (offset + 0)], v1r);
1702  save_vec(pg, &vp2[VLEN * (offset + 1)], v1i);
1703  save_vec(pg, &vp2[VLEN * (offset + 2)], v2r);
1704  save_vec(pg, &vp2[VLEN * (offset + 3)], v2i);
1705  save_vec(pg, &vp2[VLEN * (offset + 4)], v3r);
1706  save_vec(pg, &vp2[VLEN * (offset + 5)], v3i);
1707  save_vec(pg, &vp2[VLEN * (offset + 6)], v4r);
1708  save_vec(pg, &vp2[VLEN * (offset + 7)], v4i);
1709  axpy_vec(pg, y1r, f[is], v1r);
1710  axpy_vec(pg, y1i, f[is], v1i);
1711  axpy_vec(pg, y2r, f[is], v2r);
1712  axpy_vec(pg, y2i, f[is], v2i);
1713  axpy_vec(pg, y3r, f[is], v3r);
1714  axpy_vec(pg, y3i, f[is], v3i);
1715  axpy_vec(pg, y4r, f[is], v4r);
1716  axpy_vec(pg, y4i, f[is], v4i);
1717  }
1718  int is = Ns - 1;
1719  x1r = v1r;
1720  x1i = v1i;
1721  x2r = v2r;
1722  x2i = v2i;
1723  x3r = v3r;
1724  x3i = v3i;
1725  x4r = v4r;
1726  x4i = v4i;
1727 
1728  int offset = 2 * ND * ic + NVCD * is;
1729  load_vec(pg, v1r, &wp2[VLEN * (offset + 0)]);
1730  load_vec(pg, v1i, &wp2[VLEN * (offset + 1)]);
1731  load_vec(pg, v2r, &wp2[VLEN * (offset + 2)]);
1732  load_vec(pg, v2i, &wp2[VLEN * (offset + 3)]);
1733  load_vec(pg, v3r, &wp2[VLEN * (offset + 4)]);
1734  load_vec(pg, v3i, &wp2[VLEN * (offset + 5)]);
1735  load_vec(pg, v4r, &wp2[VLEN * (offset + 6)]);
1736  load_vec(pg, v4i, &wp2[VLEN * (offset + 7)]);
1737  a = real_t(0.5) * dm[is - 1];
1738  add_aPm5_dirac_vec(pg,
1739  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1740  a,
1741  x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1742  add_aPp5_dirac_vec(pg,
1743  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1744  real_t(-0.5),
1745  y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i);
1746  real_t aa = dpinv[is];
1747  scal_vec(pg, v1r, aa);
1748  scal_vec(pg, v1i, aa);
1749  scal_vec(pg, v2r, aa);
1750  scal_vec(pg, v2i, aa);
1751  scal_vec(pg, v3r, aa);
1752  scal_vec(pg, v3i, aa);
1753  scal_vec(pg, v4r, aa);
1754  scal_vec(pg, v4i, aa);
1755 
1756  save_vec(pg, &vp2[VLEN * (offset + 0)], v1r);
1757  save_vec(pg, &vp2[VLEN * (offset + 1)], v1i);
1758  save_vec(pg, &vp2[VLEN * (offset + 2)], v2r);
1759  save_vec(pg, &vp2[VLEN * (offset + 3)], v2i);
1760  save_vec(pg, &vp2[VLEN * (offset + 4)], v3r);
1761  save_vec(pg, &vp2[VLEN * (offset + 5)], v3i);
1762  save_vec(pg, &vp2[VLEN * (offset + 6)], v4r);
1763  save_vec(pg, &vp2[VLEN * (offset + 7)], v4i);
1764  } //ic
1765  } //site
1766 #pragma omp barrier
1767 }
1768 
1769 
1770 //====================================================================
1772  real_t *__restrict vp,
1773  real_t *__restrict wp,
1774  int Ns, int *Nsize,
1775  real_t *e, real_t *dpinv, real_t *dm)
1776 {
1777  int Nx2v = Nsize[0];
1778  int Ny = Nsize[1];
1779  int Nz = Nsize[2];
1780  int Nt = Nsize[3];
1781  int Nst2v = Nx2v * Ny * Nz * Nt;
1782  int Nst2 = Nst2v * VLEN;
1783 
1784  int Nin4 = VLEN * NVCD;
1785  int Nin5 = Nin4 * Ns;
1786 
1787 
1788  int ith, nth, site0, site1;
1789  set_threadtask(ith, nth, site0, site1, Nst2v);
1790 
1791  for (int site = site0; site < site1; ++site) {
1792  real_t *vp2 = &vp[Nin5 * site];
1793  real_t *wp2 = &wp[Nin5 * site];
1794  svbool_t pg = set_predicate();
1795 
1796  for (int ic = 0; ic < NC; ++ic) {
1797  svreal_t x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i;
1798  svreal_t v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i;
1799  svreal_t y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i;
1800 
1801  int offset0 = 2 * ND * ic + NVCD * (Ns - 1);
1802 
1803  __prefetch_load_luinv_l1(wp, offset0);
1804  __prefetch_write_luinv_l1(vp, offset0);
1805 
1806  load_vec(pg, v1r, &wp2[VLEN * (offset0 + 0)]);
1807  load_vec(pg, v1i, &wp2[VLEN * (offset0 + 1)]);
1808  load_vec(pg, v2r, &wp2[VLEN * (offset0 + 2)]);
1809  load_vec(pg, v2i, &wp2[VLEN * (offset0 + 3)]);
1810  load_vec(pg, v3r, &wp2[VLEN * (offset0 + 4)]);
1811  load_vec(pg, v3i, &wp2[VLEN * (offset0 + 5)]);
1812  load_vec(pg, v4r, &wp2[VLEN * (offset0 + 6)]);
1813  load_vec(pg, v4i, &wp2[VLEN * (offset0 + 7)]);
1814 
1815  save_vec(pg, &vp2[VLEN * (offset0 + 0)], v1r);
1816  save_vec(pg, &vp2[VLEN * (offset0 + 1)], v1i);
1817  save_vec(pg, &vp2[VLEN * (offset0 + 2)], v2r);
1818  save_vec(pg, &vp2[VLEN * (offset0 + 3)], v2i);
1819  save_vec(pg, &vp2[VLEN * (offset0 + 4)], v3r);
1820  save_vec(pg, &vp2[VLEN * (offset0 + 5)], v3i);
1821  save_vec(pg, &vp2[VLEN * (offset0 + 6)], v4r);
1822  save_vec(pg, &vp2[VLEN * (offset0 + 7)], v4i);
1823 
1824  set_aPm5_dirac_vec(pg,
1825  y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i,
1826  real_t(0.5),
1827  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i);
1828 
1829  for (int is = Ns - 2; is >= 0; --is) {
1830  x1r = v1r;
1831  x1i = v1i;
1832  x2r = v2r;
1833  x2i = v2i;
1834  x3r = v3r;
1835  x3i = v3i;
1836  x4r = v4r;
1837  x4i = v4i;
1838 
1839  int offset = 2 * ND * ic + NVCD * is;
1840 
1841  __prefetch_load_luinv_l1(wp, offset);
1842  __prefetch_write_luinv_l1(vp, offset);
1843 
1844  load_vec(pg, v1r, &wp2[VLEN * (offset + 0)]);
1845  load_vec(pg, v1i, &wp2[VLEN * (offset + 1)]);
1846  load_vec(pg, v2r, &wp2[VLEN * (offset + 2)]);
1847  load_vec(pg, v2i, &wp2[VLEN * (offset + 3)]);
1848  load_vec(pg, v3r, &wp2[VLEN * (offset + 4)]);
1849  load_vec(pg, v3i, &wp2[VLEN * (offset + 5)]);
1850  load_vec(pg, v4r, &wp2[VLEN * (offset + 6)]);
1851  load_vec(pg, v4i, &wp2[VLEN * (offset + 7)]);
1852 
1853  real_t a = real_t(0.5) * dm[is + 1] * dpinv[is];
1854 
1855  add_aPp5_dirac_vec(pg,
1856  v1r, v1i, v2r, v2i, v3r, v3i, v4r, v4i,
1857  a,
1858  x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i);
1859 
1860  axpy_vec(pg, v1r, -e[is], y1r);
1861  axpy_vec(pg, v1i, -e[is], y1i);
1862  axpy_vec(pg, v2r, -e[is], y2r);
1863  axpy_vec(pg, v2i, -e[is], y2i);
1864  axpy_vec(pg, v3r, -e[is], y3r);
1865  axpy_vec(pg, v3i, -e[is], y3i);
1866  axpy_vec(pg, v4r, -e[is], y4r);
1867  axpy_vec(pg, v4i, -e[is], y4i);
1868 
1869  save_vec(pg, &vp2[VLEN * (offset + 0)], v1r);
1870  save_vec(pg, &vp2[VLEN * (offset + 1)], v1i);
1871  save_vec(pg, &vp2[VLEN * (offset + 2)], v2r);
1872  save_vec(pg, &vp2[VLEN * (offset + 3)], v2i);
1873  save_vec(pg, &vp2[VLEN * (offset + 4)], v3r);
1874  save_vec(pg, &vp2[VLEN * (offset + 5)], v3i);
1875  save_vec(pg, &vp2[VLEN * (offset + 6)], v4r);
1876  save_vec(pg, &vp2[VLEN * (offset + 7)], v4i);
1877  }
1878  } //ic
1879  } //site
1880 #pragma omp barrier
1881 }
1882 
1883 
1884 #endif
1885 //============================================================END=====
BridgeQXS::mult_domainwall_5din_eo_hop2_dirac
void mult_domainwall_5din_eo_hop2_dirac(double *vp, double *up, double *wp, double *buf2_xp, double *buf2_xm, double *buf2_yp, double *buf2_ym, double *buf2_zp, double *buf2_zm, double *buf2_tp, double *buf2_tm, double mq, double M0, int Ns, int *bc, int *Leo, int *Nsize, int *do_comm, const int ieo)
Definition: mult_Domainwall_5din_eo_qxs-inc.h:916
__prefetch_load_hop2_buf_x_l1
#define __prefetch_load_hop2_buf_x_l1(a, idx, is, skip)
Definition: prefetch.h:116
__prefetch_load_hop_u_l2
#define __prefetch_load_hop_u_l2(a, dir, idx)
Definition: prefetch.h:43
NVCD
#define NVCD
Definition: define_params_SU3.h:20
VLEN
#define VLEN
Definition: bridgeQXS_Clover_coarse_double.cpp:12
NDF
#define NDF
Definition: field_F_imp_SU2-inc.h:4
Vsimd_t
Definition: vsimd_double-inc.h:13
Isimd_t
Definition: vsimd_double-inc.h:20
mult_common_th-inc.h
BridgeQXS::mult_domainwall_5din_eo_5dirdag_dirac
void mult_domainwall_5din_eo_5dirdag_dirac(double *vp, double *yp, double mq, double M0, int Ns, int *bc, double *b, double *c, int *Nsize, int *do_comm)
real_t
double real_t
Definition: bridgeQXS_Clover_coarse_double.cpp:16
__prefetch_write_hop_vec_l1
#define __prefetch_write_hop_vec_l1(a, idx, is)
Definition: prefetch.h:89
__prefetch_load_luinv_l1
#define __prefetch_load_luinv_l1(a, offset)
Definition: prefetch.h:32
BridgeQXS::mult_domainwall_5din_eo_Ldag_inv_dirac
void mult_domainwall_5din_eo_Ldag_inv_dirac(real_t *vp, real_t *wp, int Ns, int *Nsize, real_t *e, real_t *dpinv, real_t *dm)
__prefetch_write_hop1_buf_zt_l1
#define __prefetch_write_hop1_buf_zt_l1(a, idx, is, skip)
Definition: prefetch.h:160
NC
#define NC
Definition: field_F_imp_SU2-inc.h:2
__prefetch_load_hop_vec_l1
#define __prefetch_load_hop_vec_l1(a, idx, is)
Definition: prefetch.h:79
__prefetch_write_luinv_l1
#define __prefetch_write_luinv_l1(a, offset)
Definition: prefetch.h:37
ND
#define ND
Definition: field_F_imp_SU2-inc.h:5
__prefetch_load_hop2_buf_zt_l1
#define __prefetch_load_hop2_buf_zt_l1(a, idx, is, skip)
Definition: prefetch.h:126
__prefetch_write_hop1_buf_x_l1
#define __prefetch_write_hop1_buf_x_l1(a, idx, is, skip)
Definition: prefetch.h:150
__prefetch_write_hop1_buf_y_l1
#define __prefetch_write_hop1_buf_y_l1(a, idx, is, skip)
Definition: prefetch.h:155
__prefetch_load_hop2_buf_x_l2
#define __prefetch_load_hop2_buf_x_l2(a, idx, is, skip)
Definition: prefetch.h:99
BridgeQXS::mult_domainwall_5din_eo_Udag_inv_dirac
void mult_domainwall_5din_eo_Udag_inv_dirac(real_t *vp, real_t *wp, int Ns, int *Nsize, real_t *f, real_t *dpinv, real_t *dm)
BridgeQXS::mult_domainwall_5din_eo_5dir_dirac
void mult_domainwall_5din_eo_5dir_dirac(double *yp, double *wp, double mq, double M0, int Ns, int *bc, double *b, double *c, int *Nsize, int *do_comm)
BridgeQXS::mult_domainwall_5din_eo_bulk_dirac
void mult_domainwall_5din_eo_bulk_dirac(double *vp, double *up, double *wp, double *yp, double mq, double M0, int Ns, int *bc, double *b, double *c, int *Leo, int *Nsize, int *do_comm, const int ieo)
Definition: mult_Domainwall_5din_eo_qxs-inc.h:1269
VLENY
#define VLENY
Definition: bridgeQXS_Clover_coarse_double.cpp:14
__prefetch_write_hop1_buf_x_l2
#define __prefetch_write_hop1_buf_x_l2(a, idx, is, skip)
Definition: prefetch.h:133
NVC
#define NVC
Definition: fopr_Wilson_impl_SU2-inc.h:15
BridgeQXS::mult_domainwall_5din_eo_U_inv_dirac
void mult_domainwall_5din_eo_U_inv_dirac(real_t *vp, real_t *wp, int Ns, int *Nsize, real_t *f, real_t *dpinv, real_t *dm)
__prefetch_write_hop_vec_l2
#define __prefetch_write_hop_vec_l2(a, idx, is)
Definition: prefetch.h:61
svbool_t
Definition: vsimd_double-inc.h:30
BridgeQXS::mult_domainwall_5din_eo_L_inv_dirac
void mult_domainwall_5din_eo_L_inv_dirac(real_t *vp, real_t *wp, int Ns, int *Nsize, real_t *e, real_t *dpinv, real_t *dm)
BridgeQXS::mult_domainwall_5din_eo_mult_gm5_dirac
void mult_domainwall_5din_eo_mult_gm5_dirac(double *vp, double *wp, int Ns, int *Nsize)
VLENX
#define VLENX
Definition: bridgeQXS_Clover_coarse_double.cpp:13
__prefetch_load_hop_vec_l2
#define __prefetch_load_hop_vec_l2(a, idx, is)
Definition: prefetch.h:51
__prefetch_load_hop2_buf_zt_l2
#define __prefetch_load_hop2_buf_zt_l2(a, idx, is, skip)
Definition: prefetch.h:109
__prefetch_write_hop1_buf_zt_l2
#define __prefetch_write_hop1_buf_zt_l2(a, idx, is, skip)
Definition: prefetch.h:143
BridgeQXS::mult_domainwall_5din_eo_hop1_dirac
void mult_domainwall_5din_eo_hop1_dirac(double *buf1_xp, double *buf1_xm, double *buf1_yp, double *buf1_ym, double *buf1_zp, double *buf1_zm, double *buf1_tp, double *buf1_tm, double *up, double *wp, double mq, double M0, int Ns, int *bc, int *Leo, int *Nsize, int *do_comm, const int ieo)
Definition: mult_Domainwall_5din_eo_qxs-inc.h:511
__prefetch_load_hop2_buf_y_l2
#define __prefetch_load_hop2_buf_y_l2(a, idx, is, skip)
Definition: prefetch.h:104
BridgeQXS::mult_domainwall_5din_eo_hopb_dirac
void mult_domainwall_5din_eo_hopb_dirac(double *vp, double *up, double *wp, double mq, double M0, int Ns, int *bc, double *b, double *c, int *Leo, int *Nsize, int *do_comm, const int ieo)
Definition: mult_Domainwall_5din_eo_qxs-inc.h:252
ND2
#define ND2
Definition: define_params_SU3.h:18
BridgeQXS::mult_domainwall_5din_eo_clear
void mult_domainwall_5din_eo_clear(double *vp, int Ns, int *Nsize)
Definition: mult_Domainwall_5din_eo_qxs-inc.h:145
prefetch.h
__prefetch_write_hop1_buf_y_l2
#define __prefetch_write_hop1_buf_y_l2(a, idx, is, skip)
Definition: prefetch.h:138