Bridge++  Ver. 2.0.2
mult_Staggered_qxs-inc.h
Go to the documentation of this file.
1 
10 #ifndef MULT_STAGGERED_QXS_INCLUDED
11 #define MULT_STAGGERED_QXS_INCLUDED
12 
13 #include "mult_common_th-inc.h"
14 
15 //====================================================================
17  int *Nsize, int Nin)
18 {
19  int Nxv = Nsize[0];
20  int Nyv = Nsize[1];
21  int Nz = Nsize[2];
22  int Nt = Nsize[3];
23  int Nstv = Nxv * Nyv * Nz * Nt;
24 
25  int ith, nth, is, ns;
26  set_threadtask(ith, nth, is, ns, Nstv);
27 
28  svbool_t pg = set_predicate();
29 
30  for (int site = is; site < ns; ++site) {
31  svreal_t vph;
32  load_vec(pg, vph, &ph[VLEN * site]);
33 
34  for (int in = 0; in < Nin; ++in) {
35  svreal_t vt;
36  load_vec(pg, vt, &v[VLEN * (in + Nin * site)]);
37  scal_vec(pg, vt, vph);
38  save_vec(pg, &v[VLEN * (in + Nin * site)], vt);
39  }
40  }
41 }
42 
43 
44 //====================================================================
45 void BridgeQXS::mult_staggered_clear(real_t *v, int *Nsize, int Nin)
46 {
47  int Nxv = Nsize[0];
48  int Nyv = Nsize[1];
49  int Nz = Nsize[2];
50  int Nt = Nsize[3];
51  int Nstv = Nxv * Nyv * Nz * Nt;
52 
53  int ith, nth, is, ns;
54  set_threadtask(ith, nth, is, ns, Nstv);
55 
56  svbool_t pg = set_predicate();
57  svreal_t vz;
58  clear_vec(pg, vz);
59 
60  for (int site = is; site < ns; ++site) {
61  for (int in = 0; in < Nin; ++in) {
62  //svst1(pg, &v[VLEN * (in + Nin * site)], vz);
63  save_vec(pg, &v[VLEN * (in + Nin * site)], vz);
64  }
65  }
66 }
67 
68 
69 //====================================================================
71  real_t a, real_t *w,
72  int *Nsize, int Nin)
73 {
74  int Nxv = Nsize[0];
75  int Nyv = Nsize[1];
76  int Nz = Nsize[2];
77  int Nt = Nsize[3];
78  int Nstv = Nxv * Nyv * Nz * Nt;
79 
80  int ith, nth, is, ns;
81  set_threadtask(ith, nth, is, ns, Nstv);
82 
83  svbool_t pg = set_predicate();
84 
85  for (int site = is; site < ns; ++site) {
86  for (int in = 0; in < Nin; ++in) {
87  svreal_t vt, wt;
88  load_vec(pg, wt, &w[VLEN * (in + Nin * site)]);
89  load_vec(pg, vt, &v[VLEN * (in + Nin * site)]);
90  scal_vec(pg, vt, b);
91  axpy_vec(pg, vt, a, wt);
92  //svst1(pg, &v[VLEN * (in + Nin * site)], vt);
93  save_vec(pg, &v[VLEN * (in + Nin * site)], vt);
94  }
95  }
96 }
97 
98 
99 //====================================================================
101  real_t *w, int *Nsize)
102 {
103  int Nxv = Nsize[0];
104  int Nyv = Nsize[1];
105  int Nz = Nsize[2];
106  int Nt = Nsize[3];
107  int Nstv = Nxv * Nyv * Nz * Nt;
108 
109  int ith, nth, is, ns;
110  set_threadtask(ith, nth, is, ns, Nstv);
111 
112  svbool_t pg = set_predicate();
113 
114  for (int site = is; site < ns; ++site) {
115  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
116  load_vec(pg, wt0, &w[VLEN * (0 + NVC * site)]);
117  load_vec(pg, wt1, &w[VLEN * (1 + NVC * site)]);
118  load_vec(pg, wt2, &w[VLEN * (2 + NVC * site)]);
119  load_vec(pg, wt3, &w[VLEN * (3 + NVC * site)]);
120  load_vec(pg, wt4, &w[VLEN * (4 + NVC * site)]);
121  load_vec(pg, wt5, &w[VLEN * (5 + NVC * site)]);
122 
123  for (int ic = 0; ic < NC; ++ic) {
124  svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
125  load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
126  &u[VLEN * (2 * ic + NDF * site)]);
127 
128  svreal_t xtr, xti;
129  mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
130  wt0, wt1, wt2, wt3, wt4, wt5);
131 
132  //svst1(pg, &v[VLEN * (2*ic + NVC * site)], xtr);
133  //svst1(pg, &v[VLEN * (2*ic+1 + NVC * site)], xti);
134  save_vec(pg, &v[VLEN * (2 * ic + NVC * site)], xtr);
135  save_vec(pg, &v[VLEN * (2 * ic + 1 + NVC * site)], xti);
136  }
137  }
138 }
139 
140 
141 //====================================================================
143  int *Nsize)
144 {
145  int Nxv = Nsize[0];
146  int Nyv = Nsize[1];
147  int Nz = Nsize[2];
148  int Nt = Nsize[3];
149  int Nstv = Nxv * Nyv * Nz * Nt;
150 
151  int ith, nth, is, ns;
152  set_threadtask(ith, nth, is, ns, Nstv);
153 
154  svbool_t pg = set_predicate();
155 
156  for (int site = is; site < ns; ++site) {
157  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
158  load_vec(pg, wt0, &w[VLEN * (0 + NVC * site)]);
159  load_vec(pg, wt1, &w[VLEN * (1 + NVC * site)]);
160  load_vec(pg, wt2, &w[VLEN * (2 + NVC * site)]);
161  load_vec(pg, wt3, &w[VLEN * (3 + NVC * site)]);
162  load_vec(pg, wt4, &w[VLEN * (4 + NVC * site)]);
163  load_vec(pg, wt5, &w[VLEN * (5 + NVC * site)]);
164 
165  for (int ic = 0; ic < NC; ++ic) {
166  svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
167  load_udag(pg, ut0, ut1, ut2, ut3, ut4, ut5,
168  &u[VLEN * (NVC * ic + NDF * site)]);
169 
170  svreal_t xtr, xti;
171  mult_udv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
172  wt0, wt1, wt2, wt3, wt4, wt5);
173 
174  // svst1(pg, &v[VLEN * (2*ic + NVC * site)], xtr);
175  // svst1(pg, &v[VLEN * (2*ic+1 + NVC * site)], xti);
176  save_vec(pg, &v[VLEN * (2 * ic + NVC * site)], xtr);
177  save_vec(pg, &v[VLEN * (2 * ic + 1 + NVC * site)], xti);
178  }
179  }
180 }
181 
182 
183 //====================================================================
185  real_t mq, int jd,
186  int *Nsize, int *do_comm)
187 {
188  int Nxv = Nsize[0];
189  int Nyv = Nsize[1];
190  int Nz = Nsize[2];
191  int Nt = Nsize[3];
192  int Nstv = Nxv * Nyv * Nz * Nt;
193  int Nst = Nstv * VLEN;
194  real_t fac = 0.5 * real_t(jd);
195 
196  svbool_t pg = set_predicate();
197  svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
198  svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
199  set_predicate_xp(pg1_xp, pg2_xp);
200  set_predicate_xm(pg1_xm, pg2_xm);
201  set_predicate_yp(pg1_yp, pg2_yp);
202  set_predicate_ym(pg1_ym, pg2_ym);
203 
204  int Nxy = Nxv * Nyv;
205  int Nxyz = Nxv * Nyv * Nz;
206 
207  int ith, nth, is, ns;
208  set_threadtask(ith, nth, is, ns, Nstv);
209 
210  for (int site = is; site < ns; ++site) {
211  int ix = site % Nxv;
212  int iyzt = site / Nxv;
213  int iy = iyzt % Nyv;
214  int izt = site / Nxy;
215  int iz = izt % Nz;
216  int it = izt / Nz;
217  int ixy = ix + Nxv * iy;
218  int ixyz = ixy + Nxy * iz;
219 
220  svreal_t vt0, vt1, vt2, vt3, vt4, vt5;
221  clear_vec(pg, vt0);
222  clear_vec(pg, vt1);
223  clear_vec(pg, vt2);
224  clear_vec(pg, vt3);
225  clear_vec(pg, vt4);
226  clear_vec(pg, vt5);
227 
228  if ((ix < Nxv - 1) || (do_comm[0] == 0)) {
229  real_t *u = &up[NDF * Nst * 0];
230  int nei = ix + 1 + Nxv * iyzt;
231  if (ix == Nxv - 1) nei = 0 + Nxv * iyzt;
232  mult_staggered_xp(pg, pg1_xp, pg2_xp,
233  vt0, vt1, vt2, vt3, vt4, vt5,
234  &u[VLEN * NDF * site],
235  &v1[VLEN * NVC * site], &v1[VLEN * NVC * nei]);
236  }
237 
238  if ((ix > 0) || (do_comm[0] == 0)) {
239  real_t *u = &up[NDF * Nst * 0];
240  int nei = ix - 1 + Nxv * iyzt;
241  if (ix == 0) nei = Nxv - 1 + Nxv * iyzt;
242  mult_staggered_xm(pg, pg1_xm, pg2_xm,
243  vt0, vt1, vt2, vt3, vt4, vt5,
244  &u[VLEN * NDF * site], &u[VLEN * NDF * nei],
245  &v1[VLEN * NVC * site], &v1[VLEN * NVC * nei]);
246  }
247 
248  if ((iy < Nyv - 1) || (do_comm[1] == 0)) {
249  int iy2 = (iy + 1) % Nyv;
250  int nei = ix + Nxv * (iy2 + Nyv * izt);
251  real_t *u = &up[NDF * Nst * 1];
252  mult_staggered_yp(pg, pg1_yp, pg2_yp,
253  vt0, vt1, vt2, vt3, vt4, vt5,
254  &u[VLEN * NDF * site],
255  &v1[VLEN * NVC * site], &v1[VLEN * NVC * nei]);
256  }
257 
258  if ((iy > 0) || (do_comm[1] == 0)) {
259  int iy2 = (iy - 1 + Nyv) % Nyv;
260  int nei = ix + Nxv * (iy2 + Nyv * izt);
261  real_t *u = &up[NDF * Nst * 1];
262  mult_staggered_ym(pg, pg1_ym, pg2_ym,
263  vt0, vt1, vt2, vt3, vt4, vt5,
264  &u[VLEN * NDF * site], &u[VLEN * NDF * nei],
265  &v1[VLEN * NVC * site], &v1[VLEN * NVC * nei]);
266  }
267 
268  if ((iz < Nz - 1) || (do_comm[2] == 0)) {
269  int iz2 = (iz + 1) % Nz;
270  int nei = ixy + Nxy * (iz2 + Nz * it);
271  mult_staggered_up(pg, vt0, vt1, vt2, vt3, vt4, vt5,
272  &up[VLEN * NDF * (site + Nstv * 2)],
273  &v1[VLEN * NVC * nei]);
274  }
275 
276  if ((iz > 0) || (do_comm[2] == 0)) {
277  int iz2 = (iz - 1 + Nz) % Nz;
278  int nei = ixy + Nxy * (iz2 + Nz * it);
279  mult_staggered_dn(pg, vt0, vt1, vt2, vt3, vt4, vt5,
280  &up[VLEN * NDF * (nei + Nstv * 2)],
281  &v1[VLEN * NVC * nei]);
282  }
283 
284  if ((it < Nt - 1) || (do_comm[3] == 0)) {
285  int it2 = (it + 1) % Nt;
286  int nei = ixyz + Nxyz * it2;
287  mult_staggered_up(pg, vt0, vt1, vt2, vt3, vt4, vt5,
288  &up[VLEN * NDF * (site + Nstv * 3)],
289  &v1[VLEN * NVC * nei]);
290  }
291 
292  if ((it > 0) || (do_comm[3] == 0)) {
293  int it2 = (it - 1 + Nt) % Nt;
294  int nei = ixyz + Nxyz * it2;
295  mult_staggered_dn(pg, vt0, vt1, vt2, vt3, vt4, vt5,
296  &up[VLEN * NDF * (nei + Nstv * 3)],
297  &v1[VLEN * NVC * nei]);
298  }
299 
300  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
301  load_vec(pg, wt0, &v1[VLEN * (0 + NVC * site)]);
302  load_vec(pg, wt1, &v1[VLEN * (1 + NVC * site)]);
303  load_vec(pg, wt2, &v1[VLEN * (2 + NVC * site)]);
304  load_vec(pg, wt3, &v1[VLEN * (3 + NVC * site)]);
305  load_vec(pg, wt4, &v1[VLEN * (4 + NVC * site)]);
306  load_vec(pg, wt5, &v1[VLEN * (5 + NVC * site)]);
307 
308  scal_vec(pg, vt0, fac);
309  scal_vec(pg, vt1, fac);
310  scal_vec(pg, vt2, fac);
311  scal_vec(pg, vt3, fac);
312  scal_vec(pg, vt4, fac);
313  scal_vec(pg, vt5, fac);
314 
315  axpy_vec(pg, vt0, mq, wt0);
316  axpy_vec(pg, vt1, mq, wt1);
317  axpy_vec(pg, vt2, mq, wt2);
318  axpy_vec(pg, vt3, mq, wt3);
319  axpy_vec(pg, vt4, mq, wt4);
320  axpy_vec(pg, vt5, mq, wt5);
321 
322  save_vec(pg, &v2[VLEN * (0 + NVC * site)], vt0);
323  save_vec(pg, &v2[VLEN * (1 + NVC * site)], vt1);
324  save_vec(pg, &v2[VLEN * (2 + NVC * site)], vt2);
325  save_vec(pg, &v2[VLEN * (3 + NVC * site)], vt3);
326  save_vec(pg, &v2[VLEN * (4 + NVC * site)], vt4);
327  save_vec(pg, &v2[VLEN * (5 + NVC * site)], vt5);
328  }
329 }
330 
331 
332 //====================================================================
334  real_t *buf_yp, real_t *buf_ym,
335  real_t *buf_zp, real_t *buf_zm,
336  real_t *buf_tp, real_t *buf_tm,
337  real_t *up, real_t *v1,
338  int *Nsize, int *do_comm)
339 {
340  int Nxv = Nsize[0];
341  int Nyv = Nsize[1];
342  int Nz = Nsize[2];
343  int Nt = Nsize[3];
344  int Nstv = Nxv * Nyv * Nz * Nt;
345  int Nst = Nstv * VLEN;
346 
347  int Nxy = Nxv * Nyv;
348  int Nxyz = Nxv * Nyv * Nz;
349 
350  svbool_t pg = set_predicate();
351  svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
352  svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
353  set_predicate_xp(pg1_xp, pg2_xp);
354  set_predicate_xm(pg1_xm, pg2_xm);
355  set_predicate_yp(pg1_yp, pg2_yp);
356  set_predicate_ym(pg1_ym, pg2_ym);
357  svint_t svidx_xp, svidx_xm;
358  set_index_xp(svidx_xp);
359  set_index_xm(svidx_xm);
360 
361 
362  if (do_comm[0] > 0) {
363  int idir = 0;
364  real_t *u = &up[NDF * Nst * idir];
365 
366  int Nyzt = Nyv * Nz * Nt;
367  int ith, nth, isx, nsx;
368  set_threadtask(ith, nth, isx, nsx, Nyzt);
369 
370  for (int iyzt = isx; iyzt < nsx; ++iyzt) {
371  {
372  int ix = 0;
373  int site = ix + Nxv * iyzt;
374  real_t *buf = &buf_xp[VLENY * NVC * iyzt];
375 
376  set_index_xm(svidx_xm);
377  for (int ivc = 0; ivc < NVC; ++ivc) {
378  svreal_t wt;
379  load_vec(pg2_xm, wt, &v1[VLEN * (ivc + NVC * site)]);
380  //svst1_scatter_index(pg2_xm, &buf[VLENY*ivc], svidx_xm, wt);
381  save_vec_scatter(pg2_xm, &buf[VLENY * ivc], wt, svidx_xm);
382  }
383  }
384  {
385  int ix = Nxv - 1;
386  int site = ix + Nxv * iyzt;
387  real_t *buf = &buf_xm[VLENY * NVC * iyzt];
388 
389  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
390  load_vec(pg2_xp, wt0, &v1[VLEN * (0 + NVC * site)]);
391  load_vec(pg2_xp, wt1, &v1[VLEN * (1 + NVC * site)]);
392  load_vec(pg2_xp, wt2, &v1[VLEN * (2 + NVC * site)]);
393  load_vec(pg2_xp, wt3, &v1[VLEN * (3 + NVC * site)]);
394  load_vec(pg2_xp, wt4, &v1[VLEN * (4 + NVC * site)]);
395  load_vec(pg2_xp, wt5, &v1[VLEN * (5 + NVC * site)]);
396 
397  set_index_xp(svidx_xp);
398  for (int ic = 0; ic < NC; ++ic) {
399  svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
400  svreal_t xtr, xti;
401  load_udag(pg2_xp, ut0, ut1, ut2, ut3, ut4, ut5,
402  &u[VLEN * (NVC * ic + NDF * site)]);
403  mult_udv(pg2_xp, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
404  wt0, wt1, wt2, wt3, wt4, wt5);
405  //svst1_scatter_index(pg2_xp, &buf[VLENY*(2*ic) ], svidx_xp, xtr);
406  //svst1_scatter_index(pg2_xp, &buf[VLENY*(2*ic+1)], svidx_xp, xti);
407  save_vec_scatter(pg2_xp, &buf[VLENY * (2 * ic)], xtr, svidx_xp);
408  save_vec_scatter(pg2_xp, &buf[VLENY * (2 * ic + 1)], xti, svidx_xp);
409  }
410  }
411  }
412  }
413 
414  if (do_comm[1] > 0) {
415  int idir = 1;
416  real_t *u = &up[NDF * Nst * idir];
417 
418  int Nxzt = Nxv * Nz * Nt;
419  int ith, nth, isy, nsy;
420  set_threadtask(ith, nth, isy, nsy, Nxzt);
421 
422  for (int ixzt = isy; ixzt < nsy; ++ixzt) {
423  int ix = ixzt % Nxv;
424  int izt = ixzt / Nxv;
425  {
426  int iy = 0;
427  int site = ix + Nxv * (iy + Nyv * izt);
428  real_t *buf = &buf_yp[VLENX * NVC * ixzt];
429  for (int ivc = 0; ivc < NVC; ++ivc) {
430  svreal_t wt;
431  load_vec(pg2_ym, wt, &v1[VLEN * (ivc + NVC * site)]);
432  //svst1(pg2_ym, &buf[VLENX*ivc], wt);
433  save_vec(pg2_ym, &buf[VLENX * ivc], wt);
434  }
435  }
436  {
437  int iy = Nyv - 1;
438  int site = ix + Nxv * (iy + Nyv * izt);
439  real_t *buf = &buf_ym[VLENX * NVC * ixzt];
440 
441  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
442  load_vec(pg2_yp, wt0, &v1[VLEN * (0 + NVC * site)]);
443  load_vec(pg2_yp, wt1, &v1[VLEN * (1 + NVC * site)]);
444  load_vec(pg2_yp, wt2, &v1[VLEN * (2 + NVC * site)]);
445  load_vec(pg2_yp, wt3, &v1[VLEN * (3 + NVC * site)]);
446  load_vec(pg2_yp, wt4, &v1[VLEN * (4 + NVC * site)]);
447  load_vec(pg2_yp, wt5, &v1[VLEN * (5 + NVC * site)]);
448 
449  for (int ic = 0; ic < NC; ++ic) {
450  svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
451  svreal_t xtr, xti;
452  load_udag(pg2_yp, ut0, ut1, ut2, ut3, ut4, ut5,
453  &u[VLEN * (NVC * ic + NDF * site)]);
454  mult_udv(pg2_yp, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
455  wt0, wt1, wt2, wt3, wt4, wt5);
456  int offset = -VLENX * (VLENY - 1);
457  //svst1(pg2_yp, &buf[offset + VLENX*(2*ic) ], xtr);
458  //svst1(pg2_yp, &buf[offset + VLENX*(2*ic+1)], xti);
459  save_vec(pg2_yp, &buf[offset + VLENX * (2 * ic)], xtr);
460  save_vec(pg2_yp, &buf[offset + VLENX * (2 * ic + 1)], xti);
461  }
462  }
463  }
464  }
465 
466  if (do_comm[2] > 0) {
467  int idir = 2;
468 
469  int Nxyt = Nxv * Nyv * Nt;
470  int ith, nth, isz, nsz;
471  set_threadtask(ith, nth, isz, nsz, Nxyt);
472 
473  for (int ixyt = isz; ixyt < nsz; ++ixyt) {
474  int ixy = ixyt % Nxy;
475  int it = ixyt / Nxy;
476  {
477  int iz = 0;
478  int site = ixy + Nxy * (iz + Nz * it);
479  real_t *buf = &buf_zp[VLEN * NVC * ixyt];
480  for (int ivc = 0; ivc < NVC; ++ivc) {
481  svreal_t wt;
482  load_vec(pg, wt, &v1[VLEN * (ivc + NVC * site)]);
483  //svst1(pg, &buf[VLEN * ivc], wt);
484  save_vec(pg, &buf[VLEN * ivc], wt);
485  }
486  }
487  {
488  int iz = Nz - 1;
489  int site = ixy + Nxy * (iz + Nz * it);
490  mult_staggered_dn1(pg, &buf_zm[VLEN * NVC * ixyt],
491  &up[VLEN * NDF * (site + Nstv * 2)],
492  &v1[VLEN * NVC * site]);
493  }
494  }
495  }
496 
497  if (do_comm[3] > 0) {
498  int idir = 3;
499 
500  int ith, nth, ist, nst;
501  set_threadtask(ith, nth, ist, nst, Nxyz);
502 
503  for (int ixyz = ist; ixyz < nst; ++ixyz) {
504  {
505  int it = 0;
506  int site = ixyz + Nxyz * it;
507  real_t *buf = &buf_tp[VLEN * NVC * ixyz];
508  for (int ivc = 0; ivc < NVC; ++ivc) {
509  svreal_t wt;
510  load_vec(pg, wt, &v1[VLEN * (ivc + NVC * site)]);
511  //svst1(pg, &buf[VLEN * ivc], wt);
512  save_vec(pg, &buf[VLEN * ivc], wt);
513  }
514  }
515  {
516  int it = Nt - 1;
517  int site = ixyz + Nxyz * it;
518  mult_staggered_dn1(pg, &buf_tm[VLEN * NVC * ixyz],
519  &up[VLEN * NDF * (site + Nstv * 3)],
520  &v1[VLEN * NVC * site]);
521  }
522  }
523  }
524 }
525 
526 
527 //====================================================================
529  real_t *buf_xp, real_t *buf_xm,
530  real_t *buf_yp, real_t *buf_ym,
531  real_t *buf_zp, real_t *buf_zm,
532  real_t *buf_tp, real_t *buf_tm,
533  real_t qm, int jd,
534  int *Nsize, int *do_comm)
535 {
536  int Nxv = Nsize[0];
537  int Nyv = Nsize[1];
538  int Nz = Nsize[2];
539  int Nt = Nsize[3];
540  int Nstv = Nxv * Nyv * Nz * Nt;
541  int Nst = Nstv * VLEN;
542 
543  int Nxy = Nxv * Nyv;
544  int Nxyz = Nxv * Nyv * Nz;
545 
546  real_t fac = 0.5 * real_t(jd);
547 
548  svbool_t pg = set_predicate();
549  svbool_t pg1_xp, pg2_xp, pg1_xm, pg2_xm;
550  svbool_t pg1_yp, pg2_yp, pg1_ym, pg2_ym;
551  set_predicate_xp(pg1_xp, pg2_xp);
552  set_predicate_xm(pg1_xm, pg2_xm);
553  set_predicate_yp(pg1_yp, pg2_yp);
554  set_predicate_ym(pg1_ym, pg2_ym);
555  svint_t svidx_xp, svidx_xm;
556  set_index_xp(svidx_xp);
557  set_index_xm(svidx_xm);
558 
559  int ith, nth, is, ns;
560  set_threadtask(ith, nth, is, ns, Nstv);
561 
562  for (int site = is; site < ns; ++site) {
563  int ix = site % Nxv;
564  int iyzt = site / Nxv;
565  int iy = iyzt % Nyv;
566  int izt = site / Nxy;
567  int iz = izt % Nz;
568  int it = izt / Nz;
569  int ixy = ix + Nxv * iy;
570  int ixyz = ixy + Nxy * iz;
571 
572  svreal_t vt0, vt1, vt2, vt3, vt4, vt5;
573  clear_vec(pg, vt0);
574  clear_vec(pg, vt1);
575  clear_vec(pg, vt2);
576  clear_vec(pg, vt3);
577  clear_vec(pg, vt4);
578  clear_vec(pg, vt5);
579 
580  int opr_any = 0;
581 
582  if ((ix == Nxv - 1) && (do_comm[0] > 0)) {
583  real_t *u = &up[NDF * Nst * 0];
584  real_t *buf = &buf_xp[VLENY * NVC * iyzt];
585 
586  set_index_xp(svidx_xp);
587  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
588  load_vec(pg1_xp, wt0, &v1[VLEN * (0 + NVC * site) + 1]);
589  load_vec(pg1_xp, wt1, &v1[VLEN * (1 + NVC * site) + 1]);
590  load_vec(pg1_xp, wt2, &v1[VLEN * (2 + NVC * site) + 1]);
591  load_vec(pg1_xp, wt3, &v1[VLEN * (3 + NVC * site) + 1]);
592  load_vec(pg1_xp, wt4, &v1[VLEN * (4 + NVC * site) + 1]);
593  load_vec(pg1_xp, wt5, &v1[VLEN * (5 + NVC * site) + 1]);
594 
595  load_add_gather(pg2_xp, wt0, &buf[VLENY * 0], svidx_xp);
596  load_add_gather(pg2_xp, wt1, &buf[VLENY * 1], svidx_xp);
597  load_add_gather(pg2_xp, wt2, &buf[VLENY * 2], svidx_xp);
598  load_add_gather(pg2_xp, wt3, &buf[VLENY * 3], svidx_xp);
599  load_add_gather(pg2_xp, wt4, &buf[VLENY * 4], svidx_xp);
600  load_add_gather(pg2_xp, wt5, &buf[VLENY * 5], svidx_xp);
601 
602  svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
603  svreal_t xtr, xti;
604  load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
605  &u[VLEN * (0 + NDF * site)]);
606  mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
607  wt0, wt1, wt2, wt3, wt4, wt5);
608  add_vec(pg, vt0, xtr);
609  add_vec(pg, vt1, xti);
610  load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
611  &u[VLEN * (2 + NDF * site)]);
612  mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
613  wt0, wt1, wt2, wt3, wt4, wt5);
614  add_vec(pg, vt2, xtr);
615  add_vec(pg, vt3, xti);
616  load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
617  &u[VLEN * (4 + NDF * site)]);
618  mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
619  wt0, wt1, wt2, wt3, wt4, wt5);
620  add_vec(pg, vt4, xtr);
621  add_vec(pg, vt5, xti);
622 
623  ++opr_any;
624  }
625 
626  if ((ix == 0) && (do_comm[0] > 0)) {
627  real_t *u = &up[NDF * Nst * 0];
628  real_t *buf = &buf_xm[VLENY * NVC * iyzt];
629 
630  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
631  load_vec(pg1_xm, wt0, &v1[VLEN * (0 + NVC * site) - 1]);
632  load_vec(pg1_xm, wt1, &v1[VLEN * (1 + NVC * site) - 1]);
633  load_vec(pg1_xm, wt2, &v1[VLEN * (2 + NVC * site) - 1]);
634  load_vec(pg1_xm, wt3, &v1[VLEN * (3 + NVC * site) - 1]);
635  load_vec(pg1_xm, wt4, &v1[VLEN * (4 + NVC * site) - 1]);
636  load_vec(pg1_xm, wt5, &v1[VLEN * (5 + NVC * site) - 1]);
637 
638  set_index_xm(svidx_xm);
639  svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
640  svreal_t xtr, xti;
641  load_udag(pg1_xm, ut0, ut1, ut2, ut3, ut4, ut5,
642  &u[VLEN * (0 + NDF * site) - 1]);
643  mult_udv(pg1_xm, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
644  wt0, wt1, wt2, wt3, wt4, wt5);
645  load_add_gather(pg2_xm, xtr, &buf[VLENY * 0], svidx_xm);
646  load_add_gather(pg2_xm, xti, &buf[VLENY * 1], svidx_xm);
647  sub_vec(pg, vt0, xtr);
648  sub_vec(pg, vt1, xti);
649 
650  load_udag(pg1_xm, ut0, ut1, ut2, ut3, ut4, ut5,
651  &u[VLEN * (NVC + NDF * site) - 1]);
652  mult_udv(pg1_xm, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
653  wt0, wt1, wt2, wt3, wt4, wt5);
654  load_add_gather(pg2_xm, xtr, &buf[VLENY * 2], svidx_xm);
655  load_add_gather(pg2_xm, xti, &buf[VLENY * 3], svidx_xm);
656  sub_vec(pg, vt2, xtr);
657  sub_vec(pg, vt3, xti);
658 
659  load_udag(pg1_xm, ut0, ut1, ut2, ut3, ut4, ut5,
660  &u[VLEN * (2 * NVC + NDF * site) - 1]);
661  mult_udv(pg1_xm, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
662  wt0, wt1, wt2, wt3, wt4, wt5);
663  load_add_gather(pg2_xm, xtr, &buf[VLENY * 4], svidx_xm);
664  load_add_gather(pg2_xm, xti, &buf[VLENY * 5], svidx_xm);
665  sub_vec(pg, vt4, xtr);
666  sub_vec(pg, vt5, xti);
667 
668  ++opr_any;
669  }
670 
671  if ((iy == Nyv - 1) && (do_comm[1] > 0)) {
672  real_t *u = &up[NDF * Nst * 1];
673  int ixzt = ix + Nxv * izt;
674  real_t *buf = &buf_yp[VLENX * NVC * ixzt];
675 
676  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
677  load_vec(pg1_yp, wt0, &v1[VLEN * (0 + NVC * site) + VLENX]);
678  load_vec(pg1_yp, wt1, &v1[VLEN * (1 + NVC * site) + VLENX]);
679  load_vec(pg1_yp, wt2, &v1[VLEN * (2 + NVC * site) + VLENX]);
680  load_vec(pg1_yp, wt3, &v1[VLEN * (3 + NVC * site) + VLENX]);
681  load_vec(pg1_yp, wt4, &v1[VLEN * (4 + NVC * site) + VLENX]);
682  load_vec(pg1_yp, wt5, &v1[VLEN * (5 + NVC * site) + VLENX]);
683 
684  int offset = -VLENX * (VLENY - 1);
685  load_add(pg2_yp, wt0, &buf[offset + VLENX * 0]);
686  load_add(pg2_yp, wt1, &buf[offset + VLENX * 1]);
687  load_add(pg2_yp, wt2, &buf[offset + VLENX * 2]);
688  load_add(pg2_yp, wt3, &buf[offset + VLENX * 3]);
689  load_add(pg2_yp, wt4, &buf[offset + VLENX * 4]);
690  load_add(pg2_yp, wt5, &buf[offset + VLENX * 5]);
691  svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
692  svreal_t xtr, xti;
693  load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
694  &u[VLEN * (0 + NDF * site)]);
695  mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
696  wt0, wt1, wt2, wt3, wt4, wt5);
697  add_vec(pg, vt0, xtr);
698  add_vec(pg, vt1, xti);
699  load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
700  &u[VLEN * (2 + NDF * site)]);
701  mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
702  wt0, wt1, wt2, wt3, wt4, wt5);
703  add_vec(pg, vt2, xtr);
704  add_vec(pg, vt3, xti);
705  load_u(pg, ut0, ut1, ut2, ut3, ut4, ut5,
706  &u[VLEN * (4 + NDF * site)]);
707  mult_uv(pg, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
708  wt0, wt1, wt2, wt3, wt4, wt5);
709  add_vec(pg, vt4, xtr);
710  add_vec(pg, vt5, xti);
711 
712  ++opr_any;
713  }
714 
715  if ((iy == 0) && (do_comm[1] > 0)) {
716  real_t *u = &up[NDF * Nst * 1];
717  int ixzt = ix + Nxv * izt;
718  real_t *buf = &buf_ym[VLENX * NVC * ixzt];
719 
720  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
721  load_vec(pg1_ym, wt0, &v1[VLEN * (0 + NVC * site) - VLENX]);
722  load_vec(pg1_ym, wt1, &v1[VLEN * (1 + NVC * site) - VLENX]);
723  load_vec(pg1_ym, wt2, &v1[VLEN * (2 + NVC * site) - VLENX]);
724  load_vec(pg1_ym, wt3, &v1[VLEN * (3 + NVC * site) - VLENX]);
725  load_vec(pg1_ym, wt4, &v1[VLEN * (4 + NVC * site) - VLENX]);
726  load_vec(pg1_ym, wt5, &v1[VLEN * (5 + NVC * site) - VLENX]);
727 
728  svreal_t ut0, ut1, ut2, ut3, ut4, ut5;
729  svreal_t xtr, xti;
730  load_udag(pg1_ym, ut0, ut1, ut2, ut3, ut4, ut5,
731  &u[VLEN * (0 + NDF * site) - VLENX]);
732  mult_udv(pg1_ym, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
733  wt0, wt1, wt2, wt3, wt4, wt5);
734  load_add(pg2_ym, xtr, &buf[VLENX * 0]);
735  load_add(pg2_ym, xti, &buf[VLENX * 1]);
736  sub_vec(pg, vt0, xtr);
737  sub_vec(pg, vt1, xti);
738 
739  load_udag(pg1_ym, ut0, ut1, ut2, ut3, ut4, ut5,
740  &u[VLEN * (NVC + NDF * site) - VLENX]);
741  mult_udv(pg1_ym, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
742  wt0, wt1, wt2, wt3, wt4, wt5);
743  load_add(pg2_ym, xtr, &buf[VLENX * 2]);
744  load_add(pg2_ym, xti, &buf[VLENX * 3]);
745  sub_vec(pg, vt2, xtr);
746  sub_vec(pg, vt3, xti);
747 
748  load_udag(pg1_ym, ut0, ut1, ut2, ut3, ut4, ut5,
749  &u[VLEN * (2 * NVC + NDF * site) - VLENX]);
750  mult_udv(pg1_ym, xtr, xti, ut0, ut1, ut2, ut3, ut4, ut5,
751  wt0, wt1, wt2, wt3, wt4, wt5);
752  load_add(pg2_ym, xtr, &buf[VLENX * 4]);
753  load_add(pg2_ym, xti, &buf[VLENX * 5]);
754  sub_vec(pg, vt4, xtr);
755  sub_vec(pg, vt5, xti);
756 
757  ++opr_any;
758  }
759 
760  if ((iz == Nz - 1) && (do_comm[2] > 0)) {
761  int ixyt = ixy + Nxy * it;
762  mult_staggered_up(pg, vt0, vt1, vt2, vt3, vt4, vt5,
763  &up[VLEN * NDF * (site + Nstv * 2)],
764  &buf_zp[VLEN * NVC * ixyt]);
765  ++opr_any;
766  }
767 
768  if ((iz == 0) && (do_comm[2] > 0)) {
769  int ixyt = ixy + Nxy * it;
770  real_t *buf = &buf_zm[VLEN * NVC * ixyt];
771  real_t *u = &up[NDF * Nst * 2];
772 
773  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
774  load_vec(pg, wt0, &buf[VLEN * 0]);
775  load_vec(pg, wt1, &buf[VLEN * 1]);
776  load_vec(pg, wt2, &buf[VLEN * 2]);
777  load_vec(pg, wt3, &buf[VLEN * 3]);
778  load_vec(pg, wt4, &buf[VLEN * 4]);
779  load_vec(pg, wt5, &buf[VLEN * 5]);
780 
781  sub_vec(pg, vt0, wt0);
782  sub_vec(pg, vt1, wt1);
783  sub_vec(pg, vt2, wt2);
784  sub_vec(pg, vt3, wt3);
785  sub_vec(pg, vt4, wt4);
786  sub_vec(pg, vt5, wt5);
787 
788  ++opr_any;
789  }
790 
791  if ((it == Nt - 1) && (do_comm[3] > 0)) {
792  mult_staggered_up(pg, vt0, vt1, vt2, vt3, vt4, vt5,
793  &up[VLEN * NDF * (site + Nstv * 3)],
794  &buf_tp[VLEN * NVC * ixyz]);
795  ++opr_any;
796  }
797 
798  if ((it == 0) && (do_comm[3] > 0)) {
799  real_t *buf = &buf_tm[VLEN * NVC * ixyz];
800 
801  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
802  load_vec(pg, wt0, &buf[VLEN * 0]);
803  load_vec(pg, wt1, &buf[VLEN * 1]);
804  load_vec(pg, wt2, &buf[VLEN * 2]);
805  load_vec(pg, wt3, &buf[VLEN * 3]);
806  load_vec(pg, wt4, &buf[VLEN * 4]);
807  load_vec(pg, wt5, &buf[VLEN * 5]);
808 
809  sub_vec(pg, vt0, wt0);
810  sub_vec(pg, vt1, wt1);
811  sub_vec(pg, vt2, wt2);
812  sub_vec(pg, vt3, wt3);
813  sub_vec(pg, vt4, wt4);
814  sub_vec(pg, vt5, wt5);
815 
816  ++opr_any;
817  }
818 
819  if (opr_any > 0) {
820  svreal_t wt0, wt1, wt2, wt3, wt4, wt5;
821  load_vec(pg, wt0, &v2[VLEN * (0 + NVC * site)]);
822  load_vec(pg, wt1, &v2[VLEN * (1 + NVC * site)]);
823  load_vec(pg, wt2, &v2[VLEN * (2 + NVC * site)]);
824  load_vec(pg, wt3, &v2[VLEN * (3 + NVC * site)]);
825  load_vec(pg, wt4, &v2[VLEN * (4 + NVC * site)]);
826  load_vec(pg, wt5, &v2[VLEN * (5 + NVC * site)]);
827 
828  scal_vec(pg, vt0, fac);
829  scal_vec(pg, vt1, fac);
830  scal_vec(pg, vt2, fac);
831  scal_vec(pg, vt3, fac);
832  scal_vec(pg, vt4, fac);
833  scal_vec(pg, vt5, fac);
834 
835  add_vec(pg, vt0, wt0);
836  add_vec(pg, vt1, wt1);
837  add_vec(pg, vt2, wt2);
838  add_vec(pg, vt3, wt3);
839  add_vec(pg, vt4, wt4);
840  add_vec(pg, vt5, wt5);
841 
842  //svst1(pg, &v2[VLEN * (0 + NVC * site)], vt0);
843  //svst1(pg, &v2[VLEN * (1 + NVC * site)], vt1);
844  //svst1(pg, &v2[VLEN * (2 + NVC * site)], vt2);
845  //svst1(pg, &v2[VLEN * (3 + NVC * site)], vt3);
846  //svst1(pg, &v2[VLEN * (4 + NVC * site)], vt4);
847  //svst1(pg, &v2[VLEN * (5 + NVC * site)], vt5);
848  save_vec(pg, &v2[VLEN * (0 + NVC * site)], vt0);
849  save_vec(pg, &v2[VLEN * (1 + NVC * site)], vt1);
850  save_vec(pg, &v2[VLEN * (2 + NVC * site)], vt2);
851  save_vec(pg, &v2[VLEN * (3 + NVC * site)], vt3);
852  save_vec(pg, &v2[VLEN * (4 + NVC * site)], vt4);
853  save_vec(pg, &v2[VLEN * (5 + NVC * site)], vt5);
854  }
855  }
856 }
857 
858 
859 #endif
860 //============================================================END=====
BridgeQXS::mult_staggered_bulk
void mult_staggered_bulk(double *v2, double *up, double *v1, double qm, int jd, int *Nsize, int *do_comm)
Definition: mult_Staggered_qxs-inc.h:184
BridgeQXS::mult_staggered_2
void mult_staggered_2(double *v2, double *up, double *v1, double *buf_xp, double *buf_xm, double *buf_yp, double *buf_ym, double *buf_zp, double *buf_zm, double *buf_tp, double *buf_tm, double qm, int jd, int *Nsize, int *do_comm)
Definition: mult_Staggered_qxs-inc.h:528
VLEN
#define VLEN
Definition: bridgeQXS_Clover_coarse_double.cpp:12
BridgeQXS::mult_staggered_phase
void mult_staggered_phase(double *v, double *ph, int *Nsize, int Nin)
Definition: mult_Staggered_qxs-inc.h:16
NDF
#define NDF
Definition: field_F_imp_SU2-inc.h:4
Vsimd_t
Definition: vsimd_double-inc.h:13
Isimd_t
Definition: vsimd_double-inc.h:20
mult_common_th-inc.h
real_t
double real_t
Definition: bridgeQXS_Clover_coarse_double.cpp:16
NC
#define NC
Definition: field_F_imp_SU2-inc.h:2
BridgeQXS::mult_staggered_mult_Gd
void mult_staggered_mult_Gd(double *v, double *u, double *w, int *Nsize)
Definition: mult_Staggered_qxs-inc.h:142
BridgeQXS::mult_staggered_axpby
void mult_staggered_axpby(double b, double *y, double a, double *x, int *Nsize, int Nin)
Definition: mult_Staggered_qxs-inc.h:70
BridgeQXS::mult_staggered_1
void mult_staggered_1(double *buf_xp, double *buf_xm, double *buf_yp, double *buf_ym, double *buf_zp, double *buf_zm, double *buf_tp, double *buf_tm, double *up, double *v1, int *Nsize, int *do_comm)
Definition: mult_Staggered_qxs-inc.h:333
BridgeQXS::mult_staggered_clear
void mult_staggered_clear(double *v, int *Nsize, int Nin)
Definition: mult_Staggered_qxs-inc.h:45
BridgeQXS::mult_staggered_mult_Gn
void mult_staggered_mult_Gn(double *v, double *u, double *w, int *Nsize)
Definition: mult_Staggered_qxs-inc.h:100
VLENY
#define VLENY
Definition: bridgeQXS_Clover_coarse_double.cpp:14
NVC
#define NVC
Definition: fopr_Wilson_impl_SU2-inc.h:15
svbool_t
Definition: vsimd_double-inc.h:30
VLENX
#define VLENX
Definition: bridgeQXS_Clover_coarse_double.cpp:13