Bridge++  Ver. 1.3.x
fopr_Wilson_impl_thread.cpp
Go to the documentation of this file.
1 
14 #include <string>
15 using std::string;
16 
17 #include "fopr_Wilson_impl.h"
18 
19 #include "gammaMatrixSet_Dirac.h"
20 #include "gammaMatrixSet_Chiral.h"
21 
22 #include "bridgeIO.h"
23 using Bridge::vout;
24 
25 #include "threadManager_OpenMP.h"
26 
27 
28 #if defined USE_GROUP_SU3
29 #include "fopr_Wilson_impl_SU3.inc"
30 #elif defined USE_GROUP_SU2
31 #include "fopr_Wilson_impl_SU2.inc"
32 #elif defined USE_GROUP_SU_N
33 #include "fopr_Wilson_impl_SU_N.inc"
34 #endif
35 
36 //====================================================================
38 {
40 
41  // The following setup corresponds to unifirm division of volume.
42  if (m_Nthread <= m_Nt) {
44  } else if (m_Nthread <= m_Nz * m_Nt) {
45  m_Ntask_t = m_Nt;
46  } else {
47  vout.crucial(m_vl, " Too large Nthread: %d\n", m_Nthread);
48  exit(EXIT_FAILURE);
49  }
51  if (m_Ntask_z * m_Ntask_t != m_Nthread) {
52  vout.crucial(m_vl, " Nz(%d) and Nt(%d) do not mach Nthread: %d\n",
53  m_Nz, m_Nt, m_Nthread);
54  exit(EXIT_FAILURE);
55  }
57  m_Mz = m_Nz / m_Ntask_z;
58  m_Mt = m_Nt / m_Ntask_t;
59 
60  // The following setup is not monotonic division, and requires
61  // barrier at the beginning and end of mult (D and gamma5).
62  // [H.Matsufuru 22 Oct 2013]
63  // if(m_Nthread >= 64){
64  // m_Ntask_z = 8;
65  // }else if(m_Nthread >= 16){
66  // m_Ntask_z = 4;
67  // }else if(m_Nthread >= 4){
68  // m_Ntask_z = 2;
69  // }else{
70  // m_Ntask_z = 1;
71  // }
72  // m_Ntask_t = m_Nthread/m_Ntask_z;
73  // m_Ntask = m_Ntask_t * m_Ntask_z;
74  // m_Mz = m_Nz/m_Ntask_z;
75  // m_Mt = m_Nt/m_Ntask_t;
76 
77  vout.general(m_vl, " Nthread = %d\n", m_Nthread);
78  vout.general(m_vl, " Ntask = %d\n", m_Ntask);
79  vout.general(m_vl, " Ntask_z = %d Ntask_t = %d\n", m_Ntask_z, m_Ntask_t);
80  vout.general(m_vl, " Mz = %d Mt = %d\n", m_Mz, m_Mt);
81 
82  // setup of arguments
83  int Nxy = m_Nx * m_Ny;
84  m_arg.resize(m_Ntask);
85  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
86  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
87  int itask = ith_z + m_Ntask_z * ith_t;
88 
89  m_arg[itask].isite = (ith_z * m_Mz + ith_t * (m_Nz * m_Mt)) * Nxy;
90 
91  m_arg[itask].kt0 = 0;
92  m_arg[itask].kt1 = 0;
93  m_arg[itask].kz0 = 0;
94  m_arg[itask].kz1 = 0;
95  if (ith_t == 0) m_arg[itask].kt0 = 1;
96  if (ith_z == 0) m_arg[itask].kz0 = 1;
97  if (ith_t == m_Ntask_t - 1) m_arg[itask].kt1 = 1;
98  if (ith_z == m_Ntask_z - 1) m_arg[itask].kz1 = 1;
99 
100  m_arg[itask].isite_cpx = itask * m_Mz * m_Mt * m_Ny;
101  m_arg[itask].isite_cpy = itask * m_Mz * m_Mt * m_Nx;
102  m_arg[itask].isite_cpz = ith_t * m_Mt * Nxy;
103  m_arg[itask].isite_cpt = ith_z * m_Mz * Nxy;
104  }
105  }
106 
107  // setup for async data transfer
108  int Nc = CommonParameters::Nc();
109  int Nd = CommonParameters::Nd();
110  int Nvcd2 = 2 * Nc * Nd / 2;
111 
112  std::vector<int> destid(m_Ntask);
113  std::vector<int> offset(m_Ntask);
114  std::vector<int> datasize(m_Ntask);
115  std::vector<int> offset_up(m_Ntask);
116  std::vector<int> offset_lw(m_Ntask);
117  std::vector<int> datasize_up(m_Ntask);
118  std::vector<int> datasize_lw(m_Ntask);
119 
120  int imu = 0;
121  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
122  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
123  int itask = ith_z + ith_t * m_Ntask_z;
124  int isite_cp = itask * m_Mz * m_Mt * m_Ny;
125  destid[itask] = itask;
126  offset[itask] = sizeof(double) * Nvcd2 * isite_cp;
127  datasize[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Mt * m_Ny;
128  }
129  }
130  m_bw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
131  m_fw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
132  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
133  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
134 
135  imu = 1;
136  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
137  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
138  int itask = ith_z + ith_t * m_Ntask_z;
139  int isite_cp = itask * m_Mz * m_Mt * m_Nx;
140  destid[itask] = itask;
141  offset[itask] = sizeof(double) * Nvcd2 * isite_cp;
142  datasize[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Mt * m_Nx;
143  }
144  }
145  m_bw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
146  m_fw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
147  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
148  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
149 
150  imu = 2;
151  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
152  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
153  int itask = ith_z + m_Ntask_z * ith_t;
154  destid[itask] = -1;
155  offset_up[itask] = 0;
156  offset_lw[itask] = 0;
157  datasize_up[itask] = 0;
158  datasize_lw[itask] = 0;
159  if (ith_z == 0) {
160  destid[itask] = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
161  offset_lw[itask] = sizeof(double) * Nvcd2 * ith_t * m_Mt * m_Nx * m_Ny;
162  datasize_lw[itask] = sizeof(double) * Nvcd2 * m_Mt * m_Nx * m_Ny;
163  }
164  if (ith_z == m_Ntask_z - 1) {
165  destid[itask] = ith_t * m_Ntask_z;
166  offset_up[itask] = sizeof(double) * Nvcd2 * ith_t * m_Mt * m_Nx * m_Ny;
167  datasize_up[itask] = sizeof(double) * Nvcd2 * m_Mt * m_Nx * m_Ny;
168  }
169  }
170  }
171  m_bw_send[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
172  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
173  m_fw_send[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
174  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
175 
176  imu = 3;
177  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
178  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
179  int itask = ith_z + m_Ntask_z * ith_t;
180  destid[itask] = -1;
181  offset_up[itask] = 0;
182  offset_lw[itask] = 0;
183  datasize_up[itask] = 0;
184  datasize_lw[itask] = 0;
185  if (ith_t == 0) {
186  destid[itask] = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
187  offset_lw[itask] = sizeof(double) * Nvcd2 * ith_z * m_Mz * m_Nx * m_Ny;
188  datasize_lw[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Nx * m_Ny;
189  }
190  if (ith_t == m_Ntask_t - 1) {
191  destid[itask] = ith_z;
192  offset_up[itask] = sizeof(double) * Nvcd2 * ith_z * m_Mz * m_Nx * m_Ny;
193  datasize_up[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Nx * m_Ny;
194  }
195  }
196  }
197  m_bw_send[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
198  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
199  m_fw_send[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
200  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
201 }
202 
203 
204 //====================================================================
206  int itask, double *v2, double fac, const double *v1)
207 {
208  int Nvcd = m_Nvc * m_Nd;
209  int Nvxy = Nvcd * m_Nx * m_Ny;
210 
211  int isite = m_arg[itask].isite;
212 
213  double *w2 = &v2[Nvcd * isite];
214  const double *w1 = &v1[Nvcd * isite];
215 
216  for (int it = 0; it < m_Mt; ++it) {
217  for (int iz = 0; iz < m_Mz; ++iz) {
218  for (int ivxy = 0; ivxy < Nvxy; ++ivxy) {
219  int iv = ivxy + Nvxy * (iz + m_Nz * it);
220  w2[iv] = fac * w2[iv] + w1[iv];
221  }
222  }
223  }
224 }
225 
226 
227 //====================================================================
229  double *v2)
230 {
231  int Nvcd = m_Nvc * m_Nd;
232  int Nvxy = Nvcd * m_Nx * m_Ny;
233 
234  int isite = m_arg[itask].isite;
235  double *w2 = &v2[Nvcd * isite];
236 
237  for (int it = 0; it < m_Mt; ++it) {
238  for (int iz = 0; iz < m_Mz; ++iz) {
239  for (int ivxy = 0; ivxy < Nvxy; ++ivxy) {
240  int iv = ivxy + Nvxy * (iz + m_Nz * it);
241  w2[iv] = 0.0;
242  }
243  }
244  }
245 }
246 
247 
248 //====================================================================
250  int itask, double *vcp1, const double *v1)
251 {
252  int Nvc2 = 2 * m_Nvc;
253  int Nvcd = m_Nvc * m_Nd;
254  int Nvcd2 = Nvcd / 2;
255 
256  int id1 = 0;
257  int id2 = m_Nvc;
258  int id3 = m_Nvc * 2;
259  int id4 = m_Nvc * 3;
260 
261  int idir = 0;
262  double bc2 = m_boundary2[idir];
263 
264  int isite = m_arg[itask].isite;
265  int isite_cp = m_arg[itask].isite_cpx;
266 
267  // double* w2 = &vcp1[Nvcd2*isite_cp];
268  double *w2
269  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
270  const double *w1 = &v1[Nvcd * isite];
271 
272  int ix = 0;
273 
274  for (int it = 0; it < m_Mt; ++it) {
275  for (int iz = 0; iz < m_Mz; ++iz) {
276  for (int iy = 0; iy < m_Ny; ++iy) {
277  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
278  int is2 = iy + m_Ny * (iz + m_Mz * it);
279  int in = Nvcd * is;
280  int ix1 = Nvc2 * is2;
281  int ix2 = ix1 + m_Nvc;
282 
283  for (int ic = 0; ic < m_Nc; ++ic) {
284  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
285  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
286  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
287  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
288  }
289  }
290  }
291  }
292 
293  m_bw_send[idir]->start_thread(itask);
294 }
295 
296 
297 //====================================================================
299  int itask, double *v2, const double *vcp2)
300 {
301  int Nvc2 = 2 * m_Nvc;
302  int Nvcd = m_Nvc * m_Nd;
303  int Nvcd2 = Nvcd / 2;
304 
305  int id1 = 0;
306  int id2 = m_Nvc;
307  int id3 = m_Nvc * 2;
308  int id4 = m_Nvc * 3;
309 
310  int idir = 0;
311 
312  double wt1r, wt1i, wt2r, wt2i;
313 
314  int isite = m_arg[itask].isite;
315  int isite_cp = m_arg[itask].isite_cpx;
316 
317  double *w2 = &v2[Nvcd * isite];
318  // double* w1 = &vcp2[Nvcd2*isite_cp];
319  const double *w1
320  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
321  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
322 
323  m_bw_recv[idir]->wait_thread(itask);
324 
325  int ix = m_Nx - 1;
326  for (int it = 0; it < m_Mt; ++it) {
327  for (int iz = 0; iz < m_Mz; ++iz) {
328  for (int iy = 0; iy < m_Ny; ++iy) {
329  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
330  int is2 = iy + m_Ny * (iz + m_Mz * it);
331  int iv = Nvcd * is;
332  int ig = m_Ndf * is;
333  int ix1 = Nvc2 * is2;
334  int ix2 = ix1 + m_Nvc;
335 
336  for (int ic = 0; ic < m_Nc; ++ic) {
337  int ic2 = ic * m_Nvc;
338 
339  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
340  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
341  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
342  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
343 
344  w2[2 * ic + id1 + iv] += wt1r;
345  w2[2 * ic + 1 + id1 + iv] += wt1i;
346  w2[2 * ic + id2 + iv] += wt2r;
347  w2[2 * ic + 1 + id2 + iv] += wt2i;
348  w2[2 * ic + id3 + iv] += wt2i;
349  w2[2 * ic + 1 + id3 + iv] += -wt2r;
350  w2[2 * ic + id4 + iv] += wt1i;
351  w2[2 * ic + 1 + id4 + iv] += -wt1r;
352  }
353  }
354  }
355  }
356 }
357 
358 
359 //====================================================================
361  int itask, double *v2, const double *v1)
362 {
363  int Nvcd = m_Nvc * m_Nd;
364 
365  int id1 = 0;
366  int id2 = m_Nvc;
367  int id3 = m_Nvc * 2;
368  int id4 = m_Nvc * 3;
369 
370  int idir = 0;
371 
372  double vt1[m_Nvc], vt2[m_Nvc];
373  double wt1r, wt1i, wt2r, wt2i;
374 
375  int isite = m_arg[itask].isite;
376 
377  double *w2 = &v2[Nvcd * isite];
378  const double *w1 = &v1[Nvcd * isite];
379  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
380 
381  for (int it = 0; it < m_Mt; ++it) {
382  for (int iz = 0; iz < m_Mz; ++iz) {
383  for (int iy = 0; iy < m_Ny; ++iy) {
384  for (int ix = 0; ix < m_Nx - 1; ++ix) {
385  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
386  int iv = Nvcd * is;
387  int in = Nvcd * (is + 1);
388  int ig = m_Ndf * is;
389 
390  for (int ic = 0; ic < m_Nc; ++ic) {
391  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
392  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
393  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
394  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
395  }
396 
397  for (int ic = 0; ic < m_Nc; ++ic) {
398  int ic2 = ic * m_Nvc;
399 
400  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
401  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
402  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
403  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
404 
405  w2[2 * ic + id1 + iv] += wt1r;
406  w2[2 * ic + 1 + id1 + iv] += wt1i;
407  w2[2 * ic + id2 + iv] += wt2r;
408  w2[2 * ic + 1 + id2 + iv] += wt2i;
409  w2[2 * ic + id3 + iv] += wt2i;
410  w2[2 * ic + 1 + id3 + iv] += -wt2r;
411  w2[2 * ic + id4 + iv] += wt1i;
412  w2[2 * ic + 1 + id4 + iv] += -wt1r;
413  }
414  }
415  }
416  }
417  }
418 }
419 
420 
421 //====================================================================
423  int itask, double *vcp1, const double *v1)
424 {
425  int Nvc2 = 2 * m_Nvc;
426  int Nvcd = m_Nvc * m_Nd;
427  int Nvcd2 = Nvcd / 2;
428 
429  int id1 = 0;
430  int id2 = m_Nvc;
431  int id3 = m_Nvc * 2;
432  int id4 = m_Nvc * 3;
433 
434  int idir = 0;
435 
436  int isite = m_arg[itask].isite;
437  int isite_cp = m_arg[itask].isite_cpx;
438 
439  // double* w2 = &vcp1[Nvcd2*isite_cp];
440  double *w2
441  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
442  const double *w1 = &v1[Nvcd * isite];
443  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
444 
445  double vt1[m_Nvc], vt2[m_Nvc];
446 
447  int ix = m_Nx - 1;
448 
449  for (int it = 0; it < m_Mt; ++it) {
450  for (int iz = 0; iz < m_Mz; ++iz) {
451  for (int iy = 0; iy < m_Ny; ++iy) {
452  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
453  int is2 = iy + m_Ny * (iz + m_Mz * it);
454  int in = Nvcd * is;
455  int ig = m_Ndf * is;
456  int ix1 = Nvc2 * is2;
457  int ix2 = ix1 + m_Nvc;
458 
459  for (int ic = 0; ic < m_Nc; ++ic) {
460  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
461  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
462  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
463  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
464  }
465 
466  for (int ic = 0; ic < m_Nc; ++ic) {
467  int icr = 2 * ic;
468  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
469  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
470  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
471  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
472  }
473  }
474  }
475  }
476 
477  m_fw_send[idir]->start_thread(itask);
478 }
479 
480 
481 //====================================================================
483  int itask, double *v2, const double *vcp2)
484 {
485  int Nvc2 = 2 * m_Nvc;
486  int Nvcd = m_Nvc * m_Nd;
487  int Nvcd2 = Nvcd / 2;
488 
489  int id1 = 0;
490  int id2 = m_Nvc;
491  int id3 = m_Nvc * 2;
492  int id4 = m_Nvc * 3;
493 
494  int idir = 0;
495  double bc2 = m_boundary2[idir];
496 
497  double wt1r, wt1i, wt2r, wt2i;
498 
499  int isite = m_arg[itask].isite;
500  int isite_cp = m_arg[itask].isite_cpx;
501 
502  double *w2 = &v2[Nvcd * isite];
503  // double* w1 = &vcp2[Nvcd2*isite_cp];
504  const double *w1
505  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
506 
507  m_fw_recv[idir]->wait_thread(itask);
508 
509  int ix = 0;
510  for (int it = 0; it < m_Mt; ++it) {
511  for (int iz = 0; iz < m_Mz; ++iz) {
512  for (int iy = 0; iy < m_Ny; ++iy) {
513  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
514  int is2 = iy + m_Ny * (iz + m_Mz * it);
515  int iv = Nvcd * is;
516  int ix1 = Nvc2 * is2;
517  int ix2 = ix1 + m_Nvc;
518 
519  for (int ic = 0; ic < m_Nc; ++ic) {
520  int icr = 2 * ic;
521  int ici = 2 * ic + 1;
522  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
523  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
524  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
525  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
526  w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
527  w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
528  w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
529  w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
530  }
531  }
532  }
533  }
534 }
535 
536 
537 //====================================================================
539  int itask, double *v2, const double *v1)
540 {
541  int Nvcd = m_Nvc * m_Nd;
542 
543  int id1 = 0;
544  int id2 = m_Nvc;
545  int id3 = m_Nvc * 2;
546  int id4 = m_Nvc * 3;
547 
548  int idir = 0;
549 
550  double vt1[m_Nvc], vt2[m_Nvc];
551  double wt1r, wt1i, wt2r, wt2i;
552 
553  int isite = m_arg[itask].isite;
554 
555  double *w2 = &v2[Nvcd * isite];
556  const double *w1 = &v1[Nvcd * isite];
557  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
558 
559  for (int it = 0; it < m_Mt; ++it) {
560  for (int iz = 0; iz < m_Mz; ++iz) {
561  for (int iy = 0; iy < m_Ny; ++iy) {
562  for (int ix = 1; ix < m_Nx; ++ix) {
563  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
564  int iv = Nvcd * is;
565  int in = Nvcd * (is - 1);
566  int ig = m_Ndf * (is - 1);
567 
568  for (int ic = 0; ic < m_Nc; ++ic) {
569  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
570  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
571  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
572  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
573  }
574 
575  for (int ic = 0; ic < m_Nc; ++ic) {
576  int ic2 = 2 * ic;
577 
578  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
579  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
580  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
581  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
582 
583  w2[2 * ic + id1 + iv] += wt1r;
584  w2[2 * ic + 1 + id1 + iv] += wt1i;
585  w2[2 * ic + id2 + iv] += wt2r;
586  w2[2 * ic + 1 + id2 + iv] += wt2i;
587  w2[2 * ic + id3 + iv] += -wt2i;
588  w2[2 * ic + 1 + id3 + iv] += +wt2r;
589  w2[2 * ic + id4 + iv] += -wt1i;
590  w2[2 * ic + 1 + id4 + iv] += +wt1r;
591  }
592  }
593  }
594  }
595  }
596 }
597 
598 
599 //====================================================================
601  int itask, double *vcp1, const double *v1)
602 {
603  int Nvc2 = 2 * m_Nvc;
604  int Nvcd = m_Nvc * m_Nd;
605  int Nvcd2 = Nvcd / 2;
606 
607  int id1 = 0;
608  int id2 = m_Nvc;
609  int id3 = m_Nvc * 2;
610  int id4 = m_Nvc * 3;
611 
612  int isite = m_arg[itask].isite;
613  int isite_cp = m_arg[itask].isite_cpy;
614 
615  int idir = 1;
616  double bc2 = m_boundary2[idir];
617 
618  // double* w2 = &vcp1[Nvcd2*isite_cp];
619  double *w2
620  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
621  const double *w1 = &v1[Nvcd * isite];
622 
623  int iy = 0;
624 
625  for (int it = 0; it < m_Mt; ++it) {
626  for (int iz = 0; iz < m_Mz; ++iz) {
627  for (int ix = 0; ix < m_Nx; ++ix) {
628  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
629  int is2 = ix + m_Nx * (iz + m_Mz * it);
630  int in = Nvcd * is;
631  int ix1 = Nvc2 * is2;
632  int ix2 = ix1 + m_Nvc;
633 
634  for (int ic = 0; ic < m_Nc; ++ic) {
635  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
636  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
637  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
638  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
639  }
640  }
641  }
642  }
643 
644  m_bw_send[idir]->start_thread(itask);
645 }
646 
647 
648 //====================================================================
650  int itask, double *v2, const double *vcp2)
651 {
652  int Nvc2 = 2 * m_Nvc;
653  int Nvcd = m_Nvc * m_Nd;
654  int Nvcd2 = Nvcd / 2;
655 
656  int id1 = 0;
657  int id2 = m_Nvc;
658  int id3 = m_Nvc * 2;
659  int id4 = m_Nvc * 3;
660 
661  int idir = 1;
662 
663  double wt1r, wt1i, wt2r, wt2i;
664 
665  int isite = m_arg[itask].isite;
666  int isite_cp = m_arg[itask].isite_cpy;
667 
668  double *w2 = &v2[Nvcd * isite];
669  // double* w1 = &vcp2[Nvcd2*isite_cp];
670  const double *w1
671  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
672  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
673 
674  m_bw_recv[idir]->wait_thread(itask);
675 
676  int iy = m_Ny - 1;
677  for (int it = 0; it < m_Mt; ++it) {
678  for (int iz = 0; iz < m_Mz; ++iz) {
679  for (int ix = 0; ix < m_Nx; ++ix) {
680  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
681  int is2 = ix + m_Nx * (iz + m_Mz * it);
682  int iv = Nvcd * is;
683  int ig = m_Ndf * is;
684  int ix1 = Nvc2 * is2;
685  int ix2 = ix1 + m_Nvc;
686 
687  for (int ic = 0; ic < m_Nc; ++ic) {
688  int ic2 = ic * m_Nvc;
689 
690  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
691  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
692  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
693  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
694 
695  w2[2 * ic + id1 + iv] += wt1r;
696  w2[2 * ic + 1 + id1 + iv] += wt1i;
697  w2[2 * ic + id2 + iv] += wt2r;
698  w2[2 * ic + 1 + id2 + iv] += wt2i;
699  w2[2 * ic + id3 + iv] += -wt2r;
700  w2[2 * ic + 1 + id3 + iv] += -wt2i;
701  w2[2 * ic + id4 + iv] += wt1r;
702  w2[2 * ic + 1 + id4 + iv] += wt1i;
703  }
704  }
705  }
706  }
707 }
708 
709 
710 //====================================================================
712  int itask, double *v2, const double *v1)
713 {
714  int Nvcd = m_Nvc * m_Nd;
715 
716  int id1 = 0;
717  int id2 = m_Nvc;
718  int id3 = m_Nvc * 2;
719  int id4 = m_Nvc * 3;
720 
721  int idir = 1;
722 
723  double vt1[m_Nvc], vt2[m_Nvc];
724  double wt1r, wt1i, wt2r, wt2i;
725 
726  int isite = m_arg[itask].isite;
727 
728  double *w2 = &v2[Nvcd * isite];
729  const double *w1 = &v1[Nvcd * isite];
730  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
731 
732  for (int it = 0; it < m_Mt; ++it) {
733  for (int iz = 0; iz < m_Mz; ++iz) {
734  for (int iy = 0; iy < m_Ny - 1; ++iy) {
735  for (int ix = 0; ix < m_Nx; ++ix) {
736  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
737  int iv = Nvcd * is;
738  int in = Nvcd * (is + m_Nx);
739  int ig = m_Ndf * is;
740 
741  for (int ic = 0; ic < m_Nc; ++ic) {
742  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
743  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
744  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
745  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
746  }
747 
748  for (int ic = 0; ic < m_Nc; ++ic) {
749  int ic2 = ic * m_Nvc;
750 
751  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
752  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
753  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
754  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
755 
756  w2[2 * ic + id1 + iv] += wt1r;
757  w2[2 * ic + 1 + id1 + iv] += wt1i;
758  w2[2 * ic + id2 + iv] += wt2r;
759  w2[2 * ic + 1 + id2 + iv] += wt2i;
760  w2[2 * ic + id3 + iv] += -wt2r;
761  w2[2 * ic + 1 + id3 + iv] += -wt2i;
762  w2[2 * ic + id4 + iv] += wt1r;
763  w2[2 * ic + 1 + id4 + iv] += wt1i;
764  }
765  }
766  }
767  }
768  }
769 }
770 
771 
772 //====================================================================
774  int itask, double *vcp1, const double *v1)
775 {
776  int Nvc2 = 2 * m_Nvc;
777  int Nvcd = m_Nvc * m_Nd;
778  int Nvcd2 = Nvcd / 2;
779 
780  int id1 = 0;
781  int id2 = m_Nvc;
782  int id3 = m_Nvc * 2;
783  int id4 = m_Nvc * 3;
784 
785  int idir = 1;
786 
787  int isite = m_arg[itask].isite;
788  int isite_cp = m_arg[itask].isite_cpy;
789 
790  // double* w2 = &vcp1[Nvcd2*isite_cp];
791  double *w2
792  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
793 
794  const double *w1 = &v1[Nvcd * isite];
795  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
796 
797  double vt1[m_Nvc], vt2[m_Nvc];
798 
799  int iy = m_Ny - 1;
800 
801  for (int it = 0; it < m_Mt; ++it) {
802  for (int iz = 0; iz < m_Mz; ++iz) {
803  for (int ix = 0; ix < m_Nx; ++ix) {
804  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
805  int is2 = ix + m_Nx * (iz + m_Mz * it);
806  int in = Nvcd * is;
807  int ig = m_Ndf * is;
808  int ix1 = Nvc2 * is2;
809  int ix2 = ix1 + m_Nvc;
810 
811  for (int ic = 0; ic < m_Nc; ++ic) {
812  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
813  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
814  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
815  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
816  }
817 
818  for (int ic = 0; ic < m_Nc; ++ic) {
819  int icr = 2 * ic;
820  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
821  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
822  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
823  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
824  }
825  }
826  }
827  }
828 
829  m_fw_send[idir]->start_thread(itask);
830 }
831 
832 
833 //====================================================================
835  int itask, double *v2, const double *vcp2)
836 {
837  int Nvc2 = 2 * m_Nvc;
838  int Nvcd = m_Nvc * m_Nd;
839  int Nvcd2 = Nvcd / 2;
840 
841  int id1 = 0;
842  int id2 = m_Nvc;
843  int id3 = m_Nvc * 2;
844  int id4 = m_Nvc * 3;
845 
846  int idir = 1;
847  double bc2 = m_boundary2[idir];
848 
849  double wt1r, wt1i, wt2r, wt2i;
850 
851  int isite = m_arg[itask].isite;
852  int isite_cp = m_arg[itask].isite_cpy;
853 
854  double *w2 = &v2[Nvcd * isite];
855  // double* w1 = &vcp2[Nvcd2*isite_cp];
856  const double *w1
857  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
858 
859  m_fw_recv[idir]->wait_thread(itask);
860 
861  int iy = 0;
862  for (int it = 0; it < m_Mt; ++it) {
863  for (int iz = 0; iz < m_Mz; ++iz) {
864  for (int ix = 0; ix < m_Nx; ++ix) {
865  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
866  int is2 = ix + m_Nx * (iz + m_Mz * it);
867  int iv = Nvcd * is;
868  int ix1 = Nvc2 * is2;
869  int ix2 = ix1 + m_Nvc;
870 
871  for (int ic = 0; ic < m_Nc; ++ic) {
872  int icr = 2 * ic;
873  int ici = 2 * ic + 1;
874  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
875  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
876  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
877  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
878  w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
879  w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
880  w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
881  w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
882  }
883  }
884  }
885  }
886 }
887 
888 
889 //====================================================================
891  int itask, double *v2, const double *v1)
892 {
893  int Nvcd = m_Nvc * m_Nd;
894 
895  int id1 = 0;
896  int id2 = m_Nvc;
897  int id3 = m_Nvc * 2;
898  int id4 = m_Nvc * 3;
899 
900  int idir = 1;
901 
902  double vt1[m_Nvc], vt2[m_Nvc];
903  double wt1r, wt1i, wt2r, wt2i;
904 
905  int isite = m_arg[itask].isite;
906 
907  double *w2 = &v2[Nvcd * isite];
908  const double *w1 = &v1[Nvcd * isite];
909  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
910 
911  for (int it = 0; it < m_Mt; ++it) {
912  for (int iz = 0; iz < m_Mz; ++iz) {
913  for (int iy = 1; iy < m_Ny; ++iy) {
914  for (int ix = 0; ix < m_Nx; ++ix) {
915  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
916  int iv = Nvcd * is;
917  int in = Nvcd * (is - m_Nx);
918  int ig = m_Ndf * (is - m_Nx);
919 
920  for (int ic = 0; ic < m_Nc; ++ic) {
921  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
922  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
923  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
924  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
925  }
926 
927  for (int ic = 0; ic < m_Nc; ++ic) {
928  int ic2 = 2 * ic;
929  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
930  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
931  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
932  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
933 
934  w2[ic2 + id1 + iv] += wt1r;
935  w2[ic2 + 1 + id1 + iv] += wt1i;
936  w2[ic2 + id2 + iv] += wt2r;
937  w2[ic2 + 1 + id2 + iv] += wt2i;
938  w2[ic2 + id3 + iv] += wt2r;
939  w2[ic2 + 1 + id3 + iv] += wt2i;
940  w2[ic2 + id4 + iv] += -wt1r;
941  w2[ic2 + 1 + id4 + iv] += -wt1i;
942  }
943  }
944  }
945  }
946  }
947 }
948 
949 
950 //====================================================================
952  int itask, double *vcp1, const double *v1)
953 {
954  int Nvc2 = 2 * m_Nvc;
955  int Nvcd = m_Nvc * m_Nd;
956  int Nvcd2 = Nvcd / 2;
957 
958  int id1 = 0;
959  int id2 = m_Nvc;
960  int id3 = m_Nvc * 2;
961  int id4 = m_Nvc * 3;
962 
963  int isite = m_arg[itask].isite;
964  int isite_cp = m_arg[itask].isite_cpz;
965 
966  int idir = 2;
967  double bc2 = m_boundary2[idir];
968 
969  // double* w2 = &vcp1[Nvcd2*isite_cp];
970  double *w2
971  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
972  const double *w1 = &v1[Nvcd * isite];
973 
974  if (m_arg[itask].kz0 == 1) {
975  int Nxy = m_Nx * m_Ny;
976  int iz = 0;
977  for (int it = 0; it < m_Mt; ++it) {
978  for (int ixy = 0; ixy < Nxy; ++ixy) {
979  int is = ixy + Nxy * (iz + m_Nz * it);
980  int is2 = ixy + Nxy * it;
981 
982  int in = Nvcd * is;
983  int ix1 = Nvc2 * is2;
984  int ix2 = ix1 + m_Nvc;
985 
986  for (int ic = 0; ic < m_Nc; ++ic) {
987  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
988  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
989  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
990  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
991  }
992  }
993  }
994  }
995 
996  m_bw_send[idir]->start_thread(itask);
997 }
998 
999 
1000 //====================================================================
1002  int itask, double *v2, const double *vcp2)
1003 {
1004  int Nvc2 = 2 * m_Nvc;
1005  int Nvcd = m_Nvc * m_Nd;
1006  int Nvcd2 = Nvcd / 2;
1007 
1008  int id1 = 0;
1009  int id2 = m_Nvc;
1010  int id3 = m_Nvc * 2;
1011  int id4 = m_Nvc * 3;
1012 
1013  int idir = 2;
1014 
1015  double wt1r, wt1i, wt2r, wt2i;
1016 
1017  int isite = m_arg[itask].isite;
1018  int isite_cp = m_arg[itask].isite_cpz;
1019 
1020  double *w2 = &v2[Nvcd * isite];
1021  // double* w1 = &vcp2[Nvcd2*isite_cp];
1022  const double *w1
1023  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1024  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1025 
1026  m_bw_recv[idir]->wait_thread(itask);
1027 
1028  if (m_arg[itask].kz1 == 1) {
1029  int Nxy = m_Nx * m_Ny;
1030  int iz = m_Mz - 1;
1031  for (int it = 0; it < m_Mt; ++it) {
1032  for (int ixy = 0; ixy < Nxy; ++ixy) {
1033  int is = ixy + Nxy * (iz + m_Nz * it);
1034  int is2 = ixy + Nxy * it;
1035  int iv = Nvcd * is;
1036  int ig = m_Ndf * is;
1037  int ix1 = Nvc2 * is2;
1038  int ix2 = ix1 + m_Nvc;
1039 
1040  for (int ic = 0; ic < m_Nc; ++ic) {
1041  int ic2 = ic * m_Nvc;
1042 
1043  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1044  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1045  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1046  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1047 
1048  w2[2 * ic + id1 + iv] += wt1r;
1049  w2[2 * ic + 1 + id1 + iv] += wt1i;
1050  w2[2 * ic + id2 + iv] += wt2r;
1051  w2[2 * ic + 1 + id2 + iv] += wt2i;
1052  w2[2 * ic + id3 + iv] += wt1i;
1053  w2[2 * ic + 1 + id3 + iv] += -wt1r;
1054  w2[2 * ic + id4 + iv] += -wt2i;
1055  w2[2 * ic + 1 + id4 + iv] += wt2r;
1056  }
1057  }
1058  }
1059  }
1060 }
1061 
1062 
1063 //====================================================================
1065  int itask, double *v2, const double *v1)
1066 {
1067  int Nvcd = m_Nvc * m_Nd;
1068 
1069  int id1 = 0;
1070  int id2 = m_Nvc;
1071  int id3 = m_Nvc * 2;
1072  int id4 = m_Nvc * 3;
1073 
1074  int idir = 2;
1075 
1076  double vt1[m_Nvc], vt2[m_Nvc];
1077  double wt1r, wt1i, wt2r, wt2i;
1078 
1079  int isite = m_arg[itask].isite;
1080 
1081  double *w2 = &v2[Nvcd * isite];
1082  const double *w1 = &v1[Nvcd * isite];
1083  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1084 
1085  int kz1 = m_arg[itask].kz1;
1086  int Nxy = m_Nx * m_Ny;
1087 
1088  for (int it = 0; it < m_Mt; ++it) {
1089  for (int iz = 0; iz < m_Mz - kz1; ++iz) {
1090  for (int ixy = 0; ixy < Nxy; ++ixy) {
1091  int is = ixy + Nxy * (iz + m_Nz * it);
1092  int iv = Nvcd * is;
1093  int in = Nvcd * (is + Nxy);
1094  int ig = m_Ndf * is;
1095 
1096  for (int ic = 0; ic < m_Nc; ++ic) {
1097  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
1098  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
1099  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
1100  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
1101  }
1102 
1103  for (int ic = 0; ic < m_Nc; ++ic) {
1104  int ic2 = ic * m_Nvc;
1105 
1106  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1107  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1108  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1109  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1110 
1111  w2[2 * ic + id1 + iv] += wt1r;
1112  w2[2 * ic + 1 + id1 + iv] += wt1i;
1113  w2[2 * ic + id2 + iv] += wt2r;
1114  w2[2 * ic + 1 + id2 + iv] += wt2i;
1115  w2[2 * ic + id3 + iv] += wt1i;
1116  w2[2 * ic + 1 + id3 + iv] += -wt1r;
1117  w2[2 * ic + id4 + iv] += -wt2i;
1118  w2[2 * ic + 1 + id4 + iv] += wt2r;
1119  }
1120  }
1121  }
1122  }
1123 }
1124 
1125 
1126 //====================================================================
1128  int itask, double *vcp1, const double *v1)
1129 {
1130  int Nvc2 = 2 * m_Nvc;
1131  int Nvcd = m_Nvc * m_Nd;
1132  int Nvcd2 = Nvcd / 2;
1133 
1134  int id1 = 0;
1135  int id2 = m_Nvc;
1136  int id3 = m_Nvc * 2;
1137  int id4 = m_Nvc * 3;
1138 
1139  int idir = 2;
1140 
1141  int isite = m_arg[itask].isite;
1142  int isite_cp = m_arg[itask].isite_cpz;
1143 
1144  // double* w2 = &vcp1[Nvcd2*isite_cp];
1145  double *w2
1146  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1147  const double *w1 = &v1[Nvcd * isite];
1148  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1149 
1150  double vt1[m_Nvc], vt2[m_Nvc];
1151 
1152  if (m_arg[itask].kz1 == 1) {
1153  int Nxy = m_Nx * m_Ny;
1154  int iz = m_Mz - 1;
1155  for (int it = 0; it < m_Mt; ++it) {
1156  for (int ixy = 0; ixy < Nxy; ++ixy) {
1157  int is = ixy + Nxy * (iz + m_Nz * it);
1158  int is2 = ixy + Nxy * it;
1159  int in = Nvcd * is;
1160  int ig = m_Ndf * is;
1161  int ix1 = Nvc2 * is2;
1162  int ix2 = ix1 + m_Nvc;
1163 
1164  for (int ic = 0; ic < m_Nc; ++ic) {
1165  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1166  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1167  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1168  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1169  }
1170 
1171  for (int ic = 0; ic < m_Nc; ++ic) {
1172  int icr = 2 * ic;
1173  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1174  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1175  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1176  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1177  }
1178  }
1179  }
1180  }
1181 
1182  m_fw_send[idir]->start_thread(itask);
1183 }
1184 
1185 
1186 //====================================================================
1188  int itask, double *v2, const double *vcp2)
1189 {
1190  int Nvc2 = 2 * m_Nvc;
1191  int Nvcd = m_Nvc * m_Nd;
1192  int Nvcd2 = Nvcd / 2;
1193 
1194  int id1 = 0;
1195  int id2 = m_Nvc;
1196  int id3 = m_Nvc * 2;
1197  int id4 = m_Nvc * 3;
1198 
1199  int idir = 2;
1200  double bc2 = m_boundary2[idir];
1201 
1202  double wt1r, wt1i, wt2r, wt2i;
1203 
1204  int isite = m_arg[itask].isite;
1205  int isite_cp = m_arg[itask].isite_cpz;
1206 
1207  double *w2 = &v2[Nvcd * isite];
1208  // double* w1 = &vcp2[Nvcd2*isite_cp];
1209  const double *w1
1210  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1211 
1212  m_fw_recv[idir]->wait_thread(itask);
1213 
1214  if (m_arg[itask].kz0 == 1) {
1215  int Nxy = m_Nx * m_Ny;
1216 
1217  int iz = 0;
1218  for (int it = 0; it < m_Mt; ++it) {
1219  for (int ixy = 0; ixy < Nxy; ++ixy) {
1220  int is = ixy + Nxy * (iz + m_Nz * it);
1221  int is2 = ixy + Nxy * it;
1222  int iv = Nvcd * is;
1223  int ix1 = Nvc2 * is2;
1224  int ix2 = ix1 + m_Nvc;
1225 
1226  for (int ic = 0; ic < m_Nc; ++ic) {
1227  int icr = 2 * ic;
1228  int ici = 2 * ic + 1;
1229  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1230  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1231  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1232  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1233  w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
1234  w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
1235  w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
1236  w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
1237  }
1238  }
1239  }
1240  }
1241 }
1242 
1243 
1244 //====================================================================
1246  int itask, double *v2, const double *v1)
1247 {
1248  int Nvcd = m_Nvc * m_Nd;
1249 
1250  int id1 = 0;
1251  int id2 = m_Nvc;
1252  int id3 = m_Nvc * 2;
1253  int id4 = m_Nvc * 3;
1254 
1255  int idir = 2;
1256 
1257  double vt1[m_Nvc], vt2[m_Nvc];
1258  double wt1r, wt1i, wt2r, wt2i;
1259 
1260  int isite = m_arg[itask].isite;
1261 
1262  double *w2 = &v2[Nvcd * isite];
1263  const double *w1 = &v1[Nvcd * isite];
1264  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1265 
1266  int kz0 = m_arg[itask].kz0;
1267  int Nxy = m_Nx * m_Ny;
1268 
1269  for (int it = 0; it < m_Mt; ++it) {
1270  for (int iz = kz0; iz < m_Mz; ++iz) {
1271  for (int ixy = 0; ixy < Nxy; ++ixy) {
1272  int is = ixy + Nxy * (iz + m_Nz * it);
1273  int iv = Nvcd * is;
1274  int in = Nvcd * (is - Nxy);
1275  int ig = m_Ndf * (is - Nxy);
1276 
1277  for (int ic = 0; ic < m_Nc; ++ic) {
1278  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1279  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1280  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1281  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1282  }
1283 
1284  for (int ic = 0; ic < m_Nc; ++ic) {
1285  int ic2 = 2 * ic;
1286  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1287  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1288  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1289  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1290 
1291  w2[ic2 + id1 + iv] += wt1r;
1292  w2[ic2 + 1 + id1 + iv] += wt1i;
1293  w2[ic2 + id2 + iv] += wt2r;
1294  w2[ic2 + 1 + id2 + iv] += wt2i;
1295  w2[ic2 + id3 + iv] += -wt1i;
1296  w2[ic2 + 1 + id3 + iv] += wt1r;
1297  w2[ic2 + id4 + iv] += wt2i;
1298  w2[ic2 + 1 + id4 + iv] += -wt2r;
1299  }
1300  }
1301  }
1302  }
1303 }
1304 
1305 
1306 //====================================================================
1308  int itask, double *vcp1, const double *v1)
1309 {
1310  int Nvc2 = 2 * m_Nvc;
1311  int Nvcd = m_Nvc * m_Nd;
1312  int Nvcd2 = Nvcd / 2;
1313 
1314  int id1 = 0;
1315  int id2 = m_Nvc;
1316  int id3 = m_Nvc * 2;
1317  int id4 = m_Nvc * 3;
1318 
1319  int isite = m_arg[itask].isite;
1320  int isite_cp = m_arg[itask].isite_cpt;
1321 
1322  int idir = 3;
1323  double bc2 = m_boundary2[idir];
1324 
1325  // double* w2 = &vcp1[Nvcd2*isite_cp];
1326  double *w2
1327  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1328  const double *w1 = &v1[Nvcd * isite];
1329 
1330  if (m_arg[itask].kt0 == 1) {
1331  int Nxy = m_Nx * m_Ny;
1332  int it = 0;
1333  for (int iz = 0; iz < m_Mz; ++iz) {
1334  for (int ixy = 0; ixy < Nxy; ++ixy) {
1335  int is = ixy + Nxy * (iz + m_Nz * it);
1336  int is2 = ixy + Nxy * iz;
1337 
1338  int in = Nvcd * is;
1339  int ix1 = Nvc2 * is2;
1340  int ix2 = ix1 + m_Nvc;
1341 
1342  for (int ic = 0; ic < m_Nc; ++ic) {
1343  w2[2 * ic + ix1] = 2.0 * bc2 * w1[2 * ic + id3 + in];
1344  w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
1345  w2[2 * ic + ix2] = 2.0 * bc2 * w1[2 * ic + id4 + in];
1346  w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
1347  }
1348  }
1349  }
1350  }
1351 
1352  m_bw_send[idir]->start_thread(itask);
1353 }
1354 
1355 
1356 //====================================================================
1358  int itask, double *v2, const double *vcp2)
1359 {
1360  int Nvc2 = 2 * m_Nvc;
1361  int Nvcd = m_Nvc * m_Nd;
1362  int Nvcd2 = Nvcd / 2;
1363 
1364  int id1 = 0;
1365  int id2 = m_Nvc;
1366  int id3 = m_Nvc * 2;
1367  int id4 = m_Nvc * 3;
1368 
1369  int idir = 3;
1370 
1371  double wt1r, wt1i, wt2r, wt2i;
1372 
1373  int isite = m_arg[itask].isite;
1374  int isite_cp = m_arg[itask].isite_cpt;
1375 
1376  double *w2 = &v2[Nvcd * isite];
1377  // double* w1 = &vcp2[Nvcd2*isite_cp];
1378  const double *w1
1379  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1380  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1381 
1382  m_bw_recv[idir]->wait_thread(itask);
1383 
1384  if (m_arg[itask].kt1 == 1) {
1385  int Nxy = m_Nx * m_Ny;
1386  int it = m_Mt - 1;
1387  for (int iz = 0; iz < m_Mz; ++iz) {
1388  for (int ixy = 0; ixy < Nxy; ++ixy) {
1389  int is = ixy + Nxy * (iz + m_Nz * it);
1390  int is2 = ixy + Nxy * iz;
1391  int iv = Nvcd * is;
1392  int ig = m_Ndf * is;
1393  int ix1 = Nvc2 * is2;
1394  int ix2 = ix1 + m_Nvc;
1395 
1396  for (int ic = 0; ic < m_Nc; ++ic) {
1397  int ic2 = ic * m_Nvc;
1398 
1399  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1400  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1401  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1402  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1403 
1404  w2[2 * ic + id3 + iv] += wt1r;
1405  w2[2 * ic + 1 + id3 + iv] += wt1i;
1406  w2[2 * ic + id4 + iv] += wt2r;
1407  w2[2 * ic + 1 + id4 + iv] += wt2i;
1408  }
1409  }
1410  }
1411  }
1412 }
1413 
1414 
1415 //====================================================================
1417  int itask, double *v2, const double *v1)
1418 {
1419  int Nvcd = m_Nvc * m_Nd;
1420 
1421  int id1 = 0;
1422  int id2 = m_Nvc;
1423  int id3 = m_Nvc * 2;
1424  int id4 = m_Nvc * 3;
1425 
1426  int idir = 3;
1427 
1428  double vt1[m_Nvc], vt2[m_Nvc];
1429  double wt1r, wt1i, wt2r, wt2i;
1430 
1431  int isite = m_arg[itask].isite;
1432 
1433  double *w2 = &v2[Nvcd * isite];
1434  const double *w1 = &v1[Nvcd * isite];
1435  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1436 
1437  int kt1 = m_arg[itask].kt1;
1438  int Nxy = m_Nx * m_Ny;
1439  int Nxyz = Nxy * m_Nz;
1440 
1441  for (int it = 0; it < m_Mt - kt1; ++it) {
1442  for (int iz = 0; iz < m_Mz; ++iz) {
1443  for (int ixy = 0; ixy < Nxy; ++ixy) {
1444  int is = ixy + Nxy * (iz + m_Nz * it);
1445  int iv = Nvcd * is;
1446  int in = Nvcd * (is + Nxyz);
1447  int ig = m_Ndf * is;
1448 
1449  for (int ic = 0; ic < m_Nc; ++ic) {
1450  vt1[2 * ic] = 2.0 * w1[2 * ic + id3 + in];
1451  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
1452  vt2[2 * ic] = 2.0 * w1[2 * ic + id4 + in];
1453  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
1454  }
1455 
1456  for (int ic = 0; ic < m_Nc; ++ic) {
1457  int ic2 = ic * m_Nvc;
1458 
1459  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1460  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1461  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1462  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1463 
1464  w2[2 * ic + id3 + iv] += wt1r;
1465  w2[2 * ic + 1 + id3 + iv] += wt1i;
1466  w2[2 * ic + id4 + iv] += wt2r;
1467  w2[2 * ic + 1 + id4 + iv] += wt2i;
1468  }
1469  }
1470  }
1471  }
1472 }
1473 
1474 
1475 //====================================================================
1477  int itask, double *vcp1, const double *v1)
1478 {
1479  int Nvc2 = 2 * m_Nvc;
1480  int Nvcd = m_Nvc * m_Nd;
1481  int Nvcd2 = Nvcd / 2;
1482 
1483  int id1 = 0;
1484  int id2 = m_Nvc;
1485  int id3 = m_Nvc * 2;
1486  int id4 = m_Nvc * 3;
1487 
1488  int idir = 3;
1489 
1490  int isite = m_arg[itask].isite;
1491  int isite_cp = m_arg[itask].isite_cpt;
1492 
1493  // double* w2 = &vcp1[Nvcd2*isite_cp];
1494  double *w2
1495  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1496  const double *w1 = &v1[Nvcd * isite];
1497  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1498 
1499  double vt1[m_Nvc], vt2[m_Nvc];
1500 
1501  if (m_arg[itask].kt1 == 1) {
1502  int Nxy = m_Nx * m_Ny;
1503  int it = m_Mt - 1;
1504  for (int iz = 0; iz < m_Mz; ++iz) {
1505  for (int ixy = 0; ixy < Nxy; ++ixy) {
1506  int is = ixy + Nxy * (iz + m_Nz * it);
1507  int is2 = ixy + Nxy * iz;
1508  int in = Nvcd * is;
1509  int ig = m_Ndf * is;
1510  int ix1 = Nvc2 * is2;
1511  int ix2 = ix1 + m_Nvc;
1512 
1513  for (int ic = 0; ic < m_Nc; ++ic) {
1514  vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1515  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1516  vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1517  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1518  }
1519 
1520  for (int ic = 0; ic < m_Nc; ++ic) {
1521  int icr = 2 * ic;
1522  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1523  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1524  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1525  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1526  }
1527  }
1528  }
1529  }
1530 
1531  m_fw_send[idir]->start_thread(itask);
1532 }
1533 
1534 
1535 //====================================================================
1537  int itask, double *v2, const double *vcp2)
1538 {
1539  int Nvc2 = 2 * m_Nvc;
1540  int Nvcd = m_Nvc * m_Nd;
1541  int Nvcd2 = Nvcd / 2;
1542 
1543  int id1 = 0;
1544  int id2 = m_Nvc;
1545  int id3 = m_Nvc * 2;
1546  int id4 = m_Nvc * 3;
1547 
1548  int idir = 3;
1549  double bc2 = m_boundary2[idir];
1550 
1551  double wt1r, wt1i, wt2r, wt2i;
1552 
1553  int isite = m_arg[itask].isite;
1554  int isite_cp = m_arg[itask].isite_cpt;
1555 
1556  double *w2 = &v2[Nvcd * isite];
1557  // double* w1 = &vcp2[Nvcd2*isite_cp];
1558  const double *w1
1559  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1560 
1561  m_fw_recv[idir]->wait_thread(itask);
1562 
1563  if (m_arg[itask].kt0 == 1) {
1564  int Nxy = m_Nx * m_Ny;
1565  int it = 0;
1566  for (int iz = 0; iz < m_Mz; ++iz) {
1567  for (int ixy = 0; ixy < Nxy; ++ixy) {
1568  int is = ixy + Nxy * (iz + m_Nz * it);
1569  int is2 = ixy + Nxy * iz;
1570  int iv = Nvcd * is;
1571  int ix1 = Nvc2 * is2;
1572  int ix2 = ix1 + m_Nvc;
1573 
1574  for (int ic = 0; ic < m_Nc; ++ic) {
1575  int icr = 2 * ic;
1576  int ici = 2 * ic + 1;
1577  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1578  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1579  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1580  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1581  }
1582  }
1583  }
1584  }
1585 }
1586 
1587 
1588 //====================================================================
1590  int itask, double *v2, const double *v1)
1591 {
1592  int Nvcd = m_Nvc * m_Nd;
1593 
1594  int id1 = 0;
1595  int id2 = m_Nvc;
1596  int id3 = m_Nvc * 2;
1597  int id4 = m_Nvc * 3;
1598 
1599  int idir = 3;
1600 
1601  double vt1[m_Nvc], vt2[m_Nvc];
1602  double wt1r, wt1i, wt2r, wt2i;
1603 
1604  int isite = m_arg[itask].isite;
1605 
1606  double *w2 = &v2[Nvcd * isite];
1607  const double *w1 = &v1[Nvcd * isite];
1608  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1609 
1610  int kt0 = m_arg[itask].kt0;
1611  int Nxy = m_Nx * m_Ny;
1612  int Nxyz = Nxy * m_Nz;
1613 
1614  for (int it = kt0; it < m_Mt; ++it) {
1615  for (int iz = 0; iz < m_Mz; ++iz) {
1616  for (int ixy = 0; ixy < Nxy; ++ixy) {
1617  int is = ixy + Nxy * (iz + m_Nz * it);
1618  int iv = Nvcd * is;
1619  int in = Nvcd * (is - Nxyz);
1620  int ig = m_Ndf * (is - Nxyz);
1621 
1622  for (int ic = 0; ic < m_Nc; ++ic) {
1623  vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1624  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1625  vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1626  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1627  }
1628 
1629  for (int ic = 0; ic < m_Nc; ++ic) {
1630  int ic2 = 2 * ic;
1631  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1632  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1633  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1634  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1635 
1636  w2[ic2 + id1 + iv] += wt1r;
1637  w2[ic2 + 1 + id1 + iv] += wt1i;
1638  w2[ic2 + id2 + iv] += wt2r;
1639  w2[ic2 + 1 + id2 + iv] += wt2i;
1640  }
1641  }
1642  }
1643  }
1644 }
1645 
1646 
1647 //====================================================================
1649  int itask, double *vcp1, const double *v1)
1650 {
1651  int Nvc2 = 2 * m_Nvc;
1652  int Nvcd = m_Nvc * m_Nd;
1653  int Nvcd2 = Nvcd / 2;
1654 
1655  int id1 = 0;
1656  int id2 = m_Nvc;
1657  int id3 = m_Nvc * 2;
1658  int id4 = m_Nvc * 3;
1659 
1660  int isite = m_arg[itask].isite;
1661  int isite_cp = m_arg[itask].isite_cpt;
1662 
1663  int idir = 3;
1664  double bc2 = m_boundary2[idir];
1665 
1666  // double* w2 = &vcp1[Nvcd2*isite_cp];
1667  double *w2
1668  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1669  const double *w1 = &v1[Nvcd * isite];
1670 
1671  if (m_arg[itask].kt0 == 1) {
1672  int Nxy = m_Nx * m_Ny;
1673  int it = 0;
1674  for (int iz = 0; iz < m_Mz; ++iz) {
1675  for (int ixy = 0; ixy < Nxy; ++ixy) {
1676  int is = ixy + Nxy * (iz + m_Nz * it);
1677  int is2 = ixy + Nxy * iz;
1678 
1679  int in = Nvcd * is;
1680  int ix1 = Nvc2 * is2;
1681  int ix2 = ix1 + m_Nvc;
1682 
1683  for (int ic = 0; ic < m_Nc; ++ic) {
1684  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
1685  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
1686  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
1687  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
1688  }
1689  }
1690  }
1691  }
1692 
1693  m_bw_send[idir]->start_thread(itask);
1694 }
1695 
1696 
1697 //====================================================================
1699  int itask, double *v2, const double *vcp2)
1700 {
1701  int Nvc2 = 2 * m_Nvc;
1702  int Nvcd = m_Nvc * m_Nd;
1703  int Nvcd2 = Nvcd / 2;
1704 
1705  int id1 = 0;
1706  int id2 = m_Nvc;
1707  int id3 = m_Nvc * 2;
1708  int id4 = m_Nvc * 3;
1709 
1710  int idir = 3;
1711 
1712  double wt1r, wt1i, wt2r, wt2i;
1713 
1714  int isite = m_arg[itask].isite;
1715  int isite_cp = m_arg[itask].isite_cpt;
1716 
1717  double *w2 = &v2[Nvcd * isite];
1718  // double* w1 = &vcp2[Nvcd2*isite_cp];
1719  const double *w1
1720  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1721  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1722 
1723  m_bw_recv[idir]->wait_thread(itask);
1724 
1725  if (m_arg[itask].kt1 == 1) {
1726  int Nxy = m_Nx * m_Ny;
1727  int it = m_Mt - 1;
1728  for (int iz = 0; iz < m_Mz; ++iz) {
1729  for (int ixy = 0; ixy < Nxy; ++ixy) {
1730  int is = ixy + Nxy * (iz + m_Nz * it);
1731  int is2 = ixy + Nxy * iz;
1732  int iv = Nvcd * is;
1733  int ig = m_Ndf * is;
1734  int ix1 = Nvc2 * is2;
1735  int ix2 = ix1 + m_Nvc;
1736 
1737  for (int ic = 0; ic < m_Nc; ++ic) {
1738  int ic2 = ic * m_Nvc;
1739 
1740  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1741  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1742  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1743  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1744 
1745  w2[2 * ic + id1 + iv] += wt1r;
1746  w2[2 * ic + 1 + id1 + iv] += wt1i;
1747  w2[2 * ic + id2 + iv] += wt2r;
1748  w2[2 * ic + 1 + id2 + iv] += wt2i;
1749  w2[2 * ic + id3 + iv] += wt1r;
1750  w2[2 * ic + 1 + id3 + iv] += wt1i;
1751  w2[2 * ic + id4 + iv] += wt2r;
1752  w2[2 * ic + 1 + id4 + iv] += wt2i;
1753  }
1754  }
1755  }
1756  }
1757 }
1758 
1759 
1760 //====================================================================
1762  int itask, double *v2, const double *v1)
1763 {
1764  int Nvcd = m_Nvc * m_Nd;
1765 
1766  int id1 = 0;
1767  int id2 = m_Nvc;
1768  int id3 = m_Nvc * 2;
1769  int id4 = m_Nvc * 3;
1770 
1771  int idir = 3;
1772 
1773  double vt1[m_Nvc], vt2[m_Nvc];
1774  double wt1r, wt1i, wt2r, wt2i;
1775 
1776  int isite = m_arg[itask].isite;
1777 
1778  double *w2 = &v2[Nvcd * isite];
1779  const double *w1 = &v1[Nvcd * isite];
1780  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1781 
1782  int kt1 = m_arg[itask].kt1;
1783  int Nxy = m_Nx * m_Ny;
1784  int Nxyz = Nxy * m_Nz;
1785 
1786  for (int it = 0; it < m_Mt - kt1; ++it) {
1787  for (int iz = 0; iz < m_Mz; ++iz) {
1788  for (int ixy = 0; ixy < Nxy; ++ixy) {
1789  int is = ixy + Nxy * (iz + m_Nz * it);
1790  int iv = Nvcd * is;
1791  int in = Nvcd * (is + Nxyz);
1792  int ig = m_Ndf * is;
1793 
1794  for (int ic = 0; ic < m_Nc; ++ic) {
1795  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
1796  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
1797  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
1798  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
1799  }
1800 
1801  for (int ic = 0; ic < m_Nc; ++ic) {
1802  int ic2 = ic * m_Nvc;
1803 
1804  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1805  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1806  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1807  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1808 
1809  w2[2 * ic + id1 + iv] += wt1r;
1810  w2[2 * ic + 1 + id1 + iv] += wt1i;
1811  w2[2 * ic + id2 + iv] += wt2r;
1812  w2[2 * ic + 1 + id2 + iv] += wt2i;
1813  w2[2 * ic + id3 + iv] += wt1r;
1814  w2[2 * ic + 1 + id3 + iv] += wt1i;
1815  w2[2 * ic + id4 + iv] += wt2r;
1816  w2[2 * ic + 1 + id4 + iv] += wt2i;
1817  }
1818  }
1819  }
1820  }
1821 }
1822 
1823 
1824 //====================================================================
1826  int itask, double *vcp1, const double *v1)
1827 {
1828  int Nvc2 = 2 * m_Nvc;
1829  int Nvcd = m_Nvc * m_Nd;
1830  int Nvcd2 = Nvcd / 2;
1831 
1832  int id1 = 0;
1833  int id2 = m_Nvc;
1834  int id3 = m_Nvc * 2;
1835  int id4 = m_Nvc * 3;
1836 
1837  int idir = 3;
1838 
1839  int isite = m_arg[itask].isite;
1840  int isite_cp = m_arg[itask].isite_cpt;
1841 
1842  // double* w2 = &vcp1[Nvcd2*isite_cp];
1843  double *w2
1844  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1845  const double *w1 = &v1[Nvcd * isite];
1846  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1847 
1848  double vt1[m_Nvc], vt2[m_Nvc];
1849 
1850  if (m_arg[itask].kt1 == 1) {
1851  int Nxy = m_Nx * m_Ny;
1852  int it = m_Mt - 1;
1853  for (int iz = 0; iz < m_Mz; ++iz) {
1854  for (int ixy = 0; ixy < Nxy; ++ixy) {
1855  int is = ixy + Nxy * (iz + m_Nz * it);
1856  int is2 = ixy + Nxy * iz;
1857  int in = Nvcd * is;
1858  int ig = m_Ndf * is;
1859  int ix1 = Nvc2 * is2;
1860  int ix2 = ix1 + m_Nvc;
1861 
1862  for (int ic = 0; ic < m_Nc; ++ic) {
1863  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1864  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1865  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1866  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1867  }
1868 
1869  for (int ic = 0; ic < m_Nc; ++ic) {
1870  int icr = 2 * ic;
1871  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1872  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1873  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1874  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1875  }
1876  }
1877  }
1878  }
1879 
1880  m_fw_send[idir]->start_thread(itask);
1881 }
1882 
1883 
1884 //====================================================================
1886  int itask, double *v2, const double *vcp2)
1887 {
1888  int Nvc2 = 2 * m_Nvc;
1889  int Nvcd = m_Nvc * m_Nd;
1890  int Nvcd2 = Nvcd / 2;
1891 
1892  int id1 = 0;
1893  int id2 = m_Nvc;
1894  int id3 = m_Nvc * 2;
1895  int id4 = m_Nvc * 3;
1896 
1897  int idir = 3;
1898  double bc2 = m_boundary2[idir];
1899 
1900  double wt1r, wt1i, wt2r, wt2i;
1901 
1902  int isite = m_arg[itask].isite;
1903  int isite_cp = m_arg[itask].isite_cpt;
1904 
1905  double *w2 = &v2[Nvcd * isite];
1906  // double* w1 = &vcp2[Nvcd2*isite_cp];
1907  const double *w1
1908  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1909 
1910  m_fw_recv[idir]->wait_thread(itask);
1911 
1912  if (m_arg[itask].kt0 == 1) {
1913  int Nxy = m_Nx * m_Ny;
1914  int it = 0;
1915  for (int iz = 0; iz < m_Mz; ++iz) {
1916  for (int ixy = 0; ixy < Nxy; ++ixy) {
1917  int is = ixy + Nxy * (iz + m_Nz * it);
1918  int is2 = ixy + Nxy * iz;
1919  int iv = Nvcd * is;
1920  int ix1 = Nvc2 * is2;
1921  int ix2 = ix1 + m_Nvc;
1922 
1923  for (int ic = 0; ic < m_Nc; ++ic) {
1924  int icr = 2 * ic;
1925  int ici = 2 * ic + 1;
1926  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1927  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1928  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1929  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1930  w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
1931  w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
1932  w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
1933  w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
1934  }
1935  }
1936  }
1937  }
1938 }
1939 
1940 
1941 //====================================================================
1943  int itask, double *v2, const double *v1)
1944 {
1945  int Nvcd = m_Nvc * m_Nd;
1946 
1947  int id1 = 0;
1948  int id2 = m_Nvc;
1949  int id3 = m_Nvc * 2;
1950  int id4 = m_Nvc * 3;
1951 
1952  int idir = 3;
1953 
1954  double vt1[m_Nvc], vt2[m_Nvc];
1955  double wt1r, wt1i, wt2r, wt2i;
1956 
1957  int isite = m_arg[itask].isite;
1958 
1959  double *w2 = &v2[Nvcd * isite];
1960  const double *w1 = &v1[Nvcd * isite];
1961  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1962 
1963  int kt0 = m_arg[itask].kt0;
1964  int Nxy = m_Nx * m_Ny;
1965  int Nxyz = Nxy * m_Nz;
1966 
1967  for (int it = kt0; it < m_Mt; ++it) {
1968  for (int iz = 0; iz < m_Mz; ++iz) {
1969  for (int ixy = 0; ixy < Nxy; ++ixy) {
1970  int is = ixy + Nxy * (iz + m_Nz * it);
1971  int iv = Nvcd * is;
1972  int in = Nvcd * (is - Nxyz);
1973  int ig = m_Ndf * (is - Nxyz);
1974 
1975  for (int ic = 0; ic < m_Nc; ++ic) {
1976  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1977  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1978  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1979  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1980  }
1981 
1982  for (int ic = 0; ic < m_Nc; ++ic) {
1983  int ic2 = 2 * ic;
1984  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1985  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1986  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1987  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1988 
1989  w2[ic2 + id1 + iv] += wt1r;
1990  w2[ic2 + 1 + id1 + iv] += wt1i;
1991  w2[ic2 + id2 + iv] += wt2r;
1992  w2[ic2 + 1 + id2 + iv] += wt2i;
1993  w2[ic2 + id3 + iv] -= wt1r;
1994  w2[ic2 + 1 + id3 + iv] -= wt1i;
1995  w2[ic2 + id4 + iv] -= wt2r;
1996  w2[ic2 + 1 + id4 + iv] -= wt2i;
1997  }
1998  }
1999  }
2000  }
2001 }
2002 
2003 
2004 //====================================================================
2006  int itask, double *v2, const double *v1)
2007 {
2008  int Nvcd = m_Nvc * m_Nd;
2009  int Nxy = m_Nx * m_Ny;
2010 
2011  int id1 = 0;
2012  int id2 = m_Nvc;
2013  int id3 = m_Nvc * 2;
2014  int id4 = m_Nvc * 3;
2015 
2016  int isite = m_arg[itask].isite;
2017  double *w2 = &v2[Nvcd * isite];
2018  const double *w1 = &v1[Nvcd * isite];
2019 
2020  for (int it = 0; it < m_Mt; ++it) {
2021  for (int iz = 0; iz < m_Mz; ++iz) {
2022  for (int ixy = 0; ixy < Nxy; ++ixy) {
2023  int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
2024  for (int ivc = 0; ivc < m_Nvc; ++ivc) {
2025  w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
2026  w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
2027  w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
2028  w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
2029  }
2030  }
2031  }
2032  }
2033 }
2034 
2035 
2036 //====================================================================
2038  int itask, double *v2, const double *v1)
2039 {
2040  int Nvcd = m_Nvc * m_Nd;
2041  int Nxy = m_Nx * m_Ny;
2042 
2043  int id1 = 0;
2044  int id2 = m_Nvc;
2045  int id3 = m_Nvc * 2;
2046  int id4 = m_Nvc * 3;
2047 
2048  int isite = m_arg[itask].isite;
2049  double *w2 = &v2[Nvcd * isite];
2050  const double *w1 = &v1[Nvcd * isite];
2051 
2052  for (int it = 0; it < m_Mt; ++it) {
2053  for (int iz = 0; iz < m_Mz; ++iz) {
2054  for (int ixy = 0; ixy < Nxy; ++ixy) {
2055  int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
2056  for (int ivc = 0; ivc < m_Nvc; ++ivc) {
2057  w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
2058  w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
2059  w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
2060  w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
2061  }
2062  }
2063  }
2064  }
2065 }
2066 
2067 
2068 //====================================================================
2069 //============================================================END=====
BridgeIO vout
Definition: bridgeIO.cpp:278
void mult_zp2_thread(int, double *, const double *)
void mult_yp2_thread(int, double *, const double *)
void mult_tm1_dirac_thread(int, double *, const double *)
void gm5_dirac_thread(int, double *, const double *)
void general(const char *format,...)
Definition: bridgeIO.cpp:65
std::vector< Channel * > m_bw_recv
void mult_ym2_thread(int, double *, const double *)
void mult_xp2_thread(int, double *, const double *)
void mult_yp1_thread(int, double *, const double *)
void mult_tm1_chiral_thread(int, double *, const double *)
void mult_ym1_thread(int, double *, const double *)
void mult_xp1_thread(int, double *, const double *)
std::vector< Channel * > m_fw_recv
std::vector< Channel * > m_fw_send
void mult_xm2_thread(int, double *, const double *)
void mult_tp1_dirac_thread(int, double *, const double *)
std::vector< mult_arg > m_arg
void mult_ymb_thread(int, double *, const double *)
void mult_zpb_thread(int, double *, const double *)
void mult_xm1_thread(int, double *, const double *)
void daypx_thread(int, double *, double, const double *)
void mult_tpb_chiral_thread(int, double *, const double *)
void mult_tp1_chiral_thread(int, double *, const double *)
std::vector< Channel * > m_bw_send
static int get_num_threads_available()
returns number of threads (works outside of parallel region).
void gm5_chiral_thread(int, double *, const double *)
void mult_tp2_chiral_thread(int, double *, const double *)
void mult_zp1_thread(int, double *, const double *)
void mult_ypb_thread(int, double *, const double *)
void crucial(const char *format,...)
Definition: bridgeIO.cpp:48
void mult_zmb_thread(int, double *, const double *)
void mult_tmb_dirac_thread(int, double *, const double *)
void mult_xmb_thread(int, double *, const double *)
void mult_xpb_thread(int, double *, const double *)
void mult_zm2_thread(int, double *, const double *)
void mult_tm2_dirac_thread(int, double *, const double *)
void mult_tpb_dirac_thread(int, double *, const double *)
void mult_tp2_dirac_thread(int, double *, const double *)
void mult_zm1_thread(int, double *, const double *)
void mult_tm2_chiral_thread(int, double *, const double *)
void mult_tmb_chiral_thread(int, double *, const double *)