Bridge++  Version 1.4.4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fopr_Wilson_impl_thread.cpp
Go to the documentation of this file.
1 
14 #include "fopr_Wilson_impl.h"
15 
16 #include "IO/bridgeIO.h"
17 using Bridge::vout;
18 
20 
21 namespace Imp_BGQ {
22 #if defined USE_GROUP_SU3
23 #include "fopr_Wilson_impl_SU3.inc"
24 #elif defined USE_GROUP_SU2
25 #include "fopr_Wilson_impl_SU2.inc"
26 #elif defined USE_GROUP_SU_N
27 #include "fopr_Wilson_impl_SU_N.inc"
28 #endif
29 
30 //====================================================================
32  {
34 
35  // The following setup corresponds to unifirm division of volume.
36  if (m_Nthread <= m_Nt) {
38  } else if (m_Nthread <= m_Nz * m_Nt) {
39  m_Ntask_t = m_Nt;
40  } else {
41  vout.crucial(m_vl, "Error at %s: Too large Nthread=%d\n", class_name.c_str(), m_Nthread);
42  exit(EXIT_FAILURE);
43  }
45  if (m_Ntask_z * m_Ntask_t != m_Nthread) {
46  vout.crucial(m_vl, "Error at %s: Nz(%d) * Nt(%d) \neq Nthread(%d)\n",
47  class_name.c_str(), m_Nz, m_Nt, m_Nthread);
48  exit(EXIT_FAILURE);
49  }
51  m_Mz = m_Nz / m_Ntask_z;
52  m_Mt = m_Nt / m_Ntask_t;
53 
54  // The following setup is not monotonic division, and requires
55  // barrier at the beginning and end of mult (D and gamma5).
56  // [H.Matsufuru 22 Oct 2013]
57  // if(m_Nthread >= 64){
58  // m_Ntask_z = 8;
59  // }else if(m_Nthread >= 16){
60  // m_Ntask_z = 4;
61  // }else if(m_Nthread >= 4){
62  // m_Ntask_z = 2;
63  // }else{
64  // m_Ntask_z = 1;
65  // }
66  // m_Ntask_t = m_Nthread/m_Ntask_z;
67  // m_Ntask = m_Ntask_t * m_Ntask_z;
68  // m_Mz = m_Nz/m_Ntask_z;
69  // m_Mt = m_Nt/m_Ntask_t;
70 
71  vout.general(m_vl, " Nthread = %d\n", m_Nthread);
72  vout.general(m_vl, " Ntask = %d\n", m_Ntask);
73  vout.general(m_vl, " Ntask_z = %d Ntask_t = %d\n", m_Ntask_z, m_Ntask_t);
74  vout.general(m_vl, " Mz = %d Mt = %d\n", m_Mz, m_Mt);
75 
76  // setup of arguments
77  int Nxy = m_Nx * m_Ny;
78  m_arg.resize(m_Ntask);
79  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
80  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
81  int itask = ith_z + m_Ntask_z * ith_t;
82 
83  m_arg[itask].isite = (ith_z * m_Mz + ith_t * (m_Nz * m_Mt)) * Nxy;
84 
85  m_arg[itask].kt0 = 0;
86  m_arg[itask].kt1 = 0;
87  m_arg[itask].kz0 = 0;
88  m_arg[itask].kz1 = 0;
89  if (ith_t == 0) m_arg[itask].kt0 = 1;
90  if (ith_z == 0) m_arg[itask].kz0 = 1;
91  if (ith_t == m_Ntask_t - 1) m_arg[itask].kt1 = 1;
92  if (ith_z == m_Ntask_z - 1) m_arg[itask].kz1 = 1;
93 
94  m_arg[itask].isite_cpx = itask * m_Mz * m_Mt * m_Ny;
95  m_arg[itask].isite_cpy = itask * m_Mz * m_Mt * m_Nx;
96  m_arg[itask].isite_cpz = ith_t * m_Mt * Nxy;
97  m_arg[itask].isite_cpt = ith_z * m_Mz * Nxy;
98  }
99  }
100 
101  // setup for async data transfer
102  int Nc = CommonParameters::Nc();
103  int Nd = CommonParameters::Nd();
104  int Nvcd2 = 2 * Nc * Nd / 2;
105 
106  std::vector<int> destid(m_Ntask);
107  std::vector<int> offset(m_Ntask);
108  std::vector<int> datasize(m_Ntask);
109  std::vector<int> offset_up(m_Ntask);
110  std::vector<int> offset_lw(m_Ntask);
111  std::vector<int> datasize_up(m_Ntask);
112  std::vector<int> datasize_lw(m_Ntask);
113 
114  int imu = 0;
115  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
116  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
117  int itask = ith_z + ith_t * m_Ntask_z;
118  int isite_cp = itask * m_Mz * m_Mt * m_Ny;
119  destid[itask] = itask;
120  offset[itask] = sizeof(double) * Nvcd2 * isite_cp;
121  datasize[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Mt * m_Ny;
122  }
123  }
124  m_bw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
125  m_fw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
126  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
127  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
128 
129  imu = 1;
130  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
131  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
132  int itask = ith_z + ith_t * m_Ntask_z;
133  int isite_cp = itask * m_Mz * m_Mt * m_Nx;
134  destid[itask] = itask;
135  offset[itask] = sizeof(double) * Nvcd2 * isite_cp;
136  datasize[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Mt * m_Nx;
137  }
138  }
139  m_bw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
140  m_fw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
141  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
142  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
143 
144  imu = 2;
145  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
146  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
147  int itask = ith_z + m_Ntask_z * ith_t;
148  destid[itask] = -1;
149  offset_up[itask] = 0;
150  offset_lw[itask] = 0;
151  datasize_up[itask] = 0;
152  datasize_lw[itask] = 0;
153  if (ith_z == 0) {
154  destid[itask] = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
155  offset_lw[itask] = sizeof(double) * Nvcd2 * ith_t * m_Mt * m_Nx * m_Ny;
156  datasize_lw[itask] = sizeof(double) * Nvcd2 * m_Mt * m_Nx * m_Ny;
157  }
158  if (ith_z == m_Ntask_z - 1) {
159  destid[itask] = ith_t * m_Ntask_z;
160  offset_up[itask] = sizeof(double) * Nvcd2 * ith_t * m_Mt * m_Nx * m_Ny;
161  datasize_up[itask] = sizeof(double) * Nvcd2 * m_Mt * m_Nx * m_Ny;
162  }
163  }
164  }
165  m_bw_send[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
166  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
167  m_fw_send[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
168  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
169 
170  imu = 3;
171  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
172  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
173  int itask = ith_z + m_Ntask_z * ith_t;
174  destid[itask] = -1;
175  offset_up[itask] = 0;
176  offset_lw[itask] = 0;
177  datasize_up[itask] = 0;
178  datasize_lw[itask] = 0;
179  if (ith_t == 0) {
180  destid[itask] = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
181  offset_lw[itask] = sizeof(double) * Nvcd2 * ith_z * m_Mz * m_Nx * m_Ny;
182  datasize_lw[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Nx * m_Ny;
183  }
184  if (ith_t == m_Ntask_t - 1) {
185  destid[itask] = ith_z;
186  offset_up[itask] = sizeof(double) * Nvcd2 * ith_z * m_Mz * m_Nx * m_Ny;
187  datasize_up[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Nx * m_Ny;
188  }
189  }
190  }
191  m_bw_send[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
192  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
193  m_fw_send[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
194  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
195  }
196 
197 
198 //====================================================================
200  int itask, double *v2, double fac, const double *v1)
201  {
202  int Nvcd = m_Nvc * m_Nd;
203  int Nvxy = Nvcd * m_Nx * m_Ny;
204 
205  int isite = m_arg[itask].isite;
206 
207  double *w2 = &v2[Nvcd * isite];
208  const double *w1 = &v1[Nvcd * isite];
209 
210  for (int it = 0; it < m_Mt; ++it) {
211  for (int iz = 0; iz < m_Mz; ++iz) {
212  for (int ivxy = 0; ivxy < Nvxy; ++ivxy) {
213  int iv = ivxy + Nvxy * (iz + m_Nz * it);
214  w2[iv] = fac * w2[iv] + w1[iv];
215  }
216  }
217  }
218  }
219 
220 
221 //====================================================================
223  double *v2)
224  {
225  int Nvcd = m_Nvc * m_Nd;
226  int Nvxy = Nvcd * m_Nx * m_Ny;
227 
228  int isite = m_arg[itask].isite;
229  double *w2 = &v2[Nvcd * isite];
230 
231  for (int it = 0; it < m_Mt; ++it) {
232  for (int iz = 0; iz < m_Mz; ++iz) {
233  for (int ivxy = 0; ivxy < Nvxy; ++ivxy) {
234  int iv = ivxy + Nvxy * (iz + m_Nz * it);
235  w2[iv] = 0.0;
236  }
237  }
238  }
239  }
240 
241 
242 //====================================================================
244  int itask, double *vcp1, const double *v1)
245  {
246  int Nvc2 = 2 * m_Nvc;
247  int Nvcd = m_Nvc * m_Nd;
248  int Nvcd2 = Nvcd / 2;
249 
250  int id1 = 0;
251  int id2 = m_Nvc;
252  int id3 = m_Nvc * 2;
253  int id4 = m_Nvc * 3;
254 
255  int idir = 0;
256  double bc2 = m_boundary2[idir];
257 
258  int isite = m_arg[itask].isite;
259  int isite_cp = m_arg[itask].isite_cpx;
260 
261  // double* w2 = &vcp1[Nvcd2*isite_cp];
262  double *w2
263  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
264  const double *w1 = &v1[Nvcd * isite];
265 
266  int ix = 0;
267 
268  for (int it = 0; it < m_Mt; ++it) {
269  for (int iz = 0; iz < m_Mz; ++iz) {
270  for (int iy = 0; iy < m_Ny; ++iy) {
271  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
272  int is2 = iy + m_Ny * (iz + m_Mz * it);
273  int in = Nvcd * is;
274  int ix1 = Nvc2 * is2;
275  int ix2 = ix1 + m_Nvc;
276 
277  for (int ic = 0; ic < m_Nc; ++ic) {
278  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
279  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
280  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
281  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
282  }
283  }
284  }
285  }
286 
287  m_bw_send[idir]->start_thread(itask);
288  }
289 
290 
291 //====================================================================
293  int itask, double *v2, const double *vcp2)
294  {
295  int Nvc2 = 2 * m_Nvc;
296  int Nvcd = m_Nvc * m_Nd;
297  int Nvcd2 = Nvcd / 2;
298 
299  int id1 = 0;
300  int id2 = m_Nvc;
301  int id3 = m_Nvc * 2;
302  int id4 = m_Nvc * 3;
303 
304  int idir = 0;
305 
306  double wt1r, wt1i, wt2r, wt2i;
307 
308  int isite = m_arg[itask].isite;
309  int isite_cp = m_arg[itask].isite_cpx;
310 
311  double *w2 = &v2[Nvcd * isite];
312  // double* w1 = &vcp2[Nvcd2*isite_cp];
313  const double *w1
314  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
315  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
316 
317  m_bw_recv[idir]->wait_thread(itask);
318 
319  int ix = m_Nx - 1;
320  for (int it = 0; it < m_Mt; ++it) {
321  for (int iz = 0; iz < m_Mz; ++iz) {
322  for (int iy = 0; iy < m_Ny; ++iy) {
323  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
324  int is2 = iy + m_Ny * (iz + m_Mz * it);
325  int iv = Nvcd * is;
326  int ig = m_Ndf * is;
327  int ix1 = Nvc2 * is2;
328  int ix2 = ix1 + m_Nvc;
329 
330  for (int ic = 0; ic < m_Nc; ++ic) {
331  int ic2 = ic * m_Nvc;
332 
333  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
334  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
335  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
336  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
337 
338  w2[2 * ic + id1 + iv] += wt1r;
339  w2[2 * ic + 1 + id1 + iv] += wt1i;
340  w2[2 * ic + id2 + iv] += wt2r;
341  w2[2 * ic + 1 + id2 + iv] += wt2i;
342  w2[2 * ic + id3 + iv] += wt2i;
343  w2[2 * ic + 1 + id3 + iv] += -wt2r;
344  w2[2 * ic + id4 + iv] += wt1i;
345  w2[2 * ic + 1 + id4 + iv] += -wt1r;
346  }
347  }
348  }
349  }
350  }
351 
352 
353 //====================================================================
355  int itask, double *v2, const double *v1)
356  {
357  int Nvcd = m_Nvc * m_Nd;
358 
359  int id1 = 0;
360  int id2 = m_Nvc;
361  int id3 = m_Nvc * 2;
362  int id4 = m_Nvc * 3;
363 
364  int idir = 0;
365 
366  double vt1[m_Nvc], vt2[m_Nvc];
367  double wt1r, wt1i, wt2r, wt2i;
368 
369  int isite = m_arg[itask].isite;
370 
371  double *w2 = &v2[Nvcd * isite];
372  const double *w1 = &v1[Nvcd * isite];
373  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
374 
375  for (int it = 0; it < m_Mt; ++it) {
376  for (int iz = 0; iz < m_Mz; ++iz) {
377  for (int iy = 0; iy < m_Ny; ++iy) {
378  for (int ix = 0; ix < m_Nx - 1; ++ix) {
379  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
380  int iv = Nvcd * is;
381  int in = Nvcd * (is + 1);
382  int ig = m_Ndf * is;
383 
384  for (int ic = 0; ic < m_Nc; ++ic) {
385  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
386  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
387  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
388  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
389  }
390 
391  for (int ic = 0; ic < m_Nc; ++ic) {
392  int ic2 = ic * m_Nvc;
393 
394  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
395  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
396  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
397  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
398 
399  w2[2 * ic + id1 + iv] += wt1r;
400  w2[2 * ic + 1 + id1 + iv] += wt1i;
401  w2[2 * ic + id2 + iv] += wt2r;
402  w2[2 * ic + 1 + id2 + iv] += wt2i;
403  w2[2 * ic + id3 + iv] += wt2i;
404  w2[2 * ic + 1 + id3 + iv] += -wt2r;
405  w2[2 * ic + id4 + iv] += wt1i;
406  w2[2 * ic + 1 + id4 + iv] += -wt1r;
407  }
408  }
409  }
410  }
411  }
412  }
413 
414 
415 //====================================================================
417  int itask, double *vcp1, const double *v1)
418  {
419  int Nvc2 = 2 * m_Nvc;
420  int Nvcd = m_Nvc * m_Nd;
421  int Nvcd2 = Nvcd / 2;
422 
423  int id1 = 0;
424  int id2 = m_Nvc;
425  int id3 = m_Nvc * 2;
426  int id4 = m_Nvc * 3;
427 
428  int idir = 0;
429 
430  int isite = m_arg[itask].isite;
431  int isite_cp = m_arg[itask].isite_cpx;
432 
433  // double* w2 = &vcp1[Nvcd2*isite_cp];
434  double *w2
435  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
436  const double *w1 = &v1[Nvcd * isite];
437  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
438 
439  double vt1[m_Nvc], vt2[m_Nvc];
440 
441  int ix = m_Nx - 1;
442 
443  for (int it = 0; it < m_Mt; ++it) {
444  for (int iz = 0; iz < m_Mz; ++iz) {
445  for (int iy = 0; iy < m_Ny; ++iy) {
446  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
447  int is2 = iy + m_Ny * (iz + m_Mz * it);
448  int in = Nvcd * is;
449  int ig = m_Ndf * is;
450  int ix1 = Nvc2 * is2;
451  int ix2 = ix1 + m_Nvc;
452 
453  for (int ic = 0; ic < m_Nc; ++ic) {
454  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
455  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
456  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
457  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
458  }
459 
460  for (int ic = 0; ic < m_Nc; ++ic) {
461  int icr = 2 * ic;
462  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
463  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
464  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
465  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
466  }
467  }
468  }
469  }
470 
471  m_fw_send[idir]->start_thread(itask);
472  }
473 
474 
475 //====================================================================
477  int itask, double *v2, const double *vcp2)
478  {
479  int Nvc2 = 2 * m_Nvc;
480  int Nvcd = m_Nvc * m_Nd;
481  int Nvcd2 = Nvcd / 2;
482 
483  int id1 = 0;
484  int id2 = m_Nvc;
485  int id3 = m_Nvc * 2;
486  int id4 = m_Nvc * 3;
487 
488  int idir = 0;
489  double bc2 = m_boundary2[idir];
490 
491  double wt1r, wt1i, wt2r, wt2i;
492 
493  int isite = m_arg[itask].isite;
494  int isite_cp = m_arg[itask].isite_cpx;
495 
496  double *w2 = &v2[Nvcd * isite];
497  // double* w1 = &vcp2[Nvcd2*isite_cp];
498  const double *w1
499  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
500 
501  m_fw_recv[idir]->wait_thread(itask);
502 
503  int ix = 0;
504  for (int it = 0; it < m_Mt; ++it) {
505  for (int iz = 0; iz < m_Mz; ++iz) {
506  for (int iy = 0; iy < m_Ny; ++iy) {
507  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
508  int is2 = iy + m_Ny * (iz + m_Mz * it);
509  int iv = Nvcd * is;
510  int ix1 = Nvc2 * is2;
511  int ix2 = ix1 + m_Nvc;
512 
513  for (int ic = 0; ic < m_Nc; ++ic) {
514  int icr = 2 * ic;
515  int ici = 2 * ic + 1;
516  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
517  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
518  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
519  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
520  w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
521  w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
522  w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
523  w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
524  }
525  }
526  }
527  }
528  }
529 
530 
531 //====================================================================
533  int itask, double *v2, const double *v1)
534  {
535  int Nvcd = m_Nvc * m_Nd;
536 
537  int id1 = 0;
538  int id2 = m_Nvc;
539  int id3 = m_Nvc * 2;
540  int id4 = m_Nvc * 3;
541 
542  int idir = 0;
543 
544  double vt1[m_Nvc], vt2[m_Nvc];
545  double wt1r, wt1i, wt2r, wt2i;
546 
547  int isite = m_arg[itask].isite;
548 
549  double *w2 = &v2[Nvcd * isite];
550  const double *w1 = &v1[Nvcd * isite];
551  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
552 
553  for (int it = 0; it < m_Mt; ++it) {
554  for (int iz = 0; iz < m_Mz; ++iz) {
555  for (int iy = 0; iy < m_Ny; ++iy) {
556  for (int ix = 1; ix < m_Nx; ++ix) {
557  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
558  int iv = Nvcd * is;
559  int in = Nvcd * (is - 1);
560  int ig = m_Ndf * (is - 1);
561 
562  for (int ic = 0; ic < m_Nc; ++ic) {
563  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
564  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
565  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
566  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
567  }
568 
569  for (int ic = 0; ic < m_Nc; ++ic) {
570  int ic2 = 2 * ic;
571 
572  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
573  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
574  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
575  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
576 
577  w2[2 * ic + id1 + iv] += wt1r;
578  w2[2 * ic + 1 + id1 + iv] += wt1i;
579  w2[2 * ic + id2 + iv] += wt2r;
580  w2[2 * ic + 1 + id2 + iv] += wt2i;
581  w2[2 * ic + id3 + iv] += -wt2i;
582  w2[2 * ic + 1 + id3 + iv] += +wt2r;
583  w2[2 * ic + id4 + iv] += -wt1i;
584  w2[2 * ic + 1 + id4 + iv] += +wt1r;
585  }
586  }
587  }
588  }
589  }
590  }
591 
592 
593 //====================================================================
595  int itask, double *vcp1, const double *v1)
596  {
597  int Nvc2 = 2 * m_Nvc;
598  int Nvcd = m_Nvc * m_Nd;
599  int Nvcd2 = Nvcd / 2;
600 
601  int id1 = 0;
602  int id2 = m_Nvc;
603  int id3 = m_Nvc * 2;
604  int id4 = m_Nvc * 3;
605 
606  int isite = m_arg[itask].isite;
607  int isite_cp = m_arg[itask].isite_cpy;
608 
609  int idir = 1;
610  double bc2 = m_boundary2[idir];
611 
612  // double* w2 = &vcp1[Nvcd2*isite_cp];
613  double *w2
614  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
615  const double *w1 = &v1[Nvcd * isite];
616 
617  int iy = 0;
618 
619  for (int it = 0; it < m_Mt; ++it) {
620  for (int iz = 0; iz < m_Mz; ++iz) {
621  for (int ix = 0; ix < m_Nx; ++ix) {
622  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
623  int is2 = ix + m_Nx * (iz + m_Mz * it);
624  int in = Nvcd * is;
625  int ix1 = Nvc2 * is2;
626  int ix2 = ix1 + m_Nvc;
627 
628  for (int ic = 0; ic < m_Nc; ++ic) {
629  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
630  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
631  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
632  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
633  }
634  }
635  }
636  }
637 
638  m_bw_send[idir]->start_thread(itask);
639  }
640 
641 
642 //====================================================================
644  int itask, double *v2, const double *vcp2)
645  {
646  int Nvc2 = 2 * m_Nvc;
647  int Nvcd = m_Nvc * m_Nd;
648  int Nvcd2 = Nvcd / 2;
649 
650  int id1 = 0;
651  int id2 = m_Nvc;
652  int id3 = m_Nvc * 2;
653  int id4 = m_Nvc * 3;
654 
655  int idir = 1;
656 
657  double wt1r, wt1i, wt2r, wt2i;
658 
659  int isite = m_arg[itask].isite;
660  int isite_cp = m_arg[itask].isite_cpy;
661 
662  double *w2 = &v2[Nvcd * isite];
663  // double* w1 = &vcp2[Nvcd2*isite_cp];
664  const double *w1
665  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
666  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
667 
668  m_bw_recv[idir]->wait_thread(itask);
669 
670  int iy = m_Ny - 1;
671  for (int it = 0; it < m_Mt; ++it) {
672  for (int iz = 0; iz < m_Mz; ++iz) {
673  for (int ix = 0; ix < m_Nx; ++ix) {
674  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
675  int is2 = ix + m_Nx * (iz + m_Mz * it);
676  int iv = Nvcd * is;
677  int ig = m_Ndf * is;
678  int ix1 = Nvc2 * is2;
679  int ix2 = ix1 + m_Nvc;
680 
681  for (int ic = 0; ic < m_Nc; ++ic) {
682  int ic2 = ic * m_Nvc;
683 
684  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
685  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
686  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
687  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
688 
689  w2[2 * ic + id1 + iv] += wt1r;
690  w2[2 * ic + 1 + id1 + iv] += wt1i;
691  w2[2 * ic + id2 + iv] += wt2r;
692  w2[2 * ic + 1 + id2 + iv] += wt2i;
693  w2[2 * ic + id3 + iv] += -wt2r;
694  w2[2 * ic + 1 + id3 + iv] += -wt2i;
695  w2[2 * ic + id4 + iv] += wt1r;
696  w2[2 * ic + 1 + id4 + iv] += wt1i;
697  }
698  }
699  }
700  }
701  }
702 
703 
704 //====================================================================
706  int itask, double *v2, const double *v1)
707  {
708  int Nvcd = m_Nvc * m_Nd;
709 
710  int id1 = 0;
711  int id2 = m_Nvc;
712  int id3 = m_Nvc * 2;
713  int id4 = m_Nvc * 3;
714 
715  int idir = 1;
716 
717  double vt1[m_Nvc], vt2[m_Nvc];
718  double wt1r, wt1i, wt2r, wt2i;
719 
720  int isite = m_arg[itask].isite;
721 
722  double *w2 = &v2[Nvcd * isite];
723  const double *w1 = &v1[Nvcd * isite];
724  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
725 
726  for (int it = 0; it < m_Mt; ++it) {
727  for (int iz = 0; iz < m_Mz; ++iz) {
728  for (int iy = 0; iy < m_Ny - 1; ++iy) {
729  for (int ix = 0; ix < m_Nx; ++ix) {
730  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
731  int iv = Nvcd * is;
732  int in = Nvcd * (is + m_Nx);
733  int ig = m_Ndf * is;
734 
735  for (int ic = 0; ic < m_Nc; ++ic) {
736  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
737  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
738  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
739  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
740  }
741 
742  for (int ic = 0; ic < m_Nc; ++ic) {
743  int ic2 = ic * m_Nvc;
744 
745  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
746  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
747  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
748  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
749 
750  w2[2 * ic + id1 + iv] += wt1r;
751  w2[2 * ic + 1 + id1 + iv] += wt1i;
752  w2[2 * ic + id2 + iv] += wt2r;
753  w2[2 * ic + 1 + id2 + iv] += wt2i;
754  w2[2 * ic + id3 + iv] += -wt2r;
755  w2[2 * ic + 1 + id3 + iv] += -wt2i;
756  w2[2 * ic + id4 + iv] += wt1r;
757  w2[2 * ic + 1 + id4 + iv] += wt1i;
758  }
759  }
760  }
761  }
762  }
763  }
764 
765 
766 //====================================================================
768  int itask, double *vcp1, const double *v1)
769  {
770  int Nvc2 = 2 * m_Nvc;
771  int Nvcd = m_Nvc * m_Nd;
772  int Nvcd2 = Nvcd / 2;
773 
774  int id1 = 0;
775  int id2 = m_Nvc;
776  int id3 = m_Nvc * 2;
777  int id4 = m_Nvc * 3;
778 
779  int idir = 1;
780 
781  int isite = m_arg[itask].isite;
782  int isite_cp = m_arg[itask].isite_cpy;
783 
784  // double* w2 = &vcp1[Nvcd2*isite_cp];
785  double *w2
786  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
787 
788  const double *w1 = &v1[Nvcd * isite];
789  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
790 
791  double vt1[m_Nvc], vt2[m_Nvc];
792 
793  int iy = m_Ny - 1;
794 
795  for (int it = 0; it < m_Mt; ++it) {
796  for (int iz = 0; iz < m_Mz; ++iz) {
797  for (int ix = 0; ix < m_Nx; ++ix) {
798  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
799  int is2 = ix + m_Nx * (iz + m_Mz * it);
800  int in = Nvcd * is;
801  int ig = m_Ndf * is;
802  int ix1 = Nvc2 * is2;
803  int ix2 = ix1 + m_Nvc;
804 
805  for (int ic = 0; ic < m_Nc; ++ic) {
806  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
807  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
808  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
809  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
810  }
811 
812  for (int ic = 0; ic < m_Nc; ++ic) {
813  int icr = 2 * ic;
814  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
815  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
816  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
817  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
818  }
819  }
820  }
821  }
822 
823  m_fw_send[idir]->start_thread(itask);
824  }
825 
826 
827 //====================================================================
829  int itask, double *v2, const double *vcp2)
830  {
831  int Nvc2 = 2 * m_Nvc;
832  int Nvcd = m_Nvc * m_Nd;
833  int Nvcd2 = Nvcd / 2;
834 
835  int id1 = 0;
836  int id2 = m_Nvc;
837  int id3 = m_Nvc * 2;
838  int id4 = m_Nvc * 3;
839 
840  int idir = 1;
841  double bc2 = m_boundary2[idir];
842 
843  double wt1r, wt1i, wt2r, wt2i;
844 
845  int isite = m_arg[itask].isite;
846  int isite_cp = m_arg[itask].isite_cpy;
847 
848  double *w2 = &v2[Nvcd * isite];
849  // double* w1 = &vcp2[Nvcd2*isite_cp];
850  const double *w1
851  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
852 
853  m_fw_recv[idir]->wait_thread(itask);
854 
855  int iy = 0;
856  for (int it = 0; it < m_Mt; ++it) {
857  for (int iz = 0; iz < m_Mz; ++iz) {
858  for (int ix = 0; ix < m_Nx; ++ix) {
859  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
860  int is2 = ix + m_Nx * (iz + m_Mz * it);
861  int iv = Nvcd * is;
862  int ix1 = Nvc2 * is2;
863  int ix2 = ix1 + m_Nvc;
864 
865  for (int ic = 0; ic < m_Nc; ++ic) {
866  int icr = 2 * ic;
867  int ici = 2 * ic + 1;
868  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
869  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
870  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
871  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
872  w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
873  w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
874  w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
875  w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
876  }
877  }
878  }
879  }
880  }
881 
882 
883 //====================================================================
885  int itask, double *v2, const double *v1)
886  {
887  int Nvcd = m_Nvc * m_Nd;
888 
889  int id1 = 0;
890  int id2 = m_Nvc;
891  int id3 = m_Nvc * 2;
892  int id4 = m_Nvc * 3;
893 
894  int idir = 1;
895 
896  double vt1[m_Nvc], vt2[m_Nvc];
897  double wt1r, wt1i, wt2r, wt2i;
898 
899  int isite = m_arg[itask].isite;
900 
901  double *w2 = &v2[Nvcd * isite];
902  const double *w1 = &v1[Nvcd * isite];
903  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
904 
905  for (int it = 0; it < m_Mt; ++it) {
906  for (int iz = 0; iz < m_Mz; ++iz) {
907  for (int iy = 1; iy < m_Ny; ++iy) {
908  for (int ix = 0; ix < m_Nx; ++ix) {
909  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
910  int iv = Nvcd * is;
911  int in = Nvcd * (is - m_Nx);
912  int ig = m_Ndf * (is - m_Nx);
913 
914  for (int ic = 0; ic < m_Nc; ++ic) {
915  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
916  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
917  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
918  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
919  }
920 
921  for (int ic = 0; ic < m_Nc; ++ic) {
922  int ic2 = 2 * ic;
923  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
924  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
925  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
926  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
927 
928  w2[ic2 + id1 + iv] += wt1r;
929  w2[ic2 + 1 + id1 + iv] += wt1i;
930  w2[ic2 + id2 + iv] += wt2r;
931  w2[ic2 + 1 + id2 + iv] += wt2i;
932  w2[ic2 + id3 + iv] += wt2r;
933  w2[ic2 + 1 + id3 + iv] += wt2i;
934  w2[ic2 + id4 + iv] += -wt1r;
935  w2[ic2 + 1 + id4 + iv] += -wt1i;
936  }
937  }
938  }
939  }
940  }
941  }
942 
943 
944 //====================================================================
946  int itask, double *vcp1, const double *v1)
947  {
948  int Nvc2 = 2 * m_Nvc;
949  int Nvcd = m_Nvc * m_Nd;
950  int Nvcd2 = Nvcd / 2;
951 
952  int id1 = 0;
953  int id2 = m_Nvc;
954  int id3 = m_Nvc * 2;
955  int id4 = m_Nvc * 3;
956 
957  int isite = m_arg[itask].isite;
958  int isite_cp = m_arg[itask].isite_cpz;
959 
960  int idir = 2;
961  double bc2 = m_boundary2[idir];
962 
963  // double* w2 = &vcp1[Nvcd2*isite_cp];
964  double *w2
965  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
966  const double *w1 = &v1[Nvcd * isite];
967 
968  if (m_arg[itask].kz0 == 1) {
969  int Nxy = m_Nx * m_Ny;
970  int iz = 0;
971  for (int it = 0; it < m_Mt; ++it) {
972  for (int ixy = 0; ixy < Nxy; ++ixy) {
973  int is = ixy + Nxy * (iz + m_Nz * it);
974  int is2 = ixy + Nxy * it;
975 
976  int in = Nvcd * is;
977  int ix1 = Nvc2 * is2;
978  int ix2 = ix1 + m_Nvc;
979 
980  for (int ic = 0; ic < m_Nc; ++ic) {
981  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
982  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
983  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
984  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
985  }
986  }
987  }
988  }
989 
990  m_bw_send[idir]->start_thread(itask);
991  }
992 
993 
994 //====================================================================
996  int itask, double *v2, const double *vcp2)
997  {
998  int Nvc2 = 2 * m_Nvc;
999  int Nvcd = m_Nvc * m_Nd;
1000  int Nvcd2 = Nvcd / 2;
1001 
1002  int id1 = 0;
1003  int id2 = m_Nvc;
1004  int id3 = m_Nvc * 2;
1005  int id4 = m_Nvc * 3;
1006 
1007  int idir = 2;
1008 
1009  double wt1r, wt1i, wt2r, wt2i;
1010 
1011  int isite = m_arg[itask].isite;
1012  int isite_cp = m_arg[itask].isite_cpz;
1013 
1014  double *w2 = &v2[Nvcd * isite];
1015  // double* w1 = &vcp2[Nvcd2*isite_cp];
1016  const double *w1
1017  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1018  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1019 
1020  m_bw_recv[idir]->wait_thread(itask);
1021 
1022  if (m_arg[itask].kz1 == 1) {
1023  int Nxy = m_Nx * m_Ny;
1024  int iz = m_Mz - 1;
1025  for (int it = 0; it < m_Mt; ++it) {
1026  for (int ixy = 0; ixy < Nxy; ++ixy) {
1027  int is = ixy + Nxy * (iz + m_Nz * it);
1028  int is2 = ixy + Nxy * it;
1029  int iv = Nvcd * is;
1030  int ig = m_Ndf * is;
1031  int ix1 = Nvc2 * is2;
1032  int ix2 = ix1 + m_Nvc;
1033 
1034  for (int ic = 0; ic < m_Nc; ++ic) {
1035  int ic2 = ic * m_Nvc;
1036 
1037  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1038  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1039  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1040  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1041 
1042  w2[2 * ic + id1 + iv] += wt1r;
1043  w2[2 * ic + 1 + id1 + iv] += wt1i;
1044  w2[2 * ic + id2 + iv] += wt2r;
1045  w2[2 * ic + 1 + id2 + iv] += wt2i;
1046  w2[2 * ic + id3 + iv] += wt1i;
1047  w2[2 * ic + 1 + id3 + iv] += -wt1r;
1048  w2[2 * ic + id4 + iv] += -wt2i;
1049  w2[2 * ic + 1 + id4 + iv] += wt2r;
1050  }
1051  }
1052  }
1053  }
1054  }
1055 
1056 
1057 //====================================================================
1059  int itask, double *v2, const double *v1)
1060  {
1061  int Nvcd = m_Nvc * m_Nd;
1062 
1063  int id1 = 0;
1064  int id2 = m_Nvc;
1065  int id3 = m_Nvc * 2;
1066  int id4 = m_Nvc * 3;
1067 
1068  int idir = 2;
1069 
1070  double vt1[m_Nvc], vt2[m_Nvc];
1071  double wt1r, wt1i, wt2r, wt2i;
1072 
1073  int isite = m_arg[itask].isite;
1074 
1075  double *w2 = &v2[Nvcd * isite];
1076  const double *w1 = &v1[Nvcd * isite];
1077  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1078 
1079  int kz1 = m_arg[itask].kz1;
1080  int Nxy = m_Nx * m_Ny;
1081 
1082  for (int it = 0; it < m_Mt; ++it) {
1083  for (int iz = 0; iz < m_Mz - kz1; ++iz) {
1084  for (int ixy = 0; ixy < Nxy; ++ixy) {
1085  int is = ixy + Nxy * (iz + m_Nz * it);
1086  int iv = Nvcd * is;
1087  int in = Nvcd * (is + Nxy);
1088  int ig = m_Ndf * is;
1089 
1090  for (int ic = 0; ic < m_Nc; ++ic) {
1091  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
1092  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
1093  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
1094  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
1095  }
1096 
1097  for (int ic = 0; ic < m_Nc; ++ic) {
1098  int ic2 = ic * m_Nvc;
1099 
1100  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1101  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1102  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1103  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1104 
1105  w2[2 * ic + id1 + iv] += wt1r;
1106  w2[2 * ic + 1 + id1 + iv] += wt1i;
1107  w2[2 * ic + id2 + iv] += wt2r;
1108  w2[2 * ic + 1 + id2 + iv] += wt2i;
1109  w2[2 * ic + id3 + iv] += wt1i;
1110  w2[2 * ic + 1 + id3 + iv] += -wt1r;
1111  w2[2 * ic + id4 + iv] += -wt2i;
1112  w2[2 * ic + 1 + id4 + iv] += wt2r;
1113  }
1114  }
1115  }
1116  }
1117  }
1118 
1119 
1120 //====================================================================
1122  int itask, double *vcp1, const double *v1)
1123  {
1124  int Nvc2 = 2 * m_Nvc;
1125  int Nvcd = m_Nvc * m_Nd;
1126  int Nvcd2 = Nvcd / 2;
1127 
1128  int id1 = 0;
1129  int id2 = m_Nvc;
1130  int id3 = m_Nvc * 2;
1131  int id4 = m_Nvc * 3;
1132 
1133  int idir = 2;
1134 
1135  int isite = m_arg[itask].isite;
1136  int isite_cp = m_arg[itask].isite_cpz;
1137 
1138  // double* w2 = &vcp1[Nvcd2*isite_cp];
1139  double *w2
1140  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1141  const double *w1 = &v1[Nvcd * isite];
1142  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1143 
1144  double vt1[m_Nvc], vt2[m_Nvc];
1145 
1146  if (m_arg[itask].kz1 == 1) {
1147  int Nxy = m_Nx * m_Ny;
1148  int iz = m_Mz - 1;
1149  for (int it = 0; it < m_Mt; ++it) {
1150  for (int ixy = 0; ixy < Nxy; ++ixy) {
1151  int is = ixy + Nxy * (iz + m_Nz * it);
1152  int is2 = ixy + Nxy * it;
1153  int in = Nvcd * is;
1154  int ig = m_Ndf * is;
1155  int ix1 = Nvc2 * is2;
1156  int ix2 = ix1 + m_Nvc;
1157 
1158  for (int ic = 0; ic < m_Nc; ++ic) {
1159  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1160  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1161  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1162  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1163  }
1164 
1165  for (int ic = 0; ic < m_Nc; ++ic) {
1166  int icr = 2 * ic;
1167  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1168  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1169  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1170  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1171  }
1172  }
1173  }
1174  }
1175 
1176  m_fw_send[idir]->start_thread(itask);
1177  }
1178 
1179 
1180 //====================================================================
1182  int itask, double *v2, const double *vcp2)
1183  {
1184  int Nvc2 = 2 * m_Nvc;
1185  int Nvcd = m_Nvc * m_Nd;
1186  int Nvcd2 = Nvcd / 2;
1187 
1188  int id1 = 0;
1189  int id2 = m_Nvc;
1190  int id3 = m_Nvc * 2;
1191  int id4 = m_Nvc * 3;
1192 
1193  int idir = 2;
1194  double bc2 = m_boundary2[idir];
1195 
1196  double wt1r, wt1i, wt2r, wt2i;
1197 
1198  int isite = m_arg[itask].isite;
1199  int isite_cp = m_arg[itask].isite_cpz;
1200 
1201  double *w2 = &v2[Nvcd * isite];
1202  // double* w1 = &vcp2[Nvcd2*isite_cp];
1203  const double *w1
1204  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1205 
1206  m_fw_recv[idir]->wait_thread(itask);
1207 
1208  if (m_arg[itask].kz0 == 1) {
1209  int Nxy = m_Nx * m_Ny;
1210 
1211  int iz = 0;
1212  for (int it = 0; it < m_Mt; ++it) {
1213  for (int ixy = 0; ixy < Nxy; ++ixy) {
1214  int is = ixy + Nxy * (iz + m_Nz * it);
1215  int is2 = ixy + Nxy * it;
1216  int iv = Nvcd * is;
1217  int ix1 = Nvc2 * is2;
1218  int ix2 = ix1 + m_Nvc;
1219 
1220  for (int ic = 0; ic < m_Nc; ++ic) {
1221  int icr = 2 * ic;
1222  int ici = 2 * ic + 1;
1223  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1224  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1225  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1226  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1227  w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
1228  w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
1229  w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
1230  w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
1231  }
1232  }
1233  }
1234  }
1235  }
1236 
1237 
1238 //====================================================================
1240  int itask, double *v2, const double *v1)
1241  {
1242  int Nvcd = m_Nvc * m_Nd;
1243 
1244  int id1 = 0;
1245  int id2 = m_Nvc;
1246  int id3 = m_Nvc * 2;
1247  int id4 = m_Nvc * 3;
1248 
1249  int idir = 2;
1250 
1251  double vt1[m_Nvc], vt2[m_Nvc];
1252  double wt1r, wt1i, wt2r, wt2i;
1253 
1254  int isite = m_arg[itask].isite;
1255 
1256  double *w2 = &v2[Nvcd * isite];
1257  const double *w1 = &v1[Nvcd * isite];
1258  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1259 
1260  int kz0 = m_arg[itask].kz0;
1261  int Nxy = m_Nx * m_Ny;
1262 
1263  for (int it = 0; it < m_Mt; ++it) {
1264  for (int iz = kz0; iz < m_Mz; ++iz) {
1265  for (int ixy = 0; ixy < Nxy; ++ixy) {
1266  int is = ixy + Nxy * (iz + m_Nz * it);
1267  int iv = Nvcd * is;
1268  int in = Nvcd * (is - Nxy);
1269  int ig = m_Ndf * (is - Nxy);
1270 
1271  for (int ic = 0; ic < m_Nc; ++ic) {
1272  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1273  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1274  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1275  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1276  }
1277 
1278  for (int ic = 0; ic < m_Nc; ++ic) {
1279  int ic2 = 2 * ic;
1280  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1281  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1282  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1283  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1284 
1285  w2[ic2 + id1 + iv] += wt1r;
1286  w2[ic2 + 1 + id1 + iv] += wt1i;
1287  w2[ic2 + id2 + iv] += wt2r;
1288  w2[ic2 + 1 + id2 + iv] += wt2i;
1289  w2[ic2 + id3 + iv] += -wt1i;
1290  w2[ic2 + 1 + id3 + iv] += wt1r;
1291  w2[ic2 + id4 + iv] += wt2i;
1292  w2[ic2 + 1 + id4 + iv] += -wt2r;
1293  }
1294  }
1295  }
1296  }
1297  }
1298 
1299 
1300 //====================================================================
1302  int itask, double *vcp1, const double *v1)
1303  {
1304  int Nvc2 = 2 * m_Nvc;
1305  int Nvcd = m_Nvc * m_Nd;
1306  int Nvcd2 = Nvcd / 2;
1307 
1308  int id1 = 0;
1309  int id2 = m_Nvc;
1310  int id3 = m_Nvc * 2;
1311  int id4 = m_Nvc * 3;
1312 
1313  int isite = m_arg[itask].isite;
1314  int isite_cp = m_arg[itask].isite_cpt;
1315 
1316  int idir = 3;
1317  double bc2 = m_boundary2[idir];
1318 
1319  // double* w2 = &vcp1[Nvcd2*isite_cp];
1320  double *w2
1321  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1322  const double *w1 = &v1[Nvcd * isite];
1323 
1324  if (m_arg[itask].kt0 == 1) {
1325  int Nxy = m_Nx * m_Ny;
1326  int it = 0;
1327  for (int iz = 0; iz < m_Mz; ++iz) {
1328  for (int ixy = 0; ixy < Nxy; ++ixy) {
1329  int is = ixy + Nxy * (iz + m_Nz * it);
1330  int is2 = ixy + Nxy * iz;
1331 
1332  int in = Nvcd * is;
1333  int ix1 = Nvc2 * is2;
1334  int ix2 = ix1 + m_Nvc;
1335 
1336  for (int ic = 0; ic < m_Nc; ++ic) {
1337  w2[2 * ic + ix1] = 2.0 * bc2 * w1[2 * ic + id3 + in];
1338  w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
1339  w2[2 * ic + ix2] = 2.0 * bc2 * w1[2 * ic + id4 + in];
1340  w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
1341  }
1342  }
1343  }
1344  }
1345 
1346  m_bw_send[idir]->start_thread(itask);
1347  }
1348 
1349 
1350 //====================================================================
1352  int itask, double *v2, const double *vcp2)
1353  {
1354  int Nvc2 = 2 * m_Nvc;
1355  int Nvcd = m_Nvc * m_Nd;
1356  int Nvcd2 = Nvcd / 2;
1357 
1358  int id1 = 0;
1359  int id2 = m_Nvc;
1360  int id3 = m_Nvc * 2;
1361  int id4 = m_Nvc * 3;
1362 
1363  int idir = 3;
1364 
1365  double wt1r, wt1i, wt2r, wt2i;
1366 
1367  int isite = m_arg[itask].isite;
1368  int isite_cp = m_arg[itask].isite_cpt;
1369 
1370  double *w2 = &v2[Nvcd * isite];
1371  // double* w1 = &vcp2[Nvcd2*isite_cp];
1372  const double *w1
1373  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1374  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1375 
1376  m_bw_recv[idir]->wait_thread(itask);
1377 
1378  if (m_arg[itask].kt1 == 1) {
1379  int Nxy = m_Nx * m_Ny;
1380  int it = m_Mt - 1;
1381  for (int iz = 0; iz < m_Mz; ++iz) {
1382  for (int ixy = 0; ixy < Nxy; ++ixy) {
1383  int is = ixy + Nxy * (iz + m_Nz * it);
1384  int is2 = ixy + Nxy * iz;
1385  int iv = Nvcd * is;
1386  int ig = m_Ndf * is;
1387  int ix1 = Nvc2 * is2;
1388  int ix2 = ix1 + m_Nvc;
1389 
1390  for (int ic = 0; ic < m_Nc; ++ic) {
1391  int ic2 = ic * m_Nvc;
1392 
1393  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1394  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1395  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1396  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1397 
1398  w2[2 * ic + id3 + iv] += wt1r;
1399  w2[2 * ic + 1 + id3 + iv] += wt1i;
1400  w2[2 * ic + id4 + iv] += wt2r;
1401  w2[2 * ic + 1 + id4 + iv] += wt2i;
1402  }
1403  }
1404  }
1405  }
1406  }
1407 
1408 
1409 //====================================================================
1411  int itask, double *v2, const double *v1)
1412  {
1413  int Nvcd = m_Nvc * m_Nd;
1414 
1415  int id1 = 0;
1416  int id2 = m_Nvc;
1417  int id3 = m_Nvc * 2;
1418  int id4 = m_Nvc * 3;
1419 
1420  int idir = 3;
1421 
1422  double vt1[m_Nvc], vt2[m_Nvc];
1423  double wt1r, wt1i, wt2r, wt2i;
1424 
1425  int isite = m_arg[itask].isite;
1426 
1427  double *w2 = &v2[Nvcd * isite];
1428  const double *w1 = &v1[Nvcd * isite];
1429  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1430 
1431  int kt1 = m_arg[itask].kt1;
1432  int Nxy = m_Nx * m_Ny;
1433  int Nxyz = Nxy * m_Nz;
1434 
1435  for (int it = 0; it < m_Mt - kt1; ++it) {
1436  for (int iz = 0; iz < m_Mz; ++iz) {
1437  for (int ixy = 0; ixy < Nxy; ++ixy) {
1438  int is = ixy + Nxy * (iz + m_Nz * it);
1439  int iv = Nvcd * is;
1440  int in = Nvcd * (is + Nxyz);
1441  int ig = m_Ndf * is;
1442 
1443  for (int ic = 0; ic < m_Nc; ++ic) {
1444  vt1[2 * ic] = 2.0 * w1[2 * ic + id3 + in];
1445  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
1446  vt2[2 * ic] = 2.0 * w1[2 * ic + id4 + in];
1447  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
1448  }
1449 
1450  for (int ic = 0; ic < m_Nc; ++ic) {
1451  int ic2 = ic * m_Nvc;
1452 
1453  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1454  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1455  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1456  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1457 
1458  w2[2 * ic + id3 + iv] += wt1r;
1459  w2[2 * ic + 1 + id3 + iv] += wt1i;
1460  w2[2 * ic + id4 + iv] += wt2r;
1461  w2[2 * ic + 1 + id4 + iv] += wt2i;
1462  }
1463  }
1464  }
1465  }
1466  }
1467 
1468 
1469 //====================================================================
1471  int itask, double *vcp1, const double *v1)
1472  {
1473  int Nvc2 = 2 * m_Nvc;
1474  int Nvcd = m_Nvc * m_Nd;
1475  int Nvcd2 = Nvcd / 2;
1476 
1477  int id1 = 0;
1478  int id2 = m_Nvc;
1479  int id3 = m_Nvc * 2;
1480  int id4 = m_Nvc * 3;
1481 
1482  int idir = 3;
1483 
1484  int isite = m_arg[itask].isite;
1485  int isite_cp = m_arg[itask].isite_cpt;
1486 
1487  // double* w2 = &vcp1[Nvcd2*isite_cp];
1488  double *w2
1489  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1490  const double *w1 = &v1[Nvcd * isite];
1491  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1492 
1493  double vt1[m_Nvc], vt2[m_Nvc];
1494 
1495  if (m_arg[itask].kt1 == 1) {
1496  int Nxy = m_Nx * m_Ny;
1497  int it = m_Mt - 1;
1498  for (int iz = 0; iz < m_Mz; ++iz) {
1499  for (int ixy = 0; ixy < Nxy; ++ixy) {
1500  int is = ixy + Nxy * (iz + m_Nz * it);
1501  int is2 = ixy + Nxy * iz;
1502  int in = Nvcd * is;
1503  int ig = m_Ndf * is;
1504  int ix1 = Nvc2 * is2;
1505  int ix2 = ix1 + m_Nvc;
1506 
1507  for (int ic = 0; ic < m_Nc; ++ic) {
1508  vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1509  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1510  vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1511  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1512  }
1513 
1514  for (int ic = 0; ic < m_Nc; ++ic) {
1515  int icr = 2 * ic;
1516  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1517  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1518  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1519  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1520  }
1521  }
1522  }
1523  }
1524 
1525  m_fw_send[idir]->start_thread(itask);
1526  }
1527 
1528 
1529 //====================================================================
1531  int itask, double *v2, const double *vcp2)
1532  {
1533  int Nvc2 = 2 * m_Nvc;
1534  int Nvcd = m_Nvc * m_Nd;
1535  int Nvcd2 = Nvcd / 2;
1536 
1537  int id1 = 0;
1538  int id2 = m_Nvc;
1539  int id3 = m_Nvc * 2;
1540  int id4 = m_Nvc * 3;
1541 
1542  int idir = 3;
1543  double bc2 = m_boundary2[idir];
1544 
1545  double wt1r, wt1i, wt2r, wt2i;
1546 
1547  int isite = m_arg[itask].isite;
1548  int isite_cp = m_arg[itask].isite_cpt;
1549 
1550  double *w2 = &v2[Nvcd * isite];
1551  // double* w1 = &vcp2[Nvcd2*isite_cp];
1552  const double *w1
1553  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1554 
1555  m_fw_recv[idir]->wait_thread(itask);
1556 
1557  if (m_arg[itask].kt0 == 1) {
1558  int Nxy = m_Nx * m_Ny;
1559  int it = 0;
1560  for (int iz = 0; iz < m_Mz; ++iz) {
1561  for (int ixy = 0; ixy < Nxy; ++ixy) {
1562  int is = ixy + Nxy * (iz + m_Nz * it);
1563  int is2 = ixy + Nxy * iz;
1564  int iv = Nvcd * is;
1565  int ix1 = Nvc2 * is2;
1566  int ix2 = ix1 + m_Nvc;
1567 
1568  for (int ic = 0; ic < m_Nc; ++ic) {
1569  int icr = 2 * ic;
1570  int ici = 2 * ic + 1;
1571  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1572  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1573  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1574  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1575  }
1576  }
1577  }
1578  }
1579  }
1580 
1581 
1582 //====================================================================
1584  int itask, double *v2, const double *v1)
1585  {
1586  int Nvcd = m_Nvc * m_Nd;
1587 
1588  int id1 = 0;
1589  int id2 = m_Nvc;
1590  int id3 = m_Nvc * 2;
1591  int id4 = m_Nvc * 3;
1592 
1593  int idir = 3;
1594 
1595  double vt1[m_Nvc], vt2[m_Nvc];
1596  double wt1r, wt1i, wt2r, wt2i;
1597 
1598  int isite = m_arg[itask].isite;
1599 
1600  double *w2 = &v2[Nvcd * isite];
1601  const double *w1 = &v1[Nvcd * isite];
1602  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1603 
1604  int kt0 = m_arg[itask].kt0;
1605  int Nxy = m_Nx * m_Ny;
1606  int Nxyz = Nxy * m_Nz;
1607 
1608  for (int it = kt0; it < m_Mt; ++it) {
1609  for (int iz = 0; iz < m_Mz; ++iz) {
1610  for (int ixy = 0; ixy < Nxy; ++ixy) {
1611  int is = ixy + Nxy * (iz + m_Nz * it);
1612  int iv = Nvcd * is;
1613  int in = Nvcd * (is - Nxyz);
1614  int ig = m_Ndf * (is - Nxyz);
1615 
1616  for (int ic = 0; ic < m_Nc; ++ic) {
1617  vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1618  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1619  vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1620  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1621  }
1622 
1623  for (int ic = 0; ic < m_Nc; ++ic) {
1624  int ic2 = 2 * ic;
1625  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1626  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1627  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1628  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1629 
1630  w2[ic2 + id1 + iv] += wt1r;
1631  w2[ic2 + 1 + id1 + iv] += wt1i;
1632  w2[ic2 + id2 + iv] += wt2r;
1633  w2[ic2 + 1 + id2 + iv] += wt2i;
1634  }
1635  }
1636  }
1637  }
1638  }
1639 
1640 
1641 //====================================================================
1643  int itask, double *vcp1, const double *v1)
1644  {
1645  int Nvc2 = 2 * m_Nvc;
1646  int Nvcd = m_Nvc * m_Nd;
1647  int Nvcd2 = Nvcd / 2;
1648 
1649  int id1 = 0;
1650  int id2 = m_Nvc;
1651  int id3 = m_Nvc * 2;
1652  int id4 = m_Nvc * 3;
1653 
1654  int isite = m_arg[itask].isite;
1655  int isite_cp = m_arg[itask].isite_cpt;
1656 
1657  int idir = 3;
1658  double bc2 = m_boundary2[idir];
1659 
1660  // double* w2 = &vcp1[Nvcd2*isite_cp];
1661  double *w2
1662  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1663  const double *w1 = &v1[Nvcd * isite];
1664 
1665  if (m_arg[itask].kt0 == 1) {
1666  int Nxy = m_Nx * m_Ny;
1667  int it = 0;
1668  for (int iz = 0; iz < m_Mz; ++iz) {
1669  for (int ixy = 0; ixy < Nxy; ++ixy) {
1670  int is = ixy + Nxy * (iz + m_Nz * it);
1671  int is2 = ixy + Nxy * iz;
1672 
1673  int in = Nvcd * is;
1674  int ix1 = Nvc2 * is2;
1675  int ix2 = ix1 + m_Nvc;
1676 
1677  for (int ic = 0; ic < m_Nc; ++ic) {
1678  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
1679  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
1680  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
1681  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
1682  }
1683  }
1684  }
1685  }
1686 
1687  m_bw_send[idir]->start_thread(itask);
1688  }
1689 
1690 
1691 //====================================================================
1693  int itask, double *v2, const double *vcp2)
1694  {
1695  int Nvc2 = 2 * m_Nvc;
1696  int Nvcd = m_Nvc * m_Nd;
1697  int Nvcd2 = Nvcd / 2;
1698 
1699  int id1 = 0;
1700  int id2 = m_Nvc;
1701  int id3 = m_Nvc * 2;
1702  int id4 = m_Nvc * 3;
1703 
1704  int idir = 3;
1705 
1706  double wt1r, wt1i, wt2r, wt2i;
1707 
1708  int isite = m_arg[itask].isite;
1709  int isite_cp = m_arg[itask].isite_cpt;
1710 
1711  double *w2 = &v2[Nvcd * isite];
1712  // double* w1 = &vcp2[Nvcd2*isite_cp];
1713  const double *w1
1714  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1715  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1716 
1717  m_bw_recv[idir]->wait_thread(itask);
1718 
1719  if (m_arg[itask].kt1 == 1) {
1720  int Nxy = m_Nx * m_Ny;
1721  int it = m_Mt - 1;
1722  for (int iz = 0; iz < m_Mz; ++iz) {
1723  for (int ixy = 0; ixy < Nxy; ++ixy) {
1724  int is = ixy + Nxy * (iz + m_Nz * it);
1725  int is2 = ixy + Nxy * iz;
1726  int iv = Nvcd * is;
1727  int ig = m_Ndf * is;
1728  int ix1 = Nvc2 * is2;
1729  int ix2 = ix1 + m_Nvc;
1730 
1731  for (int ic = 0; ic < m_Nc; ++ic) {
1732  int ic2 = ic * m_Nvc;
1733 
1734  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1735  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1736  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1737  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1738 
1739  w2[2 * ic + id1 + iv] += wt1r;
1740  w2[2 * ic + 1 + id1 + iv] += wt1i;
1741  w2[2 * ic + id2 + iv] += wt2r;
1742  w2[2 * ic + 1 + id2 + iv] += wt2i;
1743  w2[2 * ic + id3 + iv] += wt1r;
1744  w2[2 * ic + 1 + id3 + iv] += wt1i;
1745  w2[2 * ic + id4 + iv] += wt2r;
1746  w2[2 * ic + 1 + id4 + iv] += wt2i;
1747  }
1748  }
1749  }
1750  }
1751  }
1752 
1753 
1754 //====================================================================
1756  int itask, double *v2, const double *v1)
1757  {
1758  int Nvcd = m_Nvc * m_Nd;
1759 
1760  int id1 = 0;
1761  int id2 = m_Nvc;
1762  int id3 = m_Nvc * 2;
1763  int id4 = m_Nvc * 3;
1764 
1765  int idir = 3;
1766 
1767  double vt1[m_Nvc], vt2[m_Nvc];
1768  double wt1r, wt1i, wt2r, wt2i;
1769 
1770  int isite = m_arg[itask].isite;
1771 
1772  double *w2 = &v2[Nvcd * isite];
1773  const double *w1 = &v1[Nvcd * isite];
1774  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1775 
1776  int kt1 = m_arg[itask].kt1;
1777  int Nxy = m_Nx * m_Ny;
1778  int Nxyz = Nxy * m_Nz;
1779 
1780  for (int it = 0; it < m_Mt - kt1; ++it) {
1781  for (int iz = 0; iz < m_Mz; ++iz) {
1782  for (int ixy = 0; ixy < Nxy; ++ixy) {
1783  int is = ixy + Nxy * (iz + m_Nz * it);
1784  int iv = Nvcd * is;
1785  int in = Nvcd * (is + Nxyz);
1786  int ig = m_Ndf * is;
1787 
1788  for (int ic = 0; ic < m_Nc; ++ic) {
1789  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
1790  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
1791  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
1792  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
1793  }
1794 
1795  for (int ic = 0; ic < m_Nc; ++ic) {
1796  int ic2 = ic * m_Nvc;
1797 
1798  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1799  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1800  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1801  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1802 
1803  w2[2 * ic + id1 + iv] += wt1r;
1804  w2[2 * ic + 1 + id1 + iv] += wt1i;
1805  w2[2 * ic + id2 + iv] += wt2r;
1806  w2[2 * ic + 1 + id2 + iv] += wt2i;
1807  w2[2 * ic + id3 + iv] += wt1r;
1808  w2[2 * ic + 1 + id3 + iv] += wt1i;
1809  w2[2 * ic + id4 + iv] += wt2r;
1810  w2[2 * ic + 1 + id4 + iv] += wt2i;
1811  }
1812  }
1813  }
1814  }
1815  }
1816 
1817 
1818 //====================================================================
1820  int itask, double *vcp1, const double *v1)
1821  {
1822  int Nvc2 = 2 * m_Nvc;
1823  int Nvcd = m_Nvc * m_Nd;
1824  int Nvcd2 = Nvcd / 2;
1825 
1826  int id1 = 0;
1827  int id2 = m_Nvc;
1828  int id3 = m_Nvc * 2;
1829  int id4 = m_Nvc * 3;
1830 
1831  int idir = 3;
1832 
1833  int isite = m_arg[itask].isite;
1834  int isite_cp = m_arg[itask].isite_cpt;
1835 
1836  // double* w2 = &vcp1[Nvcd2*isite_cp];
1837  double *w2
1838  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1839  const double *w1 = &v1[Nvcd * isite];
1840  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1841 
1842  double vt1[m_Nvc], vt2[m_Nvc];
1843 
1844  if (m_arg[itask].kt1 == 1) {
1845  int Nxy = m_Nx * m_Ny;
1846  int it = m_Mt - 1;
1847  for (int iz = 0; iz < m_Mz; ++iz) {
1848  for (int ixy = 0; ixy < Nxy; ++ixy) {
1849  int is = ixy + Nxy * (iz + m_Nz * it);
1850  int is2 = ixy + Nxy * iz;
1851  int in = Nvcd * is;
1852  int ig = m_Ndf * is;
1853  int ix1 = Nvc2 * is2;
1854  int ix2 = ix1 + m_Nvc;
1855 
1856  for (int ic = 0; ic < m_Nc; ++ic) {
1857  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1858  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1859  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1860  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1861  }
1862 
1863  for (int ic = 0; ic < m_Nc; ++ic) {
1864  int icr = 2 * ic;
1865  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1866  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1867  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1868  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1869  }
1870  }
1871  }
1872  }
1873 
1874  m_fw_send[idir]->start_thread(itask);
1875  }
1876 
1877 
1878 //====================================================================
1880  int itask, double *v2, const double *vcp2)
1881  {
1882  int Nvc2 = 2 * m_Nvc;
1883  int Nvcd = m_Nvc * m_Nd;
1884  int Nvcd2 = Nvcd / 2;
1885 
1886  int id1 = 0;
1887  int id2 = m_Nvc;
1888  int id3 = m_Nvc * 2;
1889  int id4 = m_Nvc * 3;
1890 
1891  int idir = 3;
1892  double bc2 = m_boundary2[idir];
1893 
1894  double wt1r, wt1i, wt2r, wt2i;
1895 
1896  int isite = m_arg[itask].isite;
1897  int isite_cp = m_arg[itask].isite_cpt;
1898 
1899  double *w2 = &v2[Nvcd * isite];
1900  // double* w1 = &vcp2[Nvcd2*isite_cp];
1901  const double *w1
1902  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1903 
1904  m_fw_recv[idir]->wait_thread(itask);
1905 
1906  if (m_arg[itask].kt0 == 1) {
1907  int Nxy = m_Nx * m_Ny;
1908  int it = 0;
1909  for (int iz = 0; iz < m_Mz; ++iz) {
1910  for (int ixy = 0; ixy < Nxy; ++ixy) {
1911  int is = ixy + Nxy * (iz + m_Nz * it);
1912  int is2 = ixy + Nxy * iz;
1913  int iv = Nvcd * is;
1914  int ix1 = Nvc2 * is2;
1915  int ix2 = ix1 + m_Nvc;
1916 
1917  for (int ic = 0; ic < m_Nc; ++ic) {
1918  int icr = 2 * ic;
1919  int ici = 2 * ic + 1;
1920  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1921  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1922  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1923  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1924  w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
1925  w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
1926  w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
1927  w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
1928  }
1929  }
1930  }
1931  }
1932  }
1933 
1934 
1935 //====================================================================
1937  int itask, double *v2, const double *v1)
1938  {
1939  int Nvcd = m_Nvc * m_Nd;
1940 
1941  int id1 = 0;
1942  int id2 = m_Nvc;
1943  int id3 = m_Nvc * 2;
1944  int id4 = m_Nvc * 3;
1945 
1946  int idir = 3;
1947 
1948  double vt1[m_Nvc], vt2[m_Nvc];
1949  double wt1r, wt1i, wt2r, wt2i;
1950 
1951  int isite = m_arg[itask].isite;
1952 
1953  double *w2 = &v2[Nvcd * isite];
1954  const double *w1 = &v1[Nvcd * isite];
1955  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1956 
1957  int kt0 = m_arg[itask].kt0;
1958  int Nxy = m_Nx * m_Ny;
1959  int Nxyz = Nxy * m_Nz;
1960 
1961  for (int it = kt0; it < m_Mt; ++it) {
1962  for (int iz = 0; iz < m_Mz; ++iz) {
1963  for (int ixy = 0; ixy < Nxy; ++ixy) {
1964  int is = ixy + Nxy * (iz + m_Nz * it);
1965  int iv = Nvcd * is;
1966  int in = Nvcd * (is - Nxyz);
1967  int ig = m_Ndf * (is - Nxyz);
1968 
1969  for (int ic = 0; ic < m_Nc; ++ic) {
1970  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1971  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1972  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1973  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1974  }
1975 
1976  for (int ic = 0; ic < m_Nc; ++ic) {
1977  int ic2 = 2 * ic;
1978  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1979  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1980  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1981  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1982 
1983  w2[ic2 + id1 + iv] += wt1r;
1984  w2[ic2 + 1 + id1 + iv] += wt1i;
1985  w2[ic2 + id2 + iv] += wt2r;
1986  w2[ic2 + 1 + id2 + iv] += wt2i;
1987  w2[ic2 + id3 + iv] -= wt1r;
1988  w2[ic2 + 1 + id3 + iv] -= wt1i;
1989  w2[ic2 + id4 + iv] -= wt2r;
1990  w2[ic2 + 1 + id4 + iv] -= wt2i;
1991  }
1992  }
1993  }
1994  }
1995  }
1996 
1997 
1998 //====================================================================
2000  int itask, double *v2, const double *v1)
2001  {
2002  int Nvcd = m_Nvc * m_Nd;
2003  int Nxy = m_Nx * m_Ny;
2004 
2005  int id1 = 0;
2006  int id2 = m_Nvc;
2007  int id3 = m_Nvc * 2;
2008  int id4 = m_Nvc * 3;
2009 
2010  int isite = m_arg[itask].isite;
2011  double *w2 = &v2[Nvcd * isite];
2012  const double *w1 = &v1[Nvcd * isite];
2013 
2014  for (int it = 0; it < m_Mt; ++it) {
2015  for (int iz = 0; iz < m_Mz; ++iz) {
2016  for (int ixy = 0; ixy < Nxy; ++ixy) {
2017  int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
2018  for (int ivc = 0; ivc < m_Nvc; ++ivc) {
2019  w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
2020  w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
2021  w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
2022  w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
2023  }
2024  }
2025  }
2026  }
2027  }
2028 
2029 
2030 //====================================================================
2032  int itask, double *v2, const double *v1)
2033  {
2034  int Nvcd = m_Nvc * m_Nd;
2035  int Nxy = m_Nx * m_Ny;
2036 
2037  int id1 = 0;
2038  int id2 = m_Nvc;
2039  int id3 = m_Nvc * 2;
2040  int id4 = m_Nvc * 3;
2041 
2042  int isite = m_arg[itask].isite;
2043  double *w2 = &v2[Nvcd * isite];
2044  const double *w1 = &v1[Nvcd * isite];
2045 
2046  for (int it = 0; it < m_Mt; ++it) {
2047  for (int iz = 0; iz < m_Mz; ++iz) {
2048  for (int ixy = 0; ixy < Nxy; ++ixy) {
2049  int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
2050  for (int ivc = 0; ivc < m_Nvc; ++ivc) {
2051  w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
2052  w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
2053  w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
2054  w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
2055  }
2056  }
2057  }
2058  }
2059  }
2060 
2061 
2062 //====================================================================
2063 }
2064 //============================================================END=====
void clear_thread(int, double *)
BridgeIO vout
Definition: bridgeIO.cpp:495
const double * ptr(const int jin, const int site, const int jex) const
Definition: field.h:142
void mult_ym1_thread(int, double *, const double *)
void mult_yp2_thread(int, double *, const double *)
void mult_xm1_thread(int, double *, const double *)
std::vector< Channel * > m_bw_recv
void mult_zm1_thread(int, double *, const double *)
static const std::string class_name
std::vector< Channel * > m_bw_send
void general(const char *format,...)
Definition: bridgeIO.cpp:195
void gm5_dirac_thread(int, double *, const double *)
std::vector< mult_arg > m_arg
void mult_tp2_chiral_thread(int, double *, const double *)
void mult_tp2_dirac_thread(int, double *, const double *)
void mult_ymb_thread(int, double *, const double *)
void mult_ypb_thread(int, double *, const double *)
void mult_zm2_thread(int, double *, const double *)
void mult_tmb_dirac_thread(int, double *, const double *)
void mult_tp1_dirac_thread(int, double *, const double *)
void gm5_chiral_thread(int, double *, const double *)
void mult_tmb_chiral_thread(int, double *, const double *)
std::vector< double > m_boundary2
b.c. for each node.
void mult_yp1_thread(int, double *, const double *)
Bridge::VerboseLevel m_vl
void mult_xp1_thread(int, double *, const double *)
void mult_zp1_thread(int, double *, const double *)
std::vector< Channel * > m_fw_recv
void mult_tm1_dirac_thread(int, double *, const double *)
static int get_num_threads_available()
returns number of threads (works outside of parallel region).
const Field_G * m_U
gauge configuration.
void mult_ym2_thread(int, double *, const double *)
void crucial(const char *format,...)
Definition: bridgeIO.cpp:178
void mult_tp1_chiral_thread(int, double *, const double *)
void mult_xpb_thread(int, double *, const double *)
void daypx_thread(int, double *, double, const double *)
void mult_tpb_chiral_thread(int, double *, const double *)
std::vector< Channel * > m_fw_send
void mult_xp2_thread(int, double *, const double *)
void mult_zpb_thread(int, double *, const double *)
void mult_zp2_thread(int, double *, const double *)
void mult_tm2_dirac_thread(int, double *, const double *)
void mult_xmb_thread(int, double *, const double *)
void mult_tm2_chiral_thread(int, double *, const double *)
void mult_tm1_chiral_thread(int, double *, const double *)
void mult_tpb_dirac_thread(int, double *, const double *)
void mult_xm2_thread(int, double *, const double *)
void mult_zmb_thread(int, double *, const double *)