Bridge++  Version 1.4.4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fopr_WilsonGeneral_impl_thread.cpp
Go to the documentation of this file.
1 
15 
17 
18 namespace Imp_BGQ {
19 #if defined USE_GROUP_SU3
20 #include "fopr_Wilson_impl_SU3.inc"
21 #elif defined USE_GROUP_SU2
22 #include "fopr_Wilson_impl_SU2.inc"
23 #elif defined USE_GROUP_SU_N
24 #include "fopr_Wilson_impl_SU_N.inc"
25 #endif
26 
27 //====================================================================
29  {
31 
32  // The following setup corresponds to uniform division of volume.
33  if (m_Nthread <= m_Nt) {
35  } else if (m_Nthread <= m_Nz * m_Nt) {
36  m_Ntask_t = m_Nt;
37  } else {
38  vout.crucial(m_vl, "Error at %s: Too large Nthread: %d\n",
39  class_name.c_str(), m_Nthread);
40  exit(EXIT_FAILURE);
41  }
43  if (m_Ntask_z * m_Ntask_t != m_Nthread) {
44  vout.crucial(m_vl, "Error at %s: Nz(%d) * Nt(%d) \neq Nthread(%d)\n",
45  class_name.c_str(), m_Nz, m_Nt, m_Nthread);
46  exit(EXIT_FAILURE);
47  }
49  m_Mz = m_Nz / m_Ntask_z;
50  m_Mt = m_Nt / m_Ntask_t;
51 
52  // The following setup is not monotonic division, and requires
53  // barrier at the beginning and end of mult (D and gamma5).
54  // [H.Matsufuru 22 Oct 2013]
55  // if(m_Nthread >= 64){
56  // m_Ntask_z = 8;
57  // }else if(m_Nthread >= 16){
58  // m_Ntask_z = 4;
59  // }else if(m_Nthread >= 4){
60  // m_Ntask_z = 2;
61  // }else{
62  // m_Ntask_z = 1;
63  // }
64  // m_Ntask_t = m_Nthread/m_Ntask_z;
65  // m_Ntask = m_Ntask_t * m_Ntask_z;
66  // m_Mz = m_Nz/m_Ntask_z;
67  // m_Mt = m_Nt/m_Ntask_t;
68 
69  vout.general(m_vl, " Nthread = %d\n", m_Nthread);
70  vout.general(m_vl, " Ntask = %d\n", m_Ntask);
71  vout.general(m_vl, " Ntask_z = %d Ntask_t = %d\n", m_Ntask_z, m_Ntask_t);
72  vout.general(m_vl, " Mz = %d Mt = %d\n", m_Mz, m_Mt);
73 
74  // setup of arguments
75  int Nxy = m_Nx * m_Ny;
76  m_arg.resize(m_Ntask);
77  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
78  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
79  int itask = ith_z + m_Ntask_z * ith_t;
80 
81  m_arg[itask].isite = (ith_z * m_Mz + ith_t * (m_Nz * m_Mt)) * Nxy;
82 
83  m_arg[itask].kt0 = 0;
84  m_arg[itask].kt1 = 0;
85  m_arg[itask].kz0 = 0;
86  m_arg[itask].kz1 = 0;
87  if (ith_t == 0) m_arg[itask].kt0 = 1;
88  if (ith_z == 0) m_arg[itask].kz0 = 1;
89  if (ith_t == m_Ntask_t - 1) m_arg[itask].kt1 = 1;
90  if (ith_z == m_Ntask_z - 1) m_arg[itask].kz1 = 1;
91 
92  m_arg[itask].isite_cpx = itask * m_Mz * m_Mt * m_Ny;
93  m_arg[itask].isite_cpy = itask * m_Mz * m_Mt * m_Nx;
94  m_arg[itask].isite_cpz = ith_t * m_Mt * Nxy;
95  m_arg[itask].isite_cpt = ith_z * m_Mz * Nxy;
96  }
97  }
98 
99  // setup for async data transfer
100  int Nc = CommonParameters::Nc();
101  int Nd = CommonParameters::Nd();
102  int Nvcd2 = 2 * Nc * Nd / 2;
103 
104  std::vector<int> destid(m_Ntask);
105  std::vector<int> offset(m_Ntask);
106  std::vector<int> datasize(m_Ntask);
107  std::vector<int> offset_up(m_Ntask);
108  std::vector<int> offset_lw(m_Ntask);
109  std::vector<int> datasize_up(m_Ntask);
110  std::vector<int> datasize_lw(m_Ntask);
111 
112  int imu = 0;
113  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
114  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
115  int itask = ith_z + ith_t * m_Ntask_z;
116  int isite_cp = itask * m_Mz * m_Mt * m_Ny;
117  destid[itask] = itask;
118  offset[itask] = sizeof(double) * Nvcd2 * isite_cp;
119  datasize[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Mt * m_Ny;
120  }
121  }
122  m_bw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
123  m_fw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
124  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
125  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
126 
127  imu = 1;
128  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
129  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
130  int itask = ith_z + ith_t * m_Ntask_z;
131  int isite_cp = itask * m_Mz * m_Mt * m_Nx;
132  destid[itask] = itask;
133  offset[itask] = sizeof(double) * Nvcd2 * isite_cp;
134  datasize[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Mt * m_Nx;
135  }
136  }
137  m_bw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
138  m_fw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
139  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
140  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
141 
142  imu = 2;
143  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
144  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
145  int itask = ith_z + m_Ntask_z * ith_t;
146  destid[itask] = -1;
147  offset_up[itask] = 0;
148  offset_lw[itask] = 0;
149  datasize_up[itask] = 0;
150  datasize_lw[itask] = 0;
151  if (ith_z == 0) {
152  destid[itask] = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
153  offset_lw[itask] = sizeof(double) * Nvcd2 * ith_t * m_Mt * m_Nx * m_Ny;
154  datasize_lw[itask] = sizeof(double) * Nvcd2 * m_Mt * m_Nx * m_Ny;
155  }
156  if (ith_z == m_Ntask_z - 1) {
157  destid[itask] = ith_t * m_Ntask_z;
158  offset_up[itask] = sizeof(double) * Nvcd2 * ith_t * m_Mt * m_Nx * m_Ny;
159  datasize_up[itask] = sizeof(double) * Nvcd2 * m_Mt * m_Nx * m_Ny;
160  }
161  }
162  }
163  m_bw_send[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
164  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
165  m_fw_send[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
166  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
167 
168  imu = 3;
169  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
170  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
171  int itask = ith_z + m_Ntask_z * ith_t;
172  destid[itask] = -1;
173  offset_up[itask] = 0;
174  offset_lw[itask] = 0;
175  datasize_up[itask] = 0;
176  datasize_lw[itask] = 0;
177  if (ith_t == 0) {
178  destid[itask] = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
179  offset_lw[itask] = sizeof(double) * Nvcd2 * ith_z * m_Mz * m_Nx * m_Ny;
180  datasize_lw[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Nx * m_Ny;
181  }
182  if (ith_t == m_Ntask_t - 1) {
183  destid[itask] = ith_z;
184  offset_up[itask] = sizeof(double) * Nvcd2 * ith_z * m_Mz * m_Nx * m_Ny;
185  datasize_up[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Nx * m_Ny;
186  }
187  }
188  }
189  m_bw_send[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
190  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
191  m_fw_send[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
192  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
193  }
194 
195 
196 //====================================================================
198  int itask, double *v2, double fac, const double *v1)
199  {
200  int Nvcd = m_Nvc * m_Nd;
201  int Nvxy = Nvcd * m_Nx * m_Ny;
202 
203  int isite = m_arg[itask].isite;
204 
205  const double *w1 = &v1[Nvcd * isite];
206  double *w2 = &v2[Nvcd * isite];
207 
208  for (int it = 0; it < m_Mt; ++it) {
209  for (int iz = 0; iz < m_Mz; ++iz) {
210  for (int ivxy = 0; ivxy < Nvxy; ++ivxy) {
211  int iv = ivxy + Nvxy * (iz + m_Nz * it);
212  w2[iv] = fac * w2[iv] + w1[iv];
213  }
214  }
215  }
216  }
217 
218 
219 //====================================================================
221  double *v2)
222  {
223  int Nvcd = m_Nvc * m_Nd;
224  int Nvxy = Nvcd * m_Nx * m_Ny;
225 
226  int isite = m_arg[itask].isite;
227  double *w2 = &v2[Nvcd * isite];
228 
229  for (int it = 0; it < m_Mt; ++it) {
230  for (int iz = 0; iz < m_Mz; ++iz) {
231  for (int ivxy = 0; ivxy < Nvxy; ++ivxy) {
232  int iv = ivxy + Nvxy * (iz + m_Nz * it);
233  w2[iv] = 0.0;
234  }
235  }
236  }
237  }
238 
239 
240 //====================================================================
242  int itask, double *vcp1, const double *v1)
243  {
244  int Nvc2 = 2 * m_Nvc;
245  int Nvcd = m_Nvc * m_Nd;
246  int Nvcd2 = Nvcd / 2;
247 
248  int id1 = 0;
249  int id2 = m_Nvc;
250  int id3 = m_Nvc * 2;
251  int id4 = m_Nvc * 3;
252 
253  int idir = 0;
254  double bc2 = m_boundary2[idir];
255 
256  int isite = m_arg[itask].isite;
257  int isite_cp = m_arg[itask].isite_cpx;
258 
259  // double* w2 = &vcp1[Nvcd2*isite_cp];
260  const double *w1 = &v1[Nvcd * isite];
261  double *w2
262  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
263 
264  int ix = 0;
265 
266  for (int it = 0; it < m_Mt; ++it) {
267  for (int iz = 0; iz < m_Mz; ++iz) {
268  for (int iy = 0; iy < m_Ny; ++iy) {
269  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
270  int is2 = iy + m_Ny * (iz + m_Mz * it);
271  int in = Nvcd * is;
272  int ix1 = Nvc2 * is2;
273  int ix2 = ix1 + m_Nvc;
274 
275  for (int ic = 0; ic < m_Nc; ++ic) {
276  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
277  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
278  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
279  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
280  }
281  }
282  }
283  }
284 
285  m_bw_send[idir]->start_thread(itask);
286  }
287 
288 
289 //====================================================================
291  int itask, double *v2, const double *vcp2)
292  {
293  int Nvc2 = 2 * m_Nvc;
294  int Nvcd = m_Nvc * m_Nd;
295  int Nvcd2 = Nvcd / 2;
296 
297  int id1 = 0;
298  int id2 = m_Nvc;
299  int id3 = m_Nvc * 2;
300  int id4 = m_Nvc * 3;
301 
302  int idir = 0;
303 
304  double wt1r, wt1i, wt2r, wt2i;
305 
306  int isite = m_arg[itask].isite;
307  int isite_cp = m_arg[itask].isite_cpx;
308 
309  double *w2 = &v2[Nvcd * isite];
310  // double* w1 = &vcp2[Nvcd2*isite_cp];
311  const double *w1
312  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
313  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
314 
315  m_bw_recv[idir]->wait_thread(itask);
316 
317  int ix = m_Nx - 1;
318  for (int it = 0; it < m_Mt; ++it) {
319  for (int iz = 0; iz < m_Mz; ++iz) {
320  for (int iy = 0; iy < m_Ny; ++iy) {
321  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
322  int is2 = iy + m_Ny * (iz + m_Mz * it);
323  int iv = Nvcd * is;
324  int ig = m_Ndf * is;
325  int ix1 = Nvc2 * is2;
326  int ix2 = ix1 + m_Nvc;
327 
328  for (int ic = 0; ic < m_Nc; ++ic) {
329  int ic2 = ic * m_Nvc;
330 
331  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
332  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
333  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
334  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
335 
336  w2[2 * ic + id1 + iv] += wt1r;
337  w2[2 * ic + 1 + id1 + iv] += wt1i;
338  w2[2 * ic + id2 + iv] += wt2r;
339  w2[2 * ic + 1 + id2 + iv] += wt2i;
340  w2[2 * ic + id3 + iv] += wt2i;
341  w2[2 * ic + 1 + id3 + iv] += -wt2r;
342  w2[2 * ic + id4 + iv] += wt1i;
343  w2[2 * ic + 1 + id4 + iv] += -wt1r;
344  }
345  }
346  }
347  }
348  }
349 
350 
351 //====================================================================
353  int itask, double *v2, const double *v1)
354  {
355  int Nvcd = m_Nvc * m_Nd;
356 
357  int id1 = 0;
358  int id2 = m_Nvc;
359  int id3 = m_Nvc * 2;
360  int id4 = m_Nvc * 3;
361 
362  int idir = 0;
363 
364  double vt1[m_Nvc], vt2[m_Nvc];
365  double wt1r, wt1i, wt2r, wt2i;
366 
367  int isite = m_arg[itask].isite;
368 
369  const double *w1 = &v1[Nvcd * isite];
370  double *w2 = &v2[Nvcd * isite];
371  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
372 
373  for (int it = 0; it < m_Mt; ++it) {
374  for (int iz = 0; iz < m_Mz; ++iz) {
375  for (int iy = 0; iy < m_Ny; ++iy) {
376  for (int ix = 0; ix < m_Nx - 1; ++ix) {
377  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
378  int iv = Nvcd * is;
379  int in = Nvcd * (is + 1);
380  int ig = m_Ndf * is;
381 
382  for (int ic = 0; ic < m_Nc; ++ic) {
383  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
384  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
385  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
386  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
387  }
388 
389  for (int ic = 0; ic < m_Nc; ++ic) {
390  int ic2 = ic * m_Nvc;
391 
392  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
393  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
394  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
395  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
396 
397  w2[2 * ic + id1 + iv] += wt1r;
398  w2[2 * ic + 1 + id1 + iv] += wt1i;
399  w2[2 * ic + id2 + iv] += wt2r;
400  w2[2 * ic + 1 + id2 + iv] += wt2i;
401  w2[2 * ic + id3 + iv] += wt2i;
402  w2[2 * ic + 1 + id3 + iv] += -wt2r;
403  w2[2 * ic + id4 + iv] += wt1i;
404  w2[2 * ic + 1 + id4 + iv] += -wt1r;
405  }
406  }
407  }
408  }
409  }
410  }
411 
412 
413 //====================================================================
415  int itask, double *vcp1, const double *v1)
416  {
417  int Nvc2 = 2 * m_Nvc;
418  int Nvcd = m_Nvc * m_Nd;
419  int Nvcd2 = Nvcd / 2;
420 
421  int id1 = 0;
422  int id2 = m_Nvc;
423  int id3 = m_Nvc * 2;
424  int id4 = m_Nvc * 3;
425 
426  int idir = 0;
427 
428  int isite = m_arg[itask].isite;
429  int isite_cp = m_arg[itask].isite_cpx;
430 
431  const double *w1 = &v1[Nvcd * isite];
432  // double* w2 = &vcp1[Nvcd2*isite_cp];
433  double *w2
434  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
435  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
436 
437  double vt1[m_Nvc], vt2[m_Nvc];
438 
439  int ix = m_Nx - 1;
440 
441  for (int it = 0; it < m_Mt; ++it) {
442  for (int iz = 0; iz < m_Mz; ++iz) {
443  for (int iy = 0; iy < m_Ny; ++iy) {
444  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
445  int is2 = iy + m_Ny * (iz + m_Mz * it);
446  int in = Nvcd * is;
447  int ig = m_Ndf * is;
448  int ix1 = Nvc2 * is2;
449  int ix2 = ix1 + m_Nvc;
450 
451  for (int ic = 0; ic < m_Nc; ++ic) {
452  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
453  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
454  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
455  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
456  }
457 
458  for (int ic = 0; ic < m_Nc; ++ic) {
459  int icr = 2 * ic;
460  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
461  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
462  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
463  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
464  }
465  }
466  }
467  }
468 
469  m_fw_send[idir]->start_thread(itask);
470  }
471 
472 
473 //====================================================================
475  int itask, double *v2, const double *vcp2)
476  {
477  int Nvc2 = 2 * m_Nvc;
478  int Nvcd = m_Nvc * m_Nd;
479  int Nvcd2 = Nvcd / 2;
480 
481  int id1 = 0;
482  int id2 = m_Nvc;
483  int id3 = m_Nvc * 2;
484  int id4 = m_Nvc * 3;
485 
486  int idir = 0;
487  double bc2 = m_boundary2[idir];
488 
489  double wt1r, wt1i, wt2r, wt2i;
490 
491  int isite = m_arg[itask].isite;
492  int isite_cp = m_arg[itask].isite_cpx;
493 
494  double *w2 = &v2[Nvcd * isite];
495  // double* w1 = &vcp2[Nvcd2*isite_cp];
496  const double *w1
497  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
498 
499  m_fw_recv[idir]->wait_thread(itask);
500 
501  int ix = 0;
502  for (int it = 0; it < m_Mt; ++it) {
503  for (int iz = 0; iz < m_Mz; ++iz) {
504  for (int iy = 0; iy < m_Ny; ++iy) {
505  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
506  int is2 = iy + m_Ny * (iz + m_Mz * it);
507  int iv = Nvcd * is;
508  int ix1 = Nvc2 * is2;
509  int ix2 = ix1 + m_Nvc;
510 
511  for (int ic = 0; ic < m_Nc; ++ic) {
512  int icr = 2 * ic;
513  int ici = 2 * ic + 1;
514  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
515  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
516  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
517  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
518  w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
519  w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
520  w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
521  w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
522  }
523  }
524  }
525  }
526  }
527 
528 
529 //====================================================================
531  int itask, double *v2, const double *v1)
532  {
533  int Nvcd = m_Nvc * m_Nd;
534 
535  int id1 = 0;
536  int id2 = m_Nvc;
537  int id3 = m_Nvc * 2;
538  int id4 = m_Nvc * 3;
539 
540  int idir = 0;
541 
542  double vt1[m_Nvc], vt2[m_Nvc];
543  double wt1r, wt1i, wt2r, wt2i;
544 
545  int isite = m_arg[itask].isite;
546 
547  const double *w1 = &v1[Nvcd * isite];
548  double *w2 = &v2[Nvcd * isite];
549  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
550 
551  for (int it = 0; it < m_Mt; ++it) {
552  for (int iz = 0; iz < m_Mz; ++iz) {
553  for (int iy = 0; iy < m_Ny; ++iy) {
554  for (int ix = 1; ix < m_Nx; ++ix) {
555  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
556  int iv = Nvcd * is;
557  int in = Nvcd * (is - 1);
558  int ig = m_Ndf * (is - 1);
559 
560  for (int ic = 0; ic < m_Nc; ++ic) {
561  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
562  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
563  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
564  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
565  }
566 
567  for (int ic = 0; ic < m_Nc; ++ic) {
568  int ic2 = 2 * ic;
569 
570  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
571  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
572  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
573  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
574 
575  w2[2 * ic + id1 + iv] += wt1r;
576  w2[2 * ic + 1 + id1 + iv] += wt1i;
577  w2[2 * ic + id2 + iv] += wt2r;
578  w2[2 * ic + 1 + id2 + iv] += wt2i;
579  w2[2 * ic + id3 + iv] += -wt2i;
580  w2[2 * ic + 1 + id3 + iv] += +wt2r;
581  w2[2 * ic + id4 + iv] += -wt1i;
582  w2[2 * ic + 1 + id4 + iv] += +wt1r;
583  }
584  }
585  }
586  }
587  }
588  }
589 
590 
591 //====================================================================
593  int itask, double *vcp1, const double *v1)
594  {
595  int Nvc2 = 2 * m_Nvc;
596  int Nvcd = m_Nvc * m_Nd;
597  int Nvcd2 = Nvcd / 2;
598 
599  int id1 = 0;
600  int id2 = m_Nvc;
601  int id3 = m_Nvc * 2;
602  int id4 = m_Nvc * 3;
603 
604  int isite = m_arg[itask].isite;
605  int isite_cp = m_arg[itask].isite_cpy;
606 
607  int idir = 1;
608  double bc2 = m_boundary2[idir];
609 
610  const double *w1 = &v1[Nvcd * isite];
611  // double* w2 = &vcp1[Nvcd2*isite_cp];
612  double *w2
613  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
614 
615 
616  int iy = 0;
617 
618  for (int it = 0; it < m_Mt; ++it) {
619  for (int iz = 0; iz < m_Mz; ++iz) {
620  for (int ix = 0; ix < m_Nx; ++ix) {
621  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
622  int is2 = ix + m_Nx * (iz + m_Mz * it);
623  int in = Nvcd * is;
624  int ix1 = Nvc2 * is2;
625  int ix2 = ix1 + m_Nvc;
626 
627  for (int ic = 0; ic < m_Nc; ++ic) {
628  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
629  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
630  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
631  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
632  }
633  }
634  }
635  }
636 
637  m_bw_send[idir]->start_thread(itask);
638  }
639 
640 
641 //====================================================================
643  int itask, double *v2, const double *vcp2)
644  {
645  int Nvc2 = 2 * m_Nvc;
646  int Nvcd = m_Nvc * m_Nd;
647  int Nvcd2 = Nvcd / 2;
648 
649  int id1 = 0;
650  int id2 = m_Nvc;
651  int id3 = m_Nvc * 2;
652  int id4 = m_Nvc * 3;
653 
654  int idir = 1;
655 
656  double wt1r, wt1i, wt2r, wt2i;
657 
658  int isite = m_arg[itask].isite;
659  int isite_cp = m_arg[itask].isite_cpy;
660 
661  double *w2 = &v2[Nvcd * isite];
662  // double* w1 = &vcp2[Nvcd2*isite_cp];
663  const double *w1
664  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
665  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
666 
667  m_bw_recv[idir]->wait_thread(itask);
668 
669  int iy = m_Ny - 1;
670  for (int it = 0; it < m_Mt; ++it) {
671  for (int iz = 0; iz < m_Mz; ++iz) {
672  for (int ix = 0; ix < m_Nx; ++ix) {
673  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
674  int is2 = ix + m_Nx * (iz + m_Mz * it);
675  int iv = Nvcd * is;
676  int ig = m_Ndf * is;
677  int ix1 = Nvc2 * is2;
678  int ix2 = ix1 + m_Nvc;
679 
680  for (int ic = 0; ic < m_Nc; ++ic) {
681  int ic2 = ic * m_Nvc;
682 
683  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
684  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
685  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
686  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
687 
688  w2[2 * ic + id1 + iv] += wt1r;
689  w2[2 * ic + 1 + id1 + iv] += wt1i;
690  w2[2 * ic + id2 + iv] += wt2r;
691  w2[2 * ic + 1 + id2 + iv] += wt2i;
692  w2[2 * ic + id3 + iv] += -wt2r;
693  w2[2 * ic + 1 + id3 + iv] += -wt2i;
694  w2[2 * ic + id4 + iv] += wt1r;
695  w2[2 * ic + 1 + id4 + iv] += wt1i;
696  }
697  }
698  }
699  }
700  }
701 
702 
703 //====================================================================
705  int itask, double *v2, const double *v1)
706  {
707  int Nvcd = m_Nvc * m_Nd;
708 
709  int id1 = 0;
710  int id2 = m_Nvc;
711  int id3 = m_Nvc * 2;
712  int id4 = m_Nvc * 3;
713 
714  int idir = 1;
715 
716  double vt1[m_Nvc], vt2[m_Nvc];
717  double wt1r, wt1i, wt2r, wt2i;
718 
719  int isite = m_arg[itask].isite;
720 
721  double *w2 = &v2[Nvcd * isite];
722  const double *w1 = &v1[Nvcd * isite];
723  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
724 
725  for (int it = 0; it < m_Mt; ++it) {
726  for (int iz = 0; iz < m_Mz; ++iz) {
727  for (int iy = 0; iy < m_Ny - 1; ++iy) {
728  for (int ix = 0; ix < m_Nx; ++ix) {
729  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
730  int iv = Nvcd * is;
731  int in = Nvcd * (is + m_Nx);
732  int ig = m_Ndf * is;
733 
734  for (int ic = 0; ic < m_Nc; ++ic) {
735  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
736  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
737  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
738  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
739  }
740 
741  for (int ic = 0; ic < m_Nc; ++ic) {
742  int ic2 = ic * m_Nvc;
743 
744  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
745  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
746  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
747  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
748 
749  w2[2 * ic + id1 + iv] += wt1r;
750  w2[2 * ic + 1 + id1 + iv] += wt1i;
751  w2[2 * ic + id2 + iv] += wt2r;
752  w2[2 * ic + 1 + id2 + iv] += wt2i;
753  w2[2 * ic + id3 + iv] += -wt2r;
754  w2[2 * ic + 1 + id3 + iv] += -wt2i;
755  w2[2 * ic + id4 + iv] += wt1r;
756  w2[2 * ic + 1 + id4 + iv] += wt1i;
757  }
758  }
759  }
760  }
761  }
762  }
763 
764 
765 //====================================================================
767  int itask, double *vcp1, const double *v1)
768  {
769  int Nvc2 = 2 * m_Nvc;
770  int Nvcd = m_Nvc * m_Nd;
771  int Nvcd2 = Nvcd / 2;
772 
773  int id1 = 0;
774  int id2 = m_Nvc;
775  int id3 = m_Nvc * 2;
776  int id4 = m_Nvc * 3;
777 
778  int idir = 1;
779 
780  int isite = m_arg[itask].isite;
781  int isite_cp = m_arg[itask].isite_cpy;
782 
783  // double* w2 = &vcp1[Nvcd2*isite_cp];
784  double *w2
785  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
786 
787  const double *w1 = &v1[Nvcd * isite];
788  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
789 
790  double vt1[m_Nvc], vt2[m_Nvc];
791 
792  int iy = m_Ny - 1;
793 
794  for (int it = 0; it < m_Mt; ++it) {
795  for (int iz = 0; iz < m_Mz; ++iz) {
796  for (int ix = 0; ix < m_Nx; ++ix) {
797  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
798  int is2 = ix + m_Nx * (iz + m_Mz * it);
799  int in = Nvcd * is;
800  int ig = m_Ndf * is;
801  int ix1 = Nvc2 * is2;
802  int ix2 = ix1 + m_Nvc;
803 
804  for (int ic = 0; ic < m_Nc; ++ic) {
805  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
806  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
807  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
808  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
809  }
810 
811  for (int ic = 0; ic < m_Nc; ++ic) {
812  int icr = 2 * ic;
813  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
814  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
815  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
816  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
817  }
818  }
819  }
820  }
821 
822  m_fw_send[idir]->start_thread(itask);
823  }
824 
825 
826 //====================================================================
828  int itask, double *v2, const double *vcp2)
829  {
830  int Nvc2 = 2 * m_Nvc;
831  int Nvcd = m_Nvc * m_Nd;
832  int Nvcd2 = Nvcd / 2;
833 
834  int id1 = 0;
835  int id2 = m_Nvc;
836  int id3 = m_Nvc * 2;
837  int id4 = m_Nvc * 3;
838 
839  int idir = 1;
840  double bc2 = m_boundary2[idir];
841 
842  double wt1r, wt1i, wt2r, wt2i;
843 
844  int isite = m_arg[itask].isite;
845  int isite_cp = m_arg[itask].isite_cpy;
846 
847  double *w2 = &v2[Nvcd * isite];
848  // double* w1 = &vcp2[Nvcd2*isite_cp];
849  const double *w1
850  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
851 
852  m_fw_recv[idir]->wait_thread(itask);
853 
854  int iy = 0;
855  for (int it = 0; it < m_Mt; ++it) {
856  for (int iz = 0; iz < m_Mz; ++iz) {
857  for (int ix = 0; ix < m_Nx; ++ix) {
858  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
859  int is2 = ix + m_Nx * (iz + m_Mz * it);
860  int iv = Nvcd * is;
861  int ix1 = Nvc2 * is2;
862  int ix2 = ix1 + m_Nvc;
863 
864  for (int ic = 0; ic < m_Nc; ++ic) {
865  int icr = 2 * ic;
866  int ici = 2 * ic + 1;
867  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
868  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
869  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
870  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
871  w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
872  w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
873  w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
874  w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
875  }
876  }
877  }
878  }
879  }
880 
881 
882 //====================================================================
884  int itask, double *v2, const double *v1)
885  {
886  int Nvcd = m_Nvc * m_Nd;
887 
888  int id1 = 0;
889  int id2 = m_Nvc;
890  int id3 = m_Nvc * 2;
891  int id4 = m_Nvc * 3;
892 
893  int idir = 1;
894 
895  double vt1[m_Nvc], vt2[m_Nvc];
896  double wt1r, wt1i, wt2r, wt2i;
897 
898  int isite = m_arg[itask].isite;
899 
900  double *w2 = &v2[Nvcd * isite];
901  const double *w1 = &v1[Nvcd * isite];
902  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
903 
904  for (int it = 0; it < m_Mt; ++it) {
905  for (int iz = 0; iz < m_Mz; ++iz) {
906  for (int iy = 1; iy < m_Ny; ++iy) {
907  for (int ix = 0; ix < m_Nx; ++ix) {
908  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
909  int iv = Nvcd * is;
910  int in = Nvcd * (is - m_Nx);
911  int ig = m_Ndf * (is - m_Nx);
912 
913  for (int ic = 0; ic < m_Nc; ++ic) {
914  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
915  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
916  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
917  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
918  }
919 
920  for (int ic = 0; ic < m_Nc; ++ic) {
921  int ic2 = 2 * ic;
922  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
923  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
924  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
925  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
926 
927  w2[ic2 + id1 + iv] += wt1r;
928  w2[ic2 + 1 + id1 + iv] += wt1i;
929  w2[ic2 + id2 + iv] += wt2r;
930  w2[ic2 + 1 + id2 + iv] += wt2i;
931  w2[ic2 + id3 + iv] += wt2r;
932  w2[ic2 + 1 + id3 + iv] += wt2i;
933  w2[ic2 + id4 + iv] += -wt1r;
934  w2[ic2 + 1 + id4 + iv] += -wt1i;
935  }
936  }
937  }
938  }
939  }
940  }
941 
942 
943 //====================================================================
945  int itask, double *vcp1, const double *v1)
946  {
947  int Nvc2 = 2 * m_Nvc;
948  int Nvcd = m_Nvc * m_Nd;
949  int Nvcd2 = Nvcd / 2;
950 
951  int id1 = 0;
952  int id2 = m_Nvc;
953  int id3 = m_Nvc * 2;
954  int id4 = m_Nvc * 3;
955 
956  int isite = m_arg[itask].isite;
957  int isite_cp = m_arg[itask].isite_cpz;
958 
959  int idir = 2;
960  double bc2 = m_boundary2[idir];
961 
962  // double* w2 = &vcp1[Nvcd2*isite_cp];
963  double *w2
964  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
965  const double *w1 = &v1[Nvcd * isite];
966 
967  if (m_arg[itask].kz0 == 1) {
968  int Nxy = m_Nx * m_Ny;
969  int iz = 0;
970  for (int it = 0; it < m_Mt; ++it) {
971  for (int ixy = 0; ixy < Nxy; ++ixy) {
972  int is = ixy + Nxy * (iz + m_Nz * it);
973  int is2 = ixy + Nxy * it;
974 
975  int in = Nvcd * is;
976  int ix1 = Nvc2 * is2;
977  int ix2 = ix1 + m_Nvc;
978 
979  for (int ic = 0; ic < m_Nc; ++ic) {
980  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
981  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
982  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
983  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
984  }
985  }
986  }
987  }
988 
989  m_bw_send[idir]->start_thread(itask);
990  }
991 
992 
993 //====================================================================
995  int itask, double *v2, const double *vcp2)
996  {
997  int Nvc2 = 2 * m_Nvc;
998  int Nvcd = m_Nvc * m_Nd;
999  int Nvcd2 = Nvcd / 2;
1000 
1001  int id1 = 0;
1002  int id2 = m_Nvc;
1003  int id3 = m_Nvc * 2;
1004  int id4 = m_Nvc * 3;
1005 
1006  int idir = 2;
1007 
1008  double wt1r, wt1i, wt2r, wt2i;
1009 
1010  int isite = m_arg[itask].isite;
1011  int isite_cp = m_arg[itask].isite_cpz;
1012 
1013  double *w2 = &v2[Nvcd * isite];
1014  // double* w1 = &vcp2[Nvcd2*isite_cp];
1015  const double *w1
1016  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1017  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1018 
1019  m_bw_recv[idir]->wait_thread(itask);
1020 
1021  if (m_arg[itask].kz1 == 1) {
1022  int Nxy = m_Nx * m_Ny;
1023  int iz = m_Mz - 1;
1024  for (int it = 0; it < m_Mt; ++it) {
1025  for (int ixy = 0; ixy < Nxy; ++ixy) {
1026  int is = ixy + Nxy * (iz + m_Nz * it);
1027  int is2 = ixy + Nxy * it;
1028  int iv = Nvcd * is;
1029  int ig = m_Ndf * is;
1030  int ix1 = Nvc2 * is2;
1031  int ix2 = ix1 + m_Nvc;
1032 
1033  for (int ic = 0; ic < m_Nc; ++ic) {
1034  int ic2 = ic * m_Nvc;
1035 
1036  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1037  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1038  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1039  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1040 
1041  w2[2 * ic + id1 + iv] += wt1r;
1042  w2[2 * ic + 1 + id1 + iv] += wt1i;
1043  w2[2 * ic + id2 + iv] += wt2r;
1044  w2[2 * ic + 1 + id2 + iv] += wt2i;
1045  w2[2 * ic + id3 + iv] += wt1i;
1046  w2[2 * ic + 1 + id3 + iv] += -wt1r;
1047  w2[2 * ic + id4 + iv] += -wt2i;
1048  w2[2 * ic + 1 + id4 + iv] += wt2r;
1049  }
1050  }
1051  }
1052  }
1053  }
1054 
1055 
1056 //====================================================================
1058  int itask, double *v2, const double *v1)
1059  {
1060  int Nvcd = m_Nvc * m_Nd;
1061 
1062  int id1 = 0;
1063  int id2 = m_Nvc;
1064  int id3 = m_Nvc * 2;
1065  int id4 = m_Nvc * 3;
1066 
1067  int idir = 2;
1068 
1069  double vt1[m_Nvc], vt2[m_Nvc];
1070  double wt1r, wt1i, wt2r, wt2i;
1071 
1072  int isite = m_arg[itask].isite;
1073 
1074  double *w2 = &v2[Nvcd * isite];
1075  const double *w1 = &v1[Nvcd * isite];
1076  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1077 
1078  int kz1 = m_arg[itask].kz1;
1079  int Nxy = m_Nx * m_Ny;
1080 
1081  for (int it = 0; it < m_Mt; ++it) {
1082  for (int iz = 0; iz < m_Mz - kz1; ++iz) {
1083  for (int ixy = 0; ixy < Nxy; ++ixy) {
1084  int is = ixy + Nxy * (iz + m_Nz * it);
1085  int iv = Nvcd * is;
1086  int in = Nvcd * (is + Nxy);
1087  int ig = m_Ndf * is;
1088 
1089  for (int ic = 0; ic < m_Nc; ++ic) {
1090  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
1091  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
1092  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
1093  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
1094  }
1095 
1096  for (int ic = 0; ic < m_Nc; ++ic) {
1097  int ic2 = ic * m_Nvc;
1098 
1099  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1100  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1101  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1102  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1103 
1104  w2[2 * ic + id1 + iv] += wt1r;
1105  w2[2 * ic + 1 + id1 + iv] += wt1i;
1106  w2[2 * ic + id2 + iv] += wt2r;
1107  w2[2 * ic + 1 + id2 + iv] += wt2i;
1108  w2[2 * ic + id3 + iv] += wt1i;
1109  w2[2 * ic + 1 + id3 + iv] += -wt1r;
1110  w2[2 * ic + id4 + iv] += -wt2i;
1111  w2[2 * ic + 1 + id4 + iv] += wt2r;
1112  }
1113  }
1114  }
1115  }
1116  }
1117 
1118 
1119 //====================================================================
1121  int itask, double *vcp1, const double *v1)
1122  {
1123  int Nvc2 = 2 * m_Nvc;
1124  int Nvcd = m_Nvc * m_Nd;
1125  int Nvcd2 = Nvcd / 2;
1126 
1127  int id1 = 0;
1128  int id2 = m_Nvc;
1129  int id3 = m_Nvc * 2;
1130  int id4 = m_Nvc * 3;
1131 
1132  int idir = 2;
1133 
1134  int isite = m_arg[itask].isite;
1135  int isite_cp = m_arg[itask].isite_cpz;
1136 
1137  // double* w2 = &vcp1[Nvcd2*isite_cp];
1138  double *w2
1139  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1140  const double *w1 = &v1[Nvcd * isite];
1141  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1142 
1143  double vt1[m_Nvc], vt2[m_Nvc];
1144 
1145  if (m_arg[itask].kz1 == 1) {
1146  int Nxy = m_Nx * m_Ny;
1147  int iz = m_Mz - 1;
1148  for (int it = 0; it < m_Mt; ++it) {
1149  for (int ixy = 0; ixy < Nxy; ++ixy) {
1150  int is = ixy + Nxy * (iz + m_Nz * it);
1151  int is2 = ixy + Nxy * it;
1152  int in = Nvcd * is;
1153  int ig = m_Ndf * is;
1154  int ix1 = Nvc2 * is2;
1155  int ix2 = ix1 + m_Nvc;
1156 
1157  for (int ic = 0; ic < m_Nc; ++ic) {
1158  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1159  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1160  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1161  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1162  }
1163 
1164  for (int ic = 0; ic < m_Nc; ++ic) {
1165  int icr = 2 * ic;
1166  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1167  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1168  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1169  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1170  }
1171  }
1172  }
1173  }
1174 
1175  m_fw_send[idir]->start_thread(itask);
1176  }
1177 
1178 
1179 //====================================================================
1181  int itask, double *v2, const double *vcp2)
1182  {
1183  int Nvc2 = 2 * m_Nvc;
1184  int Nvcd = m_Nvc * m_Nd;
1185  int Nvcd2 = Nvcd / 2;
1186 
1187  int id1 = 0;
1188  int id2 = m_Nvc;
1189  int id3 = m_Nvc * 2;
1190  int id4 = m_Nvc * 3;
1191 
1192  int idir = 2;
1193  double bc2 = m_boundary2[idir];
1194 
1195  double wt1r, wt1i, wt2r, wt2i;
1196 
1197  int isite = m_arg[itask].isite;
1198  int isite_cp = m_arg[itask].isite_cpz;
1199 
1200  double *w2 = &v2[Nvcd * isite];
1201  // double* w1 = &vcp2[Nvcd2*isite_cp];
1202  const double *w1
1203  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1204 
1205  m_fw_recv[idir]->wait_thread(itask);
1206 
1207  if (m_arg[itask].kz0 == 1) {
1208  int Nxy = m_Nx * m_Ny;
1209 
1210  int iz = 0;
1211  for (int it = 0; it < m_Mt; ++it) {
1212  for (int ixy = 0; ixy < Nxy; ++ixy) {
1213  int is = ixy + Nxy * (iz + m_Nz * it);
1214  int is2 = ixy + Nxy * it;
1215  int iv = Nvcd * is;
1216  int ix1 = Nvc2 * is2;
1217  int ix2 = ix1 + m_Nvc;
1218 
1219  for (int ic = 0; ic < m_Nc; ++ic) {
1220  int icr = 2 * ic;
1221  int ici = 2 * ic + 1;
1222  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1223  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1224  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1225  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1226  w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
1227  w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
1228  w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
1229  w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
1230  }
1231  }
1232  }
1233  }
1234  }
1235 
1236 
1237 //====================================================================
1239  int itask, double *v2, const double *v1)
1240  {
1241  int Nvcd = m_Nvc * m_Nd;
1242 
1243  int id1 = 0;
1244  int id2 = m_Nvc;
1245  int id3 = m_Nvc * 2;
1246  int id4 = m_Nvc * 3;
1247 
1248  int idir = 2;
1249 
1250  double vt1[m_Nvc], vt2[m_Nvc];
1251  double wt1r, wt1i, wt2r, wt2i;
1252 
1253  int isite = m_arg[itask].isite;
1254 
1255  double *w2 = &v2[Nvcd * isite];
1256  const double *w1 = &v1[Nvcd * isite];
1257  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1258 
1259  int kz0 = m_arg[itask].kz0;
1260  int Nxy = m_Nx * m_Ny;
1261 
1262  for (int it = 0; it < m_Mt; ++it) {
1263  for (int iz = kz0; iz < m_Mz; ++iz) {
1264  for (int ixy = 0; ixy < Nxy; ++ixy) {
1265  int is = ixy + Nxy * (iz + m_Nz * it);
1266  int iv = Nvcd * is;
1267  int in = Nvcd * (is - Nxy);
1268  int ig = m_Ndf * (is - Nxy);
1269 
1270  for (int ic = 0; ic < m_Nc; ++ic) {
1271  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1272  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1273  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1274  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1275  }
1276 
1277  for (int ic = 0; ic < m_Nc; ++ic) {
1278  int ic2 = 2 * ic;
1279  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1280  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1281  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1282  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1283 
1284  w2[ic2 + id1 + iv] += wt1r;
1285  w2[ic2 + 1 + id1 + iv] += wt1i;
1286  w2[ic2 + id2 + iv] += wt2r;
1287  w2[ic2 + 1 + id2 + iv] += wt2i;
1288  w2[ic2 + id3 + iv] += -wt1i;
1289  w2[ic2 + 1 + id3 + iv] += wt1r;
1290  w2[ic2 + id4 + iv] += wt2i;
1291  w2[ic2 + 1 + id4 + iv] += -wt2r;
1292  }
1293  }
1294  }
1295  }
1296  }
1297 
1298 
1299 //====================================================================
1301  int itask, double *vcp1, const double *v1)
1302  {
1303  int Nvc2 = 2 * m_Nvc;
1304  int Nvcd = m_Nvc * m_Nd;
1305  int Nvcd2 = Nvcd / 2;
1306 
1307  int id1 = 0;
1308  int id2 = m_Nvc;
1309  int id3 = m_Nvc * 2;
1310  int id4 = m_Nvc * 3;
1311 
1312  int isite = m_arg[itask].isite;
1313  int isite_cp = m_arg[itask].isite_cpt;
1314 
1315  int idir = 3;
1316  double bc2 = m_boundary2[idir];
1317 
1318  // double* w2 = &vcp1[Nvcd2*isite_cp];
1319  double *w2
1320  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1321  const double *w1 = &v1[Nvcd * isite];
1322 
1323  if (m_arg[itask].kt0 == 1) {
1324  int Nxy = m_Nx * m_Ny;
1325  int it = 0;
1326  for (int iz = 0; iz < m_Mz; ++iz) {
1327  for (int ixy = 0; ixy < Nxy; ++ixy) {
1328  int is = ixy + Nxy * (iz + m_Nz * it);
1329  int is2 = ixy + Nxy * iz;
1330 
1331  int in = Nvcd * is;
1332  int ix1 = Nvc2 * is2;
1333  int ix2 = ix1 + m_Nvc;
1334 
1335  for (int ic = 0; ic < m_Nc; ++ic) {
1336  w2[2 * ic + ix1] = 2.0 * bc2 * w1[2 * ic + id3 + in];
1337  w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
1338  w2[2 * ic + ix2] = 2.0 * bc2 * w1[2 * ic + id4 + in];
1339  w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
1340  }
1341  }
1342  }
1343  }
1344 
1345  m_bw_send[idir]->start_thread(itask);
1346  }
1347 
1348 
1349 //====================================================================
1351  int itask, double *v2, const double *vcp2)
1352  {
1353  int Nvc2 = 2 * m_Nvc;
1354  int Nvcd = m_Nvc * m_Nd;
1355  int Nvcd2 = Nvcd / 2;
1356 
1357  int id1 = 0;
1358  int id2 = m_Nvc;
1359  int id3 = m_Nvc * 2;
1360  int id4 = m_Nvc * 3;
1361 
1362  int idir = 3;
1363 
1364  double wt1r, wt1i, wt2r, wt2i;
1365 
1366  int isite = m_arg[itask].isite;
1367  int isite_cp = m_arg[itask].isite_cpt;
1368 
1369  double *w2 = &v2[Nvcd * isite];
1370  // double* w1 = &vcp2[Nvcd2*isite_cp];
1371  const double *w1
1372  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1373  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1374 
1375  m_bw_recv[idir]->wait_thread(itask);
1376 
1377  if (m_arg[itask].kt1 == 1) {
1378  int Nxy = m_Nx * m_Ny;
1379  int it = m_Mt - 1;
1380  for (int iz = 0; iz < m_Mz; ++iz) {
1381  for (int ixy = 0; ixy < Nxy; ++ixy) {
1382  int is = ixy + Nxy * (iz + m_Nz * it);
1383  int is2 = ixy + Nxy * iz;
1384  int iv = Nvcd * is;
1385  int ig = m_Ndf * is;
1386  int ix1 = Nvc2 * is2;
1387  int ix2 = ix1 + m_Nvc;
1388 
1389  for (int ic = 0; ic < m_Nc; ++ic) {
1390  int ic2 = ic * m_Nvc;
1391 
1392  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1393  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1394  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1395  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1396 
1397  w2[2 * ic + id3 + iv] += wt1r;
1398  w2[2 * ic + 1 + id3 + iv] += wt1i;
1399  w2[2 * ic + id4 + iv] += wt2r;
1400  w2[2 * ic + 1 + id4 + iv] += wt2i;
1401  }
1402  }
1403  }
1404  }
1405  }
1406 
1407 
1408 //====================================================================
1410  int itask, double *v2, const double *v1)
1411  {
1412  int Nvcd = m_Nvc * m_Nd;
1413 
1414  int id1 = 0;
1415  int id2 = m_Nvc;
1416  int id3 = m_Nvc * 2;
1417  int id4 = m_Nvc * 3;
1418 
1419  int idir = 3;
1420 
1421  double vt1[m_Nvc], vt2[m_Nvc];
1422  double wt1r, wt1i, wt2r, wt2i;
1423 
1424  int isite = m_arg[itask].isite;
1425 
1426  double *w2 = &v2[Nvcd * isite];
1427  const double *w1 = &v1[Nvcd * isite];
1428  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1429 
1430  int kt1 = m_arg[itask].kt1;
1431  int Nxy = m_Nx * m_Ny;
1432  int Nxyz = Nxy * m_Nz;
1433 
1434  for (int it = 0; it < m_Mt - kt1; ++it) {
1435  for (int iz = 0; iz < m_Mz; ++iz) {
1436  for (int ixy = 0; ixy < Nxy; ++ixy) {
1437  int is = ixy + Nxy * (iz + m_Nz * it);
1438  int iv = Nvcd * is;
1439  int in = Nvcd * (is + Nxyz);
1440  int ig = m_Ndf * is;
1441 
1442  for (int ic = 0; ic < m_Nc; ++ic) {
1443  vt1[2 * ic] = 2.0 * w1[2 * ic + id3 + in];
1444  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
1445  vt2[2 * ic] = 2.0 * w1[2 * ic + id4 + in];
1446  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
1447  }
1448 
1449  for (int ic = 0; ic < m_Nc; ++ic) {
1450  int ic2 = ic * m_Nvc;
1451 
1452  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1453  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1454  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1455  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1456 
1457  w2[2 * ic + id3 + iv] += wt1r;
1458  w2[2 * ic + 1 + id3 + iv] += wt1i;
1459  w2[2 * ic + id4 + iv] += wt2r;
1460  w2[2 * ic + 1 + id4 + iv] += wt2i;
1461  }
1462  }
1463  }
1464  }
1465  }
1466 
1467 
1468 //====================================================================
1470  int itask, double *vcp1, const double *v1)
1471  {
1472  int Nvc2 = 2 * m_Nvc;
1473  int Nvcd = m_Nvc * m_Nd;
1474  int Nvcd2 = Nvcd / 2;
1475 
1476  int id1 = 0;
1477  int id2 = m_Nvc;
1478  int id3 = m_Nvc * 2;
1479  int id4 = m_Nvc * 3;
1480 
1481  int idir = 3;
1482 
1483  int isite = m_arg[itask].isite;
1484  int isite_cp = m_arg[itask].isite_cpt;
1485 
1486  // double* w2 = &vcp1[Nvcd2*isite_cp];
1487  double *w2
1488  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1489  const double *w1 = &v1[Nvcd * isite];
1490  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1491 
1492  double vt1[m_Nvc], vt2[m_Nvc];
1493 
1494  if (m_arg[itask].kt1 == 1) {
1495  int Nxy = m_Nx * m_Ny;
1496  int it = m_Mt - 1;
1497  for (int iz = 0; iz < m_Mz; ++iz) {
1498  for (int ixy = 0; ixy < Nxy; ++ixy) {
1499  int is = ixy + Nxy * (iz + m_Nz * it);
1500  int is2 = ixy + Nxy * iz;
1501  int in = Nvcd * is;
1502  int ig = m_Ndf * is;
1503  int ix1 = Nvc2 * is2;
1504  int ix2 = ix1 + m_Nvc;
1505 
1506  for (int ic = 0; ic < m_Nc; ++ic) {
1507  vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1508  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1509  vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1510  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1511  }
1512 
1513  for (int ic = 0; ic < m_Nc; ++ic) {
1514  int icr = 2 * ic;
1515  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1516  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1517  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1518  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1519  }
1520  }
1521  }
1522  }
1523 
1524  m_fw_send[idir]->start_thread(itask);
1525  }
1526 
1527 
1528 //====================================================================
1530  int itask, double *v2, const double *vcp2)
1531  {
1532  int Nvc2 = 2 * m_Nvc;
1533  int Nvcd = m_Nvc * m_Nd;
1534  int Nvcd2 = Nvcd / 2;
1535 
1536  int id1 = 0;
1537  int id2 = m_Nvc;
1538  int id3 = m_Nvc * 2;
1539  int id4 = m_Nvc * 3;
1540 
1541  int idir = 3;
1542  double bc2 = m_boundary2[idir];
1543 
1544  double wt1r, wt1i, wt2r, wt2i;
1545 
1546  int isite = m_arg[itask].isite;
1547  int isite_cp = m_arg[itask].isite_cpt;
1548 
1549  double *w2 = &v2[Nvcd * isite];
1550  // double* w1 = &vcp2[Nvcd2*isite_cp];
1551  const double *w1
1552  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1553 
1554  m_fw_recv[idir]->wait_thread(itask);
1555 
1556  if (m_arg[itask].kt0 == 1) {
1557  int Nxy = m_Nx * m_Ny;
1558  int it = 0;
1559  for (int iz = 0; iz < m_Mz; ++iz) {
1560  for (int ixy = 0; ixy < Nxy; ++ixy) {
1561  int is = ixy + Nxy * (iz + m_Nz * it);
1562  int is2 = ixy + Nxy * iz;
1563  int iv = Nvcd * is;
1564  int ix1 = Nvc2 * is2;
1565  int ix2 = ix1 + m_Nvc;
1566 
1567  for (int ic = 0; ic < m_Nc; ++ic) {
1568  int icr = 2 * ic;
1569  int ici = 2 * ic + 1;
1570  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1571  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1572  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1573  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1574  }
1575  }
1576  }
1577  }
1578  }
1579 
1580 
1581 //====================================================================
1583  int itask, double *v2, const double *v1)
1584  {
1585  int Nvcd = m_Nvc * m_Nd;
1586 
1587  int id1 = 0;
1588  int id2 = m_Nvc;
1589  int id3 = m_Nvc * 2;
1590  int id4 = m_Nvc * 3;
1591 
1592  int idir = 3;
1593 
1594  double vt1[m_Nvc], vt2[m_Nvc];
1595  double wt1r, wt1i, wt2r, wt2i;
1596 
1597  int isite = m_arg[itask].isite;
1598 
1599  double *w2 = &v2[Nvcd * isite];
1600  const double *w1 = &v1[Nvcd * isite];
1601  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1602 
1603  int kt0 = m_arg[itask].kt0;
1604  int Nxy = m_Nx * m_Ny;
1605  int Nxyz = Nxy * m_Nz;
1606 
1607  for (int it = kt0; it < m_Mt; ++it) {
1608  for (int iz = 0; iz < m_Mz; ++iz) {
1609  for (int ixy = 0; ixy < Nxy; ++ixy) {
1610  int is = ixy + Nxy * (iz + m_Nz * it);
1611  int iv = Nvcd * is;
1612  int in = Nvcd * (is - Nxyz);
1613  int ig = m_Ndf * (is - Nxyz);
1614 
1615  for (int ic = 0; ic < m_Nc; ++ic) {
1616  vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1617  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1618  vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1619  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1620  }
1621 
1622  for (int ic = 0; ic < m_Nc; ++ic) {
1623  int ic2 = 2 * ic;
1624  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1625  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1626  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1627  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1628 
1629  w2[ic2 + id1 + iv] += wt1r;
1630  w2[ic2 + 1 + id1 + iv] += wt1i;
1631  w2[ic2 + id2 + iv] += wt2r;
1632  w2[ic2 + 1 + id2 + iv] += wt2i;
1633  }
1634  }
1635  }
1636  }
1637  }
1638 
1639 
1640 //====================================================================
1642  int itask, double *vcp1, const double *v1)
1643  {
1644  int Nvc2 = 2 * m_Nvc;
1645  int Nvcd = m_Nvc * m_Nd;
1646  int Nvcd2 = Nvcd / 2;
1647 
1648  int id1 = 0;
1649  int id2 = m_Nvc;
1650  int id3 = m_Nvc * 2;
1651  int id4 = m_Nvc * 3;
1652 
1653  int isite = m_arg[itask].isite;
1654  int isite_cp = m_arg[itask].isite_cpt;
1655 
1656  int idir = 3;
1657  double bc2 = m_boundary2[idir];
1658 
1659  // double* w2 = &vcp1[Nvcd2*isite_cp];
1660  double *w2
1661  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1662  const double *w1 = &v1[Nvcd * isite];
1663 
1664  if (m_arg[itask].kt0 == 1) {
1665  int Nxy = m_Nx * m_Ny;
1666  int it = 0;
1667  for (int iz = 0; iz < m_Mz; ++iz) {
1668  for (int ixy = 0; ixy < Nxy; ++ixy) {
1669  int is = ixy + Nxy * (iz + m_Nz * it);
1670  int is2 = ixy + Nxy * iz;
1671 
1672  int in = Nvcd * is;
1673  int ix1 = Nvc2 * is2;
1674  int ix2 = ix1 + m_Nvc;
1675 
1676  for (int ic = 0; ic < m_Nc; ++ic) {
1677  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
1678  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
1679  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
1680  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
1681  }
1682  }
1683  }
1684  }
1685 
1686  m_bw_send[idir]->start_thread(itask);
1687  }
1688 
1689 
1690 //====================================================================
1692  int itask, double *v2, const double *vcp2)
1693  {
1694  int Nvc2 = 2 * m_Nvc;
1695  int Nvcd = m_Nvc * m_Nd;
1696  int Nvcd2 = Nvcd / 2;
1697 
1698  int id1 = 0;
1699  int id2 = m_Nvc;
1700  int id3 = m_Nvc * 2;
1701  int id4 = m_Nvc * 3;
1702 
1703  int idir = 3;
1704 
1705  double wt1r, wt1i, wt2r, wt2i;
1706 
1707  int isite = m_arg[itask].isite;
1708  int isite_cp = m_arg[itask].isite_cpt;
1709 
1710  double *w2 = &v2[Nvcd * isite];
1711  // double* w1 = &vcp2[Nvcd2*isite_cp];
1712  const double *w1
1713  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1714  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1715 
1716  m_bw_recv[idir]->wait_thread(itask);
1717 
1718  if (m_arg[itask].kt1 == 1) {
1719  int Nxy = m_Nx * m_Ny;
1720  int it = m_Mt - 1;
1721  for (int iz = 0; iz < m_Mz; ++iz) {
1722  for (int ixy = 0; ixy < Nxy; ++ixy) {
1723  int is = ixy + Nxy * (iz + m_Nz * it);
1724  int is2 = ixy + Nxy * iz;
1725  int iv = Nvcd * is;
1726  int ig = m_Ndf * is;
1727  int ix1 = Nvc2 * is2;
1728  int ix2 = ix1 + m_Nvc;
1729 
1730  for (int ic = 0; ic < m_Nc; ++ic) {
1731  int ic2 = ic * m_Nvc;
1732 
1733  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1734  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1735  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1736  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1737 
1738  w2[2 * ic + id1 + iv] += wt1r;
1739  w2[2 * ic + 1 + id1 + iv] += wt1i;
1740  w2[2 * ic + id2 + iv] += wt2r;
1741  w2[2 * ic + 1 + id2 + iv] += wt2i;
1742  w2[2 * ic + id3 + iv] += wt1r;
1743  w2[2 * ic + 1 + id3 + iv] += wt1i;
1744  w2[2 * ic + id4 + iv] += wt2r;
1745  w2[2 * ic + 1 + id4 + iv] += wt2i;
1746  }
1747  }
1748  }
1749  }
1750  }
1751 
1752 
1753 //====================================================================
1755  int itask, double *v2, const double *v1)
1756  {
1757  int Nvcd = m_Nvc * m_Nd;
1758 
1759  int id1 = 0;
1760  int id2 = m_Nvc;
1761  int id3 = m_Nvc * 2;
1762  int id4 = m_Nvc * 3;
1763 
1764  int idir = 3;
1765 
1766  double vt1[m_Nvc], vt2[m_Nvc];
1767  double wt1r, wt1i, wt2r, wt2i;
1768 
1769  int isite = m_arg[itask].isite;
1770 
1771  double *w2 = &v2[Nvcd * isite];
1772  const double *w1 = &v1[Nvcd * isite];
1773  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1774 
1775  int kt1 = m_arg[itask].kt1;
1776  int Nxy = m_Nx * m_Ny;
1777  int Nxyz = Nxy * m_Nz;
1778 
1779  for (int it = 0; it < m_Mt - kt1; ++it) {
1780  for (int iz = 0; iz < m_Mz; ++iz) {
1781  for (int ixy = 0; ixy < Nxy; ++ixy) {
1782  int is = ixy + Nxy * (iz + m_Nz * it);
1783  int iv = Nvcd * is;
1784  int in = Nvcd * (is + Nxyz);
1785  int ig = m_Ndf * is;
1786 
1787  for (int ic = 0; ic < m_Nc; ++ic) {
1788  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
1789  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
1790  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
1791  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
1792  }
1793 
1794  for (int ic = 0; ic < m_Nc; ++ic) {
1795  int ic2 = ic * m_Nvc;
1796 
1797  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1798  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1799  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1800  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1801 
1802  w2[2 * ic + id1 + iv] += wt1r;
1803  w2[2 * ic + 1 + id1 + iv] += wt1i;
1804  w2[2 * ic + id2 + iv] += wt2r;
1805  w2[2 * ic + 1 + id2 + iv] += wt2i;
1806  w2[2 * ic + id3 + iv] += wt1r;
1807  w2[2 * ic + 1 + id3 + iv] += wt1i;
1808  w2[2 * ic + id4 + iv] += wt2r;
1809  w2[2 * ic + 1 + id4 + iv] += wt2i;
1810  }
1811  }
1812  }
1813  }
1814  }
1815 
1816 
1817 //====================================================================
1819  int itask, double *vcp1, const double *v1)
1820  {
1821  int Nvc2 = 2 * m_Nvc;
1822  int Nvcd = m_Nvc * m_Nd;
1823  int Nvcd2 = Nvcd / 2;
1824 
1825  int id1 = 0;
1826  int id2 = m_Nvc;
1827  int id3 = m_Nvc * 2;
1828  int id4 = m_Nvc * 3;
1829 
1830  int idir = 3;
1831 
1832  int isite = m_arg[itask].isite;
1833  int isite_cp = m_arg[itask].isite_cpt;
1834 
1835  // double* w2 = &vcp1[Nvcd2*isite_cp];
1836  double *w2
1837  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1838  const double *w1 = &v1[Nvcd * isite];
1839  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1840 
1841  double vt1[m_Nvc], vt2[m_Nvc];
1842 
1843  if (m_arg[itask].kt1 == 1) {
1844  int Nxy = m_Nx * m_Ny;
1845  int it = m_Mt - 1;
1846  for (int iz = 0; iz < m_Mz; ++iz) {
1847  for (int ixy = 0; ixy < Nxy; ++ixy) {
1848  int is = ixy + Nxy * (iz + m_Nz * it);
1849  int is2 = ixy + Nxy * iz;
1850  int in = Nvcd * is;
1851  int ig = m_Ndf * is;
1852  int ix1 = Nvc2 * is2;
1853  int ix2 = ix1 + m_Nvc;
1854 
1855  for (int ic = 0; ic < m_Nc; ++ic) {
1856  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1857  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1858  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1859  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1860  }
1861 
1862  for (int ic = 0; ic < m_Nc; ++ic) {
1863  int icr = 2 * ic;
1864  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1865  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1866  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1867  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1868  }
1869  }
1870  }
1871  }
1872 
1873  m_fw_send[idir]->start_thread(itask);
1874  }
1875 
1876 
1877 //====================================================================
1879  int itask, double *v2, const double *vcp2)
1880  {
1881  int Nvc2 = 2 * m_Nvc;
1882  int Nvcd = m_Nvc * m_Nd;
1883  int Nvcd2 = Nvcd / 2;
1884 
1885  int id1 = 0;
1886  int id2 = m_Nvc;
1887  int id3 = m_Nvc * 2;
1888  int id4 = m_Nvc * 3;
1889 
1890  int idir = 3;
1891  double bc2 = m_boundary2[idir];
1892 
1893  double wt1r, wt1i, wt2r, wt2i;
1894 
1895  int isite = m_arg[itask].isite;
1896  int isite_cp = m_arg[itask].isite_cpt;
1897 
1898  double *w2 = &v2[Nvcd * isite];
1899  // double* w1 = &vcp2[Nvcd2*isite_cp];
1900  const double *w1
1901  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1902 
1903  m_fw_recv[idir]->wait_thread(itask);
1904 
1905  if (m_arg[itask].kt0 == 1) {
1906  int Nxy = m_Nx * m_Ny;
1907  int it = 0;
1908  for (int iz = 0; iz < m_Mz; ++iz) {
1909  for (int ixy = 0; ixy < Nxy; ++ixy) {
1910  int is = ixy + Nxy * (iz + m_Nz * it);
1911  int is2 = ixy + Nxy * iz;
1912  int iv = Nvcd * is;
1913  int ix1 = Nvc2 * is2;
1914  int ix2 = ix1 + m_Nvc;
1915 
1916  for (int ic = 0; ic < m_Nc; ++ic) {
1917  int icr = 2 * ic;
1918  int ici = 2 * ic + 1;
1919  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1920  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1921  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1922  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1923  w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
1924  w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
1925  w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
1926  w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
1927  }
1928  }
1929  }
1930  }
1931  }
1932 
1933 
1934 //====================================================================
1936  int itask, double *v2, const double *v1)
1937  {
1938  int Nvcd = m_Nvc * m_Nd;
1939 
1940  int id1 = 0;
1941  int id2 = m_Nvc;
1942  int id3 = m_Nvc * 2;
1943  int id4 = m_Nvc * 3;
1944 
1945  int idir = 3;
1946 
1947  double vt1[m_Nvc], vt2[m_Nvc];
1948  double wt1r, wt1i, wt2r, wt2i;
1949 
1950  int isite = m_arg[itask].isite;
1951 
1952  double *w2 = &v2[Nvcd * isite];
1953  const double *w1 = &v1[Nvcd * isite];
1954  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1955 
1956  int kt0 = m_arg[itask].kt0;
1957  int Nxy = m_Nx * m_Ny;
1958  int Nxyz = Nxy * m_Nz;
1959 
1960  for (int it = kt0; it < m_Mt; ++it) {
1961  for (int iz = 0; iz < m_Mz; ++iz) {
1962  for (int ixy = 0; ixy < Nxy; ++ixy) {
1963  int is = ixy + Nxy * (iz + m_Nz * it);
1964  int iv = Nvcd * is;
1965  int in = Nvcd * (is - Nxyz);
1966  int ig = m_Ndf * (is - Nxyz);
1967 
1968  for (int ic = 0; ic < m_Nc; ++ic) {
1969  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1970  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1971  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1972  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1973  }
1974 
1975  for (int ic = 0; ic < m_Nc; ++ic) {
1976  int ic2 = 2 * ic;
1977  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1978  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1979  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1980  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1981 
1982  w2[ic2 + id1 + iv] += wt1r;
1983  w2[ic2 + 1 + id1 + iv] += wt1i;
1984  w2[ic2 + id2 + iv] += wt2r;
1985  w2[ic2 + 1 + id2 + iv] += wt2i;
1986  w2[ic2 + id3 + iv] -= wt1r;
1987  w2[ic2 + 1 + id3 + iv] -= wt1i;
1988  w2[ic2 + id4 + iv] -= wt2r;
1989  w2[ic2 + 1 + id4 + iv] -= wt2i;
1990  }
1991  }
1992  }
1993  }
1994  }
1995 
1996 
1997 //====================================================================
1999  int itask, double *v2, const double *v1)
2000  {
2001  int Nvcd = m_Nvc * m_Nd;
2002  int Nxy = m_Nx * m_Ny;
2003 
2004  int id1 = 0;
2005  int id2 = m_Nvc;
2006  int id3 = m_Nvc * 2;
2007  int id4 = m_Nvc * 3;
2008 
2009  int isite = m_arg[itask].isite;
2010  double *w2 = &v2[Nvcd * isite];
2011  const double *w1 = &v1[Nvcd * isite];
2012 
2013  for (int it = 0; it < m_Mt; ++it) {
2014  for (int iz = 0; iz < m_Mz; ++iz) {
2015  for (int ixy = 0; ixy < Nxy; ++ixy) {
2016  int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
2017  for (int ivc = 0; ivc < m_Nvc; ++ivc) {
2018  w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
2019  w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
2020  w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
2021  w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
2022  }
2023  }
2024  }
2025  }
2026  }
2027 
2028 
2029 //====================================================================
2031  int itask, double *v2, const double *v1)
2032  {
2033  int Nvcd = m_Nvc * m_Nd;
2034  int Nxy = m_Nx * m_Ny;
2035 
2036  int id1 = 0;
2037  int id2 = m_Nvc;
2038  int id3 = m_Nvc * 2;
2039  int id4 = m_Nvc * 3;
2040 
2041  int isite = m_arg[itask].isite;
2042  double *w2 = &v2[Nvcd * isite];
2043  const double *w1 = &v1[Nvcd * isite];
2044 
2045  for (int it = 0; it < m_Mt; ++it) {
2046  for (int iz = 0; iz < m_Mz; ++iz) {
2047  for (int ixy = 0; ixy < Nxy; ++ixy) {
2048  int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
2049  for (int ivc = 0; ivc < m_Nvc; ++ivc) {
2050  w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
2051  w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
2052  w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
2053  w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
2054  }
2055  }
2056  }
2057  }
2058  }
2059 
2060 
2061 //====================================================================
2062 }
2063 //============================================================END=====
void mult_zpb_thread(int, double *, const double *)
BridgeIO vout
Definition: bridgeIO.cpp:495
void mult_tp1_chiral_thread(int, double *, const double *)
const double * ptr(const int jin, const int site, const int jex) const
Definition: field.h:142
void mult_tm1_dirac_thread(int, double *, const double *)
const Field_G * m_U
gauge configuration.
void mult_yp1_thread(int, double *, const double *)
void mult_yp2_thread(int, double *, const double *)
void mult_zp2_thread(int, double *, const double *)
void general(const char *format,...)
Definition: bridgeIO.cpp:195
void mult_ym2_thread(int, double *, const double *)
void mult_tpb_chiral_thread(int, double *, const double *)
void gm5_chiral_thread(int, double *, const double *)
std::vector< Channel * > m_bw_recv
void mult_tpb_dirac_thread(int, double *, const double *)
void mult_tp2_dirac_thread(int, double *, const double *)
void mult_zm2_thread(int, double *, const double *)
void mult_xm2_thread(int, double *, const double *)
void mult_tm1_chiral_thread(int, double *, const double *)
void mult_tmb_dirac_thread(int, double *, const double *)
void mult_tm2_dirac_thread(int, double *, const double *)
void mult_tp1_dirac_thread(int, double *, const double *)
static int get_num_threads_available()
returns number of threads (works outside of parallel region).
void mult_zmb_thread(int, double *, const double *)
void crucial(const char *format,...)
Definition: bridgeIO.cpp:178
void mult_xmb_thread(int, double *, const double *)
void gm5_dirac_thread(int, double *, const double *)
std::vector< double > m_boundary2
b.c. for each node.
void daypx_thread(int, double *, double, const double *)
void mult_tmb_chiral_thread(int, double *, const double *)
void mult_ypb_thread(int, double *, const double *)
void mult_ymb_thread(int, double *, const double *)
void mult_xpb_thread(int, double *, const double *)
void mult_ym1_thread(int, double *, const double *)
void mult_xp1_thread(int, double *, const double *)
void mult_tm2_chiral_thread(int, double *, const double *)
static const std::string class_name
void mult_tp2_chiral_thread(int, double *, const double *)
void mult_zp1_thread(int, double *, const double *)
std::vector< Channel * > m_fw_send
void mult_xm1_thread(int, double *, const double *)
void mult_xp2_thread(int, double *, const double *)
void mult_zm1_thread(int, double *, const double *)
std::vector< Channel * > m_fw_recv
std::vector< Channel * > m_bw_send