Bridge++  Ver. 1.3.x
fopr_Wilson_General_impl_thread.cpp
Go to the documentation of this file.
1 
15 
16 #include "threadManager_OpenMP.h"
17 
18 
19 #if defined USE_GROUP_SU3
20 #include "fopr_Wilson_impl_SU3.inc"
21 #elif defined USE_GROUP_SU2
22 #include "fopr_Wilson_impl_SU2.inc"
23 #elif defined USE_GROUP_SU_N
24 #include "fopr_Wilson_impl_SU_N.inc"
25 #endif
26 
27 //====================================================================
29 {
31 
32  // The following setup corresponds to unifirm division of volume.
33  if (m_Nthread <= m_Nt) {
35  } else if (m_Nthread <= m_Nz * m_Nt) {
36  m_Ntask_t = m_Nt;
37  } else {
38  vout.crucial(m_vl, " Too large Nthread: %d\n", m_Nthread);
39  exit(EXIT_FAILURE);
40  }
42  if (m_Ntask_z * m_Ntask_t != m_Nthread) {
43  vout.crucial(m_vl, " Nz(%d) and Nt(%d) do not mach Nthread: %d\n",
44  m_Nz, m_Nt, m_Nthread);
45  exit(EXIT_FAILURE);
46  }
48  m_Mz = m_Nz / m_Ntask_z;
49  m_Mt = m_Nt / m_Ntask_t;
50 
51  // The following setup is not monotonic division, and requires
52  // barrier at the beginning and end of mult (D and gamma5).
53  // [H.Matsufuru 22 Oct 2013]
54  // if(m_Nthread >= 64){
55  // m_Ntask_z = 8;
56  // }else if(m_Nthread >= 16){
57  // m_Ntask_z = 4;
58  // }else if(m_Nthread >= 4){
59  // m_Ntask_z = 2;
60  // }else{
61  // m_Ntask_z = 1;
62  // }
63  // m_Ntask_t = m_Nthread/m_Ntask_z;
64  // m_Ntask = m_Ntask_t * m_Ntask_z;
65  // m_Mz = m_Nz/m_Ntask_z;
66  // m_Mt = m_Nt/m_Ntask_t;
67 
68  vout.general(m_vl, " Nthread = %d\n", m_Nthread);
69  vout.general(m_vl, " Ntask = %d\n", m_Ntask);
70  vout.general(m_vl, " Ntask_z = %d Ntask_t = %d\n", m_Ntask_z, m_Ntask_t);
71  vout.general(m_vl, " Mz = %d Mt = %d\n", m_Mz, m_Mt);
72 
73  // setup of arguments
74  int Nxy = m_Nx * m_Ny;
75  m_arg.resize(m_Ntask);
76  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
77  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
78  int itask = ith_z + m_Ntask_z * ith_t;
79 
80  m_arg[itask].isite = (ith_z * m_Mz + ith_t * (m_Nz * m_Mt)) * Nxy;
81 
82  m_arg[itask].kt0 = 0;
83  m_arg[itask].kt1 = 0;
84  m_arg[itask].kz0 = 0;
85  m_arg[itask].kz1 = 0;
86  if (ith_t == 0) m_arg[itask].kt0 = 1;
87  if (ith_z == 0) m_arg[itask].kz0 = 1;
88  if (ith_t == m_Ntask_t - 1) m_arg[itask].kt1 = 1;
89  if (ith_z == m_Ntask_z - 1) m_arg[itask].kz1 = 1;
90 
91  m_arg[itask].isite_cpx = itask * m_Mz * m_Mt * m_Ny;
92  m_arg[itask].isite_cpy = itask * m_Mz * m_Mt * m_Nx;
93  m_arg[itask].isite_cpz = ith_t * m_Mt * Nxy;
94  m_arg[itask].isite_cpt = ith_z * m_Mz * Nxy;
95  }
96  }
97 
98  // setup for async data transfer
99  int Nc = CommonParameters::Nc();
100  int Nd = CommonParameters::Nd();
101  int Nvcd2 = 2 * Nc * Nd / 2;
102 
103  std::vector<int> destid(m_Ntask);
104  std::vector<int> offset(m_Ntask);
105  std::vector<int> datasize(m_Ntask);
106  std::vector<int> offset_up(m_Ntask);
107  std::vector<int> offset_lw(m_Ntask);
108  std::vector<int> datasize_up(m_Ntask);
109  std::vector<int> datasize_lw(m_Ntask);
110 
111  int imu = 0;
112  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
113  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
114  int itask = ith_z + ith_t * m_Ntask_z;
115  int isite_cp = itask * m_Mz * m_Mt * m_Ny;
116  destid[itask] = itask;
117  offset[itask] = sizeof(double) * Nvcd2 * isite_cp;
118  datasize[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Mt * m_Ny;
119  }
120  }
121  m_bw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
122  m_fw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
123  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
124  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
125 
126  imu = 1;
127  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
128  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
129  int itask = ith_z + ith_t * m_Ntask_z;
130  int isite_cp = itask * m_Mz * m_Mt * m_Nx;
131  destid[itask] = itask;
132  offset[itask] = sizeof(double) * Nvcd2 * isite_cp;
133  datasize[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Mt * m_Nx;
134  }
135  }
136  m_bw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
137  m_fw_send[imu]->set_thread(m_Ntask, destid, offset, datasize);
138  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
139  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset, datasize);
140 
141  imu = 2;
142  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
143  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
144  int itask = ith_z + m_Ntask_z * ith_t;
145  destid[itask] = -1;
146  offset_up[itask] = 0;
147  offset_lw[itask] = 0;
148  datasize_up[itask] = 0;
149  datasize_lw[itask] = 0;
150  if (ith_z == 0) {
151  destid[itask] = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
152  offset_lw[itask] = sizeof(double) * Nvcd2 * ith_t * m_Mt * m_Nx * m_Ny;
153  datasize_lw[itask] = sizeof(double) * Nvcd2 * m_Mt * m_Nx * m_Ny;
154  }
155  if (ith_z == m_Ntask_z - 1) {
156  destid[itask] = ith_t * m_Ntask_z;
157  offset_up[itask] = sizeof(double) * Nvcd2 * ith_t * m_Mt * m_Nx * m_Ny;
158  datasize_up[itask] = sizeof(double) * Nvcd2 * m_Mt * m_Nx * m_Ny;
159  }
160  }
161  }
162  m_bw_send[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
163  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
164  m_fw_send[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
165  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
166 
167  imu = 3;
168  for (int ith_t = 0; ith_t < m_Ntask_t; ++ith_t) {
169  for (int ith_z = 0; ith_z < m_Ntask_z; ++ith_z) {
170  int itask = ith_z + m_Ntask_z * ith_t;
171  destid[itask] = -1;
172  offset_up[itask] = 0;
173  offset_lw[itask] = 0;
174  datasize_up[itask] = 0;
175  datasize_lw[itask] = 0;
176  if (ith_t == 0) {
177  destid[itask] = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
178  offset_lw[itask] = sizeof(double) * Nvcd2 * ith_z * m_Mz * m_Nx * m_Ny;
179  datasize_lw[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Nx * m_Ny;
180  }
181  if (ith_t == m_Ntask_t - 1) {
182  destid[itask] = ith_z;
183  offset_up[itask] = sizeof(double) * Nvcd2 * ith_z * m_Mz * m_Nx * m_Ny;
184  datasize_up[itask] = sizeof(double) * Nvcd2 * m_Mz * m_Nx * m_Ny;
185  }
186  }
187  }
188  m_bw_send[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
189  m_bw_recv[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
190  m_fw_send[imu]->set_thread(m_Ntask, destid, offset_up, datasize_up);
191  m_fw_recv[imu]->set_thread(m_Ntask, destid, offset_lw, datasize_lw);
192 }
193 
194 
195 //====================================================================
197  int itask, double *v2, double fac, const double *v1)
198 {
199  int Nvcd = m_Nvc * m_Nd;
200  int Nvxy = Nvcd * m_Nx * m_Ny;
201 
202  int isite = m_arg[itask].isite;
203 
204  const double *w1 = &v1[Nvcd * isite];
205  double *w2 = &v2[Nvcd * isite];
206 
207  for (int it = 0; it < m_Mt; ++it) {
208  for (int iz = 0; iz < m_Mz; ++iz) {
209  for (int ivxy = 0; ivxy < Nvxy; ++ivxy) {
210  int iv = ivxy + Nvxy * (iz + m_Nz * it);
211  w2[iv] = fac * w2[iv] + w1[iv];
212  }
213  }
214  }
215 }
216 
217 
218 //====================================================================
220  double *v2)
221 {
222  int Nvcd = m_Nvc * m_Nd;
223  int Nvxy = Nvcd * m_Nx * m_Ny;
224 
225  int isite = m_arg[itask].isite;
226  double *w2 = &v2[Nvcd * isite];
227 
228  for (int it = 0; it < m_Mt; ++it) {
229  for (int iz = 0; iz < m_Mz; ++iz) {
230  for (int ivxy = 0; ivxy < Nvxy; ++ivxy) {
231  int iv = ivxy + Nvxy * (iz + m_Nz * it);
232  w2[iv] = 0.0;
233  }
234  }
235  }
236 }
237 
238 
239 //====================================================================
241  int itask, double *vcp1, const double *v1)
242 {
243  int Nvc2 = 2 * m_Nvc;
244  int Nvcd = m_Nvc * m_Nd;
245  int Nvcd2 = Nvcd / 2;
246 
247  int id1 = 0;
248  int id2 = m_Nvc;
249  int id3 = m_Nvc * 2;
250  int id4 = m_Nvc * 3;
251 
252  int idir = 0;
253  double bc2 = m_boundary2[idir];
254 
255  int isite = m_arg[itask].isite;
256  int isite_cp = m_arg[itask].isite_cpx;
257 
258  // double* w2 = &vcp1[Nvcd2*isite_cp];
259  const double *w1 = &v1[Nvcd * isite];
260  double *w2
261  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
262 
263  int ix = 0;
264 
265  for (int it = 0; it < m_Mt; ++it) {
266  for (int iz = 0; iz < m_Mz; ++iz) {
267  for (int iy = 0; iy < m_Ny; ++iy) {
268  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
269  int is2 = iy + m_Ny * (iz + m_Mz * it);
270  int in = Nvcd * is;
271  int ix1 = Nvc2 * is2;
272  int ix2 = ix1 + m_Nvc;
273 
274  for (int ic = 0; ic < m_Nc; ++ic) {
275  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
276  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
277  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
278  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
279  }
280  }
281  }
282  }
283 
284  m_bw_send[idir]->start_thread(itask);
285 }
286 
287 
288 //====================================================================
290  int itask, double *v2, const double *vcp2)
291 {
292  int Nvc2 = 2 * m_Nvc;
293  int Nvcd = m_Nvc * m_Nd;
294  int Nvcd2 = Nvcd / 2;
295 
296  int id1 = 0;
297  int id2 = m_Nvc;
298  int id3 = m_Nvc * 2;
299  int id4 = m_Nvc * 3;
300 
301  int idir = 0;
302 
303  double wt1r, wt1i, wt2r, wt2i;
304 
305  int isite = m_arg[itask].isite;
306  int isite_cp = m_arg[itask].isite_cpx;
307 
308  double *w2 = &v2[Nvcd * isite];
309  // double* w1 = &vcp2[Nvcd2*isite_cp];
310  const double *w1
311  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
312  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
313 
314  m_bw_recv[idir]->wait_thread(itask);
315 
316  int ix = m_Nx - 1;
317  for (int it = 0; it < m_Mt; ++it) {
318  for (int iz = 0; iz < m_Mz; ++iz) {
319  for (int iy = 0; iy < m_Ny; ++iy) {
320  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
321  int is2 = iy + m_Ny * (iz + m_Mz * it);
322  int iv = Nvcd * is;
323  int ig = m_Ndf * is;
324  int ix1 = Nvc2 * is2;
325  int ix2 = ix1 + m_Nvc;
326 
327  for (int ic = 0; ic < m_Nc; ++ic) {
328  int ic2 = ic * m_Nvc;
329 
330  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
331  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
332  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
333  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
334 
335  w2[2 * ic + id1 + iv] += wt1r;
336  w2[2 * ic + 1 + id1 + iv] += wt1i;
337  w2[2 * ic + id2 + iv] += wt2r;
338  w2[2 * ic + 1 + id2 + iv] += wt2i;
339  w2[2 * ic + id3 + iv] += wt2i;
340  w2[2 * ic + 1 + id3 + iv] += -wt2r;
341  w2[2 * ic + id4 + iv] += wt1i;
342  w2[2 * ic + 1 + id4 + iv] += -wt1r;
343  }
344  }
345  }
346  }
347 }
348 
349 
350 //====================================================================
352  int itask, double *v2, const double *v1)
353 {
354  int Nvcd = m_Nvc * m_Nd;
355 
356  int id1 = 0;
357  int id2 = m_Nvc;
358  int id3 = m_Nvc * 2;
359  int id4 = m_Nvc * 3;
360 
361  int idir = 0;
362 
363  double vt1[m_Nvc], vt2[m_Nvc];
364  double wt1r, wt1i, wt2r, wt2i;
365 
366  int isite = m_arg[itask].isite;
367 
368  const double *w1 = &v1[Nvcd * isite];
369  double *w2 = &v2[Nvcd * isite];
370  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
371 
372  for (int it = 0; it < m_Mt; ++it) {
373  for (int iz = 0; iz < m_Mz; ++iz) {
374  for (int iy = 0; iy < m_Ny; ++iy) {
375  for (int ix = 0; ix < m_Nx - 1; ++ix) {
376  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
377  int iv = Nvcd * is;
378  int in = Nvcd * (is + 1);
379  int ig = m_Ndf * is;
380 
381  for (int ic = 0; ic < m_Nc; ++ic) {
382  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
383  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
384  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
385  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
386  }
387 
388  for (int ic = 0; ic < m_Nc; ++ic) {
389  int ic2 = ic * m_Nvc;
390 
391  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
392  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
393  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
394  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
395 
396  w2[2 * ic + id1 + iv] += wt1r;
397  w2[2 * ic + 1 + id1 + iv] += wt1i;
398  w2[2 * ic + id2 + iv] += wt2r;
399  w2[2 * ic + 1 + id2 + iv] += wt2i;
400  w2[2 * ic + id3 + iv] += wt2i;
401  w2[2 * ic + 1 + id3 + iv] += -wt2r;
402  w2[2 * ic + id4 + iv] += wt1i;
403  w2[2 * ic + 1 + id4 + iv] += -wt1r;
404  }
405  }
406  }
407  }
408  }
409 }
410 
411 
412 //====================================================================
414  int itask, double *vcp1, const double *v1)
415 {
416  int Nvc2 = 2 * m_Nvc;
417  int Nvcd = m_Nvc * m_Nd;
418  int Nvcd2 = Nvcd / 2;
419 
420  int id1 = 0;
421  int id2 = m_Nvc;
422  int id3 = m_Nvc * 2;
423  int id4 = m_Nvc * 3;
424 
425  int idir = 0;
426 
427  int isite = m_arg[itask].isite;
428  int isite_cp = m_arg[itask].isite_cpx;
429 
430  const double *w1 = &v1[Nvcd * isite];
431  // double* w2 = &vcp1[Nvcd2*isite_cp];
432  double *w2
433  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
434  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
435 
436  double vt1[m_Nvc], vt2[m_Nvc];
437 
438  int ix = m_Nx - 1;
439 
440  for (int it = 0; it < m_Mt; ++it) {
441  for (int iz = 0; iz < m_Mz; ++iz) {
442  for (int iy = 0; iy < m_Ny; ++iy) {
443  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
444  int is2 = iy + m_Ny * (iz + m_Mz * it);
445  int in = Nvcd * is;
446  int ig = m_Ndf * is;
447  int ix1 = Nvc2 * is2;
448  int ix2 = ix1 + m_Nvc;
449 
450  for (int ic = 0; ic < m_Nc; ++ic) {
451  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
452  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
453  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
454  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
455  }
456 
457  for (int ic = 0; ic < m_Nc; ++ic) {
458  int icr = 2 * ic;
459  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
460  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
461  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
462  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
463  }
464  }
465  }
466  }
467 
468  m_fw_send[idir]->start_thread(itask);
469 }
470 
471 
472 //====================================================================
474  int itask, double *v2, const double *vcp2)
475 {
476  int Nvc2 = 2 * m_Nvc;
477  int Nvcd = m_Nvc * m_Nd;
478  int Nvcd2 = Nvcd / 2;
479 
480  int id1 = 0;
481  int id2 = m_Nvc;
482  int id3 = m_Nvc * 2;
483  int id4 = m_Nvc * 3;
484 
485  int idir = 0;
486  double bc2 = m_boundary2[idir];
487 
488  double wt1r, wt1i, wt2r, wt2i;
489 
490  int isite = m_arg[itask].isite;
491  int isite_cp = m_arg[itask].isite_cpx;
492 
493  double *w2 = &v2[Nvcd * isite];
494  // double* w1 = &vcp2[Nvcd2*isite_cp];
495  const double *w1
496  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
497 
498  m_fw_recv[idir]->wait_thread(itask);
499 
500  int ix = 0;
501  for (int it = 0; it < m_Mt; ++it) {
502  for (int iz = 0; iz < m_Mz; ++iz) {
503  for (int iy = 0; iy < m_Ny; ++iy) {
504  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
505  int is2 = iy + m_Ny * (iz + m_Mz * it);
506  int iv = Nvcd * is;
507  int ix1 = Nvc2 * is2;
508  int ix2 = ix1 + m_Nvc;
509 
510  for (int ic = 0; ic < m_Nc; ++ic) {
511  int icr = 2 * ic;
512  int ici = 2 * ic + 1;
513  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
514  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
515  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
516  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
517  w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
518  w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
519  w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
520  w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
521  }
522  }
523  }
524  }
525 }
526 
527 
528 //====================================================================
530  int itask, double *v2, const double *v1)
531 {
532  int Nvcd = m_Nvc * m_Nd;
533 
534  int id1 = 0;
535  int id2 = m_Nvc;
536  int id3 = m_Nvc * 2;
537  int id4 = m_Nvc * 3;
538 
539  int idir = 0;
540 
541  double vt1[m_Nvc], vt2[m_Nvc];
542  double wt1r, wt1i, wt2r, wt2i;
543 
544  int isite = m_arg[itask].isite;
545 
546  const double *w1 = &v1[Nvcd * isite];
547  double *w2 = &v2[Nvcd * isite];
548  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
549 
550  for (int it = 0; it < m_Mt; ++it) {
551  for (int iz = 0; iz < m_Mz; ++iz) {
552  for (int iy = 0; iy < m_Ny; ++iy) {
553  for (int ix = 1; ix < m_Nx; ++ix) {
554  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
555  int iv = Nvcd * is;
556  int in = Nvcd * (is - 1);
557  int ig = m_Ndf * (is - 1);
558 
559  for (int ic = 0; ic < m_Nc; ++ic) {
560  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
561  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
562  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
563  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
564  }
565 
566  for (int ic = 0; ic < m_Nc; ++ic) {
567  int ic2 = 2 * ic;
568 
569  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
570  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
571  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
572  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
573 
574  w2[2 * ic + id1 + iv] += wt1r;
575  w2[2 * ic + 1 + id1 + iv] += wt1i;
576  w2[2 * ic + id2 + iv] += wt2r;
577  w2[2 * ic + 1 + id2 + iv] += wt2i;
578  w2[2 * ic + id3 + iv] += -wt2i;
579  w2[2 * ic + 1 + id3 + iv] += +wt2r;
580  w2[2 * ic + id4 + iv] += -wt1i;
581  w2[2 * ic + 1 + id4 + iv] += +wt1r;
582  }
583  }
584  }
585  }
586  }
587 }
588 
589 
590 //====================================================================
592  int itask, double *vcp1, const double *v1)
593 {
594  int Nvc2 = 2 * m_Nvc;
595  int Nvcd = m_Nvc * m_Nd;
596  int Nvcd2 = Nvcd / 2;
597 
598  int id1 = 0;
599  int id2 = m_Nvc;
600  int id3 = m_Nvc * 2;
601  int id4 = m_Nvc * 3;
602 
603  int isite = m_arg[itask].isite;
604  int isite_cp = m_arg[itask].isite_cpy;
605 
606  int idir = 1;
607  double bc2 = m_boundary2[idir];
608 
609  const double *w1 = &v1[Nvcd * isite];
610  // double* w2 = &vcp1[Nvcd2*isite_cp];
611  double *w2
612  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
613 
614 
615  int iy = 0;
616 
617  for (int it = 0; it < m_Mt; ++it) {
618  for (int iz = 0; iz < m_Mz; ++iz) {
619  for (int ix = 0; ix < m_Nx; ++ix) {
620  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
621  int is2 = ix + m_Nx * (iz + m_Mz * it);
622  int in = Nvcd * is;
623  int ix1 = Nvc2 * is2;
624  int ix2 = ix1 + m_Nvc;
625 
626  for (int ic = 0; ic < m_Nc; ++ic) {
627  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
628  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
629  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
630  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
631  }
632  }
633  }
634  }
635 
636  m_bw_send[idir]->start_thread(itask);
637 }
638 
639 
640 //====================================================================
642  int itask, double *v2, const double *vcp2)
643 {
644  int Nvc2 = 2 * m_Nvc;
645  int Nvcd = m_Nvc * m_Nd;
646  int Nvcd2 = Nvcd / 2;
647 
648  int id1 = 0;
649  int id2 = m_Nvc;
650  int id3 = m_Nvc * 2;
651  int id4 = m_Nvc * 3;
652 
653  int idir = 1;
654 
655  double wt1r, wt1i, wt2r, wt2i;
656 
657  int isite = m_arg[itask].isite;
658  int isite_cp = m_arg[itask].isite_cpy;
659 
660  double *w2 = &v2[Nvcd * isite];
661  // double* w1 = &vcp2[Nvcd2*isite_cp];
662  const double *w1
663  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
664  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
665 
666  m_bw_recv[idir]->wait_thread(itask);
667 
668  int iy = m_Ny - 1;
669  for (int it = 0; it < m_Mt; ++it) {
670  for (int iz = 0; iz < m_Mz; ++iz) {
671  for (int ix = 0; ix < m_Nx; ++ix) {
672  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
673  int is2 = ix + m_Nx * (iz + m_Mz * it);
674  int iv = Nvcd * is;
675  int ig = m_Ndf * is;
676  int ix1 = Nvc2 * is2;
677  int ix2 = ix1 + m_Nvc;
678 
679  for (int ic = 0; ic < m_Nc; ++ic) {
680  int ic2 = ic * m_Nvc;
681 
682  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
683  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
684  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
685  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
686 
687  w2[2 * ic + id1 + iv] += wt1r;
688  w2[2 * ic + 1 + id1 + iv] += wt1i;
689  w2[2 * ic + id2 + iv] += wt2r;
690  w2[2 * ic + 1 + id2 + iv] += wt2i;
691  w2[2 * ic + id3 + iv] += -wt2r;
692  w2[2 * ic + 1 + id3 + iv] += -wt2i;
693  w2[2 * ic + id4 + iv] += wt1r;
694  w2[2 * ic + 1 + id4 + iv] += wt1i;
695  }
696  }
697  }
698  }
699 }
700 
701 
702 //====================================================================
704  int itask, double *v2, const double *v1)
705 {
706  int Nvcd = m_Nvc * m_Nd;
707 
708  int id1 = 0;
709  int id2 = m_Nvc;
710  int id3 = m_Nvc * 2;
711  int id4 = m_Nvc * 3;
712 
713  int idir = 1;
714 
715  double vt1[m_Nvc], vt2[m_Nvc];
716  double wt1r, wt1i, wt2r, wt2i;
717 
718  int isite = m_arg[itask].isite;
719 
720  double *w2 = &v2[Nvcd * isite];
721  const double *w1 = &v1[Nvcd * isite];
722  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
723 
724  for (int it = 0; it < m_Mt; ++it) {
725  for (int iz = 0; iz < m_Mz; ++iz) {
726  for (int iy = 0; iy < m_Ny - 1; ++iy) {
727  for (int ix = 0; ix < m_Nx; ++ix) {
728  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
729  int iv = Nvcd * is;
730  int in = Nvcd * (is + m_Nx);
731  int ig = m_Ndf * is;
732 
733  for (int ic = 0; ic < m_Nc; ++ic) {
734  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
735  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
736  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
737  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
738  }
739 
740  for (int ic = 0; ic < m_Nc; ++ic) {
741  int ic2 = ic * m_Nvc;
742 
743  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
744  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
745  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
746  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
747 
748  w2[2 * ic + id1 + iv] += wt1r;
749  w2[2 * ic + 1 + id1 + iv] += wt1i;
750  w2[2 * ic + id2 + iv] += wt2r;
751  w2[2 * ic + 1 + id2 + iv] += wt2i;
752  w2[2 * ic + id3 + iv] += -wt2r;
753  w2[2 * ic + 1 + id3 + iv] += -wt2i;
754  w2[2 * ic + id4 + iv] += wt1r;
755  w2[2 * ic + 1 + id4 + iv] += wt1i;
756  }
757  }
758  }
759  }
760  }
761 }
762 
763 
764 //====================================================================
766  int itask, double *vcp1, const double *v1)
767 {
768  int Nvc2 = 2 * m_Nvc;
769  int Nvcd = m_Nvc * m_Nd;
770  int Nvcd2 = Nvcd / 2;
771 
772  int id1 = 0;
773  int id2 = m_Nvc;
774  int id3 = m_Nvc * 2;
775  int id4 = m_Nvc * 3;
776 
777  int idir = 1;
778 
779  int isite = m_arg[itask].isite;
780  int isite_cp = m_arg[itask].isite_cpy;
781 
782  // double* w2 = &vcp1[Nvcd2*isite_cp];
783  double *w2
784  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
785 
786  const double *w1 = &v1[Nvcd * isite];
787  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
788 
789  double vt1[m_Nvc], vt2[m_Nvc];
790 
791  int iy = m_Ny - 1;
792 
793  for (int it = 0; it < m_Mt; ++it) {
794  for (int iz = 0; iz < m_Mz; ++iz) {
795  for (int ix = 0; ix < m_Nx; ++ix) {
796  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
797  int is2 = ix + m_Nx * (iz + m_Mz * it);
798  int in = Nvcd * is;
799  int ig = m_Ndf * is;
800  int ix1 = Nvc2 * is2;
801  int ix2 = ix1 + m_Nvc;
802 
803  for (int ic = 0; ic < m_Nc; ++ic) {
804  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
805  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
806  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
807  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
808  }
809 
810  for (int ic = 0; ic < m_Nc; ++ic) {
811  int icr = 2 * ic;
812  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
813  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
814  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
815  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
816  }
817  }
818  }
819  }
820 
821  m_fw_send[idir]->start_thread(itask);
822 }
823 
824 
825 //====================================================================
827  int itask, double *v2, const double *vcp2)
828 {
829  int Nvc2 = 2 * m_Nvc;
830  int Nvcd = m_Nvc * m_Nd;
831  int Nvcd2 = Nvcd / 2;
832 
833  int id1 = 0;
834  int id2 = m_Nvc;
835  int id3 = m_Nvc * 2;
836  int id4 = m_Nvc * 3;
837 
838  int idir = 1;
839  double bc2 = m_boundary2[idir];
840 
841  double wt1r, wt1i, wt2r, wt2i;
842 
843  int isite = m_arg[itask].isite;
844  int isite_cp = m_arg[itask].isite_cpy;
845 
846  double *w2 = &v2[Nvcd * isite];
847  // double* w1 = &vcp2[Nvcd2*isite_cp];
848  const double *w1
849  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
850 
851  m_fw_recv[idir]->wait_thread(itask);
852 
853  int iy = 0;
854  for (int it = 0; it < m_Mt; ++it) {
855  for (int iz = 0; iz < m_Mz; ++iz) {
856  for (int ix = 0; ix < m_Nx; ++ix) {
857  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
858  int is2 = ix + m_Nx * (iz + m_Mz * it);
859  int iv = Nvcd * is;
860  int ix1 = Nvc2 * is2;
861  int ix2 = ix1 + m_Nvc;
862 
863  for (int ic = 0; ic < m_Nc; ++ic) {
864  int icr = 2 * ic;
865  int ici = 2 * ic + 1;
866  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
867  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
868  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
869  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
870  w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
871  w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
872  w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
873  w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
874  }
875  }
876  }
877  }
878 }
879 
880 
881 //====================================================================
883  int itask, double *v2, const double *v1)
884 {
885  int Nvcd = m_Nvc * m_Nd;
886 
887  int id1 = 0;
888  int id2 = m_Nvc;
889  int id3 = m_Nvc * 2;
890  int id4 = m_Nvc * 3;
891 
892  int idir = 1;
893 
894  double vt1[m_Nvc], vt2[m_Nvc];
895  double wt1r, wt1i, wt2r, wt2i;
896 
897  int isite = m_arg[itask].isite;
898 
899  double *w2 = &v2[Nvcd * isite];
900  const double *w1 = &v1[Nvcd * isite];
901  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
902 
903  for (int it = 0; it < m_Mt; ++it) {
904  for (int iz = 0; iz < m_Mz; ++iz) {
905  for (int iy = 1; iy < m_Ny; ++iy) {
906  for (int ix = 0; ix < m_Nx; ++ix) {
907  int is = ix + m_Nx * (iy + m_Ny * (iz + m_Nz * it));
908  int iv = Nvcd * is;
909  int in = Nvcd * (is - m_Nx);
910  int ig = m_Ndf * (is - m_Nx);
911 
912  for (int ic = 0; ic < m_Nc; ++ic) {
913  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
914  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
915  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
916  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
917  }
918 
919  for (int ic = 0; ic < m_Nc; ++ic) {
920  int ic2 = 2 * ic;
921  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
922  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
923  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
924  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
925 
926  w2[ic2 + id1 + iv] += wt1r;
927  w2[ic2 + 1 + id1 + iv] += wt1i;
928  w2[ic2 + id2 + iv] += wt2r;
929  w2[ic2 + 1 + id2 + iv] += wt2i;
930  w2[ic2 + id3 + iv] += wt2r;
931  w2[ic2 + 1 + id3 + iv] += wt2i;
932  w2[ic2 + id4 + iv] += -wt1r;
933  w2[ic2 + 1 + id4 + iv] += -wt1i;
934  }
935  }
936  }
937  }
938  }
939 }
940 
941 
942 //====================================================================
944  int itask, double *vcp1, const double *v1)
945 {
946  int Nvc2 = 2 * m_Nvc;
947  int Nvcd = m_Nvc * m_Nd;
948  int Nvcd2 = Nvcd / 2;
949 
950  int id1 = 0;
951  int id2 = m_Nvc;
952  int id3 = m_Nvc * 2;
953  int id4 = m_Nvc * 3;
954 
955  int isite = m_arg[itask].isite;
956  int isite_cp = m_arg[itask].isite_cpz;
957 
958  int idir = 2;
959  double bc2 = m_boundary2[idir];
960 
961  // double* w2 = &vcp1[Nvcd2*isite_cp];
962  double *w2
963  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
964  const double *w1 = &v1[Nvcd * isite];
965 
966  if (m_arg[itask].kz0 == 1) {
967  int Nxy = m_Nx * m_Ny;
968  int iz = 0;
969  for (int it = 0; it < m_Mt; ++it) {
970  for (int ixy = 0; ixy < Nxy; ++ixy) {
971  int is = ixy + Nxy * (iz + m_Nz * it);
972  int is2 = ixy + Nxy * it;
973 
974  int in = Nvcd * is;
975  int ix1 = Nvc2 * is2;
976  int ix2 = ix1 + m_Nvc;
977 
978  for (int ic = 0; ic < m_Nc; ++ic) {
979  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
980  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
981  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
982  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
983  }
984  }
985  }
986  }
987 
988  m_bw_send[idir]->start_thread(itask);
989 }
990 
991 
992 //====================================================================
994  int itask, double *v2, const double *vcp2)
995 {
996  int Nvc2 = 2 * m_Nvc;
997  int Nvcd = m_Nvc * m_Nd;
998  int Nvcd2 = Nvcd / 2;
999 
1000  int id1 = 0;
1001  int id2 = m_Nvc;
1002  int id3 = m_Nvc * 2;
1003  int id4 = m_Nvc * 3;
1004 
1005  int idir = 2;
1006 
1007  double wt1r, wt1i, wt2r, wt2i;
1008 
1009  int isite = m_arg[itask].isite;
1010  int isite_cp = m_arg[itask].isite_cpz;
1011 
1012  double *w2 = &v2[Nvcd * isite];
1013  // double* w1 = &vcp2[Nvcd2*isite_cp];
1014  const double *w1
1015  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1016  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1017 
1018  m_bw_recv[idir]->wait_thread(itask);
1019 
1020  if (m_arg[itask].kz1 == 1) {
1021  int Nxy = m_Nx * m_Ny;
1022  int iz = m_Mz - 1;
1023  for (int it = 0; it < m_Mt; ++it) {
1024  for (int ixy = 0; ixy < Nxy; ++ixy) {
1025  int is = ixy + Nxy * (iz + m_Nz * it);
1026  int is2 = ixy + Nxy * it;
1027  int iv = Nvcd * is;
1028  int ig = m_Ndf * is;
1029  int ix1 = Nvc2 * is2;
1030  int ix2 = ix1 + m_Nvc;
1031 
1032  for (int ic = 0; ic < m_Nc; ++ic) {
1033  int ic2 = ic * m_Nvc;
1034 
1035  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1036  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1037  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1038  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1039 
1040  w2[2 * ic + id1 + iv] += wt1r;
1041  w2[2 * ic + 1 + id1 + iv] += wt1i;
1042  w2[2 * ic + id2 + iv] += wt2r;
1043  w2[2 * ic + 1 + id2 + iv] += wt2i;
1044  w2[2 * ic + id3 + iv] += wt1i;
1045  w2[2 * ic + 1 + id3 + iv] += -wt1r;
1046  w2[2 * ic + id4 + iv] += -wt2i;
1047  w2[2 * ic + 1 + id4 + iv] += wt2r;
1048  }
1049  }
1050  }
1051  }
1052 }
1053 
1054 
1055 //====================================================================
1057  int itask, double *v2, const double *v1)
1058 {
1059  int Nvcd = m_Nvc * m_Nd;
1060 
1061  int id1 = 0;
1062  int id2 = m_Nvc;
1063  int id3 = m_Nvc * 2;
1064  int id4 = m_Nvc * 3;
1065 
1066  int idir = 2;
1067 
1068  double vt1[m_Nvc], vt2[m_Nvc];
1069  double wt1r, wt1i, wt2r, wt2i;
1070 
1071  int isite = m_arg[itask].isite;
1072 
1073  double *w2 = &v2[Nvcd * isite];
1074  const double *w1 = &v1[Nvcd * isite];
1075  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1076 
1077  int kz1 = m_arg[itask].kz1;
1078  int Nxy = m_Nx * m_Ny;
1079 
1080  for (int it = 0; it < m_Mt; ++it) {
1081  for (int iz = 0; iz < m_Mz - kz1; ++iz) {
1082  for (int ixy = 0; ixy < Nxy; ++ixy) {
1083  int is = ixy + Nxy * (iz + m_Nz * it);
1084  int iv = Nvcd * is;
1085  int in = Nvcd * (is + Nxy);
1086  int ig = m_Ndf * is;
1087 
1088  for (int ic = 0; ic < m_Nc; ++ic) {
1089  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
1090  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
1091  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
1092  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
1093  }
1094 
1095  for (int ic = 0; ic < m_Nc; ++ic) {
1096  int ic2 = ic * m_Nvc;
1097 
1098  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1099  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1100  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1101  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1102 
1103  w2[2 * ic + id1 + iv] += wt1r;
1104  w2[2 * ic + 1 + id1 + iv] += wt1i;
1105  w2[2 * ic + id2 + iv] += wt2r;
1106  w2[2 * ic + 1 + id2 + iv] += wt2i;
1107  w2[2 * ic + id3 + iv] += wt1i;
1108  w2[2 * ic + 1 + id3 + iv] += -wt1r;
1109  w2[2 * ic + id4 + iv] += -wt2i;
1110  w2[2 * ic + 1 + id4 + iv] += wt2r;
1111  }
1112  }
1113  }
1114  }
1115 }
1116 
1117 
1118 //====================================================================
1120  int itask, double *vcp1, const double *v1)
1121 {
1122  int Nvc2 = 2 * m_Nvc;
1123  int Nvcd = m_Nvc * m_Nd;
1124  int Nvcd2 = Nvcd / 2;
1125 
1126  int id1 = 0;
1127  int id2 = m_Nvc;
1128  int id3 = m_Nvc * 2;
1129  int id4 = m_Nvc * 3;
1130 
1131  int idir = 2;
1132 
1133  int isite = m_arg[itask].isite;
1134  int isite_cp = m_arg[itask].isite_cpz;
1135 
1136  // double* w2 = &vcp1[Nvcd2*isite_cp];
1137  double *w2
1138  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1139  const double *w1 = &v1[Nvcd * isite];
1140  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1141 
1142  double vt1[m_Nvc], vt2[m_Nvc];
1143 
1144  if (m_arg[itask].kz1 == 1) {
1145  int Nxy = m_Nx * m_Ny;
1146  int iz = m_Mz - 1;
1147  for (int it = 0; it < m_Mt; ++it) {
1148  for (int ixy = 0; ixy < Nxy; ++ixy) {
1149  int is = ixy + Nxy * (iz + m_Nz * it);
1150  int is2 = ixy + Nxy * it;
1151  int in = Nvcd * is;
1152  int ig = m_Ndf * is;
1153  int ix1 = Nvc2 * is2;
1154  int ix2 = ix1 + m_Nvc;
1155 
1156  for (int ic = 0; ic < m_Nc; ++ic) {
1157  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1158  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1159  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1160  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1161  }
1162 
1163  for (int ic = 0; ic < m_Nc; ++ic) {
1164  int icr = 2 * ic;
1165  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1166  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1167  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1168  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1169  }
1170  }
1171  }
1172  }
1173 
1174  m_fw_send[idir]->start_thread(itask);
1175 }
1176 
1177 
1178 //====================================================================
1180  int itask, double *v2, const double *vcp2)
1181 {
1182  int Nvc2 = 2 * m_Nvc;
1183  int Nvcd = m_Nvc * m_Nd;
1184  int Nvcd2 = Nvcd / 2;
1185 
1186  int id1 = 0;
1187  int id2 = m_Nvc;
1188  int id3 = m_Nvc * 2;
1189  int id4 = m_Nvc * 3;
1190 
1191  int idir = 2;
1192  double bc2 = m_boundary2[idir];
1193 
1194  double wt1r, wt1i, wt2r, wt2i;
1195 
1196  int isite = m_arg[itask].isite;
1197  int isite_cp = m_arg[itask].isite_cpz;
1198 
1199  double *w2 = &v2[Nvcd * isite];
1200  // double* w1 = &vcp2[Nvcd2*isite_cp];
1201  const double *w1
1202  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1203 
1204  m_fw_recv[idir]->wait_thread(itask);
1205 
1206  if (m_arg[itask].kz0 == 1) {
1207  int Nxy = m_Nx * m_Ny;
1208 
1209  int iz = 0;
1210  for (int it = 0; it < m_Mt; ++it) {
1211  for (int ixy = 0; ixy < Nxy; ++ixy) {
1212  int is = ixy + Nxy * (iz + m_Nz * it);
1213  int is2 = ixy + Nxy * it;
1214  int iv = Nvcd * is;
1215  int ix1 = Nvc2 * is2;
1216  int ix2 = ix1 + m_Nvc;
1217 
1218  for (int ic = 0; ic < m_Nc; ++ic) {
1219  int icr = 2 * ic;
1220  int ici = 2 * ic + 1;
1221  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1222  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1223  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1224  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1225  w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
1226  w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
1227  w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
1228  w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
1229  }
1230  }
1231  }
1232  }
1233 }
1234 
1235 
1236 //====================================================================
1238  int itask, double *v2, const double *v1)
1239 {
1240  int Nvcd = m_Nvc * m_Nd;
1241 
1242  int id1 = 0;
1243  int id2 = m_Nvc;
1244  int id3 = m_Nvc * 2;
1245  int id4 = m_Nvc * 3;
1246 
1247  int idir = 2;
1248 
1249  double vt1[m_Nvc], vt2[m_Nvc];
1250  double wt1r, wt1i, wt2r, wt2i;
1251 
1252  int isite = m_arg[itask].isite;
1253 
1254  double *w2 = &v2[Nvcd * isite];
1255  const double *w1 = &v1[Nvcd * isite];
1256  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1257 
1258  int kz0 = m_arg[itask].kz0;
1259  int Nxy = m_Nx * m_Ny;
1260 
1261  for (int it = 0; it < m_Mt; ++it) {
1262  for (int iz = kz0; iz < m_Mz; ++iz) {
1263  for (int ixy = 0; ixy < Nxy; ++ixy) {
1264  int is = ixy + Nxy * (iz + m_Nz * it);
1265  int iv = Nvcd * is;
1266  int in = Nvcd * (is - Nxy);
1267  int ig = m_Ndf * (is - Nxy);
1268 
1269  for (int ic = 0; ic < m_Nc; ++ic) {
1270  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1271  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1272  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1273  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1274  }
1275 
1276  for (int ic = 0; ic < m_Nc; ++ic) {
1277  int ic2 = 2 * ic;
1278  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1279  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1280  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1281  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1282 
1283  w2[ic2 + id1 + iv] += wt1r;
1284  w2[ic2 + 1 + id1 + iv] += wt1i;
1285  w2[ic2 + id2 + iv] += wt2r;
1286  w2[ic2 + 1 + id2 + iv] += wt2i;
1287  w2[ic2 + id3 + iv] += -wt1i;
1288  w2[ic2 + 1 + id3 + iv] += wt1r;
1289  w2[ic2 + id4 + iv] += wt2i;
1290  w2[ic2 + 1 + id4 + iv] += -wt2r;
1291  }
1292  }
1293  }
1294  }
1295 }
1296 
1297 
1298 //====================================================================
1300  int itask, double *vcp1, const double *v1)
1301 {
1302  int Nvc2 = 2 * m_Nvc;
1303  int Nvcd = m_Nvc * m_Nd;
1304  int Nvcd2 = Nvcd / 2;
1305 
1306  int id1 = 0;
1307  int id2 = m_Nvc;
1308  int id3 = m_Nvc * 2;
1309  int id4 = m_Nvc * 3;
1310 
1311  int isite = m_arg[itask].isite;
1312  int isite_cp = m_arg[itask].isite_cpt;
1313 
1314  int idir = 3;
1315  double bc2 = m_boundary2[idir];
1316 
1317  // double* w2 = &vcp1[Nvcd2*isite_cp];
1318  double *w2
1319  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1320  const double *w1 = &v1[Nvcd * isite];
1321 
1322  if (m_arg[itask].kt0 == 1) {
1323  int Nxy = m_Nx * m_Ny;
1324  int it = 0;
1325  for (int iz = 0; iz < m_Mz; ++iz) {
1326  for (int ixy = 0; ixy < Nxy; ++ixy) {
1327  int is = ixy + Nxy * (iz + m_Nz * it);
1328  int is2 = ixy + Nxy * iz;
1329 
1330  int in = Nvcd * is;
1331  int ix1 = Nvc2 * is2;
1332  int ix2 = ix1 + m_Nvc;
1333 
1334  for (int ic = 0; ic < m_Nc; ++ic) {
1335  w2[2 * ic + ix1] = 2.0 * bc2 * w1[2 * ic + id3 + in];
1336  w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
1337  w2[2 * ic + ix2] = 2.0 * bc2 * w1[2 * ic + id4 + in];
1338  w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
1339  }
1340  }
1341  }
1342  }
1343 
1344  m_bw_send[idir]->start_thread(itask);
1345 }
1346 
1347 
1348 //====================================================================
1350  int itask, double *v2, const double *vcp2)
1351 {
1352  int Nvc2 = 2 * m_Nvc;
1353  int Nvcd = m_Nvc * m_Nd;
1354  int Nvcd2 = Nvcd / 2;
1355 
1356  int id1 = 0;
1357  int id2 = m_Nvc;
1358  int id3 = m_Nvc * 2;
1359  int id4 = m_Nvc * 3;
1360 
1361  int idir = 3;
1362 
1363  double wt1r, wt1i, wt2r, wt2i;
1364 
1365  int isite = m_arg[itask].isite;
1366  int isite_cp = m_arg[itask].isite_cpt;
1367 
1368  double *w2 = &v2[Nvcd * isite];
1369  // double* w1 = &vcp2[Nvcd2*isite_cp];
1370  const double *w1
1371  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1372  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1373 
1374  m_bw_recv[idir]->wait_thread(itask);
1375 
1376  if (m_arg[itask].kt1 == 1) {
1377  int Nxy = m_Nx * m_Ny;
1378  int it = m_Mt - 1;
1379  for (int iz = 0; iz < m_Mz; ++iz) {
1380  for (int ixy = 0; ixy < Nxy; ++ixy) {
1381  int is = ixy + Nxy * (iz + m_Nz * it);
1382  int is2 = ixy + Nxy * iz;
1383  int iv = Nvcd * is;
1384  int ig = m_Ndf * is;
1385  int ix1 = Nvc2 * is2;
1386  int ix2 = ix1 + m_Nvc;
1387 
1388  for (int ic = 0; ic < m_Nc; ++ic) {
1389  int ic2 = ic * m_Nvc;
1390 
1391  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1392  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1393  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1394  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1395 
1396  w2[2 * ic + id3 + iv] += wt1r;
1397  w2[2 * ic + 1 + id3 + iv] += wt1i;
1398  w2[2 * ic + id4 + iv] += wt2r;
1399  w2[2 * ic + 1 + id4 + iv] += wt2i;
1400  }
1401  }
1402  }
1403  }
1404 }
1405 
1406 
1407 //====================================================================
1409  int itask, double *v2, const double *v1)
1410 {
1411  int Nvcd = m_Nvc * m_Nd;
1412 
1413  int id1 = 0;
1414  int id2 = m_Nvc;
1415  int id3 = m_Nvc * 2;
1416  int id4 = m_Nvc * 3;
1417 
1418  int idir = 3;
1419 
1420  double vt1[m_Nvc], vt2[m_Nvc];
1421  double wt1r, wt1i, wt2r, wt2i;
1422 
1423  int isite = m_arg[itask].isite;
1424 
1425  double *w2 = &v2[Nvcd * isite];
1426  const double *w1 = &v1[Nvcd * isite];
1427  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1428 
1429  int kt1 = m_arg[itask].kt1;
1430  int Nxy = m_Nx * m_Ny;
1431  int Nxyz = Nxy * m_Nz;
1432 
1433  for (int it = 0; it < m_Mt - kt1; ++it) {
1434  for (int iz = 0; iz < m_Mz; ++iz) {
1435  for (int ixy = 0; ixy < Nxy; ++ixy) {
1436  int is = ixy + Nxy * (iz + m_Nz * it);
1437  int iv = Nvcd * is;
1438  int in = Nvcd * (is + Nxyz);
1439  int ig = m_Ndf * is;
1440 
1441  for (int ic = 0; ic < m_Nc; ++ic) {
1442  vt1[2 * ic] = 2.0 * w1[2 * ic + id3 + in];
1443  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
1444  vt2[2 * ic] = 2.0 * w1[2 * ic + id4 + in];
1445  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
1446  }
1447 
1448  for (int ic = 0; ic < m_Nc; ++ic) {
1449  int ic2 = ic * m_Nvc;
1450 
1451  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1452  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1453  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1454  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1455 
1456  w2[2 * ic + id3 + iv] += wt1r;
1457  w2[2 * ic + 1 + id3 + iv] += wt1i;
1458  w2[2 * ic + id4 + iv] += wt2r;
1459  w2[2 * ic + 1 + id4 + iv] += wt2i;
1460  }
1461  }
1462  }
1463  }
1464 }
1465 
1466 
1467 //====================================================================
1469  int itask, double *vcp1, const double *v1)
1470 {
1471  int Nvc2 = 2 * m_Nvc;
1472  int Nvcd = m_Nvc * m_Nd;
1473  int Nvcd2 = Nvcd / 2;
1474 
1475  int id1 = 0;
1476  int id2 = m_Nvc;
1477  int id3 = m_Nvc * 2;
1478  int id4 = m_Nvc * 3;
1479 
1480  int idir = 3;
1481 
1482  int isite = m_arg[itask].isite;
1483  int isite_cp = m_arg[itask].isite_cpt;
1484 
1485  // double* w2 = &vcp1[Nvcd2*isite_cp];
1486  double *w2
1487  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1488  const double *w1 = &v1[Nvcd * isite];
1489  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1490 
1491  double vt1[m_Nvc], vt2[m_Nvc];
1492 
1493  if (m_arg[itask].kt1 == 1) {
1494  int Nxy = m_Nx * m_Ny;
1495  int it = m_Mt - 1;
1496  for (int iz = 0; iz < m_Mz; ++iz) {
1497  for (int ixy = 0; ixy < Nxy; ++ixy) {
1498  int is = ixy + Nxy * (iz + m_Nz * it);
1499  int is2 = ixy + Nxy * iz;
1500  int in = Nvcd * is;
1501  int ig = m_Ndf * is;
1502  int ix1 = Nvc2 * is2;
1503  int ix2 = ix1 + m_Nvc;
1504 
1505  for (int ic = 0; ic < m_Nc; ++ic) {
1506  vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1507  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1508  vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1509  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1510  }
1511 
1512  for (int ic = 0; ic < m_Nc; ++ic) {
1513  int icr = 2 * ic;
1514  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1515  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1516  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1517  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1518  }
1519  }
1520  }
1521  }
1522 
1523  m_fw_send[idir]->start_thread(itask);
1524 }
1525 
1526 
1527 //====================================================================
1529  int itask, double *v2, const double *vcp2)
1530 {
1531  int Nvc2 = 2 * m_Nvc;
1532  int Nvcd = m_Nvc * m_Nd;
1533  int Nvcd2 = Nvcd / 2;
1534 
1535  int id1 = 0;
1536  int id2 = m_Nvc;
1537  int id3 = m_Nvc * 2;
1538  int id4 = m_Nvc * 3;
1539 
1540  int idir = 3;
1541  double bc2 = m_boundary2[idir];
1542 
1543  double wt1r, wt1i, wt2r, wt2i;
1544 
1545  int isite = m_arg[itask].isite;
1546  int isite_cp = m_arg[itask].isite_cpt;
1547 
1548  double *w2 = &v2[Nvcd * isite];
1549  // double* w1 = &vcp2[Nvcd2*isite_cp];
1550  const double *w1
1551  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1552 
1553  m_fw_recv[idir]->wait_thread(itask);
1554 
1555  if (m_arg[itask].kt0 == 1) {
1556  int Nxy = m_Nx * m_Ny;
1557  int it = 0;
1558  for (int iz = 0; iz < m_Mz; ++iz) {
1559  for (int ixy = 0; ixy < Nxy; ++ixy) {
1560  int is = ixy + Nxy * (iz + m_Nz * it);
1561  int is2 = ixy + Nxy * iz;
1562  int iv = Nvcd * is;
1563  int ix1 = Nvc2 * is2;
1564  int ix2 = ix1 + m_Nvc;
1565 
1566  for (int ic = 0; ic < m_Nc; ++ic) {
1567  int icr = 2 * ic;
1568  int ici = 2 * ic + 1;
1569  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1570  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1571  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1572  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1573  }
1574  }
1575  }
1576  }
1577 }
1578 
1579 
1580 //====================================================================
1582  int itask, double *v2, const double *v1)
1583 {
1584  int Nvcd = m_Nvc * m_Nd;
1585 
1586  int id1 = 0;
1587  int id2 = m_Nvc;
1588  int id3 = m_Nvc * 2;
1589  int id4 = m_Nvc * 3;
1590 
1591  int idir = 3;
1592 
1593  double vt1[m_Nvc], vt2[m_Nvc];
1594  double wt1r, wt1i, wt2r, wt2i;
1595 
1596  int isite = m_arg[itask].isite;
1597 
1598  double *w2 = &v2[Nvcd * isite];
1599  const double *w1 = &v1[Nvcd * isite];
1600  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1601 
1602  int kt0 = m_arg[itask].kt0;
1603  int Nxy = m_Nx * m_Ny;
1604  int Nxyz = Nxy * m_Nz;
1605 
1606  for (int it = kt0; it < m_Mt; ++it) {
1607  for (int iz = 0; iz < m_Mz; ++iz) {
1608  for (int ixy = 0; ixy < Nxy; ++ixy) {
1609  int is = ixy + Nxy * (iz + m_Nz * it);
1610  int iv = Nvcd * is;
1611  int in = Nvcd * (is - Nxyz);
1612  int ig = m_Ndf * (is - Nxyz);
1613 
1614  for (int ic = 0; ic < m_Nc; ++ic) {
1615  vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1616  vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1617  vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1618  vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1619  }
1620 
1621  for (int ic = 0; ic < m_Nc; ++ic) {
1622  int ic2 = 2 * ic;
1623  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1624  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1625  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1626  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1627 
1628  w2[ic2 + id1 + iv] += wt1r;
1629  w2[ic2 + 1 + id1 + iv] += wt1i;
1630  w2[ic2 + id2 + iv] += wt2r;
1631  w2[ic2 + 1 + id2 + iv] += wt2i;
1632  }
1633  }
1634  }
1635  }
1636 }
1637 
1638 
1639 //====================================================================
1641  int itask, double *vcp1, const double *v1)
1642 {
1643  int Nvc2 = 2 * m_Nvc;
1644  int Nvcd = m_Nvc * m_Nd;
1645  int Nvcd2 = Nvcd / 2;
1646 
1647  int id1 = 0;
1648  int id2 = m_Nvc;
1649  int id3 = m_Nvc * 2;
1650  int id4 = m_Nvc * 3;
1651 
1652  int isite = m_arg[itask].isite;
1653  int isite_cp = m_arg[itask].isite_cpt;
1654 
1655  int idir = 3;
1656  double bc2 = m_boundary2[idir];
1657 
1658  // double* w2 = &vcp1[Nvcd2*isite_cp];
1659  double *w2
1660  = (double *)m_bw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1661  const double *w1 = &v1[Nvcd * isite];
1662 
1663  if (m_arg[itask].kt0 == 1) {
1664  int Nxy = m_Nx * m_Ny;
1665  int it = 0;
1666  for (int iz = 0; iz < m_Mz; ++iz) {
1667  for (int ixy = 0; ixy < Nxy; ++ixy) {
1668  int is = ixy + Nxy * (iz + m_Nz * it);
1669  int is2 = ixy + Nxy * iz;
1670 
1671  int in = Nvcd * is;
1672  int ix1 = Nvc2 * is2;
1673  int ix2 = ix1 + m_Nvc;
1674 
1675  for (int ic = 0; ic < m_Nc; ++ic) {
1676  w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
1677  w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
1678  w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
1679  w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
1680  }
1681  }
1682  }
1683  }
1684 
1685  m_bw_send[idir]->start_thread(itask);
1686 }
1687 
1688 
1689 //====================================================================
1691  int itask, double *v2, const double *vcp2)
1692 {
1693  int Nvc2 = 2 * m_Nvc;
1694  int Nvcd = m_Nvc * m_Nd;
1695  int Nvcd2 = Nvcd / 2;
1696 
1697  int id1 = 0;
1698  int id2 = m_Nvc;
1699  int id3 = m_Nvc * 2;
1700  int id4 = m_Nvc * 3;
1701 
1702  int idir = 3;
1703 
1704  double wt1r, wt1i, wt2r, wt2i;
1705 
1706  int isite = m_arg[itask].isite;
1707  int isite_cp = m_arg[itask].isite_cpt;
1708 
1709  double *w2 = &v2[Nvcd * isite];
1710  // double* w1 = &vcp2[Nvcd2*isite_cp];
1711  const double *w1
1712  = (double *)m_bw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1713  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1714 
1715  m_bw_recv[idir]->wait_thread(itask);
1716 
1717  if (m_arg[itask].kt1 == 1) {
1718  int Nxy = m_Nx * m_Ny;
1719  int it = m_Mt - 1;
1720  for (int iz = 0; iz < m_Mz; ++iz) {
1721  for (int ixy = 0; ixy < Nxy; ++ixy) {
1722  int is = ixy + Nxy * (iz + m_Nz * it);
1723  int is2 = ixy + Nxy * iz;
1724  int iv = Nvcd * is;
1725  int ig = m_Ndf * is;
1726  int ix1 = Nvc2 * is2;
1727  int ix2 = ix1 + m_Nvc;
1728 
1729  for (int ic = 0; ic < m_Nc; ++ic) {
1730  int ic2 = ic * m_Nvc;
1731 
1732  wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1733  wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1734  wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1735  wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1736 
1737  w2[2 * ic + id1 + iv] += wt1r;
1738  w2[2 * ic + 1 + id1 + iv] += wt1i;
1739  w2[2 * ic + id2 + iv] += wt2r;
1740  w2[2 * ic + 1 + id2 + iv] += wt2i;
1741  w2[2 * ic + id3 + iv] += wt1r;
1742  w2[2 * ic + 1 + id3 + iv] += wt1i;
1743  w2[2 * ic + id4 + iv] += wt2r;
1744  w2[2 * ic + 1 + id4 + iv] += wt2i;
1745  }
1746  }
1747  }
1748  }
1749 }
1750 
1751 
1752 //====================================================================
1754  int itask, double *v2, const double *v1)
1755 {
1756  int Nvcd = m_Nvc * m_Nd;
1757 
1758  int id1 = 0;
1759  int id2 = m_Nvc;
1760  int id3 = m_Nvc * 2;
1761  int id4 = m_Nvc * 3;
1762 
1763  int idir = 3;
1764 
1765  double vt1[m_Nvc], vt2[m_Nvc];
1766  double wt1r, wt1i, wt2r, wt2i;
1767 
1768  int isite = m_arg[itask].isite;
1769 
1770  double *w2 = &v2[Nvcd * isite];
1771  const double *w1 = &v1[Nvcd * isite];
1772  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1773 
1774  int kt1 = m_arg[itask].kt1;
1775  int Nxy = m_Nx * m_Ny;
1776  int Nxyz = Nxy * m_Nz;
1777 
1778  for (int it = 0; it < m_Mt - kt1; ++it) {
1779  for (int iz = 0; iz < m_Mz; ++iz) {
1780  for (int ixy = 0; ixy < Nxy; ++ixy) {
1781  int is = ixy + Nxy * (iz + m_Nz * it);
1782  int iv = Nvcd * is;
1783  int in = Nvcd * (is + Nxyz);
1784  int ig = m_Ndf * is;
1785 
1786  for (int ic = 0; ic < m_Nc; ++ic) {
1787  vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
1788  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
1789  vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
1790  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
1791  }
1792 
1793  for (int ic = 0; ic < m_Nc; ++ic) {
1794  int ic2 = ic * m_Nvc;
1795 
1796  wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1797  wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1798  wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1799  wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1800 
1801  w2[2 * ic + id1 + iv] += wt1r;
1802  w2[2 * ic + 1 + id1 + iv] += wt1i;
1803  w2[2 * ic + id2 + iv] += wt2r;
1804  w2[2 * ic + 1 + id2 + iv] += wt2i;
1805  w2[2 * ic + id3 + iv] += wt1r;
1806  w2[2 * ic + 1 + id3 + iv] += wt1i;
1807  w2[2 * ic + id4 + iv] += wt2r;
1808  w2[2 * ic + 1 + id4 + iv] += wt2i;
1809  }
1810  }
1811  }
1812  }
1813 }
1814 
1815 
1816 //====================================================================
1818  int itask, double *vcp1, const double *v1)
1819 {
1820  int Nvc2 = 2 * m_Nvc;
1821  int Nvcd = m_Nvc * m_Nd;
1822  int Nvcd2 = Nvcd / 2;
1823 
1824  int id1 = 0;
1825  int id2 = m_Nvc;
1826  int id3 = m_Nvc * 2;
1827  int id4 = m_Nvc * 3;
1828 
1829  int idir = 3;
1830 
1831  int isite = m_arg[itask].isite;
1832  int isite_cp = m_arg[itask].isite_cpt;
1833 
1834  // double* w2 = &vcp1[Nvcd2*isite_cp];
1835  double *w2
1836  = (double *)m_fw_send[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1837  const double *w1 = &v1[Nvcd * isite];
1838  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1839 
1840  double vt1[m_Nvc], vt2[m_Nvc];
1841 
1842  if (m_arg[itask].kt1 == 1) {
1843  int Nxy = m_Nx * m_Ny;
1844  int it = m_Mt - 1;
1845  for (int iz = 0; iz < m_Mz; ++iz) {
1846  for (int ixy = 0; ixy < Nxy; ++ixy) {
1847  int is = ixy + Nxy * (iz + m_Nz * it);
1848  int is2 = ixy + Nxy * iz;
1849  int in = Nvcd * is;
1850  int ig = m_Ndf * is;
1851  int ix1 = Nvc2 * is2;
1852  int ix2 = ix1 + m_Nvc;
1853 
1854  for (int ic = 0; ic < m_Nc; ++ic) {
1855  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1856  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1857  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1858  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1859  }
1860 
1861  for (int ic = 0; ic < m_Nc; ++ic) {
1862  int icr = 2 * ic;
1863  w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1864  w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1865  w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1866  w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1867  }
1868  }
1869  }
1870  }
1871 
1872  m_fw_send[idir]->start_thread(itask);
1873 }
1874 
1875 
1876 //====================================================================
1878  int itask, double *v2, const double *vcp2)
1879 {
1880  int Nvc2 = 2 * m_Nvc;
1881  int Nvcd = m_Nvc * m_Nd;
1882  int Nvcd2 = Nvcd / 2;
1883 
1884  int id1 = 0;
1885  int id2 = m_Nvc;
1886  int id3 = m_Nvc * 2;
1887  int id4 = m_Nvc * 3;
1888 
1889  int idir = 3;
1890  double bc2 = m_boundary2[idir];
1891 
1892  double wt1r, wt1i, wt2r, wt2i;
1893 
1894  int isite = m_arg[itask].isite;
1895  int isite_cp = m_arg[itask].isite_cpt;
1896 
1897  double *w2 = &v2[Nvcd * isite];
1898  // double* w1 = &vcp2[Nvcd2*isite_cp];
1899  const double *w1
1900  = (double *)m_fw_recv[idir]->ptr(sizeof(double) * Nvcd2 * isite_cp);
1901 
1902  m_fw_recv[idir]->wait_thread(itask);
1903 
1904  if (m_arg[itask].kt0 == 1) {
1905  int Nxy = m_Nx * m_Ny;
1906  int it = 0;
1907  for (int iz = 0; iz < m_Mz; ++iz) {
1908  for (int ixy = 0; ixy < Nxy; ++ixy) {
1909  int is = ixy + Nxy * (iz + m_Nz * it);
1910  int is2 = ixy + Nxy * iz;
1911  int iv = Nvcd * is;
1912  int ix1 = Nvc2 * is2;
1913  int ix2 = ix1 + m_Nvc;
1914 
1915  for (int ic = 0; ic < m_Nc; ++ic) {
1916  int icr = 2 * ic;
1917  int ici = 2 * ic + 1;
1918  w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1919  w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1920  w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1921  w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1922  w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
1923  w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
1924  w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
1925  w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
1926  }
1927  }
1928  }
1929  }
1930 }
1931 
1932 
1933 //====================================================================
1935  int itask, double *v2, const double *v1)
1936 {
1937  int Nvcd = m_Nvc * m_Nd;
1938 
1939  int id1 = 0;
1940  int id2 = m_Nvc;
1941  int id3 = m_Nvc * 2;
1942  int id4 = m_Nvc * 3;
1943 
1944  int idir = 3;
1945 
1946  double vt1[m_Nvc], vt2[m_Nvc];
1947  double wt1r, wt1i, wt2r, wt2i;
1948 
1949  int isite = m_arg[itask].isite;
1950 
1951  double *w2 = &v2[Nvcd * isite];
1952  const double *w1 = &v1[Nvcd * isite];
1953  const double *u = m_U->ptr(m_Ndf * (isite + idir * m_Nvol));
1954 
1955  int kt0 = m_arg[itask].kt0;
1956  int Nxy = m_Nx * m_Ny;
1957  int Nxyz = Nxy * m_Nz;
1958 
1959  for (int it = kt0; it < m_Mt; ++it) {
1960  for (int iz = 0; iz < m_Mz; ++iz) {
1961  for (int ixy = 0; ixy < Nxy; ++ixy) {
1962  int is = ixy + Nxy * (iz + m_Nz * it);
1963  int iv = Nvcd * is;
1964  int in = Nvcd * (is - Nxyz);
1965  int ig = m_Ndf * (is - Nxyz);
1966 
1967  for (int ic = 0; ic < m_Nc; ++ic) {
1968  vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1969  vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1970  vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1971  vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1972  }
1973 
1974  for (int ic = 0; ic < m_Nc; ++ic) {
1975  int ic2 = 2 * ic;
1976  wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1977  wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1978  wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1979  wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1980 
1981  w2[ic2 + id1 + iv] += wt1r;
1982  w2[ic2 + 1 + id1 + iv] += wt1i;
1983  w2[ic2 + id2 + iv] += wt2r;
1984  w2[ic2 + 1 + id2 + iv] += wt2i;
1985  w2[ic2 + id3 + iv] -= wt1r;
1986  w2[ic2 + 1 + id3 + iv] -= wt1i;
1987  w2[ic2 + id4 + iv] -= wt2r;
1988  w2[ic2 + 1 + id4 + iv] -= wt2i;
1989  }
1990  }
1991  }
1992  }
1993 }
1994 
1995 
1996 //====================================================================
1998  int itask, double *v2, const double *v1)
1999 {
2000  int Nvcd = m_Nvc * m_Nd;
2001  int Nxy = m_Nx * m_Ny;
2002 
2003  int id1 = 0;
2004  int id2 = m_Nvc;
2005  int id3 = m_Nvc * 2;
2006  int id4 = m_Nvc * 3;
2007 
2008  int isite = m_arg[itask].isite;
2009  double *w2 = &v2[Nvcd * isite];
2010  const double *w1 = &v1[Nvcd * isite];
2011 
2012  for (int it = 0; it < m_Mt; ++it) {
2013  for (int iz = 0; iz < m_Mz; ++iz) {
2014  for (int ixy = 0; ixy < Nxy; ++ixy) {
2015  int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
2016  for (int ivc = 0; ivc < m_Nvc; ++ivc) {
2017  w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
2018  w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
2019  w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
2020  w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
2021  }
2022  }
2023  }
2024  }
2025 }
2026 
2027 
2028 //====================================================================
2030  int itask, double *v2, const double *v1)
2031 {
2032  int Nvcd = m_Nvc * m_Nd;
2033  int Nxy = m_Nx * m_Ny;
2034 
2035  int id1 = 0;
2036  int id2 = m_Nvc;
2037  int id3 = m_Nvc * 2;
2038  int id4 = m_Nvc * 3;
2039 
2040  int isite = m_arg[itask].isite;
2041  double *w2 = &v2[Nvcd * isite];
2042  const double *w1 = &v1[Nvcd * isite];
2043 
2044  for (int it = 0; it < m_Mt; ++it) {
2045  for (int iz = 0; iz < m_Mz; ++iz) {
2046  for (int ixy = 0; ixy < Nxy; ++ixy) {
2047  int iv = Nvcd * (ixy + Nxy * (iz + m_Nz * it));
2048  for (int ivc = 0; ivc < m_Nvc; ++ivc) {
2049  w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
2050  w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
2051  w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
2052  w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
2053  }
2054  }
2055  }
2056  }
2057 }
2058 
2059 
2060 //====================================================================
2061 //============================================================END=====
BridgeIO vout
Definition: bridgeIO.cpp:278
void general(const char *format,...)
Definition: bridgeIO.cpp:65
void daypx_thread(int, double *, double, const double *)
static int get_num_threads_available()
returns number of threads (works outside of parallel region).
void crucial(const char *format,...)
Definition: bridgeIO.cpp:48