22 #if defined USE_GROUP_SU3
23 #include "fopr_Wilson_impl_SU3.inc"
24 #elif defined USE_GROUP_SU2
25 #include "fopr_Wilson_impl_SU2.inc"
26 #elif defined USE_GROUP_SU_N
27 #include "fopr_Wilson_impl_SU_N.inc"
63 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
64 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
65 int itask = ith_z + m_Ntask_z * ith_t;
73 if (ith_t == 0)
m_arg[itask].kt0 = 1;
74 if (ith_z == 0)
m_arg[itask].kz0 = 1;
75 if (ith_t == m_Ntask_t - 1)
m_arg[itask].kt1 = 1;
76 if (ith_z == m_Ntask_z - 1)
m_arg[itask].kz1 = 1;
80 m_arg[itask].isite_cpz = ith_t *
m_Mt * Nxy2;
81 m_arg[itask].isite_cpt = ith_z *
m_Mz * Nxy2;
88 int Nvcd2 = 2 * Nc * Nd / 2;
90 std::vector<int> destid(
m_Ntask);
91 std::vector<int> offset(
m_Ntask);
92 std::vector<int> datasize(
m_Ntask);
93 std::vector<int> offset_up(
m_Ntask);
94 std::vector<int> offset_lw(
m_Ntask);
95 std::vector<int> datasize_up(
m_Ntask);
96 std::vector<int> datasize_lw(
m_Ntask);
99 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
100 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
102 int isite_cp = itask *
m_Mz *
m_Mt * (m_Ny / 2);
103 destid[itask] = itask;
104 offset[itask] =
sizeof(double) * Nvcd2 * isite_cp;
105 datasize[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Mt * (m_Ny / 2);
114 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
115 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
118 destid[itask] = itask;
119 offset[itask] =
sizeof(double) * Nvcd2 * isite_cp;
120 datasize[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Mt * m_Nx2;
129 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
130 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
131 int itask = ith_z + m_Ntask_z * ith_t;
133 offset_up[itask] = 0;
134 offset_lw[itask] = 0;
135 datasize_up[itask] = 0;
136 datasize_lw[itask] = 0;
138 destid[itask] = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
139 offset_lw[itask] =
sizeof(double) * Nvcd2 * ith_t *
m_Mt *
m_Nx2 * m_Ny;
140 datasize_lw[itask] =
sizeof(double) * Nvcd2 *
m_Mt *
m_Nx2 * m_Ny;
142 if (ith_z == m_Ntask_z - 1) {
144 offset_up[itask] =
sizeof(double) * Nvcd2 * ith_t *
m_Mt *
m_Nx2 * m_Ny;
145 datasize_up[itask] =
sizeof(double) * Nvcd2 *
m_Mt *
m_Nx2 * m_Ny;
155 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
156 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
157 int itask = ith_z + m_Ntask_z * ith_t;
159 offset_up[itask] = 0;
160 offset_lw[itask] = 0;
161 datasize_up[itask] = 0;
162 datasize_lw[itask] = 0;
164 destid[itask] = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
165 offset_lw[itask] =
sizeof(double) * Nvcd2 * ith_z *
m_Mz *
m_Nx2 * m_Ny;
166 datasize_lw[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Nx2 * m_Ny;
168 if (ith_t == m_Ntask_t - 1) {
169 destid[itask] = ith_z;
170 offset_up[itask] =
sizeof(double) * Nvcd2 * ith_z *
m_Mz *
m_Nx2 * m_Ny;
171 datasize_up[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Nx2 * m_Ny;
184 double *w,
double fac)
189 int isite =
m_arg[itask].isite;
190 double *wp = &w[Nvcd * isite];
192 for (
int it = 0; it <
m_Mt; ++it) {
193 for (
int iz = 0; iz <
m_Mz; ++iz) {
194 for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
195 int iv = ivxy + Nvxy * (iz +
m_Nz * it);
196 wp[iv] = fac * wp[iv];
210 int isite =
m_arg[itask].isite;
211 double *wp = &v[Nvcd * isite];
213 for (
int it = 0; it <
m_Mt; ++it) {
214 for (
int iz = 0; iz <
m_Mz; ++iz) {
215 for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
216 int iv = ivxy + Nvxy * (iz +
m_Nz * it);
226 int itask,
double *vcp1,
const double *v1,
int ieo)
228 int Nvc2 = 2 *
m_Nvc;
230 int Nvcd2 = Nvcd / 2;
239 int isite =
m_arg[itask].isite;
240 int isite_cp =
m_arg[itask].isite_cpx;
241 int iyzt0 = isite /
m_Nx2;
245 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
246 const double *w1 = &v1[Nvcd * isite];
253 for (
int it = 0; it <
m_Mt; ++it) {
254 for (
int iz = 0; iz <
m_Mz; ++iz) {
255 for (
int iy = 0; iy <
m_Ny; ++iy) {
256 int iyzt = iy + m_Ny * (iz +
m_Nz * it);
257 int Leo = ieo + (1 - 2 * ieo) *
m_Leo[iyzt0 + iyzt];
259 int is = ix +
m_Nx2 * iyzt;
262 int ix1 = Nvc2 * ibf;
263 int ix2 = ix1 +
m_Nvc;
265 for (
int ic = 0; ic <
m_Nc; ++ic) {
266 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
267 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
268 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
269 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
283 int itask,
double *v2,
const double *vcp2,
int ieo)
285 int Nvc2 = 2 *
m_Nvc;
287 int Nvcd2 = Nvcd / 2;
296 double wt1r, wt1i, wt2r, wt2i;
298 int isite =
m_arg[itask].isite;
299 int isite_cp =
m_arg[itask].isite_cpx;
300 int iyzt0 = isite /
m_Nx2;
302 double *w2 = &v2[Nvcd * isite];
305 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
312 for (
int it = 0; it <
m_Mt; ++it) {
313 for (
int iz = 0; iz <
m_Mz; ++iz) {
314 for (
int iy = 0; iy <
m_Ny; ++iy) {
315 int iyzt = iy + m_Ny * (iz +
m_Nz * it);
316 int Leo = ieo + (1 - 2 * ieo) *
m_Leo[iyzt0 + iyzt];
319 int is = ix +
m_Nx2 * iyzt;
322 int ix1 = Nvc2 * ibf;
323 int ix2 = ix1 +
m_Nvc;
325 for (
int ic = 0; ic <
m_Nc; ++ic) {
326 int ic2 = ic *
m_Nvc;
327 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
328 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
329 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
330 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
331 w2[2 * ic + id1 + iv] += wt1r;
332 w2[2 * ic + 1 + id1 + iv] += wt1i;
333 w2[2 * ic + id2 + iv] += wt2r;
334 w2[2 * ic + 1 + id2 + iv] += wt2i;
335 w2[2 * ic + id3 + iv] += wt2i;
336 w2[2 * ic + 1 + id3 + iv] += -wt2r;
337 w2[2 * ic + id4 + iv] += wt1i;
338 w2[2 * ic + 1 + id4 + iv] += -wt1r;
350 int itask,
double *v2,
const double *v1,
int ieo)
362 double wt1r, wt1i, wt2r, wt2i;
364 int isite =
m_arg[itask].isite;
365 int iyzt0 = isite /
m_Nx2;
367 double *w2 = &v2[Nvcd * isite];
368 const double *w1 = &v1[Nvcd * isite];
371 for (
int it = 0; it <
m_Mt; ++it) {
372 for (
int iz = 0; iz <
m_Mz; ++iz) {
373 for (
int iy = 0; iy <
m_Ny; ++iy) {
374 int iyzt = iy + m_Ny * (iz +
m_Nz * it);
375 int Leo = ieo + (1 - 2 * ieo) *
m_Leo[iyzt0 + iyzt];
376 for (
int ix = 0; ix <
m_Nx2 - Leo; ++ix) {
377 int is = ix +
m_Nx2 * iyzt;
379 int in = Nvcd * (is + Leo);
382 for (
int ic = 0; ic <
m_Nc; ++ic) {
383 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
384 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
385 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
386 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
389 for (
int ic = 0; ic <
m_Nc; ++ic) {
390 int ic2 = ic *
m_Nvc;
392 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
393 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
394 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
395 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
397 w2[2 * ic + id1 + iv] += wt1r;
398 w2[2 * ic + 1 + id1 + iv] += wt1i;
399 w2[2 * ic + id2 + iv] += wt2r;
400 w2[2 * ic + 1 + id2 + iv] += wt2i;
401 w2[2 * ic + id3 + iv] += wt2i;
402 w2[2 * ic + 1 + id3 + iv] += -wt2r;
403 w2[2 * ic + id4 + iv] += wt1i;
404 w2[2 * ic + 1 + id4 + iv] += -wt1r;
415 int itask,
double *vcp1,
const double *v1,
int ieo)
417 int Nvc2 = 2 *
m_Nvc;
419 int Nvcd2 = Nvcd / 2;
428 int isite =
m_arg[itask].isite;
429 int isite_cp =
m_arg[itask].isite_cpx;
430 int iyzt0 = isite /
m_Nx2;
434 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
435 const double *w1 = &v1[Nvcd * isite];
443 for (
int it = 0; it <
m_Mt; ++it) {
444 for (
int iz = 0; iz <
m_Mz; ++iz) {
445 for (
int iy = 0; iy <
m_Ny; ++iy) {
446 int iyzt = iy + m_Ny * (iz +
m_Nz * it);
447 int Leo = ieo + (1 - 2 * ieo) *
m_Leo[iyzt0 + iyzt];
449 int is = ix +
m_Nx2 * iyzt;
453 int ix1 = Nvc2 * ibf;
454 int ix2 = ix1 +
m_Nvc;
456 for (
int ic = 0; ic <
m_Nc; ++ic) {
457 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
458 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
459 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
460 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
463 for (
int ic = 0; ic <
m_Nc; ++ic) {
465 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
466 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
467 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
468 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
482 int itask,
double *v2,
const double *vcp2,
int ieo)
484 int Nvc2 = 2 *
m_Nvc;
486 int Nvcd2 = Nvcd / 2;
496 double wt1r, wt1i, wt2r, wt2i;
498 int isite =
m_arg[itask].isite;
499 int isite_cp =
m_arg[itask].isite_cpx;
500 int iyzt0 = isite /
m_Nx2;
502 double *w2 = &v2[Nvcd * isite];
505 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
511 for (
int it = 0; it <
m_Mt; ++it) {
512 for (
int iz = 0; iz <
m_Mz; ++iz) {
513 for (
int iy = 0; iy <
m_Ny; ++iy) {
514 int iyzt = iy + m_Ny * (iz +
m_Nz * it);
515 int Leo = ieo + (1 - 2 * ieo) *
m_Leo[iyzt0 + iyzt];
517 int is = ix +
m_Nx2 * iyzt;
520 int ix1 = Nvc2 * ibf;
521 int ix2 = ix1 +
m_Nvc;
523 for (
int ic = 0; ic <
m_Nc; ++ic) {
525 int ici = 2 * ic + 1;
526 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
527 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
528 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
529 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
530 w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
531 w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
532 w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
533 w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
545 int itask,
double *v2,
const double *v1,
int ieo)
557 double wt1r, wt1i, wt2r, wt2i;
559 int isite =
m_arg[itask].isite;
560 int iyzt0 = isite /
m_Nx2;
562 double *w2 = &v2[Nvcd * isite];
563 const double *w1 = &v1[Nvcd * isite];
566 for (
int it = 0; it <
m_Mt; ++it) {
567 for (
int iz = 0; iz <
m_Mz; ++iz) {
568 for (
int iy = 0; iy <
m_Ny; ++iy) {
569 int iyzt = iy + m_Ny * (iz +
m_Nz * it);
570 int Leo = ieo + (1 - 2 * ieo) *
m_Leo[iyzt0 + iyzt];
572 for (
int ix = Meo; ix <
m_Nx2; ++ix) {
573 int is = ix + m_Nx2 * iyzt;
575 int in = Nvcd * (is -
Meo);
578 for (
int ic = 0; ic <
m_Nc; ++ic) {
579 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
580 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
581 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
582 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
585 for (
int ic = 0; ic <
m_Nc; ++ic) {
588 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
589 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
590 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
591 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
593 w2[2 * ic + id1 + iv] += wt1r;
594 w2[2 * ic + 1 + id1 + iv] += wt1i;
595 w2[2 * ic + id2 + iv] += wt2r;
596 w2[2 * ic + 1 + id2 + iv] += wt2i;
597 w2[2 * ic + id3 + iv] += -wt2i;
598 w2[2 * ic + 1 + id3 + iv] += +wt2r;
599 w2[2 * ic + id4 + iv] += -wt1i;
600 w2[2 * ic + 1 + id4 + iv] += +wt1r;
611 int itask,
double *vcp1,
const double *v1,
int ieo)
613 int Nvc2 = 2 *
m_Nvc;
615 int Nvcd2 = Nvcd / 2;
624 int isite =
m_arg[itask].isite;
625 int isite_cp =
m_arg[itask].isite_cpy;
629 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
630 const double *w1 = &v1[Nvcd * isite];
636 for (
int it = 0; it <
m_Mt; ++it) {
637 for (
int iz = 0; iz <
m_Mz; ++iz) {
638 for (
int ix = 0; ix <
m_Nx2; ++ix) {
639 int is = ix + m_Nx2 * (iy +
m_Ny * (iz +
m_Nz * it));
640 int is2 = ix + m_Nx2 * (iz + m_Mz * it);
642 int ix1 = Nvc2 * is2;
643 int ix2 = ix1 +
m_Nvc;
645 for (
int ic = 0; ic <
m_Nc; ++ic) {
646 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
647 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
648 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
649 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
661 int itask,
double *v2,
const double *vcp2,
int ieo)
663 int Nvc2 = 2 *
m_Nvc;
665 int Nvcd2 = Nvcd / 2;
674 double wt1r, wt1i, wt2r, wt2i;
676 int isite =
m_arg[itask].isite;
677 int isite_cp =
m_arg[itask].isite_cpy;
679 double *w2 = &v2[Nvcd * isite];
682 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
688 for (
int it = 0; it <
m_Mt; ++it) {
689 for (
int iz = 0; iz <
m_Mz; ++iz) {
690 for (
int ix = 0; ix <
m_Nx2; ++ix) {
691 int is = ix + m_Nx2 * (iy +
m_Ny * (iz +
m_Nz * it));
692 int is2 = ix + m_Nx2 * (iz + m_Mz * it);
695 int ix1 = Nvc2 * is2;
696 int ix2 = ix1 +
m_Nvc;
698 for (
int ic = 0; ic <
m_Nc; ++ic) {
699 int ic2 = ic *
m_Nvc;
701 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
702 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
703 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
704 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
706 w2[2 * ic + id1 + iv] += wt1r;
707 w2[2 * ic + 1 + id1 + iv] += wt1i;
708 w2[2 * ic + id2 + iv] += wt2r;
709 w2[2 * ic + 1 + id2 + iv] += wt2i;
710 w2[2 * ic + id3 + iv] += -wt2r;
711 w2[2 * ic + 1 + id3 + iv] += -wt2i;
712 w2[2 * ic + id4 + iv] += wt1r;
713 w2[2 * ic + 1 + id4 + iv] += wt1i;
723 int itask,
double *v2,
const double *v1,
int ieo)
735 double wt1r, wt1i, wt2r, wt2i;
737 int isite =
m_arg[itask].isite;
739 double *w2 = &v2[Nvcd * isite];
740 const double *w1 = &v1[Nvcd * isite];
743 for (
int it = 0; it <
m_Mt; ++it) {
744 for (
int iz = 0; iz <
m_Mz; ++iz) {
745 for (
int iy = 0; iy <
m_Ny - 1; ++iy) {
746 for (
int ix = 0; ix <
m_Nx2; ++ix) {
747 int is = ix + m_Nx2 * (iy + m_Ny * (iz +
m_Nz * it));
749 int in = Nvcd * (is +
m_Nx2);
752 for (
int ic = 0; ic <
m_Nc; ++ic) {
753 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
754 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
755 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
756 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
759 for (
int ic = 0; ic <
m_Nc; ++ic) {
760 int ic2 = ic *
m_Nvc;
762 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
763 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
764 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
765 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
767 w2[2 * ic + id1 + iv] += wt1r;
768 w2[2 * ic + 1 + id1 + iv] += wt1i;
769 w2[2 * ic + id2 + iv] += wt2r;
770 w2[2 * ic + 1 + id2 + iv] += wt2i;
771 w2[2 * ic + id3 + iv] += -wt2r;
772 w2[2 * ic + 1 + id3 + iv] += -wt2i;
773 w2[2 * ic + id4 + iv] += wt1r;
774 w2[2 * ic + 1 + id4 + iv] += wt1i;
785 int itask,
double *vcp1,
const double *v1,
int ieo)
787 int Nvc2 = 2 *
m_Nvc;
789 int Nvcd2 = Nvcd / 2;
798 int isite =
m_arg[itask].isite;
799 int isite_cp =
m_arg[itask].isite_cpy;
803 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
804 const double *w1 = &v1[Nvcd * isite];
811 for (
int it = 0; it <
m_Mt; ++it) {
812 for (
int iz = 0; iz <
m_Mz; ++iz) {
813 for (
int ix = 0; ix <
m_Nx2; ++ix) {
814 int is = ix + m_Nx2 * (iy +
m_Ny * (iz +
m_Nz * it));
815 int is2 = ix + m_Nx2 * (iz + m_Mz * it);
818 int ix1 = Nvc2 * is2;
819 int ix2 = ix1 +
m_Nvc;
821 for (
int ic = 0; ic <
m_Nc; ++ic) {
822 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
823 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
824 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
825 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
828 for (
int ic = 0; ic <
m_Nc; ++ic) {
830 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
831 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
832 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
833 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
845 int itask,
double *v2,
const double *vcp2,
int ieo)
847 int Nvc2 = 2 *
m_Nvc;
849 int Nvcd2 = Nvcd / 2;
859 double wt1r, wt1i, wt2r, wt2i;
861 int isite =
m_arg[itask].isite;
862 int isite_cp =
m_arg[itask].isite_cpy;
864 double *w2 = &v2[Nvcd * isite];
867 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
872 for (
int it = 0; it <
m_Mt; ++it) {
873 for (
int iz = 0; iz <
m_Mz; ++iz) {
874 for (
int ix = 0; ix <
m_Nx2; ++ix) {
875 int is = ix + m_Nx2 * (iy +
m_Ny * (iz +
m_Nz * it));
876 int is2 = ix + m_Nx2 * (iz + m_Mz * it);
878 int ix1 = Nvc2 * is2;
879 int ix2 = ix1 +
m_Nvc;
881 for (
int ic = 0; ic <
m_Nc; ++ic) {
883 int ici = 2 * ic + 1;
884 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
885 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
886 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
887 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
888 w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
889 w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
890 w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
891 w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
901 int itask,
double *v2,
const double *v1,
int ieo)
913 double wt1r, wt1i, wt2r, wt2i;
915 int isite =
m_arg[itask].isite;
917 double *w2 = &v2[Nvcd * isite];
918 const double *w1 = &v1[Nvcd * isite];
921 for (
int it = 0; it <
m_Mt; ++it) {
922 for (
int iz = 0; iz <
m_Mz; ++iz) {
923 for (
int iy = 1; iy <
m_Ny; ++iy) {
924 for (
int ix = 0; ix <
m_Nx2; ++ix) {
925 int is = ix + m_Nx2 * (iy + m_Ny * (iz +
m_Nz * it));
927 int in = Nvcd * (is -
m_Nx2);
930 for (
int ic = 0; ic <
m_Nc; ++ic) {
931 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
932 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
933 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
934 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
937 for (
int ic = 0; ic <
m_Nc; ++ic) {
939 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
940 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
941 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
942 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
944 w2[ic2 + id1 + iv] += wt1r;
945 w2[ic2 + 1 + id1 + iv] += wt1i;
946 w2[ic2 + id2 + iv] += wt2r;
947 w2[ic2 + 1 + id2 + iv] += wt2i;
948 w2[ic2 + id3 + iv] += wt2r;
949 w2[ic2 + 1 + id3 + iv] += wt2i;
950 w2[ic2 + id4 + iv] += -wt1r;
951 w2[ic2 + 1 + id4 + iv] += -wt1i;
962 int itask,
double *vcp1,
const double *v1,
int ieo)
964 int Nvc2 = 2 *
m_Nvc;
966 int Nvcd2 = Nvcd / 2;
975 int isite =
m_arg[itask].isite;
976 int isite_cp =
m_arg[itask].isite_cpz;
980 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
981 const double *w1 = &v1[Nvcd * isite];
985 if (
m_arg[itask].kz0 == 1) {
988 for (
int it = 0; it <
m_Mt; ++it) {
989 for (
int ixy = 0; ixy < Nxy; ++ixy) {
990 int is = ixy + Nxy * (iz +
m_Nz * it);
991 int is2 = ixy + Nxy * it;
994 int ix1 = Nvc2 * is2;
995 int ix2 = ix1 +
m_Nvc;
997 for (
int ic = 0; ic <
m_Nc; ++ic) {
998 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
999 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
1000 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
1001 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
1013 int itask,
double *v2,
const double *vcp2,
int ieo)
1015 int Nvc2 = 2 *
m_Nvc;
1017 int Nvcd2 = Nvcd / 2;
1021 int id3 =
m_Nvc * 2;
1022 int id4 =
m_Nvc * 3;
1026 double wt1r, wt1i, wt2r, wt2i;
1028 int isite =
m_arg[itask].isite;
1029 int isite_cp =
m_arg[itask].isite_cpz;
1031 double *w2 = &v2[Nvcd * isite];
1034 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1039 if (
m_arg[itask].kz1 == 1) {
1042 for (
int it = 0; it <
m_Mt; ++it) {
1043 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1044 int is = ixy + Nxy * (iz +
m_Nz * it);
1045 int is2 = ixy + Nxy * it;
1047 int ig =
m_Ndf * is;
1048 int ix1 = Nvc2 * is2;
1049 int ix2 = ix1 +
m_Nvc;
1051 for (
int ic = 0; ic <
m_Nc; ++ic) {
1052 int ic2 = ic *
m_Nvc;
1054 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1055 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1056 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1057 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1059 w2[2 * ic + id1 + iv] += wt1r;
1060 w2[2 * ic + 1 + id1 + iv] += wt1i;
1061 w2[2 * ic + id2 + iv] += wt2r;
1062 w2[2 * ic + 1 + id2 + iv] += wt2i;
1063 w2[2 * ic + id3 + iv] += wt1i;
1064 w2[2 * ic + 1 + id3 + iv] += -wt1r;
1065 w2[2 * ic + id4 + iv] += -wt2i;
1066 w2[2 * ic + 1 + id4 + iv] += wt2r;
1076 int itask,
double *v2,
const double *v1,
int ieo)
1082 int id3 =
m_Nvc * 2;
1083 int id4 =
m_Nvc * 3;
1088 double wt1r, wt1i, wt2r, wt2i;
1090 int isite =
m_arg[itask].isite;
1092 double *w2 = &v2[Nvcd * isite];
1093 const double *w1 = &v1[Nvcd * isite];
1096 int kz1 =
m_arg[itask].kz1;
1099 for (
int it = 0; it <
m_Mt; ++it) {
1100 for (
int iz = 0; iz <
m_Mz - kz1; ++iz) {
1101 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1102 int is = ixy + Nxy * (iz +
m_Nz * it);
1104 int in = Nvcd * (is + Nxy);
1105 int ig =
m_Ndf * is;
1107 for (
int ic = 0; ic <
m_Nc; ++ic) {
1108 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
1109 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
1110 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
1111 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
1114 for (
int ic = 0; ic <
m_Nc; ++ic) {
1115 int ic2 = ic *
m_Nvc;
1117 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1118 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1119 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1120 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1122 w2[2 * ic + id1 + iv] += wt1r;
1123 w2[2 * ic + 1 + id1 + iv] += wt1i;
1124 w2[2 * ic + id2 + iv] += wt2r;
1125 w2[2 * ic + 1 + id2 + iv] += wt2i;
1126 w2[2 * ic + id3 + iv] += wt1i;
1127 w2[2 * ic + 1 + id3 + iv] += -wt1r;
1128 w2[2 * ic + id4 + iv] += -wt2i;
1129 w2[2 * ic + 1 + id4 + iv] += wt2r;
1139 int itask,
double *vcp1,
const double *v1,
int ieo)
1141 int Nvc2 = 2 *
m_Nvc;
1143 int Nvcd2 = Nvcd / 2;
1147 int id3 =
m_Nvc * 2;
1148 int id4 =
m_Nvc * 3;
1152 int isite =
m_arg[itask].isite;
1153 int isite_cp =
m_arg[itask].isite_cpz;
1157 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1158 const double *w1 = &v1[Nvcd * isite];
1163 if (
m_arg[itask].kz1 == 1) {
1166 for (
int it = 0; it <
m_Mt; ++it) {
1167 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1168 int is = ixy + Nxy * (iz +
m_Nz * it);
1169 int is2 = ixy + Nxy * it;
1171 int ig =
m_Ndf * is;
1172 int ix1 = Nvc2 * is2;
1173 int ix2 = ix1 +
m_Nvc;
1175 for (
int ic = 0; ic <
m_Nc; ++ic) {
1176 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1177 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1178 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1179 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1182 for (
int ic = 0; ic <
m_Nc; ++ic) {
1184 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1185 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1186 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1187 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1199 int itask,
double *v2,
const double *vcp2,
int ieo)
1201 int Nvc2 = 2 *
m_Nvc;
1203 int Nvcd2 = Nvcd / 2;
1207 int id3 =
m_Nvc * 2;
1208 int id4 =
m_Nvc * 3;
1213 double wt1r, wt1i, wt2r, wt2i;
1215 int isite =
m_arg[itask].isite;
1216 int isite_cp =
m_arg[itask].isite_cpz;
1218 double *w2 = &v2[Nvcd * isite];
1221 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1225 if (
m_arg[itask].kz0 == 1) {
1229 for (
int it = 0; it <
m_Mt; ++it) {
1230 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1231 int is = ixy + Nxy * (iz +
m_Nz * it);
1232 int is2 = ixy + Nxy * it;
1234 int ix1 = Nvc2 * is2;
1235 int ix2 = ix1 +
m_Nvc;
1237 for (
int ic = 0; ic <
m_Nc; ++ic) {
1239 int ici = 2 * ic + 1;
1240 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1241 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1242 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1243 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1244 w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
1245 w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
1246 w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
1247 w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
1257 int itask,
double *v2,
const double *v1,
int ieo)
1263 int id3 =
m_Nvc * 2;
1264 int id4 =
m_Nvc * 3;
1269 double wt1r, wt1i, wt2r, wt2i;
1271 int isite =
m_arg[itask].isite;
1273 double *w2 = &v2[Nvcd * isite];
1274 const double *w1 = &v1[Nvcd * isite];
1277 int kz0 =
m_arg[itask].kz0;
1280 for (
int it = 0; it <
m_Mt; ++it) {
1281 for (
int iz = kz0; iz <
m_Mz; ++iz) {
1282 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1283 int is = ixy + Nxy * (iz +
m_Nz * it);
1285 int in = Nvcd * (is - Nxy);
1286 int ig =
m_Ndf * (is - Nxy);
1288 for (
int ic = 0; ic <
m_Nc; ++ic) {
1289 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1290 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1291 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1292 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1295 for (
int ic = 0; ic <
m_Nc; ++ic) {
1297 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1298 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1299 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1300 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1302 w2[ic2 + id1 + iv] += wt1r;
1303 w2[ic2 + 1 + id1 + iv] += wt1i;
1304 w2[ic2 + id2 + iv] += wt2r;
1305 w2[ic2 + 1 + id2 + iv] += wt2i;
1306 w2[ic2 + id3 + iv] += -wt1i;
1307 w2[ic2 + 1 + id3 + iv] += wt1r;
1308 w2[ic2 + id4 + iv] += wt2i;
1309 w2[ic2 + 1 + id4 + iv] += -wt2r;
1319 int itask,
double *vcp1,
const double *v1,
int ieo)
1321 int Nvc2 = 2 *
m_Nvc;
1323 int Nvcd2 = Nvcd / 2;
1327 int id3 =
m_Nvc * 2;
1328 int id4 =
m_Nvc * 3;
1332 int isite =
m_arg[itask].isite;
1333 int isite_cp =
m_arg[itask].isite_cpt;
1337 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1338 const double *w1 = &v1[Nvcd * isite];
1342 if (
m_arg[itask].kt0 == 1) {
1345 for (
int iz = 0; iz <
m_Mz; ++iz) {
1346 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1347 int is = ixy + Nxy * (iz +
m_Nz * it);
1348 int is2 = ixy + Nxy * iz;
1351 int ix1 = Nvc2 * is2;
1352 int ix2 = ix1 +
m_Nvc;
1354 for (
int ic = 0; ic <
m_Nc; ++ic) {
1355 w2[2 * ic + ix1] = 2.0 * bc2 * w1[2 * ic + id3 + in];
1356 w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
1357 w2[2 * ic + ix2] = 2.0 * bc2 * w1[2 * ic + id4 + in];
1358 w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
1370 int itask,
double *v2,
const double *vcp2,
int ieo)
1372 int Nvc2 = 2 *
m_Nvc;
1374 int Nvcd2 = Nvcd / 2;
1378 int id3 =
m_Nvc * 2;
1379 int id4 =
m_Nvc * 3;
1383 double wt1r, wt1i, wt2r, wt2i;
1385 int isite =
m_arg[itask].isite;
1386 int isite_cp =
m_arg[itask].isite_cpt;
1388 double *w2 = &v2[Nvcd * isite];
1391 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1396 if (
m_arg[itask].kt1 == 1) {
1399 for (
int iz = 0; iz <
m_Mz; ++iz) {
1400 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1401 int is = ixy + Nxy * (iz +
m_Nz * it);
1402 int is2 = ixy + Nxy * iz;
1404 int ig =
m_Ndf * is;
1405 int ix1 = Nvc2 * is2;
1406 int ix2 = ix1 +
m_Nvc;
1408 for (
int ic = 0; ic <
m_Nc; ++ic) {
1409 int ic2 = ic *
m_Nvc;
1411 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1412 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1413 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1414 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1416 w2[2 * ic + id3 + iv] += wt1r;
1417 w2[2 * ic + 1 + id3 + iv] += wt1i;
1418 w2[2 * ic + id4 + iv] += wt2r;
1419 w2[2 * ic + 1 + id4 + iv] += wt2i;
1429 int itask,
double *v2,
const double *v1,
int ieo)
1435 int id3 =
m_Nvc * 2;
1436 int id4 =
m_Nvc * 3;
1441 double wt1r, wt1i, wt2r, wt2i;
1443 int isite =
m_arg[itask].isite;
1445 double *w2 = &v2[Nvcd * isite];
1446 const double *w1 = &v1[Nvcd * isite];
1449 int kt1 =
m_arg[itask].kt1;
1451 int Nxyz = Nxy *
m_Nz;
1453 for (
int it = 0; it <
m_Mt - kt1; ++it) {
1454 for (
int iz = 0; iz <
m_Mz; ++iz) {
1455 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1456 int is = ixy + Nxy * (iz + m_Nz * it);
1458 int in = Nvcd * (is + Nxyz);
1459 int ig =
m_Ndf * is;
1461 for (
int ic = 0; ic <
m_Nc; ++ic) {
1462 vt1[2 * ic] = 2.0 * w1[2 * ic + id3 + in];
1463 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
1464 vt2[2 * ic] = 2.0 * w1[2 * ic + id4 + in];
1465 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
1468 for (
int ic = 0; ic <
m_Nc; ++ic) {
1469 int ic2 = ic *
m_Nvc;
1471 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1472 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1473 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1474 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1476 w2[2 * ic + id3 + iv] += wt1r;
1477 w2[2 * ic + 1 + id3 + iv] += wt1i;
1478 w2[2 * ic + id4 + iv] += wt2r;
1479 w2[2 * ic + 1 + id4 + iv] += wt2i;
1489 int itask,
double *vcp1,
const double *v1,
int ieo)
1491 int Nvc2 = 2 *
m_Nvc;
1493 int Nvcd2 = Nvcd / 2;
1497 int id3 =
m_Nvc * 2;
1498 int id4 =
m_Nvc * 3;
1502 int isite =
m_arg[itask].isite;
1503 int isite_cp =
m_arg[itask].isite_cpt;
1507 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1508 const double *w1 = &v1[Nvcd * isite];
1513 if (
m_arg[itask].kt1 == 1) {
1516 for (
int iz = 0; iz <
m_Mz; ++iz) {
1517 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1518 int is = ixy + Nxy * (iz +
m_Nz * it);
1519 int is2 = ixy + Nxy * iz;
1521 int ig =
m_Ndf * is;
1522 int ix1 = Nvc2 * is2;
1523 int ix2 = ix1 +
m_Nvc;
1525 for (
int ic = 0; ic <
m_Nc; ++ic) {
1526 vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1527 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1528 vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1529 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1532 for (
int ic = 0; ic <
m_Nc; ++ic) {
1534 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1535 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1536 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1537 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1549 int itask,
double *v2,
const double *vcp2,
int ieo)
1551 int Nvc2 = 2 *
m_Nvc;
1553 int Nvcd2 = Nvcd / 2;
1557 int id3 =
m_Nvc * 2;
1558 int id4 =
m_Nvc * 3;
1563 double wt1r, wt1i, wt2r, wt2i;
1565 int isite =
m_arg[itask].isite;
1566 int isite_cp =
m_arg[itask].isite_cpt;
1568 double *w2 = &v2[Nvcd * isite];
1571 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1575 if (
m_arg[itask].kt0 == 1) {
1578 for (
int iz = 0; iz <
m_Mz; ++iz) {
1579 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1580 int is = ixy + Nxy * (iz +
m_Nz * it);
1581 int is2 = ixy + Nxy * iz;
1583 int ix1 = Nvc2 * is2;
1584 int ix2 = ix1 +
m_Nvc;
1586 for (
int ic = 0; ic <
m_Nc; ++ic) {
1588 int ici = 2 * ic + 1;
1589 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1590 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1591 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1592 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1602 int itask,
double *v2,
const double *v1,
int ieo)
1608 int id3 =
m_Nvc * 2;
1609 int id4 =
m_Nvc * 3;
1614 double wt1r, wt1i, wt2r, wt2i;
1616 int isite =
m_arg[itask].isite;
1618 double *w2 = &v2[Nvcd * isite];
1619 const double *w1 = &v1[Nvcd * isite];
1622 int kt0 =
m_arg[itask].kt0;
1624 int Nxyz = Nxy *
m_Nz;
1626 for (
int it = kt0; it <
m_Mt; ++it) {
1627 for (
int iz = 0; iz <
m_Mz; ++iz) {
1628 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1629 int is = ixy + Nxy * (iz + m_Nz * it);
1631 int in = Nvcd * (is - Nxyz);
1632 int ig =
m_Ndf * (is - Nxyz);
1634 for (
int ic = 0; ic <
m_Nc; ++ic) {
1635 vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1636 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1637 vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1638 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1641 for (
int ic = 0; ic <
m_Nc; ++ic) {
1643 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1644 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1645 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1646 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1648 w2[ic2 + id1 + iv] += wt1r;
1649 w2[ic2 + 1 + id1 + iv] += wt1i;
1650 w2[ic2 + id2 + iv] += wt2r;
1651 w2[ic2 + 1 + id2 + iv] += wt2i;
1661 int itask,
double *vcp1,
const double *v1,
int ieo)
1663 int Nvc2 = 2 *
m_Nvc;
1665 int Nvcd2 = Nvcd / 2;
1669 int id3 =
m_Nvc * 2;
1670 int id4 =
m_Nvc * 3;
1674 int isite =
m_arg[itask].isite;
1675 int isite_cp =
m_arg[itask].isite_cpt;
1679 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1680 const double *w1 = &v1[Nvcd * isite];
1684 if (
m_arg[itask].kt0 == 1) {
1687 for (
int iz = 0; iz <
m_Mz; ++iz) {
1688 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1689 int is = ixy + Nxy * (iz +
m_Nz * it);
1690 int is2 = ixy + Nxy * iz;
1693 int ix1 = Nvc2 * is2;
1694 int ix2 = ix1 +
m_Nvc;
1696 for (
int ic = 0; ic <
m_Nc; ++ic) {
1697 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
1698 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
1699 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
1700 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
1712 int itask,
double *v2,
const double *vcp2,
int ieo)
1714 int Nvc2 = 2 *
m_Nvc;
1716 int Nvcd2 = Nvcd / 2;
1720 int id3 =
m_Nvc * 2;
1721 int id4 =
m_Nvc * 3;
1725 double wt1r, wt1i, wt2r, wt2i;
1727 int isite =
m_arg[itask].isite;
1728 int isite_cp =
m_arg[itask].isite_cpt;
1730 double *w2 = &v2[Nvcd * isite];
1733 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1738 if (
m_arg[itask].kt1 == 1) {
1741 for (
int iz = 0; iz <
m_Mz; ++iz) {
1742 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1743 int is = ixy + Nxy * (iz +
m_Nz * it);
1744 int is2 = ixy + Nxy * iz;
1746 int ig =
m_Ndf * is;
1747 int ix1 = Nvc2 * is2;
1748 int ix2 = ix1 +
m_Nvc;
1750 for (
int ic = 0; ic <
m_Nc; ++ic) {
1751 int ic2 = ic *
m_Nvc;
1753 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1754 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1755 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1756 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1758 w2[2 * ic + id1 + iv] += wt1r;
1759 w2[2 * ic + 1 + id1 + iv] += wt1i;
1760 w2[2 * ic + id2 + iv] += wt2r;
1761 w2[2 * ic + 1 + id2 + iv] += wt2i;
1762 w2[2 * ic + id3 + iv] += wt1r;
1763 w2[2 * ic + 1 + id3 + iv] += wt1i;
1764 w2[2 * ic + id4 + iv] += wt2r;
1765 w2[2 * ic + 1 + id4 + iv] += wt2i;
1775 int itask,
double *v2,
const double *v1,
int ieo)
1781 int id3 =
m_Nvc * 2;
1782 int id4 =
m_Nvc * 3;
1787 double wt1r, wt1i, wt2r, wt2i;
1789 int isite =
m_arg[itask].isite;
1791 double *w2 = &v2[Nvcd * isite];
1792 const double *w1 = &v1[Nvcd * isite];
1795 int kt1 =
m_arg[itask].kt1;
1797 int Nxyz = Nxy *
m_Nz;
1799 for (
int it = 0; it <
m_Mt - kt1; ++it) {
1800 for (
int iz = 0; iz <
m_Mz; ++iz) {
1801 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1802 int is = ixy + Nxy * (iz + m_Nz * it);
1804 int in = Nvcd * (is + Nxyz);
1805 int ig =
m_Ndf * is;
1807 for (
int ic = 0; ic <
m_Nc; ++ic) {
1808 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
1809 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
1810 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
1811 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
1814 for (
int ic = 0; ic <
m_Nc; ++ic) {
1815 int ic2 = ic *
m_Nvc;
1817 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1818 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1819 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1820 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1822 w2[2 * ic + id1 + iv] += wt1r;
1823 w2[2 * ic + 1 + id1 + iv] += wt1i;
1824 w2[2 * ic + id2 + iv] += wt2r;
1825 w2[2 * ic + 1 + id2 + iv] += wt2i;
1826 w2[2 * ic + id3 + iv] += wt1r;
1827 w2[2 * ic + 1 + id3 + iv] += wt1i;
1828 w2[2 * ic + id4 + iv] += wt2r;
1829 w2[2 * ic + 1 + id4 + iv] += wt2i;
1839 int itask,
double *vcp1,
const double *v1,
int ieo)
1841 int Nvc2 = 2 *
m_Nvc;
1843 int Nvcd2 = Nvcd / 2;
1847 int id3 =
m_Nvc * 2;
1848 int id4 =
m_Nvc * 3;
1852 int isite =
m_arg[itask].isite;
1853 int isite_cp =
m_arg[itask].isite_cpt;
1857 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1858 const double *w1 = &v1[Nvcd * isite];
1863 if (
m_arg[itask].kt1 == 1) {
1866 for (
int iz = 0; iz <
m_Mz; ++iz) {
1867 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1868 int is = ixy + Nxy * (iz +
m_Nz * it);
1869 int is2 = ixy + Nxy * iz;
1871 int ig =
m_Ndf * is;
1872 int ix1 = Nvc2 * is2;
1873 int ix2 = ix1 +
m_Nvc;
1875 for (
int ic = 0; ic <
m_Nc; ++ic) {
1876 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1877 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1878 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1879 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1882 for (
int ic = 0; ic <
m_Nc; ++ic) {
1884 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1885 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1886 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1887 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1899 int itask,
double *v2,
const double *vcp2,
int ieo)
1901 int Nvc2 = 2 *
m_Nvc;
1903 int Nvcd2 = Nvcd / 2;
1907 int id3 =
m_Nvc * 2;
1908 int id4 =
m_Nvc * 3;
1913 double wt1r, wt1i, wt2r, wt2i;
1915 int isite =
m_arg[itask].isite;
1916 int isite_cp =
m_arg[itask].isite_cpt;
1918 double *w2 = &v2[Nvcd * isite];
1921 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1925 if (
m_arg[itask].kt0 == 1) {
1928 for (
int iz = 0; iz <
m_Mz; ++iz) {
1929 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1930 int is = ixy + Nxy * (iz +
m_Nz * it);
1931 int is2 = ixy + Nxy * iz;
1933 int ix1 = Nvc2 * is2;
1934 int ix2 = ix1 +
m_Nvc;
1936 for (
int ic = 0; ic <
m_Nc; ++ic) {
1938 int ici = 2 * ic + 1;
1939 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1940 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1941 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1942 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1943 w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
1944 w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
1945 w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
1946 w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
1956 int itask,
double *v2,
const double *v1,
int ieo)
1962 int id3 =
m_Nvc * 2;
1963 int id4 =
m_Nvc * 3;
1968 double wt1r, wt1i, wt2r, wt2i;
1970 int isite =
m_arg[itask].isite;
1972 double *w2 = &v2[Nvcd * isite];
1973 const double *w1 = &v1[Nvcd * isite];
1976 int kt0 =
m_arg[itask].kt0;
1978 int Nxyz = Nxy *
m_Nz;
1980 for (
int it = kt0; it <
m_Mt; ++it) {
1981 for (
int iz = 0; iz <
m_Mz; ++iz) {
1982 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1983 int is = ixy + Nxy * (iz + m_Nz * it);
1985 int in = Nvcd * (is - Nxyz);
1986 int ig =
m_Ndf * (is - Nxyz);
1988 for (
int ic = 0; ic <
m_Nc; ++ic) {
1989 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1990 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1991 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1992 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1995 for (
int ic = 0; ic <
m_Nc; ++ic) {
1997 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1998 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1999 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
2000 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
2002 w2[ic2 + id1 + iv] += wt1r;
2003 w2[ic2 + 1 + id1 + iv] += wt1i;
2004 w2[ic2 + id2 + iv] += wt2r;
2005 w2[ic2 + 1 + id2 + iv] += wt2i;
2006 w2[ic2 + id3 + iv] -= wt1r;
2007 w2[ic2 + 1 + id3 + iv] -= wt1i;
2008 w2[ic2 + id4 + iv] -= wt2r;
2009 w2[ic2 + 1 + id4 + iv] -= wt2i;
2019 int itask,
double *v2,
const double *v1)
2026 int id3 =
m_Nvc * 2;
2027 int id4 =
m_Nvc * 3;
2029 int isite =
m_arg[itask].isite;
2030 double *w2 = &v2[Nvcd * isite];
2031 const double *w1 = &v1[Nvcd * isite];
2033 for (
int it = 0; it <
m_Mt; ++it) {
2034 for (
int iz = 0; iz <
m_Mz; ++iz) {
2035 for (
int ixy = 0; ixy < Nxy; ++ixy) {
2036 int iv = Nvcd * (ixy + Nxy * (iz +
m_Nz * it));
2037 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
2038 w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
2039 w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
2040 w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
2041 w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
2051 int itask,
double *v2,
const double *v1)
2058 int id3 =
m_Nvc * 2;
2059 int id4 =
m_Nvc * 3;
2061 int isite =
m_arg[itask].isite;
2062 double *w2 = &v2[Nvcd * isite];
2063 const double *w1 = &v1[Nvcd * isite];
2065 for (
int it = 0; it <
m_Mt; ++it) {
2066 for (
int iz = 0; iz <
m_Mz; ++iz) {
2067 for (
int ixy = 0; ixy < Nxy; ++ixy) {
2068 int iv = Nvcd * (ixy + Nxy * (iz +
m_Nz * it));
2069 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
2070 w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
2071 w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
2072 w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
2073 w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
2090 int id3 =
m_Nvc * 2;
2091 int id4 =
m_Nvc * 3;
2093 int isite =
m_arg[itask].isite;
2094 double *w1 = &v1[Nvcd * isite];
2096 for (
int it = 0; it <
m_Mt; ++it) {
2097 for (
int iz = 0; iz <
m_Mz; ++iz) {
2098 for (
int ixy = 0; ixy < Nxy; ++ixy) {
2099 int iv = Nvcd * (ixy + Nxy * (iz +
m_Nz * it));
2100 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
2101 double wt1 = w1[ivc + id1 + iv];
2102 double wt2 = w1[ivc + id2 + iv];
2103 w1[ivc + id1 + iv] = w1[ivc + id3 + iv];
2104 w1[ivc + id2 + iv] = w1[ivc + id4 + iv];
2105 w1[ivc + id3 + iv] = wt1;
2106 w1[ivc + id4 + iv] = wt2;
2123 int id3 =
m_Nvc * 2;
2124 int id4 =
m_Nvc * 3;
2126 int isite =
m_arg[itask].isite;
2127 double *w1 = &v1[Nvcd * isite];
2129 for (
int it = 0; it <
m_Mt; ++it) {
2130 for (
int iz = 0; iz <
m_Mz; ++iz) {
2131 for (
int ixy = 0; ixy < Nxy; ++ixy) {
2132 int iv = Nvcd * (ixy + Nxy * (iz +
m_Nz * it));
2133 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
2134 w1[ivc + id3 + iv] = -w1[ivc + id3 + iv];
2135 w1[ivc + id4 + iv] = -w1[ivc + id4 + iv];
std::vector< Channel * > m_fw_send
void mult_tm1_chiral_thread(int, double *, const double *, int)
void mult_tm1_dirac_thread(int, double *, const double *, int)
std::vector< double > m_boundary2
b.c. for each node.
const double * ptr(const int jin, const int site, const int jex) const
void mult_yp2_thread(int, double *, const double *, int)
void general(const char *format,...)
static const std::string class_name
void mult_tp2_dirac_thread(int, double *, const double *, int)
std::vector< mult_arg > m_arg
void mult_ypb_thread(int, double *, const double *, int)
void gm5_chiral_thread(int, double *, const double *)
void mult_yp1_thread(int, double *, const double *, int)
void Meo(Field &, const Field &, const int ieo)
void mult_tm2_chiral_thread(int, double *, const double *, int)
void mult_ym2_thread(int, double *, const double *, int)
void scal_thread(int, double *, double)
void mult_xm1_thread(int, double *, const double *, int)
void mult_zp2_thread(int, double *, const double *, int)
void mult_tp1_dirac_thread(int, double *, const double *, int)
void gm5_dirac_thread(int, double *, const double *)
void mult_xp2_thread(int, double *, const double *, int)
void mult_zp1_thread(int, double *, const double *, int)
Bridge::VerboseLevel m_vl
void mult_tpb_dirac_thread(int, double *, const double *, int)
void mult_zmb_thread(int, double *, const double *, int)
void mult_tp1_chiral_thread(int, double *, const double *, int)
void mult_zpb_thread(int, double *, const double *, int)
void mult_tm2_dirac_thread(int, double *, const double *, int)
void mult_xmb_thread(int, double *, const double *, int)
void mult_tpb_chiral_thread(int, double *, const double *, int)
static int get_num_threads_available()
returns number of threads (works outside of parallel region).
Field_G * m_U
dummy: pointing m_Ueo.
void crucial(const char *format,...)
std::vector< Channel * > m_bw_recv
void mult_tp2_chiral_thread(int, double *, const double *, int)
void mult_ymb_thread(int, double *, const double *, int)
void clear_thread(int, double *)
void mult_tmb_dirac_thread(int, double *, const double *, int)
void mult_ym1_thread(int, double *, const double *, int)
void mult_xm2_thread(int, double *, const double *, int)
void mult_zm2_thread(int, double *, const double *, int)
void mult_xpb_thread(int, double *, const double *, int)
void mult_zm1_thread(int, double *, const double *, int)
std::vector< Channel * > m_fw_recv
void mult_tmb_chiral_thread(int, double *, const double *, int)
void mult_xp1_thread(int, double *, const double *, int)
std::vector< Channel * > m_bw_send