22 #if defined USE_GROUP_SU3
23 #include "fopr_Wilson_impl_SU3.inc"
24 #elif defined USE_GROUP_SU2
25 #include "fopr_Wilson_impl_SU2.inc"
26 #elif defined USE_GROUP_SU_N
27 #include "fopr_Wilson_impl_SU_N.inc"
79 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
80 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
81 int itask = ith_z + m_Ntask_z * ith_t;
89 if (ith_t == 0)
m_arg[itask].kt0 = 1;
90 if (ith_z == 0)
m_arg[itask].kz0 = 1;
91 if (ith_t == m_Ntask_t - 1)
m_arg[itask].kt1 = 1;
92 if (ith_z == m_Ntask_z - 1)
m_arg[itask].kz1 = 1;
96 m_arg[itask].isite_cpz = ith_t *
m_Mt * Nxy;
97 m_arg[itask].isite_cpt = ith_z *
m_Mz * Nxy;
104 int Nvcd2 = 2 * Nc * Nd / 2;
106 std::vector<int> destid(
m_Ntask);
107 std::vector<int> offset(
m_Ntask);
108 std::vector<int> datasize(
m_Ntask);
109 std::vector<int> offset_up(
m_Ntask);
110 std::vector<int> offset_lw(
m_Ntask);
111 std::vector<int> datasize_up(
m_Ntask);
112 std::vector<int> datasize_lw(
m_Ntask);
115 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
116 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
119 destid[itask] = itask;
120 offset[itask] =
sizeof(double) * Nvcd2 * isite_cp;
121 datasize[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Mt * m_Ny;
130 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
131 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
134 destid[itask] = itask;
135 offset[itask] =
sizeof(double) * Nvcd2 * isite_cp;
136 datasize[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Mt * m_Nx;
145 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
146 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
147 int itask = ith_z + m_Ntask_z * ith_t;
149 offset_up[itask] = 0;
150 offset_lw[itask] = 0;
151 datasize_up[itask] = 0;
152 datasize_lw[itask] = 0;
154 destid[itask] = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
155 offset_lw[itask] =
sizeof(double) * Nvcd2 * ith_t *
m_Mt *
m_Nx * m_Ny;
156 datasize_lw[itask] =
sizeof(double) * Nvcd2 *
m_Mt *
m_Nx * m_Ny;
158 if (ith_z == m_Ntask_z - 1) {
160 offset_up[itask] =
sizeof(double) * Nvcd2 * ith_t *
m_Mt *
m_Nx * m_Ny;
161 datasize_up[itask] =
sizeof(double) * Nvcd2 *
m_Mt *
m_Nx * m_Ny;
171 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
172 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
173 int itask = ith_z + m_Ntask_z * ith_t;
175 offset_up[itask] = 0;
176 offset_lw[itask] = 0;
177 datasize_up[itask] = 0;
178 datasize_lw[itask] = 0;
180 destid[itask] = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
181 offset_lw[itask] =
sizeof(double) * Nvcd2 * ith_z *
m_Mz *
m_Nx * m_Ny;
182 datasize_lw[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Nx * m_Ny;
184 if (ith_t == m_Ntask_t - 1) {
185 destid[itask] = ith_z;
186 offset_up[itask] =
sizeof(double) * Nvcd2 * ith_z *
m_Mz *
m_Nx * m_Ny;
187 datasize_up[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Nx * m_Ny;
200 int itask,
double *v2,
double fac,
const double *v1)
205 int isite =
m_arg[itask].isite;
207 double *w2 = &v2[Nvcd * isite];
208 const double *w1 = &v1[Nvcd * isite];
210 for (
int it = 0; it <
m_Mt; ++it) {
211 for (
int iz = 0; iz <
m_Mz; ++iz) {
212 for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
213 int iv = ivxy + Nvxy * (iz +
m_Nz * it);
214 w2[iv] = fac * w2[iv] + w1[iv];
228 int isite =
m_arg[itask].isite;
229 double *w2 = &v2[Nvcd * isite];
231 for (
int it = 0; it <
m_Mt; ++it) {
232 for (
int iz = 0; iz <
m_Mz; ++iz) {
233 for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
234 int iv = ivxy + Nvxy * (iz +
m_Nz * it);
244 int itask,
double *vcp1,
const double *v1)
246 int Nvc2 = 2 *
m_Nvc;
248 int Nvcd2 = Nvcd / 2;
258 int isite =
m_arg[itask].isite;
259 int isite_cp =
m_arg[itask].isite_cpx;
263 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
264 const double *w1 = &v1[Nvcd * isite];
268 for (
int it = 0; it <
m_Mt; ++it) {
269 for (
int iz = 0; iz <
m_Mz; ++iz) {
270 for (
int iy = 0; iy <
m_Ny; ++iy) {
271 int is = ix +
m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
272 int is2 = iy + m_Ny * (iz + m_Mz * it);
274 int ix1 = Nvc2 * is2;
275 int ix2 = ix1 +
m_Nvc;
277 for (
int ic = 0; ic <
m_Nc; ++ic) {
278 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
279 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
280 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
281 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
293 int itask,
double *v2,
const double *vcp2)
295 int Nvc2 = 2 *
m_Nvc;
297 int Nvcd2 = Nvcd / 2;
306 double wt1r, wt1i, wt2r, wt2i;
308 int isite =
m_arg[itask].isite;
309 int isite_cp =
m_arg[itask].isite_cpx;
311 double *w2 = &v2[Nvcd * isite];
314 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
320 for (
int it = 0; it <
m_Mt; ++it) {
321 for (
int iz = 0; iz <
m_Mz; ++iz) {
322 for (
int iy = 0; iy <
m_Ny; ++iy) {
323 int is = ix +
m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
324 int is2 = iy + m_Ny * (iz + m_Mz * it);
327 int ix1 = Nvc2 * is2;
328 int ix2 = ix1 +
m_Nvc;
330 for (
int ic = 0; ic <
m_Nc; ++ic) {
331 int ic2 = ic *
m_Nvc;
333 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
334 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
335 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
336 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
338 w2[2 * ic + id1 + iv] += wt1r;
339 w2[2 * ic + 1 + id1 + iv] += wt1i;
340 w2[2 * ic + id2 + iv] += wt2r;
341 w2[2 * ic + 1 + id2 + iv] += wt2i;
342 w2[2 * ic + id3 + iv] += wt2i;
343 w2[2 * ic + 1 + id3 + iv] += -wt2r;
344 w2[2 * ic + id4 + iv] += wt1i;
345 w2[2 * ic + 1 + id4 + iv] += -wt1r;
355 int itask,
double *v2,
const double *v1)
367 double wt1r, wt1i, wt2r, wt2i;
369 int isite =
m_arg[itask].isite;
371 double *w2 = &v2[Nvcd * isite];
372 const double *w1 = &v1[Nvcd * isite];
375 for (
int it = 0; it <
m_Mt; ++it) {
376 for (
int iz = 0; iz <
m_Mz; ++iz) {
377 for (
int iy = 0; iy <
m_Ny; ++iy) {
378 for (
int ix = 0; ix <
m_Nx - 1; ++ix) {
379 int is = ix + m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
381 int in = Nvcd * (is + 1);
384 for (
int ic = 0; ic <
m_Nc; ++ic) {
385 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
386 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
387 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
388 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
391 for (
int ic = 0; ic <
m_Nc; ++ic) {
392 int ic2 = ic *
m_Nvc;
394 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
395 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
396 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
397 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
399 w2[2 * ic + id1 + iv] += wt1r;
400 w2[2 * ic + 1 + id1 + iv] += wt1i;
401 w2[2 * ic + id2 + iv] += wt2r;
402 w2[2 * ic + 1 + id2 + iv] += wt2i;
403 w2[2 * ic + id3 + iv] += wt2i;
404 w2[2 * ic + 1 + id3 + iv] += -wt2r;
405 w2[2 * ic + id4 + iv] += wt1i;
406 w2[2 * ic + 1 + id4 + iv] += -wt1r;
417 int itask,
double *vcp1,
const double *v1)
419 int Nvc2 = 2 *
m_Nvc;
421 int Nvcd2 = Nvcd / 2;
430 int isite =
m_arg[itask].isite;
431 int isite_cp =
m_arg[itask].isite_cpx;
435 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
436 const double *w1 = &v1[Nvcd * isite];
443 for (
int it = 0; it <
m_Mt; ++it) {
444 for (
int iz = 0; iz <
m_Mz; ++iz) {
445 for (
int iy = 0; iy <
m_Ny; ++iy) {
446 int is = ix +
m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
447 int is2 = iy + m_Ny * (iz + m_Mz * it);
450 int ix1 = Nvc2 * is2;
451 int ix2 = ix1 +
m_Nvc;
453 for (
int ic = 0; ic <
m_Nc; ++ic) {
454 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
455 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
456 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
457 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
460 for (
int ic = 0; ic <
m_Nc; ++ic) {
462 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
463 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
464 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
465 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
477 int itask,
double *v2,
const double *vcp2)
479 int Nvc2 = 2 *
m_Nvc;
481 int Nvcd2 = Nvcd / 2;
491 double wt1r, wt1i, wt2r, wt2i;
493 int isite =
m_arg[itask].isite;
494 int isite_cp =
m_arg[itask].isite_cpx;
496 double *w2 = &v2[Nvcd * isite];
499 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
504 for (
int it = 0; it <
m_Mt; ++it) {
505 for (
int iz = 0; iz <
m_Mz; ++iz) {
506 for (
int iy = 0; iy <
m_Ny; ++iy) {
507 int is = ix +
m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
508 int is2 = iy + m_Ny * (iz + m_Mz * it);
510 int ix1 = Nvc2 * is2;
511 int ix2 = ix1 +
m_Nvc;
513 for (
int ic = 0; ic <
m_Nc; ++ic) {
515 int ici = 2 * ic + 1;
516 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
517 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
518 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
519 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
520 w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
521 w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
522 w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
523 w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
533 int itask,
double *v2,
const double *v1)
545 double wt1r, wt1i, wt2r, wt2i;
547 int isite =
m_arg[itask].isite;
549 double *w2 = &v2[Nvcd * isite];
550 const double *w1 = &v1[Nvcd * isite];
553 for (
int it = 0; it <
m_Mt; ++it) {
554 for (
int iz = 0; iz <
m_Mz; ++iz) {
555 for (
int iy = 0; iy <
m_Ny; ++iy) {
556 for (
int ix = 1; ix <
m_Nx; ++ix) {
557 int is = ix + m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
559 int in = Nvcd * (is - 1);
560 int ig =
m_Ndf * (is - 1);
562 for (
int ic = 0; ic <
m_Nc; ++ic) {
563 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
564 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
565 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
566 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
569 for (
int ic = 0; ic <
m_Nc; ++ic) {
572 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
573 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
574 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
575 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
577 w2[2 * ic + id1 + iv] += wt1r;
578 w2[2 * ic + 1 + id1 + iv] += wt1i;
579 w2[2 * ic + id2 + iv] += wt2r;
580 w2[2 * ic + 1 + id2 + iv] += wt2i;
581 w2[2 * ic + id3 + iv] += -wt2i;
582 w2[2 * ic + 1 + id3 + iv] += +wt2r;
583 w2[2 * ic + id4 + iv] += -wt1i;
584 w2[2 * ic + 1 + id4 + iv] += +wt1r;
595 int itask,
double *vcp1,
const double *v1)
597 int Nvc2 = 2 *
m_Nvc;
599 int Nvcd2 = Nvcd / 2;
606 int isite =
m_arg[itask].isite;
607 int isite_cp =
m_arg[itask].isite_cpy;
614 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
615 const double *w1 = &v1[Nvcd * isite];
619 for (
int it = 0; it <
m_Mt; ++it) {
620 for (
int iz = 0; iz <
m_Mz; ++iz) {
621 for (
int ix = 0; ix <
m_Nx; ++ix) {
622 int is = ix + m_Nx * (iy +
m_Ny * (iz +
m_Nz * it));
623 int is2 = ix + m_Nx * (iz + m_Mz * it);
625 int ix1 = Nvc2 * is2;
626 int ix2 = ix1 +
m_Nvc;
628 for (
int ic = 0; ic <
m_Nc; ++ic) {
629 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
630 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
631 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
632 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
644 int itask,
double *v2,
const double *vcp2)
646 int Nvc2 = 2 *
m_Nvc;
648 int Nvcd2 = Nvcd / 2;
657 double wt1r, wt1i, wt2r, wt2i;
659 int isite =
m_arg[itask].isite;
660 int isite_cp =
m_arg[itask].isite_cpy;
662 double *w2 = &v2[Nvcd * isite];
665 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
671 for (
int it = 0; it <
m_Mt; ++it) {
672 for (
int iz = 0; iz <
m_Mz; ++iz) {
673 for (
int ix = 0; ix <
m_Nx; ++ix) {
674 int is = ix + m_Nx * (iy +
m_Ny * (iz +
m_Nz * it));
675 int is2 = ix + m_Nx * (iz + m_Mz * it);
678 int ix1 = Nvc2 * is2;
679 int ix2 = ix1 +
m_Nvc;
681 for (
int ic = 0; ic <
m_Nc; ++ic) {
682 int ic2 = ic *
m_Nvc;
684 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
685 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
686 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
687 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
689 w2[2 * ic + id1 + iv] += wt1r;
690 w2[2 * ic + 1 + id1 + iv] += wt1i;
691 w2[2 * ic + id2 + iv] += wt2r;
692 w2[2 * ic + 1 + id2 + iv] += wt2i;
693 w2[2 * ic + id3 + iv] += -wt2r;
694 w2[2 * ic + 1 + id3 + iv] += -wt2i;
695 w2[2 * ic + id4 + iv] += wt1r;
696 w2[2 * ic + 1 + id4 + iv] += wt1i;
706 int itask,
double *v2,
const double *v1)
718 double wt1r, wt1i, wt2r, wt2i;
720 int isite =
m_arg[itask].isite;
722 double *w2 = &v2[Nvcd * isite];
723 const double *w1 = &v1[Nvcd * isite];
726 for (
int it = 0; it <
m_Mt; ++it) {
727 for (
int iz = 0; iz <
m_Mz; ++iz) {
728 for (
int iy = 0; iy <
m_Ny - 1; ++iy) {
729 for (
int ix = 0; ix <
m_Nx; ++ix) {
730 int is = ix + m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
732 int in = Nvcd * (is +
m_Nx);
735 for (
int ic = 0; ic <
m_Nc; ++ic) {
736 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
737 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
738 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
739 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
742 for (
int ic = 0; ic <
m_Nc; ++ic) {
743 int ic2 = ic *
m_Nvc;
745 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
746 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
747 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
748 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
750 w2[2 * ic + id1 + iv] += wt1r;
751 w2[2 * ic + 1 + id1 + iv] += wt1i;
752 w2[2 * ic + id2 + iv] += wt2r;
753 w2[2 * ic + 1 + id2 + iv] += wt2i;
754 w2[2 * ic + id3 + iv] += -wt2r;
755 w2[2 * ic + 1 + id3 + iv] += -wt2i;
756 w2[2 * ic + id4 + iv] += wt1r;
757 w2[2 * ic + 1 + id4 + iv] += wt1i;
768 int itask,
double *vcp1,
const double *v1)
770 int Nvc2 = 2 *
m_Nvc;
772 int Nvcd2 = Nvcd / 2;
781 int isite =
m_arg[itask].isite;
782 int isite_cp =
m_arg[itask].isite_cpy;
786 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
788 const double *w1 = &v1[Nvcd * isite];
795 for (
int it = 0; it <
m_Mt; ++it) {
796 for (
int iz = 0; iz <
m_Mz; ++iz) {
797 for (
int ix = 0; ix <
m_Nx; ++ix) {
798 int is = ix + m_Nx * (iy +
m_Ny * (iz +
m_Nz * it));
799 int is2 = ix + m_Nx * (iz + m_Mz * it);
802 int ix1 = Nvc2 * is2;
803 int ix2 = ix1 +
m_Nvc;
805 for (
int ic = 0; ic <
m_Nc; ++ic) {
806 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
807 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
808 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
809 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
812 for (
int ic = 0; ic <
m_Nc; ++ic) {
814 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
815 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
816 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
817 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
829 int itask,
double *v2,
const double *vcp2)
831 int Nvc2 = 2 *
m_Nvc;
833 int Nvcd2 = Nvcd / 2;
843 double wt1r, wt1i, wt2r, wt2i;
845 int isite =
m_arg[itask].isite;
846 int isite_cp =
m_arg[itask].isite_cpy;
848 double *w2 = &v2[Nvcd * isite];
851 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
856 for (
int it = 0; it <
m_Mt; ++it) {
857 for (
int iz = 0; iz <
m_Mz; ++iz) {
858 for (
int ix = 0; ix <
m_Nx; ++ix) {
859 int is = ix + m_Nx * (iy +
m_Ny * (iz +
m_Nz * it));
860 int is2 = ix + m_Nx * (iz + m_Mz * it);
862 int ix1 = Nvc2 * is2;
863 int ix2 = ix1 +
m_Nvc;
865 for (
int ic = 0; ic <
m_Nc; ++ic) {
867 int ici = 2 * ic + 1;
868 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
869 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
870 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
871 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
872 w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
873 w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
874 w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
875 w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
885 int itask,
double *v2,
const double *v1)
897 double wt1r, wt1i, wt2r, wt2i;
899 int isite =
m_arg[itask].isite;
901 double *w2 = &v2[Nvcd * isite];
902 const double *w1 = &v1[Nvcd * isite];
905 for (
int it = 0; it <
m_Mt; ++it) {
906 for (
int iz = 0; iz <
m_Mz; ++iz) {
907 for (
int iy = 1; iy <
m_Ny; ++iy) {
908 for (
int ix = 0; ix <
m_Nx; ++ix) {
909 int is = ix + m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
911 int in = Nvcd * (is -
m_Nx);
914 for (
int ic = 0; ic <
m_Nc; ++ic) {
915 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
916 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
917 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
918 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
921 for (
int ic = 0; ic <
m_Nc; ++ic) {
923 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
924 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
925 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
926 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
928 w2[ic2 + id1 + iv] += wt1r;
929 w2[ic2 + 1 + id1 + iv] += wt1i;
930 w2[ic2 + id2 + iv] += wt2r;
931 w2[ic2 + 1 + id2 + iv] += wt2i;
932 w2[ic2 + id3 + iv] += wt2r;
933 w2[ic2 + 1 + id3 + iv] += wt2i;
934 w2[ic2 + id4 + iv] += -wt1r;
935 w2[ic2 + 1 + id4 + iv] += -wt1i;
946 int itask,
double *vcp1,
const double *v1)
948 int Nvc2 = 2 *
m_Nvc;
950 int Nvcd2 = Nvcd / 2;
957 int isite =
m_arg[itask].isite;
958 int isite_cp =
m_arg[itask].isite_cpz;
965 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
966 const double *w1 = &v1[Nvcd * isite];
968 if (
m_arg[itask].kz0 == 1) {
971 for (
int it = 0; it <
m_Mt; ++it) {
972 for (
int ixy = 0; ixy < Nxy; ++ixy) {
973 int is = ixy + Nxy * (iz +
m_Nz * it);
974 int is2 = ixy + Nxy * it;
977 int ix1 = Nvc2 * is2;
978 int ix2 = ix1 +
m_Nvc;
980 for (
int ic = 0; ic <
m_Nc; ++ic) {
981 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
982 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
983 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
984 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
996 int itask,
double *v2,
const double *vcp2)
998 int Nvc2 = 2 *
m_Nvc;
1000 int Nvcd2 = Nvcd / 2;
1004 int id3 =
m_Nvc * 2;
1005 int id4 =
m_Nvc * 3;
1009 double wt1r, wt1i, wt2r, wt2i;
1011 int isite =
m_arg[itask].isite;
1012 int isite_cp =
m_arg[itask].isite_cpz;
1014 double *w2 = &v2[Nvcd * isite];
1017 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1022 if (
m_arg[itask].kz1 == 1) {
1025 for (
int it = 0; it <
m_Mt; ++it) {
1026 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1027 int is = ixy + Nxy * (iz +
m_Nz * it);
1028 int is2 = ixy + Nxy * it;
1030 int ig =
m_Ndf * is;
1031 int ix1 = Nvc2 * is2;
1032 int ix2 = ix1 +
m_Nvc;
1034 for (
int ic = 0; ic <
m_Nc; ++ic) {
1035 int ic2 = ic *
m_Nvc;
1037 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1038 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1039 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1040 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1042 w2[2 * ic + id1 + iv] += wt1r;
1043 w2[2 * ic + 1 + id1 + iv] += wt1i;
1044 w2[2 * ic + id2 + iv] += wt2r;
1045 w2[2 * ic + 1 + id2 + iv] += wt2i;
1046 w2[2 * ic + id3 + iv] += wt1i;
1047 w2[2 * ic + 1 + id3 + iv] += -wt1r;
1048 w2[2 * ic + id4 + iv] += -wt2i;
1049 w2[2 * ic + 1 + id4 + iv] += wt2r;
1059 int itask,
double *v2,
const double *v1)
1065 int id3 =
m_Nvc * 2;
1066 int id4 =
m_Nvc * 3;
1071 double wt1r, wt1i, wt2r, wt2i;
1073 int isite =
m_arg[itask].isite;
1075 double *w2 = &v2[Nvcd * isite];
1076 const double *w1 = &v1[Nvcd * isite];
1079 int kz1 =
m_arg[itask].kz1;
1082 for (
int it = 0; it <
m_Mt; ++it) {
1083 for (
int iz = 0; iz <
m_Mz - kz1; ++iz) {
1084 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1085 int is = ixy + Nxy * (iz +
m_Nz * it);
1087 int in = Nvcd * (is + Nxy);
1088 int ig =
m_Ndf * is;
1090 for (
int ic = 0; ic <
m_Nc; ++ic) {
1091 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
1092 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
1093 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
1094 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
1097 for (
int ic = 0; ic <
m_Nc; ++ic) {
1098 int ic2 = ic *
m_Nvc;
1100 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1101 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1102 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1103 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1105 w2[2 * ic + id1 + iv] += wt1r;
1106 w2[2 * ic + 1 + id1 + iv] += wt1i;
1107 w2[2 * ic + id2 + iv] += wt2r;
1108 w2[2 * ic + 1 + id2 + iv] += wt2i;
1109 w2[2 * ic + id3 + iv] += wt1i;
1110 w2[2 * ic + 1 + id3 + iv] += -wt1r;
1111 w2[2 * ic + id4 + iv] += -wt2i;
1112 w2[2 * ic + 1 + id4 + iv] += wt2r;
1122 int itask,
double *vcp1,
const double *v1)
1124 int Nvc2 = 2 *
m_Nvc;
1126 int Nvcd2 = Nvcd / 2;
1130 int id3 =
m_Nvc * 2;
1131 int id4 =
m_Nvc * 3;
1135 int isite =
m_arg[itask].isite;
1136 int isite_cp =
m_arg[itask].isite_cpz;
1140 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1141 const double *w1 = &v1[Nvcd * isite];
1146 if (
m_arg[itask].kz1 == 1) {
1149 for (
int it = 0; it <
m_Mt; ++it) {
1150 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1151 int is = ixy + Nxy * (iz +
m_Nz * it);
1152 int is2 = ixy + Nxy * it;
1154 int ig =
m_Ndf * is;
1155 int ix1 = Nvc2 * is2;
1156 int ix2 = ix1 +
m_Nvc;
1158 for (
int ic = 0; ic <
m_Nc; ++ic) {
1159 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1160 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1161 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1162 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1165 for (
int ic = 0; ic <
m_Nc; ++ic) {
1167 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1168 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1169 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1170 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1182 int itask,
double *v2,
const double *vcp2)
1184 int Nvc2 = 2 *
m_Nvc;
1186 int Nvcd2 = Nvcd / 2;
1190 int id3 =
m_Nvc * 2;
1191 int id4 =
m_Nvc * 3;
1196 double wt1r, wt1i, wt2r, wt2i;
1198 int isite =
m_arg[itask].isite;
1199 int isite_cp =
m_arg[itask].isite_cpz;
1201 double *w2 = &v2[Nvcd * isite];
1204 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1208 if (
m_arg[itask].kz0 == 1) {
1212 for (
int it = 0; it <
m_Mt; ++it) {
1213 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1214 int is = ixy + Nxy * (iz +
m_Nz * it);
1215 int is2 = ixy + Nxy * it;
1217 int ix1 = Nvc2 * is2;
1218 int ix2 = ix1 +
m_Nvc;
1220 for (
int ic = 0; ic <
m_Nc; ++ic) {
1222 int ici = 2 * ic + 1;
1223 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1224 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1225 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1226 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1227 w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
1228 w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
1229 w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
1230 w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
1240 int itask,
double *v2,
const double *v1)
1246 int id3 =
m_Nvc * 2;
1247 int id4 =
m_Nvc * 3;
1252 double wt1r, wt1i, wt2r, wt2i;
1254 int isite =
m_arg[itask].isite;
1256 double *w2 = &v2[Nvcd * isite];
1257 const double *w1 = &v1[Nvcd * isite];
1260 int kz0 =
m_arg[itask].kz0;
1263 for (
int it = 0; it <
m_Mt; ++it) {
1264 for (
int iz = kz0; iz <
m_Mz; ++iz) {
1265 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1266 int is = ixy + Nxy * (iz +
m_Nz * it);
1268 int in = Nvcd * (is - Nxy);
1269 int ig =
m_Ndf * (is - Nxy);
1271 for (
int ic = 0; ic <
m_Nc; ++ic) {
1272 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1273 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1274 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1275 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1278 for (
int ic = 0; ic <
m_Nc; ++ic) {
1280 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1281 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1282 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1283 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1285 w2[ic2 + id1 + iv] += wt1r;
1286 w2[ic2 + 1 + id1 + iv] += wt1i;
1287 w2[ic2 + id2 + iv] += wt2r;
1288 w2[ic2 + 1 + id2 + iv] += wt2i;
1289 w2[ic2 + id3 + iv] += -wt1i;
1290 w2[ic2 + 1 + id3 + iv] += wt1r;
1291 w2[ic2 + id4 + iv] += wt2i;
1292 w2[ic2 + 1 + id4 + iv] += -wt2r;
1302 int itask,
double *vcp1,
const double *v1)
1304 int Nvc2 = 2 *
m_Nvc;
1306 int Nvcd2 = Nvcd / 2;
1310 int id3 =
m_Nvc * 2;
1311 int id4 =
m_Nvc * 3;
1313 int isite =
m_arg[itask].isite;
1314 int isite_cp =
m_arg[itask].isite_cpt;
1321 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1322 const double *w1 = &v1[Nvcd * isite];
1324 if (
m_arg[itask].kt0 == 1) {
1327 for (
int iz = 0; iz <
m_Mz; ++iz) {
1328 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1329 int is = ixy + Nxy * (iz +
m_Nz * it);
1330 int is2 = ixy + Nxy * iz;
1333 int ix1 = Nvc2 * is2;
1334 int ix2 = ix1 +
m_Nvc;
1336 for (
int ic = 0; ic <
m_Nc; ++ic) {
1337 w2[2 * ic + ix1] = 2.0 * bc2 * w1[2 * ic + id3 + in];
1338 w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
1339 w2[2 * ic + ix2] = 2.0 * bc2 * w1[2 * ic + id4 + in];
1340 w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
1352 int itask,
double *v2,
const double *vcp2)
1354 int Nvc2 = 2 *
m_Nvc;
1356 int Nvcd2 = Nvcd / 2;
1360 int id3 =
m_Nvc * 2;
1361 int id4 =
m_Nvc * 3;
1365 double wt1r, wt1i, wt2r, wt2i;
1367 int isite =
m_arg[itask].isite;
1368 int isite_cp =
m_arg[itask].isite_cpt;
1370 double *w2 = &v2[Nvcd * isite];
1373 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1378 if (
m_arg[itask].kt1 == 1) {
1381 for (
int iz = 0; iz <
m_Mz; ++iz) {
1382 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1383 int is = ixy + Nxy * (iz +
m_Nz * it);
1384 int is2 = ixy + Nxy * iz;
1386 int ig =
m_Ndf * is;
1387 int ix1 = Nvc2 * is2;
1388 int ix2 = ix1 +
m_Nvc;
1390 for (
int ic = 0; ic <
m_Nc; ++ic) {
1391 int ic2 = ic *
m_Nvc;
1393 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1394 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1395 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1396 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1398 w2[2 * ic + id3 + iv] += wt1r;
1399 w2[2 * ic + 1 + id3 + iv] += wt1i;
1400 w2[2 * ic + id4 + iv] += wt2r;
1401 w2[2 * ic + 1 + id4 + iv] += wt2i;
1411 int itask,
double *v2,
const double *v1)
1417 int id3 =
m_Nvc * 2;
1418 int id4 =
m_Nvc * 3;
1423 double wt1r, wt1i, wt2r, wt2i;
1425 int isite =
m_arg[itask].isite;
1427 double *w2 = &v2[Nvcd * isite];
1428 const double *w1 = &v1[Nvcd * isite];
1431 int kt1 =
m_arg[itask].kt1;
1433 int Nxyz = Nxy *
m_Nz;
1435 for (
int it = 0; it <
m_Mt - kt1; ++it) {
1436 for (
int iz = 0; iz <
m_Mz; ++iz) {
1437 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1438 int is = ixy + Nxy * (iz + m_Nz * it);
1440 int in = Nvcd * (is + Nxyz);
1441 int ig =
m_Ndf * is;
1443 for (
int ic = 0; ic <
m_Nc; ++ic) {
1444 vt1[2 * ic] = 2.0 * w1[2 * ic + id3 + in];
1445 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
1446 vt2[2 * ic] = 2.0 * w1[2 * ic + id4 + in];
1447 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
1450 for (
int ic = 0; ic <
m_Nc; ++ic) {
1451 int ic2 = ic *
m_Nvc;
1453 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1454 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1455 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1456 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1458 w2[2 * ic + id3 + iv] += wt1r;
1459 w2[2 * ic + 1 + id3 + iv] += wt1i;
1460 w2[2 * ic + id4 + iv] += wt2r;
1461 w2[2 * ic + 1 + id4 + iv] += wt2i;
1471 int itask,
double *vcp1,
const double *v1)
1473 int Nvc2 = 2 *
m_Nvc;
1475 int Nvcd2 = Nvcd / 2;
1479 int id3 =
m_Nvc * 2;
1480 int id4 =
m_Nvc * 3;
1484 int isite =
m_arg[itask].isite;
1485 int isite_cp =
m_arg[itask].isite_cpt;
1489 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1490 const double *w1 = &v1[Nvcd * isite];
1495 if (
m_arg[itask].kt1 == 1) {
1498 for (
int iz = 0; iz <
m_Mz; ++iz) {
1499 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1500 int is = ixy + Nxy * (iz +
m_Nz * it);
1501 int is2 = ixy + Nxy * iz;
1503 int ig =
m_Ndf * is;
1504 int ix1 = Nvc2 * is2;
1505 int ix2 = ix1 +
m_Nvc;
1507 for (
int ic = 0; ic <
m_Nc; ++ic) {
1508 vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1509 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1510 vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1511 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1514 for (
int ic = 0; ic <
m_Nc; ++ic) {
1516 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1517 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1518 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1519 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1531 int itask,
double *v2,
const double *vcp2)
1533 int Nvc2 = 2 *
m_Nvc;
1535 int Nvcd2 = Nvcd / 2;
1539 int id3 =
m_Nvc * 2;
1540 int id4 =
m_Nvc * 3;
1545 double wt1r, wt1i, wt2r, wt2i;
1547 int isite =
m_arg[itask].isite;
1548 int isite_cp =
m_arg[itask].isite_cpt;
1550 double *w2 = &v2[Nvcd * isite];
1553 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1557 if (
m_arg[itask].kt0 == 1) {
1560 for (
int iz = 0; iz <
m_Mz; ++iz) {
1561 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1562 int is = ixy + Nxy * (iz +
m_Nz * it);
1563 int is2 = ixy + Nxy * iz;
1565 int ix1 = Nvc2 * is2;
1566 int ix2 = ix1 +
m_Nvc;
1568 for (
int ic = 0; ic <
m_Nc; ++ic) {
1570 int ici = 2 * ic + 1;
1571 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1572 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1573 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1574 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1584 int itask,
double *v2,
const double *v1)
1590 int id3 =
m_Nvc * 2;
1591 int id4 =
m_Nvc * 3;
1596 double wt1r, wt1i, wt2r, wt2i;
1598 int isite =
m_arg[itask].isite;
1600 double *w2 = &v2[Nvcd * isite];
1601 const double *w1 = &v1[Nvcd * isite];
1604 int kt0 =
m_arg[itask].kt0;
1606 int Nxyz = Nxy *
m_Nz;
1608 for (
int it = kt0; it <
m_Mt; ++it) {
1609 for (
int iz = 0; iz <
m_Mz; ++iz) {
1610 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1611 int is = ixy + Nxy * (iz + m_Nz * it);
1613 int in = Nvcd * (is - Nxyz);
1614 int ig =
m_Ndf * (is - Nxyz);
1616 for (
int ic = 0; ic <
m_Nc; ++ic) {
1617 vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1618 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1619 vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1620 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1623 for (
int ic = 0; ic <
m_Nc; ++ic) {
1625 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1626 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1627 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1628 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1630 w2[ic2 + id1 + iv] += wt1r;
1631 w2[ic2 + 1 + id1 + iv] += wt1i;
1632 w2[ic2 + id2 + iv] += wt2r;
1633 w2[ic2 + 1 + id2 + iv] += wt2i;
1643 int itask,
double *vcp1,
const double *v1)
1645 int Nvc2 = 2 *
m_Nvc;
1647 int Nvcd2 = Nvcd / 2;
1651 int id3 =
m_Nvc * 2;
1652 int id4 =
m_Nvc * 3;
1654 int isite =
m_arg[itask].isite;
1655 int isite_cp =
m_arg[itask].isite_cpt;
1662 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1663 const double *w1 = &v1[Nvcd * isite];
1665 if (
m_arg[itask].kt0 == 1) {
1668 for (
int iz = 0; iz <
m_Mz; ++iz) {
1669 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1670 int is = ixy + Nxy * (iz +
m_Nz * it);
1671 int is2 = ixy + Nxy * iz;
1674 int ix1 = Nvc2 * is2;
1675 int ix2 = ix1 +
m_Nvc;
1677 for (
int ic = 0; ic <
m_Nc; ++ic) {
1678 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
1679 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
1680 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
1681 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
1693 int itask,
double *v2,
const double *vcp2)
1695 int Nvc2 = 2 *
m_Nvc;
1697 int Nvcd2 = Nvcd / 2;
1701 int id3 =
m_Nvc * 2;
1702 int id4 =
m_Nvc * 3;
1706 double wt1r, wt1i, wt2r, wt2i;
1708 int isite =
m_arg[itask].isite;
1709 int isite_cp =
m_arg[itask].isite_cpt;
1711 double *w2 = &v2[Nvcd * isite];
1714 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1719 if (
m_arg[itask].kt1 == 1) {
1722 for (
int iz = 0; iz <
m_Mz; ++iz) {
1723 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1724 int is = ixy + Nxy * (iz +
m_Nz * it);
1725 int is2 = ixy + Nxy * iz;
1727 int ig =
m_Ndf * is;
1728 int ix1 = Nvc2 * is2;
1729 int ix2 = ix1 +
m_Nvc;
1731 for (
int ic = 0; ic <
m_Nc; ++ic) {
1732 int ic2 = ic *
m_Nvc;
1734 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1735 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1736 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1737 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1739 w2[2 * ic + id1 + iv] += wt1r;
1740 w2[2 * ic + 1 + id1 + iv] += wt1i;
1741 w2[2 * ic + id2 + iv] += wt2r;
1742 w2[2 * ic + 1 + id2 + iv] += wt2i;
1743 w2[2 * ic + id3 + iv] += wt1r;
1744 w2[2 * ic + 1 + id3 + iv] += wt1i;
1745 w2[2 * ic + id4 + iv] += wt2r;
1746 w2[2 * ic + 1 + id4 + iv] += wt2i;
1756 int itask,
double *v2,
const double *v1)
1762 int id3 =
m_Nvc * 2;
1763 int id4 =
m_Nvc * 3;
1768 double wt1r, wt1i, wt2r, wt2i;
1770 int isite =
m_arg[itask].isite;
1772 double *w2 = &v2[Nvcd * isite];
1773 const double *w1 = &v1[Nvcd * isite];
1776 int kt1 =
m_arg[itask].kt1;
1778 int Nxyz = Nxy *
m_Nz;
1780 for (
int it = 0; it <
m_Mt - kt1; ++it) {
1781 for (
int iz = 0; iz <
m_Mz; ++iz) {
1782 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1783 int is = ixy + Nxy * (iz + m_Nz * it);
1785 int in = Nvcd * (is + Nxyz);
1786 int ig =
m_Ndf * is;
1788 for (
int ic = 0; ic <
m_Nc; ++ic) {
1789 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
1790 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
1791 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
1792 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
1795 for (
int ic = 0; ic <
m_Nc; ++ic) {
1796 int ic2 = ic *
m_Nvc;
1798 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1799 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1800 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1801 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1803 w2[2 * ic + id1 + iv] += wt1r;
1804 w2[2 * ic + 1 + id1 + iv] += wt1i;
1805 w2[2 * ic + id2 + iv] += wt2r;
1806 w2[2 * ic + 1 + id2 + iv] += wt2i;
1807 w2[2 * ic + id3 + iv] += wt1r;
1808 w2[2 * ic + 1 + id3 + iv] += wt1i;
1809 w2[2 * ic + id4 + iv] += wt2r;
1810 w2[2 * ic + 1 + id4 + iv] += wt2i;
1820 int itask,
double *vcp1,
const double *v1)
1822 int Nvc2 = 2 *
m_Nvc;
1824 int Nvcd2 = Nvcd / 2;
1828 int id3 =
m_Nvc * 2;
1829 int id4 =
m_Nvc * 3;
1833 int isite =
m_arg[itask].isite;
1834 int isite_cp =
m_arg[itask].isite_cpt;
1838 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1839 const double *w1 = &v1[Nvcd * isite];
1844 if (
m_arg[itask].kt1 == 1) {
1847 for (
int iz = 0; iz <
m_Mz; ++iz) {
1848 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1849 int is = ixy + Nxy * (iz +
m_Nz * it);
1850 int is2 = ixy + Nxy * iz;
1852 int ig =
m_Ndf * is;
1853 int ix1 = Nvc2 * is2;
1854 int ix2 = ix1 +
m_Nvc;
1856 for (
int ic = 0; ic <
m_Nc; ++ic) {
1857 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1858 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1859 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1860 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1863 for (
int ic = 0; ic <
m_Nc; ++ic) {
1865 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1866 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1867 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1868 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1880 int itask,
double *v2,
const double *vcp2)
1882 int Nvc2 = 2 *
m_Nvc;
1884 int Nvcd2 = Nvcd / 2;
1888 int id3 =
m_Nvc * 2;
1889 int id4 =
m_Nvc * 3;
1894 double wt1r, wt1i, wt2r, wt2i;
1896 int isite =
m_arg[itask].isite;
1897 int isite_cp =
m_arg[itask].isite_cpt;
1899 double *w2 = &v2[Nvcd * isite];
1902 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1906 if (
m_arg[itask].kt0 == 1) {
1909 for (
int iz = 0; iz <
m_Mz; ++iz) {
1910 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1911 int is = ixy + Nxy * (iz +
m_Nz * it);
1912 int is2 = ixy + Nxy * iz;
1914 int ix1 = Nvc2 * is2;
1915 int ix2 = ix1 +
m_Nvc;
1917 for (
int ic = 0; ic <
m_Nc; ++ic) {
1919 int ici = 2 * ic + 1;
1920 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1921 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1922 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1923 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1924 w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
1925 w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
1926 w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
1927 w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
1937 int itask,
double *v2,
const double *v1)
1943 int id3 =
m_Nvc * 2;
1944 int id4 =
m_Nvc * 3;
1949 double wt1r, wt1i, wt2r, wt2i;
1951 int isite =
m_arg[itask].isite;
1953 double *w2 = &v2[Nvcd * isite];
1954 const double *w1 = &v1[Nvcd * isite];
1957 int kt0 =
m_arg[itask].kt0;
1959 int Nxyz = Nxy *
m_Nz;
1961 for (
int it = kt0; it <
m_Mt; ++it) {
1962 for (
int iz = 0; iz <
m_Mz; ++iz) {
1963 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1964 int is = ixy + Nxy * (iz + m_Nz * it);
1966 int in = Nvcd * (is - Nxyz);
1967 int ig =
m_Ndf * (is - Nxyz);
1969 for (
int ic = 0; ic <
m_Nc; ++ic) {
1970 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1971 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1972 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1973 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1976 for (
int ic = 0; ic <
m_Nc; ++ic) {
1978 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1979 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1980 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1981 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1983 w2[ic2 + id1 + iv] += wt1r;
1984 w2[ic2 + 1 + id1 + iv] += wt1i;
1985 w2[ic2 + id2 + iv] += wt2r;
1986 w2[ic2 + 1 + id2 + iv] += wt2i;
1987 w2[ic2 + id3 + iv] -= wt1r;
1988 w2[ic2 + 1 + id3 + iv] -= wt1i;
1989 w2[ic2 + id4 + iv] -= wt2r;
1990 w2[ic2 + 1 + id4 + iv] -= wt2i;
2000 int itask,
double *v2,
const double *v1)
2007 int id3 =
m_Nvc * 2;
2008 int id4 =
m_Nvc * 3;
2010 int isite =
m_arg[itask].isite;
2011 double *w2 = &v2[Nvcd * isite];
2012 const double *w1 = &v1[Nvcd * isite];
2014 for (
int it = 0; it <
m_Mt; ++it) {
2015 for (
int iz = 0; iz <
m_Mz; ++iz) {
2016 for (
int ixy = 0; ixy < Nxy; ++ixy) {
2017 int iv = Nvcd * (ixy + Nxy * (iz +
m_Nz * it));
2018 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
2019 w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
2020 w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
2021 w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
2022 w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
2032 int itask,
double *v2,
const double *v1)
2039 int id3 =
m_Nvc * 2;
2040 int id4 =
m_Nvc * 3;
2042 int isite =
m_arg[itask].isite;
2043 double *w2 = &v2[Nvcd * isite];
2044 const double *w1 = &v1[Nvcd * isite];
2046 for (
int it = 0; it <
m_Mt; ++it) {
2047 for (
int iz = 0; iz <
m_Mz; ++iz) {
2048 for (
int ixy = 0; ixy < Nxy; ++ixy) {
2049 int iv = Nvcd * (ixy + Nxy * (iz +
m_Nz * it));
2050 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
2051 w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
2052 w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
2053 w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
2054 w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
void clear_thread(int, double *)
const double * ptr(const int jin, const int site, const int jex) const
void mult_ym1_thread(int, double *, const double *)
void mult_yp2_thread(int, double *, const double *)
void mult_xm1_thread(int, double *, const double *)
std::vector< Channel * > m_bw_recv
void mult_zm1_thread(int, double *, const double *)
static const std::string class_name
std::vector< Channel * > m_bw_send
void general(const char *format,...)
void gm5_dirac_thread(int, double *, const double *)
std::vector< mult_arg > m_arg
void mult_tp2_chiral_thread(int, double *, const double *)
void mult_tp2_dirac_thread(int, double *, const double *)
void mult_ymb_thread(int, double *, const double *)
void mult_ypb_thread(int, double *, const double *)
void mult_zm2_thread(int, double *, const double *)
void mult_tmb_dirac_thread(int, double *, const double *)
void mult_tp1_dirac_thread(int, double *, const double *)
void gm5_chiral_thread(int, double *, const double *)
void mult_tmb_chiral_thread(int, double *, const double *)
std::vector< double > m_boundary2
b.c. for each node.
void mult_yp1_thread(int, double *, const double *)
Bridge::VerboseLevel m_vl
void mult_xp1_thread(int, double *, const double *)
void mult_zp1_thread(int, double *, const double *)
std::vector< Channel * > m_fw_recv
void mult_tm1_dirac_thread(int, double *, const double *)
static int get_num_threads_available()
returns number of threads (works outside of parallel region).
const Field_G * m_U
gauge configuration.
void mult_ym2_thread(int, double *, const double *)
void crucial(const char *format,...)
void mult_tp1_chiral_thread(int, double *, const double *)
void mult_xpb_thread(int, double *, const double *)
void daypx_thread(int, double *, double, const double *)
void mult_tpb_chiral_thread(int, double *, const double *)
std::vector< Channel * > m_fw_send
void mult_xp2_thread(int, double *, const double *)
void mult_zpb_thread(int, double *, const double *)
void mult_zp2_thread(int, double *, const double *)
void mult_tm2_dirac_thread(int, double *, const double *)
void mult_xmb_thread(int, double *, const double *)
void mult_tm2_chiral_thread(int, double *, const double *)
void mult_tm1_chiral_thread(int, double *, const double *)
void mult_tpb_dirac_thread(int, double *, const double *)
void mult_xm2_thread(int, double *, const double *)
void mult_zmb_thread(int, double *, const double *)