19 #if defined USE_GROUP_SU3
20 #include "fopr_Wilson_impl_SU3.inc"
21 #elif defined USE_GROUP_SU2
22 #include "fopr_Wilson_impl_SU2.inc"
23 #elif defined USE_GROUP_SU_N
24 #include "fopr_Wilson_impl_SU_N.inc"
77 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
78 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
79 int itask = ith_z + m_Ntask_z * ith_t;
87 if (ith_t == 0)
m_arg[itask].kt0 = 1;
88 if (ith_z == 0)
m_arg[itask].kz0 = 1;
89 if (ith_t == m_Ntask_t - 1)
m_arg[itask].kt1 = 1;
90 if (ith_z == m_Ntask_z - 1)
m_arg[itask].kz1 = 1;
94 m_arg[itask].isite_cpz = ith_t *
m_Mt * Nxy;
95 m_arg[itask].isite_cpt = ith_z *
m_Mz * Nxy;
102 int Nvcd2 = 2 * Nc * Nd / 2;
104 std::vector<int> destid(
m_Ntask);
105 std::vector<int> offset(
m_Ntask);
106 std::vector<int> datasize(
m_Ntask);
107 std::vector<int> offset_up(
m_Ntask);
108 std::vector<int> offset_lw(
m_Ntask);
109 std::vector<int> datasize_up(
m_Ntask);
110 std::vector<int> datasize_lw(
m_Ntask);
113 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
114 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
117 destid[itask] = itask;
118 offset[itask] =
sizeof(double) * Nvcd2 * isite_cp;
119 datasize[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Mt * m_Ny;
128 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
129 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
132 destid[itask] = itask;
133 offset[itask] =
sizeof(double) * Nvcd2 * isite_cp;
134 datasize[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Mt * m_Nx;
143 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
144 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
145 int itask = ith_z + m_Ntask_z * ith_t;
147 offset_up[itask] = 0;
148 offset_lw[itask] = 0;
149 datasize_up[itask] = 0;
150 datasize_lw[itask] = 0;
152 destid[itask] = (m_Ntask_z - 1) + ith_t * m_Ntask_z;
153 offset_lw[itask] =
sizeof(double) * Nvcd2 * ith_t *
m_Mt *
m_Nx * m_Ny;
154 datasize_lw[itask] =
sizeof(double) * Nvcd2 *
m_Mt *
m_Nx * m_Ny;
156 if (ith_z == m_Ntask_z - 1) {
158 offset_up[itask] =
sizeof(double) * Nvcd2 * ith_t *
m_Mt *
m_Nx * m_Ny;
159 datasize_up[itask] =
sizeof(double) * Nvcd2 *
m_Mt *
m_Nx * m_Ny;
169 for (
int ith_t = 0; ith_t <
m_Ntask_t; ++ith_t) {
170 for (
int ith_z = 0; ith_z <
m_Ntask_z; ++ith_z) {
171 int itask = ith_z + m_Ntask_z * ith_t;
173 offset_up[itask] = 0;
174 offset_lw[itask] = 0;
175 datasize_up[itask] = 0;
176 datasize_lw[itask] = 0;
178 destid[itask] = ith_z + (m_Ntask_t - 1) * m_Ntask_z;
179 offset_lw[itask] =
sizeof(double) * Nvcd2 * ith_z *
m_Mz *
m_Nx * m_Ny;
180 datasize_lw[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Nx * m_Ny;
182 if (ith_t == m_Ntask_t - 1) {
183 destid[itask] = ith_z;
184 offset_up[itask] =
sizeof(double) * Nvcd2 * ith_z *
m_Mz *
m_Nx * m_Ny;
185 datasize_up[itask] =
sizeof(double) * Nvcd2 *
m_Mz *
m_Nx * m_Ny;
198 int itask,
double *v2,
double fac,
const double *v1)
203 int isite =
m_arg[itask].isite;
205 const double *w1 = &v1[Nvcd * isite];
206 double *w2 = &v2[Nvcd * isite];
208 for (
int it = 0; it <
m_Mt; ++it) {
209 for (
int iz = 0; iz <
m_Mz; ++iz) {
210 for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
211 int iv = ivxy + Nvxy * (iz +
m_Nz * it);
212 w2[iv] = fac * w2[iv] + w1[iv];
226 int isite =
m_arg[itask].isite;
227 double *w2 = &v2[Nvcd * isite];
229 for (
int it = 0; it <
m_Mt; ++it) {
230 for (
int iz = 0; iz <
m_Mz; ++iz) {
231 for (
int ivxy = 0; ivxy < Nvxy; ++ivxy) {
232 int iv = ivxy + Nvxy * (iz +
m_Nz * it);
242 int itask,
double *vcp1,
const double *v1)
244 int Nvc2 = 2 *
m_Nvc;
246 int Nvcd2 = Nvcd / 2;
256 int isite =
m_arg[itask].isite;
257 int isite_cp =
m_arg[itask].isite_cpx;
260 const double *w1 = &v1[Nvcd * isite];
262 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
266 for (
int it = 0; it <
m_Mt; ++it) {
267 for (
int iz = 0; iz <
m_Mz; ++iz) {
268 for (
int iy = 0; iy <
m_Ny; ++iy) {
269 int is = ix +
m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
270 int is2 = iy + m_Ny * (iz + m_Mz * it);
272 int ix1 = Nvc2 * is2;
273 int ix2 = ix1 +
m_Nvc;
275 for (
int ic = 0; ic <
m_Nc; ++ic) {
276 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in]);
277 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in]);
278 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in]);
279 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in]);
291 int itask,
double *v2,
const double *vcp2)
293 int Nvc2 = 2 *
m_Nvc;
295 int Nvcd2 = Nvcd / 2;
304 double wt1r, wt1i, wt2r, wt2i;
306 int isite =
m_arg[itask].isite;
307 int isite_cp =
m_arg[itask].isite_cpx;
309 double *w2 = &v2[Nvcd * isite];
312 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
318 for (
int it = 0; it <
m_Mt; ++it) {
319 for (
int iz = 0; iz <
m_Mz; ++iz) {
320 for (
int iy = 0; iy <
m_Ny; ++iy) {
321 int is = ix +
m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
322 int is2 = iy + m_Ny * (iz + m_Mz * it);
325 int ix1 = Nvc2 * is2;
326 int ix2 = ix1 +
m_Nvc;
328 for (
int ic = 0; ic <
m_Nc; ++ic) {
329 int ic2 = ic *
m_Nvc;
331 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
332 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
333 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
334 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
336 w2[2 * ic + id1 + iv] += wt1r;
337 w2[2 * ic + 1 + id1 + iv] += wt1i;
338 w2[2 * ic + id2 + iv] += wt2r;
339 w2[2 * ic + 1 + id2 + iv] += wt2i;
340 w2[2 * ic + id3 + iv] += wt2i;
341 w2[2 * ic + 1 + id3 + iv] += -wt2r;
342 w2[2 * ic + id4 + iv] += wt1i;
343 w2[2 * ic + 1 + id4 + iv] += -wt1r;
353 int itask,
double *v2,
const double *v1)
365 double wt1r, wt1i, wt2r, wt2i;
367 int isite =
m_arg[itask].isite;
369 const double *w1 = &v1[Nvcd * isite];
370 double *w2 = &v2[Nvcd * isite];
373 for (
int it = 0; it <
m_Mt; ++it) {
374 for (
int iz = 0; iz <
m_Mz; ++iz) {
375 for (
int iy = 0; iy <
m_Ny; ++iy) {
376 for (
int ix = 0; ix <
m_Nx - 1; ++ix) {
377 int is = ix + m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
379 int in = Nvcd * (is + 1);
382 for (
int ic = 0; ic <
m_Nc; ++ic) {
383 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id4 + in];
384 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id4 + in];
385 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id3 + in];
386 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id3 + in];
389 for (
int ic = 0; ic <
m_Nc; ++ic) {
390 int ic2 = ic *
m_Nvc;
392 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
393 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
394 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
395 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
397 w2[2 * ic + id1 + iv] += wt1r;
398 w2[2 * ic + 1 + id1 + iv] += wt1i;
399 w2[2 * ic + id2 + iv] += wt2r;
400 w2[2 * ic + 1 + id2 + iv] += wt2i;
401 w2[2 * ic + id3 + iv] += wt2i;
402 w2[2 * ic + 1 + id3 + iv] += -wt2r;
403 w2[2 * ic + id4 + iv] += wt1i;
404 w2[2 * ic + 1 + id4 + iv] += -wt1r;
415 int itask,
double *vcp1,
const double *v1)
417 int Nvc2 = 2 *
m_Nvc;
419 int Nvcd2 = Nvcd / 2;
428 int isite =
m_arg[itask].isite;
429 int isite_cp =
m_arg[itask].isite_cpx;
431 const double *w1 = &v1[Nvcd * isite];
434 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
441 for (
int it = 0; it <
m_Mt; ++it) {
442 for (
int iz = 0; iz <
m_Mz; ++iz) {
443 for (
int iy = 0; iy <
m_Ny; ++iy) {
444 int is = ix +
m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
445 int is2 = iy + m_Ny * (iz + m_Mz * it);
448 int ix1 = Nvc2 * is2;
449 int ix2 = ix1 +
m_Nvc;
451 for (
int ic = 0; ic <
m_Nc; ++ic) {
452 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
453 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
454 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
455 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
458 for (
int ic = 0; ic <
m_Nc; ++ic) {
460 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
461 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
462 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
463 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
475 int itask,
double *v2,
const double *vcp2)
477 int Nvc2 = 2 *
m_Nvc;
479 int Nvcd2 = Nvcd / 2;
489 double wt1r, wt1i, wt2r, wt2i;
491 int isite =
m_arg[itask].isite;
492 int isite_cp =
m_arg[itask].isite_cpx;
494 double *w2 = &v2[Nvcd * isite];
497 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
502 for (
int it = 0; it <
m_Mt; ++it) {
503 for (
int iz = 0; iz <
m_Mz; ++iz) {
504 for (
int iy = 0; iy <
m_Ny; ++iy) {
505 int is = ix +
m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
506 int is2 = iy + m_Ny * (iz + m_Mz * it);
508 int ix1 = Nvc2 * is2;
509 int ix2 = ix1 +
m_Nvc;
511 for (
int ic = 0; ic <
m_Nc; ++ic) {
513 int ici = 2 * ic + 1;
514 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
515 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
516 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
517 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
518 w2[icr + id3 + iv] += -bc2 * w1[ici + ix2];
519 w2[ici + id3 + iv] += +bc2 * w1[icr + ix2];
520 w2[icr + id4 + iv] += -bc2 * w1[ici + ix1];
521 w2[ici + id4 + iv] += +bc2 * w1[icr + ix1];
531 int itask,
double *v2,
const double *v1)
543 double wt1r, wt1i, wt2r, wt2i;
545 int isite =
m_arg[itask].isite;
547 const double *w1 = &v1[Nvcd * isite];
548 double *w2 = &v2[Nvcd * isite];
551 for (
int it = 0; it <
m_Mt; ++it) {
552 for (
int iz = 0; iz <
m_Mz; ++iz) {
553 for (
int iy = 0; iy <
m_Ny; ++iy) {
554 for (
int ix = 1; ix <
m_Nx; ++ix) {
555 int is = ix + m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
557 int in = Nvcd * (is - 1);
558 int ig =
m_Ndf * (is - 1);
560 for (
int ic = 0; ic <
m_Nc; ++ic) {
561 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id4 + in];
562 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id4 + in];
563 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id3 + in];
564 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id3 + in];
567 for (
int ic = 0; ic <
m_Nc; ++ic) {
570 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
571 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
572 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
573 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
575 w2[2 * ic + id1 + iv] += wt1r;
576 w2[2 * ic + 1 + id1 + iv] += wt1i;
577 w2[2 * ic + id2 + iv] += wt2r;
578 w2[2 * ic + 1 + id2 + iv] += wt2i;
579 w2[2 * ic + id3 + iv] += -wt2i;
580 w2[2 * ic + 1 + id3 + iv] += +wt2r;
581 w2[2 * ic + id4 + iv] += -wt1i;
582 w2[2 * ic + 1 + id4 + iv] += +wt1r;
593 int itask,
double *vcp1,
const double *v1)
595 int Nvc2 = 2 *
m_Nvc;
597 int Nvcd2 = Nvcd / 2;
604 int isite =
m_arg[itask].isite;
605 int isite_cp =
m_arg[itask].isite_cpy;
610 const double *w1 = &v1[Nvcd * isite];
613 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
618 for (
int it = 0; it <
m_Mt; ++it) {
619 for (
int iz = 0; iz <
m_Mz; ++iz) {
620 for (
int ix = 0; ix <
m_Nx; ++ix) {
621 int is = ix + m_Nx * (iy +
m_Ny * (iz +
m_Nz * it));
622 int is2 = ix + m_Nx * (iz + m_Mz * it);
624 int ix1 = Nvc2 * is2;
625 int ix2 = ix1 +
m_Nvc;
627 for (
int ic = 0; ic <
m_Nc; ++ic) {
628 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in]);
629 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in]);
630 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in]);
631 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in]);
643 int itask,
double *v2,
const double *vcp2)
645 int Nvc2 = 2 *
m_Nvc;
647 int Nvcd2 = Nvcd / 2;
656 double wt1r, wt1i, wt2r, wt2i;
658 int isite =
m_arg[itask].isite;
659 int isite_cp =
m_arg[itask].isite_cpy;
661 double *w2 = &v2[Nvcd * isite];
664 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
670 for (
int it = 0; it <
m_Mt; ++it) {
671 for (
int iz = 0; iz <
m_Mz; ++iz) {
672 for (
int ix = 0; ix <
m_Nx; ++ix) {
673 int is = ix + m_Nx * (iy +
m_Ny * (iz +
m_Nz * it));
674 int is2 = ix + m_Nx * (iz + m_Mz * it);
677 int ix1 = Nvc2 * is2;
678 int ix2 = ix1 +
m_Nvc;
680 for (
int ic = 0; ic <
m_Nc; ++ic) {
681 int ic2 = ic *
m_Nvc;
683 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
684 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
685 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
686 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
688 w2[2 * ic + id1 + iv] += wt1r;
689 w2[2 * ic + 1 + id1 + iv] += wt1i;
690 w2[2 * ic + id2 + iv] += wt2r;
691 w2[2 * ic + 1 + id2 + iv] += wt2i;
692 w2[2 * ic + id3 + iv] += -wt2r;
693 w2[2 * ic + 1 + id3 + iv] += -wt2i;
694 w2[2 * ic + id4 + iv] += wt1r;
695 w2[2 * ic + 1 + id4 + iv] += wt1i;
705 int itask,
double *v2,
const double *v1)
717 double wt1r, wt1i, wt2r, wt2i;
719 int isite =
m_arg[itask].isite;
721 double *w2 = &v2[Nvcd * isite];
722 const double *w1 = &v1[Nvcd * isite];
725 for (
int it = 0; it <
m_Mt; ++it) {
726 for (
int iz = 0; iz <
m_Mz; ++iz) {
727 for (
int iy = 0; iy <
m_Ny - 1; ++iy) {
728 for (
int ix = 0; ix <
m_Nx; ++ix) {
729 int is = ix + m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
731 int in = Nvcd * (is +
m_Nx);
734 for (
int ic = 0; ic <
m_Nc; ++ic) {
735 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id4 + in];
736 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id4 + in];
737 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id3 + in];
738 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id3 + in];
741 for (
int ic = 0; ic <
m_Nc; ++ic) {
742 int ic2 = ic *
m_Nvc;
744 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
745 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
746 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
747 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
749 w2[2 * ic + id1 + iv] += wt1r;
750 w2[2 * ic + 1 + id1 + iv] += wt1i;
751 w2[2 * ic + id2 + iv] += wt2r;
752 w2[2 * ic + 1 + id2 + iv] += wt2i;
753 w2[2 * ic + id3 + iv] += -wt2r;
754 w2[2 * ic + 1 + id3 + iv] += -wt2i;
755 w2[2 * ic + id4 + iv] += wt1r;
756 w2[2 * ic + 1 + id4 + iv] += wt1i;
767 int itask,
double *vcp1,
const double *v1)
769 int Nvc2 = 2 *
m_Nvc;
771 int Nvcd2 = Nvcd / 2;
780 int isite =
m_arg[itask].isite;
781 int isite_cp =
m_arg[itask].isite_cpy;
785 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
787 const double *w1 = &v1[Nvcd * isite];
794 for (
int it = 0; it <
m_Mt; ++it) {
795 for (
int iz = 0; iz <
m_Mz; ++iz) {
796 for (
int ix = 0; ix <
m_Nx; ++ix) {
797 int is = ix + m_Nx * (iy +
m_Ny * (iz +
m_Nz * it));
798 int is2 = ix + m_Nx * (iz + m_Mz * it);
801 int ix1 = Nvc2 * is2;
802 int ix2 = ix1 +
m_Nvc;
804 for (
int ic = 0; ic <
m_Nc; ++ic) {
805 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
806 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
807 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
808 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
811 for (
int ic = 0; ic <
m_Nc; ++ic) {
813 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
814 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
815 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
816 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
828 int itask,
double *v2,
const double *vcp2)
830 int Nvc2 = 2 *
m_Nvc;
832 int Nvcd2 = Nvcd / 2;
842 double wt1r, wt1i, wt2r, wt2i;
844 int isite =
m_arg[itask].isite;
845 int isite_cp =
m_arg[itask].isite_cpy;
847 double *w2 = &v2[Nvcd * isite];
850 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
855 for (
int it = 0; it <
m_Mt; ++it) {
856 for (
int iz = 0; iz <
m_Mz; ++iz) {
857 for (
int ix = 0; ix <
m_Nx; ++ix) {
858 int is = ix + m_Nx * (iy +
m_Ny * (iz +
m_Nz * it));
859 int is2 = ix + m_Nx * (iz + m_Mz * it);
861 int ix1 = Nvc2 * is2;
862 int ix2 = ix1 +
m_Nvc;
864 for (
int ic = 0; ic <
m_Nc; ++ic) {
866 int ici = 2 * ic + 1;
867 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
868 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
869 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
870 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
871 w2[icr + id3 + iv] += bc2 * w1[icr + ix2];
872 w2[ici + id3 + iv] += bc2 * w1[ici + ix2];
873 w2[icr + id4 + iv] += -bc2 * w1[icr + ix1];
874 w2[ici + id4 + iv] += -bc2 * w1[ici + ix1];
884 int itask,
double *v2,
const double *v1)
896 double wt1r, wt1i, wt2r, wt2i;
898 int isite =
m_arg[itask].isite;
900 double *w2 = &v2[Nvcd * isite];
901 const double *w1 = &v1[Nvcd * isite];
904 for (
int it = 0; it <
m_Mt; ++it) {
905 for (
int iz = 0; iz <
m_Mz; ++iz) {
906 for (
int iy = 1; iy <
m_Ny; ++iy) {
907 for (
int ix = 0; ix <
m_Nx; ++ix) {
908 int is = ix + m_Nx * (iy + m_Ny * (iz +
m_Nz * it));
910 int in = Nvcd * (is -
m_Nx);
913 for (
int ic = 0; ic <
m_Nc; ++ic) {
914 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id4 + in];
915 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id4 + in];
916 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id3 + in];
917 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id3 + in];
920 for (
int ic = 0; ic <
m_Nc; ++ic) {
922 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
923 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
924 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
925 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
927 w2[ic2 + id1 + iv] += wt1r;
928 w2[ic2 + 1 + id1 + iv] += wt1i;
929 w2[ic2 + id2 + iv] += wt2r;
930 w2[ic2 + 1 + id2 + iv] += wt2i;
931 w2[ic2 + id3 + iv] += wt2r;
932 w2[ic2 + 1 + id3 + iv] += wt2i;
933 w2[ic2 + id4 + iv] += -wt1r;
934 w2[ic2 + 1 + id4 + iv] += -wt1i;
945 int itask,
double *vcp1,
const double *v1)
947 int Nvc2 = 2 *
m_Nvc;
949 int Nvcd2 = Nvcd / 2;
956 int isite =
m_arg[itask].isite;
957 int isite_cp =
m_arg[itask].isite_cpz;
964 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
965 const double *w1 = &v1[Nvcd * isite];
967 if (
m_arg[itask].kz0 == 1) {
970 for (
int it = 0; it <
m_Mt; ++it) {
971 for (
int ixy = 0; ixy < Nxy; ++ixy) {
972 int is = ixy + Nxy * (iz +
m_Nz * it);
973 int is2 = ixy + Nxy * it;
976 int ix1 = Nvc2 * is2;
977 int ix2 = ix1 +
m_Nvc;
979 for (
int ic = 0; ic <
m_Nc; ++ic) {
980 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in]);
981 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in]);
982 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in]);
983 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in]);
995 int itask,
double *v2,
const double *vcp2)
997 int Nvc2 = 2 *
m_Nvc;
999 int Nvcd2 = Nvcd / 2;
1003 int id3 =
m_Nvc * 2;
1004 int id4 =
m_Nvc * 3;
1008 double wt1r, wt1i, wt2r, wt2i;
1010 int isite =
m_arg[itask].isite;
1011 int isite_cp =
m_arg[itask].isite_cpz;
1013 double *w2 = &v2[Nvcd * isite];
1016 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1021 if (
m_arg[itask].kz1 == 1) {
1024 for (
int it = 0; it <
m_Mt; ++it) {
1025 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1026 int is = ixy + Nxy * (iz +
m_Nz * it);
1027 int is2 = ixy + Nxy * it;
1029 int ig =
m_Ndf * is;
1030 int ix1 = Nvc2 * is2;
1031 int ix2 = ix1 +
m_Nvc;
1033 for (
int ic = 0; ic <
m_Nc; ++ic) {
1034 int ic2 = ic *
m_Nvc;
1036 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1037 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1038 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1039 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1041 w2[2 * ic + id1 + iv] += wt1r;
1042 w2[2 * ic + 1 + id1 + iv] += wt1i;
1043 w2[2 * ic + id2 + iv] += wt2r;
1044 w2[2 * ic + 1 + id2 + iv] += wt2i;
1045 w2[2 * ic + id3 + iv] += wt1i;
1046 w2[2 * ic + 1 + id3 + iv] += -wt1r;
1047 w2[2 * ic + id4 + iv] += -wt2i;
1048 w2[2 * ic + 1 + id4 + iv] += wt2r;
1058 int itask,
double *v2,
const double *v1)
1064 int id3 =
m_Nvc * 2;
1065 int id4 =
m_Nvc * 3;
1070 double wt1r, wt1i, wt2r, wt2i;
1072 int isite =
m_arg[itask].isite;
1074 double *w2 = &v2[Nvcd * isite];
1075 const double *w1 = &v1[Nvcd * isite];
1078 int kz1 =
m_arg[itask].kz1;
1081 for (
int it = 0; it <
m_Mt; ++it) {
1082 for (
int iz = 0; iz <
m_Mz - kz1; ++iz) {
1083 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1084 int is = ixy + Nxy * (iz +
m_Nz * it);
1086 int in = Nvcd * (is + Nxy);
1087 int ig =
m_Ndf * is;
1089 for (
int ic = 0; ic <
m_Nc; ++ic) {
1090 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + 1 + id3 + in];
1091 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + id3 + in];
1092 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + 1 + id4 + in];
1093 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + id4 + in];
1096 for (
int ic = 0; ic <
m_Nc; ++ic) {
1097 int ic2 = ic *
m_Nvc;
1099 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1100 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1101 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1102 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1104 w2[2 * ic + id1 + iv] += wt1r;
1105 w2[2 * ic + 1 + id1 + iv] += wt1i;
1106 w2[2 * ic + id2 + iv] += wt2r;
1107 w2[2 * ic + 1 + id2 + iv] += wt2i;
1108 w2[2 * ic + id3 + iv] += wt1i;
1109 w2[2 * ic + 1 + id3 + iv] += -wt1r;
1110 w2[2 * ic + id4 + iv] += -wt2i;
1111 w2[2 * ic + 1 + id4 + iv] += wt2r;
1121 int itask,
double *vcp1,
const double *v1)
1123 int Nvc2 = 2 *
m_Nvc;
1125 int Nvcd2 = Nvcd / 2;
1129 int id3 =
m_Nvc * 2;
1130 int id4 =
m_Nvc * 3;
1134 int isite =
m_arg[itask].isite;
1135 int isite_cp =
m_arg[itask].isite_cpz;
1139 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1140 const double *w1 = &v1[Nvcd * isite];
1145 if (
m_arg[itask].kz1 == 1) {
1148 for (
int it = 0; it <
m_Mt; ++it) {
1149 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1150 int is = ixy + Nxy * (iz +
m_Nz * it);
1151 int is2 = ixy + Nxy * it;
1153 int ig =
m_Ndf * is;
1154 int ix1 = Nvc2 * is2;
1155 int ix2 = ix1 +
m_Nvc;
1157 for (
int ic = 0; ic <
m_Nc; ++ic) {
1158 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1159 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1160 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1161 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1164 for (
int ic = 0; ic <
m_Nc; ++ic) {
1166 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1167 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1168 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1169 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1181 int itask,
double *v2,
const double *vcp2)
1183 int Nvc2 = 2 *
m_Nvc;
1185 int Nvcd2 = Nvcd / 2;
1189 int id3 =
m_Nvc * 2;
1190 int id4 =
m_Nvc * 3;
1195 double wt1r, wt1i, wt2r, wt2i;
1197 int isite =
m_arg[itask].isite;
1198 int isite_cp =
m_arg[itask].isite_cpz;
1200 double *w2 = &v2[Nvcd * isite];
1203 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1207 if (
m_arg[itask].kz0 == 1) {
1211 for (
int it = 0; it <
m_Mt; ++it) {
1212 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1213 int is = ixy + Nxy * (iz +
m_Nz * it);
1214 int is2 = ixy + Nxy * it;
1216 int ix1 = Nvc2 * is2;
1217 int ix2 = ix1 +
m_Nvc;
1219 for (
int ic = 0; ic <
m_Nc; ++ic) {
1221 int ici = 2 * ic + 1;
1222 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1223 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1224 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1225 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1226 w2[icr + id3 + iv] += -bc2 * w1[ici + ix1];
1227 w2[ici + id3 + iv] += bc2 * w1[icr + ix1];
1228 w2[icr + id4 + iv] += bc2 * w1[ici + ix2];
1229 w2[ici + id4 + iv] += -bc2 * w1[icr + ix2];
1239 int itask,
double *v2,
const double *v1)
1245 int id3 =
m_Nvc * 2;
1246 int id4 =
m_Nvc * 3;
1251 double wt1r, wt1i, wt2r, wt2i;
1253 int isite =
m_arg[itask].isite;
1255 double *w2 = &v2[Nvcd * isite];
1256 const double *w1 = &v1[Nvcd * isite];
1259 int kz0 =
m_arg[itask].kz0;
1262 for (
int it = 0; it <
m_Mt; ++it) {
1263 for (
int iz = kz0; iz <
m_Mz; ++iz) {
1264 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1265 int is = ixy + Nxy * (iz +
m_Nz * it);
1267 int in = Nvcd * (is - Nxy);
1268 int ig =
m_Ndf * (is - Nxy);
1270 for (
int ic = 0; ic <
m_Nc; ++ic) {
1271 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + 1 + id3 + in];
1272 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + id3 + in];
1273 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + 1 + id4 + in];
1274 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + id4 + in];
1277 for (
int ic = 0; ic <
m_Nc; ++ic) {
1279 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1280 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1281 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1282 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1284 w2[ic2 + id1 + iv] += wt1r;
1285 w2[ic2 + 1 + id1 + iv] += wt1i;
1286 w2[ic2 + id2 + iv] += wt2r;
1287 w2[ic2 + 1 + id2 + iv] += wt2i;
1288 w2[ic2 + id3 + iv] += -wt1i;
1289 w2[ic2 + 1 + id3 + iv] += wt1r;
1290 w2[ic2 + id4 + iv] += wt2i;
1291 w2[ic2 + 1 + id4 + iv] += -wt2r;
1301 int itask,
double *vcp1,
const double *v1)
1303 int Nvc2 = 2 *
m_Nvc;
1305 int Nvcd2 = Nvcd / 2;
1309 int id3 =
m_Nvc * 2;
1310 int id4 =
m_Nvc * 3;
1312 int isite =
m_arg[itask].isite;
1313 int isite_cp =
m_arg[itask].isite_cpt;
1320 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1321 const double *w1 = &v1[Nvcd * isite];
1323 if (
m_arg[itask].kt0 == 1) {
1326 for (
int iz = 0; iz <
m_Mz; ++iz) {
1327 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1328 int is = ixy + Nxy * (iz +
m_Nz * it);
1329 int is2 = ixy + Nxy * iz;
1332 int ix1 = Nvc2 * is2;
1333 int ix2 = ix1 +
m_Nvc;
1335 for (
int ic = 0; ic <
m_Nc; ++ic) {
1336 w2[2 * ic + ix1] = 2.0 * bc2 * w1[2 * ic + id3 + in];
1337 w2[2 * ic + 1 + ix1] = 2.0 * bc2 * w1[2 * ic + 1 + id3 + in];
1338 w2[2 * ic + ix2] = 2.0 * bc2 * w1[2 * ic + id4 + in];
1339 w2[2 * ic + 1 + ix2] = 2.0 * bc2 * w1[2 * ic + 1 + id4 + in];
1351 int itask,
double *v2,
const double *vcp2)
1353 int Nvc2 = 2 *
m_Nvc;
1355 int Nvcd2 = Nvcd / 2;
1359 int id3 =
m_Nvc * 2;
1360 int id4 =
m_Nvc * 3;
1364 double wt1r, wt1i, wt2r, wt2i;
1366 int isite =
m_arg[itask].isite;
1367 int isite_cp =
m_arg[itask].isite_cpt;
1369 double *w2 = &v2[Nvcd * isite];
1372 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1377 if (
m_arg[itask].kt1 == 1) {
1380 for (
int iz = 0; iz <
m_Mz; ++iz) {
1381 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1382 int is = ixy + Nxy * (iz +
m_Nz * it);
1383 int is2 = ixy + Nxy * iz;
1385 int ig =
m_Ndf * is;
1386 int ix1 = Nvc2 * is2;
1387 int ix2 = ix1 +
m_Nvc;
1389 for (
int ic = 0; ic <
m_Nc; ++ic) {
1390 int ic2 = ic *
m_Nvc;
1392 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1393 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1394 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1395 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1397 w2[2 * ic + id3 + iv] += wt1r;
1398 w2[2 * ic + 1 + id3 + iv] += wt1i;
1399 w2[2 * ic + id4 + iv] += wt2r;
1400 w2[2 * ic + 1 + id4 + iv] += wt2i;
1410 int itask,
double *v2,
const double *v1)
1416 int id3 =
m_Nvc * 2;
1417 int id4 =
m_Nvc * 3;
1422 double wt1r, wt1i, wt2r, wt2i;
1424 int isite =
m_arg[itask].isite;
1426 double *w2 = &v2[Nvcd * isite];
1427 const double *w1 = &v1[Nvcd * isite];
1430 int kt1 =
m_arg[itask].kt1;
1432 int Nxyz = Nxy *
m_Nz;
1434 for (
int it = 0; it <
m_Mt - kt1; ++it) {
1435 for (
int iz = 0; iz <
m_Mz; ++iz) {
1436 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1437 int is = ixy + Nxy * (iz + m_Nz * it);
1439 int in = Nvcd * (is + Nxyz);
1440 int ig =
m_Ndf * is;
1442 for (
int ic = 0; ic <
m_Nc; ++ic) {
1443 vt1[2 * ic] = 2.0 * w1[2 * ic + id3 + in];
1444 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id3 + in];
1445 vt2[2 * ic] = 2.0 * w1[2 * ic + id4 + in];
1446 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id4 + in];
1449 for (
int ic = 0; ic <
m_Nc; ++ic) {
1450 int ic2 = ic *
m_Nvc;
1452 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1453 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1454 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1455 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1457 w2[2 * ic + id3 + iv] += wt1r;
1458 w2[2 * ic + 1 + id3 + iv] += wt1i;
1459 w2[2 * ic + id4 + iv] += wt2r;
1460 w2[2 * ic + 1 + id4 + iv] += wt2i;
1470 int itask,
double *vcp1,
const double *v1)
1472 int Nvc2 = 2 *
m_Nvc;
1474 int Nvcd2 = Nvcd / 2;
1478 int id3 =
m_Nvc * 2;
1479 int id4 =
m_Nvc * 3;
1483 int isite =
m_arg[itask].isite;
1484 int isite_cp =
m_arg[itask].isite_cpt;
1488 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1489 const double *w1 = &v1[Nvcd * isite];
1494 if (
m_arg[itask].kt1 == 1) {
1497 for (
int iz = 0; iz <
m_Mz; ++iz) {
1498 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1499 int is = ixy + Nxy * (iz +
m_Nz * it);
1500 int is2 = ixy + Nxy * iz;
1502 int ig =
m_Ndf * is;
1503 int ix1 = Nvc2 * is2;
1504 int ix2 = ix1 +
m_Nvc;
1506 for (
int ic = 0; ic <
m_Nc; ++ic) {
1507 vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1508 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1509 vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1510 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1513 for (
int ic = 0; ic <
m_Nc; ++ic) {
1515 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1516 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1517 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1518 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1530 int itask,
double *v2,
const double *vcp2)
1532 int Nvc2 = 2 *
m_Nvc;
1534 int Nvcd2 = Nvcd / 2;
1538 int id3 =
m_Nvc * 2;
1539 int id4 =
m_Nvc * 3;
1544 double wt1r, wt1i, wt2r, wt2i;
1546 int isite =
m_arg[itask].isite;
1547 int isite_cp =
m_arg[itask].isite_cpt;
1549 double *w2 = &v2[Nvcd * isite];
1552 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1556 if (
m_arg[itask].kt0 == 1) {
1559 for (
int iz = 0; iz <
m_Mz; ++iz) {
1560 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1561 int is = ixy + Nxy * (iz +
m_Nz * it);
1562 int is2 = ixy + Nxy * iz;
1564 int ix1 = Nvc2 * is2;
1565 int ix2 = ix1 +
m_Nvc;
1567 for (
int ic = 0; ic <
m_Nc; ++ic) {
1569 int ici = 2 * ic + 1;
1570 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1571 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1572 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1573 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1583 int itask,
double *v2,
const double *v1)
1589 int id3 =
m_Nvc * 2;
1590 int id4 =
m_Nvc * 3;
1595 double wt1r, wt1i, wt2r, wt2i;
1597 int isite =
m_arg[itask].isite;
1599 double *w2 = &v2[Nvcd * isite];
1600 const double *w1 = &v1[Nvcd * isite];
1603 int kt0 =
m_arg[itask].kt0;
1605 int Nxyz = Nxy *
m_Nz;
1607 for (
int it = kt0; it <
m_Mt; ++it) {
1608 for (
int iz = 0; iz <
m_Mz; ++iz) {
1609 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1610 int is = ixy + Nxy * (iz + m_Nz * it);
1612 int in = Nvcd * (is - Nxyz);
1613 int ig =
m_Ndf * (is - Nxyz);
1615 for (
int ic = 0; ic <
m_Nc; ++ic) {
1616 vt1[2 * ic] = 2.0 * w1[2 * ic + id1 + in];
1617 vt1[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id1 + in];
1618 vt2[2 * ic] = 2.0 * w1[2 * ic + id2 + in];
1619 vt2[2 * ic + 1] = 2.0 * w1[2 * ic + 1 + id2 + in];
1622 for (
int ic = 0; ic <
m_Nc; ++ic) {
1624 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1625 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1626 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1627 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1629 w2[ic2 + id1 + iv] += wt1r;
1630 w2[ic2 + 1 + id1 + iv] += wt1i;
1631 w2[ic2 + id2 + iv] += wt2r;
1632 w2[ic2 + 1 + id2 + iv] += wt2i;
1642 int itask,
double *vcp1,
const double *v1)
1644 int Nvc2 = 2 *
m_Nvc;
1646 int Nvcd2 = Nvcd / 2;
1650 int id3 =
m_Nvc * 2;
1651 int id4 =
m_Nvc * 3;
1653 int isite =
m_arg[itask].isite;
1654 int isite_cp =
m_arg[itask].isite_cpt;
1661 = (
double *)
m_bw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1662 const double *w1 = &v1[Nvcd * isite];
1664 if (
m_arg[itask].kt0 == 1) {
1667 for (
int iz = 0; iz <
m_Mz; ++iz) {
1668 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1669 int is = ixy + Nxy * (iz +
m_Nz * it);
1670 int is2 = ixy + Nxy * iz;
1673 int ix1 = Nvc2 * is2;
1674 int ix2 = ix1 +
m_Nvc;
1676 for (
int ic = 0; ic <
m_Nc; ++ic) {
1677 w2[2 * ic + ix1] = bc2 * (w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in]);
1678 w2[2 * ic + 1 + ix1] = bc2 * (w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in]);
1679 w2[2 * ic + ix2] = bc2 * (w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in]);
1680 w2[2 * ic + 1 + ix2] = bc2 * (w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in]);
1692 int itask,
double *v2,
const double *vcp2)
1694 int Nvc2 = 2 *
m_Nvc;
1696 int Nvcd2 = Nvcd / 2;
1700 int id3 =
m_Nvc * 2;
1701 int id4 =
m_Nvc * 3;
1705 double wt1r, wt1i, wt2r, wt2i;
1707 int isite =
m_arg[itask].isite;
1708 int isite_cp =
m_arg[itask].isite_cpt;
1710 double *w2 = &v2[Nvcd * isite];
1713 = (
double *)
m_bw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1718 if (
m_arg[itask].kt1 == 1) {
1721 for (
int iz = 0; iz <
m_Mz; ++iz) {
1722 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1723 int is = ixy + Nxy * (iz +
m_Nz * it);
1724 int is2 = ixy + Nxy * iz;
1726 int ig =
m_Ndf * is;
1727 int ix1 = Nvc2 * is2;
1728 int ix2 = ix1 +
m_Nvc;
1730 for (
int ic = 0; ic <
m_Nc; ++ic) {
1731 int ic2 = ic *
m_Nvc;
1733 wt1r = mult_uv_r(&u[ic2 + ig], &w1[ix1], m_Nc);
1734 wt1i = mult_uv_i(&u[ic2 + ig], &w1[ix1], m_Nc);
1735 wt2r = mult_uv_r(&u[ic2 + ig], &w1[ix2], m_Nc);
1736 wt2i = mult_uv_i(&u[ic2 + ig], &w1[ix2], m_Nc);
1738 w2[2 * ic + id1 + iv] += wt1r;
1739 w2[2 * ic + 1 + id1 + iv] += wt1i;
1740 w2[2 * ic + id2 + iv] += wt2r;
1741 w2[2 * ic + 1 + id2 + iv] += wt2i;
1742 w2[2 * ic + id3 + iv] += wt1r;
1743 w2[2 * ic + 1 + id3 + iv] += wt1i;
1744 w2[2 * ic + id4 + iv] += wt2r;
1745 w2[2 * ic + 1 + id4 + iv] += wt2i;
1755 int itask,
double *v2,
const double *v1)
1761 int id3 =
m_Nvc * 2;
1762 int id4 =
m_Nvc * 3;
1767 double wt1r, wt1i, wt2r, wt2i;
1769 int isite =
m_arg[itask].isite;
1771 double *w2 = &v2[Nvcd * isite];
1772 const double *w1 = &v1[Nvcd * isite];
1775 int kt1 =
m_arg[itask].kt1;
1777 int Nxyz = Nxy *
m_Nz;
1779 for (
int it = 0; it <
m_Mt - kt1; ++it) {
1780 for (
int iz = 0; iz <
m_Mz; ++iz) {
1781 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1782 int is = ixy + Nxy * (iz + m_Nz * it);
1784 int in = Nvcd * (is + Nxyz);
1785 int ig =
m_Ndf * is;
1787 for (
int ic = 0; ic <
m_Nc; ++ic) {
1788 vt1[2 * ic] = w1[2 * ic + id1 + in] + w1[2 * ic + id3 + in];
1789 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] + w1[2 * ic + 1 + id3 + in];
1790 vt2[2 * ic] = w1[2 * ic + id2 + in] + w1[2 * ic + id4 + in];
1791 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] + w1[2 * ic + 1 + id4 + in];
1794 for (
int ic = 0; ic <
m_Nc; ++ic) {
1795 int ic2 = ic *
m_Nvc;
1797 wt1r = mult_uv_r(&u[ic2 + ig], vt1, m_Nc);
1798 wt1i = mult_uv_i(&u[ic2 + ig], vt1, m_Nc);
1799 wt2r = mult_uv_r(&u[ic2 + ig], vt2, m_Nc);
1800 wt2i = mult_uv_i(&u[ic2 + ig], vt2, m_Nc);
1802 w2[2 * ic + id1 + iv] += wt1r;
1803 w2[2 * ic + 1 + id1 + iv] += wt1i;
1804 w2[2 * ic + id2 + iv] += wt2r;
1805 w2[2 * ic + 1 + id2 + iv] += wt2i;
1806 w2[2 * ic + id3 + iv] += wt1r;
1807 w2[2 * ic + 1 + id3 + iv] += wt1i;
1808 w2[2 * ic + id4 + iv] += wt2r;
1809 w2[2 * ic + 1 + id4 + iv] += wt2i;
1819 int itask,
double *vcp1,
const double *v1)
1821 int Nvc2 = 2 *
m_Nvc;
1823 int Nvcd2 = Nvcd / 2;
1827 int id3 =
m_Nvc * 2;
1828 int id4 =
m_Nvc * 3;
1832 int isite =
m_arg[itask].isite;
1833 int isite_cp =
m_arg[itask].isite_cpt;
1837 = (
double *)
m_fw_send[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1838 const double *w1 = &v1[Nvcd * isite];
1843 if (
m_arg[itask].kt1 == 1) {
1846 for (
int iz = 0; iz <
m_Mz; ++iz) {
1847 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1848 int is = ixy + Nxy * (iz +
m_Nz * it);
1849 int is2 = ixy + Nxy * iz;
1851 int ig =
m_Ndf * is;
1852 int ix1 = Nvc2 * is2;
1853 int ix2 = ix1 +
m_Nvc;
1855 for (
int ic = 0; ic <
m_Nc; ++ic) {
1856 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1857 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1858 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1859 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1862 for (
int ic = 0; ic <
m_Nc; ++ic) {
1864 w2[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1, m_Nc);
1865 w2[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1, m_Nc);
1866 w2[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2, m_Nc);
1867 w2[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2, m_Nc);
1879 int itask,
double *v2,
const double *vcp2)
1881 int Nvc2 = 2 *
m_Nvc;
1883 int Nvcd2 = Nvcd / 2;
1887 int id3 =
m_Nvc * 2;
1888 int id4 =
m_Nvc * 3;
1893 double wt1r, wt1i, wt2r, wt2i;
1895 int isite =
m_arg[itask].isite;
1896 int isite_cp =
m_arg[itask].isite_cpt;
1898 double *w2 = &v2[Nvcd * isite];
1901 = (
double *)
m_fw_recv[idir]->ptr(
sizeof(
double) * Nvcd2 * isite_cp);
1905 if (
m_arg[itask].kt0 == 1) {
1908 for (
int iz = 0; iz <
m_Mz; ++iz) {
1909 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1910 int is = ixy + Nxy * (iz +
m_Nz * it);
1911 int is2 = ixy + Nxy * iz;
1913 int ix1 = Nvc2 * is2;
1914 int ix2 = ix1 +
m_Nvc;
1916 for (
int ic = 0; ic <
m_Nc; ++ic) {
1918 int ici = 2 * ic + 1;
1919 w2[icr + id1 + iv] += bc2 * w1[icr + ix1];
1920 w2[ici + id1 + iv] += bc2 * w1[ici + ix1];
1921 w2[icr + id2 + iv] += bc2 * w1[icr + ix2];
1922 w2[ici + id2 + iv] += bc2 * w1[ici + ix2];
1923 w2[icr + id3 + iv] -= bc2 * w1[icr + ix1];
1924 w2[ici + id3 + iv] -= bc2 * w1[ici + ix1];
1925 w2[icr + id4 + iv] -= bc2 * w1[icr + ix2];
1926 w2[ici + id4 + iv] -= bc2 * w1[ici + ix2];
1936 int itask,
double *v2,
const double *v1)
1942 int id3 =
m_Nvc * 2;
1943 int id4 =
m_Nvc * 3;
1948 double wt1r, wt1i, wt2r, wt2i;
1950 int isite =
m_arg[itask].isite;
1952 double *w2 = &v2[Nvcd * isite];
1953 const double *w1 = &v1[Nvcd * isite];
1956 int kt0 =
m_arg[itask].kt0;
1958 int Nxyz = Nxy *
m_Nz;
1960 for (
int it = kt0; it <
m_Mt; ++it) {
1961 for (
int iz = 0; iz <
m_Mz; ++iz) {
1962 for (
int ixy = 0; ixy < Nxy; ++ixy) {
1963 int is = ixy + Nxy * (iz + m_Nz * it);
1965 int in = Nvcd * (is - Nxyz);
1966 int ig =
m_Ndf * (is - Nxyz);
1968 for (
int ic = 0; ic <
m_Nc; ++ic) {
1969 vt1[2 * ic] = w1[2 * ic + id1 + in] - w1[2 * ic + id3 + in];
1970 vt1[2 * ic + 1] = w1[2 * ic + 1 + id1 + in] - w1[2 * ic + 1 + id3 + in];
1971 vt2[2 * ic] = w1[2 * ic + id2 + in] - w1[2 * ic + id4 + in];
1972 vt2[2 * ic + 1] = w1[2 * ic + 1 + id2 + in] - w1[2 * ic + 1 + id4 + in];
1975 for (
int ic = 0; ic <
m_Nc; ++ic) {
1977 wt1r = mult_udagv_r(&u[ic2 + ig], vt1, m_Nc);
1978 wt1i = mult_udagv_i(&u[ic2 + ig], vt1, m_Nc);
1979 wt2r = mult_udagv_r(&u[ic2 + ig], vt2, m_Nc);
1980 wt2i = mult_udagv_i(&u[ic2 + ig], vt2, m_Nc);
1982 w2[ic2 + id1 + iv] += wt1r;
1983 w2[ic2 + 1 + id1 + iv] += wt1i;
1984 w2[ic2 + id2 + iv] += wt2r;
1985 w2[ic2 + 1 + id2 + iv] += wt2i;
1986 w2[ic2 + id3 + iv] -= wt1r;
1987 w2[ic2 + 1 + id3 + iv] -= wt1i;
1988 w2[ic2 + id4 + iv] -= wt2r;
1989 w2[ic2 + 1 + id4 + iv] -= wt2i;
1999 int itask,
double *v2,
const double *v1)
2006 int id3 =
m_Nvc * 2;
2007 int id4 =
m_Nvc * 3;
2009 int isite =
m_arg[itask].isite;
2010 double *w2 = &v2[Nvcd * isite];
2011 const double *w1 = &v1[Nvcd * isite];
2013 for (
int it = 0; it <
m_Mt; ++it) {
2014 for (
int iz = 0; iz <
m_Mz; ++iz) {
2015 for (
int ixy = 0; ixy < Nxy; ++ixy) {
2016 int iv = Nvcd * (ixy + Nxy * (iz +
m_Nz * it));
2017 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
2018 w2[ivc + id1 + iv] = w1[ivc + id3 + iv];
2019 w2[ivc + id2 + iv] = w1[ivc + id4 + iv];
2020 w2[ivc + id3 + iv] = w1[ivc + id1 + iv];
2021 w2[ivc + id4 + iv] = w1[ivc + id2 + iv];
2031 int itask,
double *v2,
const double *v1)
2038 int id3 =
m_Nvc * 2;
2039 int id4 =
m_Nvc * 3;
2041 int isite =
m_arg[itask].isite;
2042 double *w2 = &v2[Nvcd * isite];
2043 const double *w1 = &v1[Nvcd * isite];
2045 for (
int it = 0; it <
m_Mt; ++it) {
2046 for (
int iz = 0; iz <
m_Mz; ++iz) {
2047 for (
int ixy = 0; ixy < Nxy; ++ixy) {
2048 int iv = Nvcd * (ixy + Nxy * (iz +
m_Nz * it));
2049 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
2050 w2[ivc + id1 + iv] = w1[ivc + id1 + iv];
2051 w2[ivc + id2 + iv] = w1[ivc + id2 + iv];
2052 w2[ivc + id3 + iv] = -w1[ivc + id3 + iv];
2053 w2[ivc + id4 + iv] = -w1[ivc + id4 + iv];
void mult_zpb_thread(int, double *, const double *)
void mult_tp1_chiral_thread(int, double *, const double *)
const double * ptr(const int jin, const int site, const int jex) const
void mult_tm1_dirac_thread(int, double *, const double *)
const Field_G * m_U
gauge configuration.
void mult_yp1_thread(int, double *, const double *)
void mult_yp2_thread(int, double *, const double *)
void mult_zp2_thread(int, double *, const double *)
void general(const char *format,...)
std::vector< mult_arg > m_arg
void mult_ym2_thread(int, double *, const double *)
void clear_thread(int, double *)
void mult_tpb_chiral_thread(int, double *, const double *)
void gm5_chiral_thread(int, double *, const double *)
std::vector< Channel * > m_bw_recv
void mult_tpb_dirac_thread(int, double *, const double *)
void mult_tp2_dirac_thread(int, double *, const double *)
void mult_zm2_thread(int, double *, const double *)
void mult_xm2_thread(int, double *, const double *)
void mult_tm1_chiral_thread(int, double *, const double *)
void mult_tmb_dirac_thread(int, double *, const double *)
void mult_tm2_dirac_thread(int, double *, const double *)
void mult_tp1_dirac_thread(int, double *, const double *)
static int get_num_threads_available()
returns number of threads (works outside of parallel region).
void mult_zmb_thread(int, double *, const double *)
void crucial(const char *format,...)
void mult_xmb_thread(int, double *, const double *)
void gm5_dirac_thread(int, double *, const double *)
std::vector< double > m_boundary2
b.c. for each node.
void daypx_thread(int, double *, double, const double *)
void mult_tmb_chiral_thread(int, double *, const double *)
void mult_ypb_thread(int, double *, const double *)
void mult_ymb_thread(int, double *, const double *)
void mult_xpb_thread(int, double *, const double *)
void mult_ym1_thread(int, double *, const double *)
void mult_xp1_thread(int, double *, const double *)
void mult_tm2_chiral_thread(int, double *, const double *)
static const std::string class_name
Bridge::VerboseLevel m_vl
void mult_tp2_chiral_thread(int, double *, const double *)
void mult_zp1_thread(int, double *, const double *)
std::vector< Channel * > m_fw_send
void mult_xm1_thread(int, double *, const double *)
void mult_xp2_thread(int, double *, const double *)
void mult_zm1_thread(int, double *, const double *)
std::vector< Channel * > m_fw_recv
std::vector< Channel * > m_bw_send