14 #define AFOPR_CLOVER_COARSE_TIMER
16 #ifdef AFOPR_CLOVER_COARSE_TIMER
18 #define TIMER_mult_start timer_mult->start();
19 #define TIMER_mult_stop timer_mult->stop();
20 #define TIMER_pack_start timer_pack->start();
21 #define TIMER_pack_stop timer_pack->stop();
22 #define TIMER_bulk_start timer_bulk->start();
23 #define TIMER_bulk_stop timer_bulk->stop();
24 #define TIMER_boundary_start timer_boundary->start();
25 #define TIMER_boundary_stop timer_boundary->stop();
26 #define TIMER_comm_start timer_comm->start();
27 #define TIMER_comm_stop timer_comm->stop();
28 #define TIMER_comm_recv_wait_start timer_comm_recv_wait->start();
29 #define TIMER_comm_recv_wait_stop timer_comm_recv_wait->stop();
30 #define TIMER_comm_send_wait_start timer_comm_send_wait->start();
31 #define TIMER_comm_send_wait_stop timer_comm_send_wait->stop();
32 #define TIMER_comm_recv_start_start timer_comm_recv_start->start();
33 #define TIMER_comm_recv_start_stop timer_comm_recv_start->stop();
34 #define TIMER_comm_send_start_start timer_comm_send_start->start();
35 #define TIMER_comm_send_start_stop timer_comm_send_start->stop();
36 #define TIMER_comm_test_all_start timer_comm_test_all->start();
37 #define TIMER_comm_test_all_stop timer_comm_test_all->stop();
38 #define TIMER_clear_start timer_clear->start();
39 #define TIMER_clear_stop timer_clear->stop();
41 #define TIMER_mult_start
42 #define TIMER_mult_stop
43 #define TIMER_pack_start
44 #define TIMER_pack_stop
45 #define TIMER_bulk_start
46 #define TIMER_bulk_stop
47 #define TIMER_boundary_start
48 #define TIMER_boundary_stop
49 #define TIMER_comm_start
50 #define TIMER_comm_stop
51 #define TIMER_comm_recv_wait_start
52 #define TIMER_comm_recv_wait_stop
53 #define TIMER_comm_send_wait_start
54 #define TIMER_comm_send_wait_stop
55 #define TIMER_comm_recv_start_start
56 #define TIMER_comm_recv_start_stop
57 #define TIMER_comm_send_start_start
58 #define TIMER_comm_send_start_stop
59 #define TIMER_comm_test_all_start
60 #define TIMER_comm_test_all_stop
61 #define TIMER_clear_start
62 #define TIMER_clear_stop
68 #ifndef QXS_DATA_ALIGNMENT
69 constexpr
int alignment = 256;
71 constexpr
int alignment = QXS_DATA_ALIGNMENT;
75 static inline void accum_mult_u_i(
Vsimd_t *out,
81 const int nh = ncol / 2;
89 for (
int j = 0; j < nh; j++) {
91 load_vec(vu, &u[
VLEN * 4 * j], 4);
92 load_vec(vin, &in[
VLEN * 4 * j], 4);
95 add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
96 sub_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
97 add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
98 add_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
101 add_dot_vec(&out[e1r], &vu[e2r], &vin[e2r], 1);
102 sub_dot_vec(&out[e1r], &vu[e2i], &vin[e2i], 1);
103 add_dot_vec(&out[e1i], &vu[e2r], &vin[e2i], 1);
104 add_dot_vec(&out[e1i], &vu[e2i], &vin[e2r], 1);
106 load_vec(vu, &u[
VLEN * 4 * (j + nh)], 4);
108 add_dot_vec(&out[e2r], &vu[e1r], &vin[e1r], 1);
109 sub_dot_vec(&out[e2r], &vu[e1i], &vin[e1i], 1);
110 add_dot_vec(&out[e2i], &vu[e1r], &vin[e1i], 1);
111 add_dot_vec(&out[e2i], &vu[e1i], &vin[e1r], 1);
114 add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
115 sub_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
116 add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
117 add_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
123 static inline void accum_mult_u_yp_i(
Vsimd_t *out,
129 const int nh = ncol / 2;
137 for (
int j = 0; j < nh; j++) {
139 load_vec(vu, &u[
VLEN * 4 * j], 4);
142 shift_vec2_ybw(in, &in1[
VLEN * 4 * j], &in2[
VLEN * 4 * j], 4);
146 add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
147 sub_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
148 add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
149 add_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
152 add_dot_vec(&out[e1r], &vu[e2r], &vin[e2r], 1);
153 sub_dot_vec(&out[e1r], &vu[e2i], &vin[e2i], 1);
154 add_dot_vec(&out[e1i], &vu[e2r], &vin[e2i], 1);
155 add_dot_vec(&out[e1i], &vu[e2i], &vin[e2r], 1);
157 load_vec(vu, &u[
VLEN * 4 * (j + nh)], 4);
159 add_dot_vec(&out[e2r], &vu[e1r], &vin[e1r], 1);
160 sub_dot_vec(&out[e2r], &vu[e1i], &vin[e1i], 1);
161 add_dot_vec(&out[e2i], &vu[e1r], &vin[e1i], 1);
162 add_dot_vec(&out[e2i], &vu[e1i], &vin[e1r], 1);
165 add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
166 sub_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
167 add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
168 add_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
174 static inline void accum_mult_u(
real_t *out,
186 const int nh = ncol / 2;
188 for (
int i = 0; i < nh; i++) {
190 load_vec(tmp, &out[
VLEN * 4 * i], 4);
191 accum_mult_u_i(tmp, in, u0, i, ncol);
192 save_vec(&out[
VLEN * 4 * i], tmp, 4);
198 static inline void accum_mult_udag_i(
Vsimd_t *out,
204 const int dof2 = ncol * ncol;
205 const int nh = ncol / 2;
211 for (
int j = 0; j < nh; j++) {
216 load_vec(vu, &u[
VLEN * 4 * i], 4);
217 load_vec(vin, &in[
VLEN * 4 * j], 4);
220 add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
221 add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
222 add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
223 sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
226 sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
227 sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
228 sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
229 add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
231 load_vec(vu, &u[
VLEN * (2 * ncol + 4 * i)], 4);
233 sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
234 sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
235 sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
236 add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
239 add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
240 add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
241 add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
242 sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
248 static inline void accum_mult_udag_xm_i(
Vsimd_t *out,
254 const int nh = ncol / 2;
260 for (
int j = 0; j < nh; j++) {
272 load_vec(vin, &in[
VLEN * 4 * j], 4);
275 shift_vec2_xfw(u, &uu1[
VLEN * 4 * i], &uu2[
VLEN * 4 * i], 4);
279 add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
280 add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
281 add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
282 sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
285 sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
286 sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
287 sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
288 add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
291 shift_vec2_xfw(u, &uu1[
VLEN * (2 * ncol + 4 * i)], &uu2[
VLEN * (2 * ncol + 4 * i)], 4);
295 sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
296 sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
297 sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
298 add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
301 add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
302 add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
303 add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
304 sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
310 static inline void accum_mult_udag_ym_i(
Vsimd_t *out,
316 const int nh = ncol / 2;
322 for (
int j = 0; j < nh; j++) {
335 shift_vec2_yfw(in, &in1[
VLEN * 4 * j], &in2[
VLEN * 4 * j], 4);
336 load_vec(vin, in, 4);
339 shift_vec2_yfw(u, &uu1[
VLEN * 4 * i], &uu2[
VLEN * 4 * i], 4);
343 add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
344 add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
345 add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
346 sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
349 sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
350 sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
351 sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
352 add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
355 shift_vec2_yfw(u, &uu1[
VLEN * (2 * ncol + 4 * i)], &uu2[
VLEN * (2 * ncol + 4 * i)], 4);
359 sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
360 sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
361 sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
362 add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
365 add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
366 add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
367 add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
368 sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
374 static inline void accum_mult_udag_ym_i(
Vsimd_t *out,
380 const int nh = ncol / 2;
386 for (
int j = 0; j < nh; j++) {
398 load_vec(vin, &in[
VLEN * 4 * j], 4);
401 shift_vec2_yfw(u, &uu1[
VLEN * 4 * i], &uu2[
VLEN * 4 * i], 4);
405 add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
406 add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
407 add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
408 sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
411 sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
412 sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
413 sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
414 add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
417 shift_vec2_yfw(u, &uu1[
VLEN * (2 * ncol + 4 * i)], &uu2[
VLEN * (2 * ncol + 4 * i)], 4);
421 sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
422 sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
423 sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
424 add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
427 add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
428 add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
429 add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
430 sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
436 static inline void accum_mult_udag(
real_t *out,
441 const int nh = ncol / 2;
443 for (
int i = 0; i < nh; i++) {
445 load_vec(tmp, &out[
VLEN * 4 * i], 4);
446 accum_mult_udag_i(tmp, in, u0, i, ncol);
447 save_vec(&out[
VLEN * 4 * i], tmp, 4);
453 static inline void set_mult_u(
real_t *out,
458 const int nh = ncol / 2;
460 for (
int i = 0; i < nh; i++) {
463 accum_mult_u_i(tmp, in, u0, i, ncol);
464 save_vec(&out[
VLEN * 4 * i], tmp, 4);
470 static inline void set_mult_udag(
real_t *out,
475 const int nh = ncol / 2;
477 for (
int i = 0; i < nh; i++) {
480 accum_mult_udag_i(tmp, in, u, i, ncol);
481 save_vec(&out[
VLEN * (4 * i)], tmp, 4);
487 static inline void accum_buf(
real_t *out,
real_t *in,
const int ncol)
491 for (
int i = 0; i < 2 * ncol; i++) {
492 load_vec(&vin, &in[
VLEN * i], 1);
494 add_vec(&
vout, &vin, 1);
501 static inline void copy_buf(
real_t *out,
real_t *in,
const int ncol)
503 for (
int i = 0; i <
VLEN * 2 * ncol; i++) {
517 static inline void mult_coarse_xp1(
real_t *buf,
real_t *in,
const int ncol)
519 const int nh = ncol / 2;
520 for (
int i = 0; i < nh; i++) {
522 load_vec(tmp, &in[
VLEN * 4 * i], 4);
523 save_vec1_x(&buf[
VLENY * 4 * i], tmp, 0, 4);
528 static inline void mult_coarse_xp2(
real_t *out,
535 const int nh = ncol / 2;
537 shift_vec0_xbw(in, in0, 2 * ncol);
539 for (
int i = 0; i < nh; i++) {
542 shift_vec1_xbw(buf, &buf0[
VLENY * 4 * i], 4);
543 load_vec(vin, &in[
VLEN * 4 * i], 4);
544 add_vec(vin, buf, 4);
545 save_vec(&in[
VLEN * 4 * i], vin, 4);
547 accum_mult_u(out, in, u0, ncol);
551 static inline void mult_coarse_xpb(
real_t *out,
554 const int ncol,
real_t *work)
557 shift_vec2_xbw(in, in1, in2, 2 * ncol);
558 accum_mult_u(out, in, u, ncol);
565 static inline void mult_coarse_xm1(
real_t *buf,
real_t *u,
real_t *in,
const int ncol)
567 const int nh = ncol / 2;
568 for (
int i = 0; i < nh; i++) {
571 accum_mult_udag_i(tmp, in, u, i, ncol);
572 save_vec1_x(&buf[
VLENY * 4 * i], tmp,
VLENX - 1, 4);
577 static inline void mult_coarse_xm2(
real_t *out,
582 const int nh = ncol / 2;
583 for (
int i = 0; i < nh; i++) {
588 accum_mult_udag_i(vtmp, in0, u0, i, ncol);
592 save_vec(tmp1, vtmp, 4);
593 shift_vec0_xfw(tmp2, tmp1, 4);
594 load_vec(vtmp, tmp2, 4);
597 shift_vec1_xfw(buf, &buf0[
VLENY * 4 * i], 4);
598 add_vec(vtmp, buf, 4);
602 load_vec(tmpout, &out[
VLEN * 4 * i], 4);
603 add_vec(tmpout, vtmp, 4);
604 save_vec(&out[
VLEN * 4 * i], tmpout, 4);
609 static inline void mult_coarse_xmb(
real_t *out,
616 shift_vec2_xfw(in, in1, in2, 2 * ncol);
618 const int nh = ncol / 2;
619 for (
int i = 0; i < nh; i++) {
621 load_vec(tmp, &out[
VLEN * 4 * i], 4);
622 accum_mult_udag_xm_i(tmp, in, u1, u2, i, ncol);
623 save_vec(&out[
VLEN * 4 * i], tmp, 4);
631 static inline void mult_coarse_yp1(
real_t *buf,
real_t *in,
const int ncol)
633 const int nh = ncol / 2;
634 for (
int i = 0; i < nh; i++) {
636 load_vec(tmp, &in[
VLEN * 4 * i], 4);
637 save_vec1_y(&buf[
VLENX * 4 * i], tmp, 0, 4);
642 static inline void mult_coarse_yp2(
real_t *out,
649 const int nh = ncol / 2;
651 shift_vec0_ybw(in, in0, 2 * ncol);
653 for (
int i = 0; i < nh; i++) {
656 shift_vec1_ybw(buf, &buf0[
VLENX * 4 * i], 4);
657 load_vec(vin, &in[
VLEN * 4 * i], 4);
658 add_vec(vin, buf, 4);
659 save_vec(&in[
VLEN * 4 * i], vin, 4);
661 for (
int i = 0; i < nh; i++) {
663 load_vec(tmp, &out[
VLEN * 4 * i], 4);
664 accum_mult_u_i(tmp, in, u0, i, ncol);
665 save_vec(&out[
VLEN * 4 * i], tmp, 4);
670 static inline void mult_coarse_ypb(
real_t *out,
677 shift_vec2_ybw(in, in1, in2, 2 * ncol);
678 const int nh = ncol / 2;
679 for (
int i = 0; i < nh; i++) {
681 load_vec(tmp, &out[
VLEN * 4 * i], 4);
682 accum_mult_u_i(tmp, in, u, i, ncol);
683 save_vec(&out[
VLEN * 4 * i], tmp, 4);
691 static inline void mult_coarse_ym1(
real_t *buf,
real_t *u,
real_t *in,
const int ncol)
693 const int nh = ncol / 2;
694 for (
int i = 0; i < nh; i++) {
697 accum_mult_udag_i(tmp, in, u, i, ncol);
698 save_vec1_y(&buf[
VLENX * 4 * i], tmp,
VLENY - 1, 4);
703 static inline void mult_coarse_ym2(
real_t *out,
708 const int nh = ncol / 2;
709 for (
int i = 0; i < nh; i++) {
714 accum_mult_udag_i(vtmp, in0, u0, i, ncol);
718 save_vec(tmp1, vtmp, 4);
719 shift_vec0_yfw(tmp2, tmp1, 4);
720 load_vec(vtmp, tmp2, 4);
723 shift_vec1_yfw(buf, &buf0[
VLENX * 4 * i], 4);
724 add_vec(vtmp, buf, 4);
728 load_vec(tmpout, &out[
VLEN * 4 * i], 4);
729 add_vec(tmpout, vtmp, 4);
730 save_vec(&out[
VLEN * 4 * i], tmpout, 4);
735 static inline void mult_coarse_ymb(
real_t *out,
742 shift_vec2_yfw(in, in1, in2, 2 * ncol);
744 const int nh = ncol / 2;
745 for (
int i = 0; i < nh; i++) {
747 load_vec(tmp, &out[
VLEN * 4 * i], 4);
748 accum_mult_udag_ym_i(tmp, in, u1, u2, i, ncol);
749 save_vec(&out[
VLEN * 4 * i], tmp, 4);
757 static inline void mult_coarse_zp1(
real_t *out,
real_t *in,
const int ncol)
759 copy_buf(out, in, ncol);
763 static inline void mult_coarse_zp2(
real_t *out,
real_t *u,
real_t *buf,
const int ncol)
765 accum_mult_u(out, buf, u, ncol);
769 static inline void mult_coarse_zpb(
real_t *out,
real_t *u,
real_t *in,
const int ncol)
771 accum_mult_u(out, in, u, ncol);
778 static inline void mult_coarse_zm1(
real_t *out,
real_t *u,
real_t *buf,
const int ncol)
780 set_mult_udag(out, buf, u, ncol);
784 static inline void mult_coarse_zm2(
real_t *out,
real_t *buf,
const int ncol)
786 accum_buf(out, buf, ncol);
790 static inline void mult_coarse_zmb(
real_t *out,
real_t *u,
real_t *in,
const int ncol)
792 accum_mult_udag(out, in, u, ncol);
799 static inline void mult_coarse_tp1(
real_t *out,
real_t *in,
const int ncol)
801 copy_buf(out, in, ncol);
805 static inline void mult_coarse_tp2(
real_t *out,
real_t *u,
real_t *buf,
const int ncol)
807 accum_mult_u(out, buf, u, ncol);
811 static inline void mult_coarse_tpb(
real_t *out,
real_t *u,
real_t *in,
const int ncol)
813 accum_mult_u(out, in, u, ncol);
820 static inline void mult_coarse_tm1(
real_t *out,
real_t *u,
real_t *buf,
const int ncol)
822 set_mult_udag(out, buf, u, ncol);
826 static inline void mult_coarse_tm2(
real_t *out,
real_t *buf,
const int ncol)
828 accum_buf(out, buf, ncol);
832 static inline void mult_coarse_tmb(
real_t *out,
real_t *u,
real_t *in,
const int ncol)
834 accum_mult_udag(out, in, u, ncol);
840 template<
typename AFIELD>
842 =
"AFopr_Clover_coarse";
845 template<
typename AFIELD>
858 for (
int mu = 0; mu < Ndim; ++mu) {
861 do_comm_any += do_comm[mu];
862 vout.
general(
" do_comm[%d] = %d\n", mu, do_comm[mu]);
865 m_bdsize.resize(Ndim);
870 int NinF = 2 * Nc * Nd;
871 workvec1.reset(NinF, fine_nvol, 1);
872 workvec2.reset(NinF, fine_nvol, 1);
873 workvec3.reset(NinF, fine_nvol, 1);
876 #ifdef AFOPR_CLOVER_COARSE_TIMER
877 timer_mult.reset(
new Timer(
"afopr_Clover_coarse: mult "));
878 timer_pack.reset(
new Timer(
"afopr_Clover_coarse: pack "));
879 timer_bulk.reset(
new Timer(
"afopr_Clover_coarse: bulk "));
880 timer_boundary.reset(
new Timer(
"afopr_Clover_coarse: boundary "));
881 timer_comm.reset(
new Timer(
"afopr_Clover_coarse: comm "));
882 timer_comm_recv_wait.reset(
new Timer(
"afopr_Clover_coarse: comm_recv_wait "));
883 timer_comm_send_wait.reset(
new Timer(
"afopr_Clover_coarse: comm_send_wait "));
884 timer_comm_recv_start.reset(
new Timer(
"afopr_Clover_coarse: comm_recv_start"));
885 timer_comm_send_start.reset(
new Timer(
"afopr_Clover_coarse: comm_send_start"));
886 timer_comm_test_all.reset(
new Timer(
"afopr_Clover_coarse: comm_test_all "));
887 timer_clear.reset(
new Timer(
"afopr_Clover_coarse: clear "));
893 template<
typename AFIELD>
898 chsend_up.resize(Ndim);
899 chrecv_up.resize(Ndim);
900 chsend_dn.resize(Ndim);
901 chrecv_dn.resize(Ndim);
903 for (
int mu = 0; mu < Ndim; ++mu) {
904 size_t Nvsize = m_bdsize[mu] *
sizeof(
real_t);
906 chsend_dn[mu].send_init(Nvsize, mu, -1);
907 chsend_up[mu].send_init(Nvsize, mu, 1);
909 chrecv_up[mu].recv_init(Nvsize, mu, 1);
910 chrecv_dn[mu].recv_init(Nvsize, mu, -1);
912 void *buf_up = (
void *)chsend_dn[mu].ptr();
913 chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
914 void *buf_dn = (
void *)chsend_up[mu].ptr();
915 chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
918 if (do_comm[mu] == 1) {
919 chset_send.append(chsend_up[mu]);
920 chset_send.append(chsend_dn[mu]);
921 chset_recv.append(chrecv_up[mu]);
922 chset_recv.append(chrecv_dn[mu]);
929 template<
typename AFIELD>
935 for (
int i = 0; i < work_shifted.size(); i++) {
936 free(work_shifted[i]);
937 work_shifted[i] =
nullptr;
940 #ifdef AFOPR_CLOVER_COARSE_TIMER
941 timer_mult->report();
942 timer_clear->report();
943 timer_pack->report();
944 timer_bulk->report();
945 timer_boundary->report();
946 timer_comm->report();
947 timer_comm_recv_wait->report();
948 timer_comm_send_wait->report();
949 timer_comm_recv_start->report();
950 timer_comm_send_start->report();
951 timer_comm_test_all->report();
957 template<
typename AFIELD>
960 const string str_vlevel = params.
get_string(
"verbose_level");
965 std::vector<int> coarse_lattice;
968 err += params.
fetch_int(
"number_of_testvectors", num_testvectors);
971 vout.
crucial(m_vl,
"Error at %s: input parameter not found.\n",
976 set_parameters(num_testvectors, coarse_lattice);
981 template<
typename AFIELD>
983 const int num_testvectors,
984 const std::vector<int>& coarse_lattice)
989 assert(coarse_lattice.size() == Ndim);
991 m_num_testvectors = num_testvectors;
992 m_ncol = 2 * num_testvectors;
994 m_Nc2 = m_ncol * m_ncol;
996 m_Ndf = 2 * m_Nc * m_Nc;
997 int Nc2 = m_ncol * m_ncol;
999 m_Nx = coarse_lattice[0];
1000 m_Ny = coarse_lattice[1];
1001 m_Nz = coarse_lattice[2];
1002 m_Nt = coarse_lattice[3];
1003 m_Nst = m_Nx * m_Ny * m_Nz * m_Nt;
1004 m_Nstv = m_Nst /
VLEN;
1005 m_Nxv = m_Nx /
VLENX;
1006 m_Nyv = m_Ny /
VLENY;
1009 if (m_Nxv *
VLENX != m_Nx) {
1010 vout.
crucial(
"%s: bad coarse lattice size in x-direction: must be a multiple of %d (given: %d)\n", class_name.c_str(),
VLENX, m_Nx);
1013 if (m_Nyv *
VLENY != m_Ny) {
1014 vout.
crucial(
"%s: bad coarse lattice size in y-direction: must be a multiple of %d (given: %d)\n", class_name.c_str(),
VLENY, m_Ny);
1019 m_bdsize[0] = m_Nvc * m_Ny * m_Nz * m_Nt;
1020 m_bdsize[1] = m_Nvc * m_Nx * m_Nz * m_Nt;
1021 m_bdsize[2] = m_Nvc * m_Nx * m_Ny * m_Nt;
1022 m_bdsize[3] = m_Nvc * m_Nx * m_Ny * m_Nz;
1026 size_t coarse_nvol = m_Nst;
1029 m_U.reset(m_Ndf, m_Nst, Ndim);
1030 m_Clov.reset(m_Ndf, m_Nst, 1);
1032 tmp_buffer1.resize(coarse_nvol);
1033 tmp_buffer2.resize(coarse_nvol);
1036 int pool_size = ((
sizeof(
real_t) *
VLEN * m_Nvc - 1) / alignment + 1) * alignment;
1038 for (
int i = 0; i < work_shifted.size(); i++) {
1039 free(work_shifted[i]);
1040 work_shifted[i] =
nullptr;
1042 work_shifted.resize(nthreads);
1043 for (
int i = 0; i < nthreads; i++) {
1044 posix_memalign((
void **)&work_shifted[i], alignment, pool_size);
1046 vout.
detailed(m_vl,
"shifted buffer: size=%d, alignment=%d\n", pool_size, alignment);
1050 for (
int th = 0; th < m_list_boundary.size(); th++) {
1051 vout.
detailed(m_vl,
" thread=%d, number of boundary sites = %d\n", th, m_list_boundary[th].size());
1053 vout.
general(m_vl,
"Parameters of %s:\n", class_name.c_str());
1054 for (
int mu = 0; mu < Ndim; ++mu) {
1055 vout.
general(m_vl,
" coarse_lattice_size[%d] = %2d\n",
1056 mu, coarse_lattice[mu]);
1062 template<
typename AFIELD>
1065 int work_xp = m_ncol;
1066 int work_xm = m_ncol;
1067 int work_yp = m_ncol;
1068 int work_ym = m_ncol;
1069 int work_zp = m_ncol;
1071 int work_tp = m_ncol;
1073 std::vector<int> workload(m_Nstv);
1074 for (
int site = 0; site < m_Nstv; ++site) {
1077 for (
int site = 0; site < m_Nstv; ++site) {
1078 int ix = site % m_Nxv;
1079 int iyzt = site / m_Nxv;
1080 int iy = iyzt % m_Nyv;
1081 int izt = site / (m_Nxv * m_Nyv);
1082 int iz = izt % m_Nz;
1083 int it = izt / m_Nz;
1084 if (do_comm[0] == 1) {
1085 if (ix == m_Nxv - 1) { workload[site] += work_xp; }
1086 if (ix == 0) { workload[site] += work_xm; }
1088 if (do_comm[1] == 1) {
1089 if (iy == m_Nyv - 1) { workload[site] += work_yp; }
1090 if (iy == 0) { workload[site] += work_ym; }
1092 if (do_comm[2] == 1) {
1093 if (iz == m_Nz - 1) { workload[site] += work_zp; }
1094 if (iz == 0) { workload[site] += work_zm; }
1096 if (do_comm[3] == 1) {
1097 if (it == m_Nt - 1) { workload[site] += work_tp; }
1098 if (it == 0) { workload[site] += work_tm; }
1104 if (nth > 2) { nth--; }
1106 std::vector<std::vector<int> > tmp_list;
1107 std::vector<int> work(nth);
1108 std::vector<int> tmp_list_next(nth);
1109 tmp_list.resize(nth);
1110 for (
int i = 0; i < nth; i++) {
1111 tmp_list[i].resize(m_Nstv);
1113 for (
int i = 0; i < nth; i++) {
1116 for (
int i = 0; i < nth; i++) {
1117 tmp_list_next[i] = 0;
1119 int th_min_work = 0;
1124 int max_work_site = -1;
1125 for (
int site = 0; site < m_Nstv; site++) {
1126 if (workload[site] > max_work) {
1127 max_work = workload[site];
1128 max_work_site = site;
1131 if (max_work == 0) {
1135 tmp_list[th_min_work][tmp_list_next[th_min_work]] = max_work_site;
1136 tmp_list_next[th_min_work]++;
1137 work[th_min_work] += max_work;
1138 workload[max_work_site] = 0;
1141 int min_work = work[th_min_work];
1142 for (
int th = 0; th < nth; th++) {
1143 if (work[th] < min_work) {
1144 min_work = work[th];
1151 m_list_boundary.resize(nth0);
1152 m_list_boundary[0].resize(0);
1153 for (
int th = 0; th < nth; th++) {
1155 if (nth0 > nth) { th0++; }
1156 int size = tmp_list_next[th];
1157 m_list_boundary[th0].resize(size);
1158 vout.
general(
"hoge: setting list: th0=%d/%d, size=%d, load=%d\n",
1159 th0, nth0, size, work[th]);
1160 for (
int i = 0; i < size; i++) {
1162 th0, nth0, i, tmp_list[th][i]);
1164 m_list_boundary[th0][i] = tmp_list[th][i];
1171 template<
typename AFIELD>
1174 const std::vector<AFIELD>& atestvec)
1176 int ith, nth, coarse_is, coarse_ns;
1177 const int coarse_nvol = m_U.nvol();
1178 set_threadtask(ith, nth, coarse_is, coarse_ns, coarse_nvol);
1180 real_t *out_clov = m_Clov.ptr(0);
1181 real_t *out_gauge = m_U.ptr(0);
1182 const int num_vectors = m_num_testvectors;
1183 const int coarse_Nc2 = m_ncol * m_ncol;
1184 assert(m_Nc2 == coarse_Nc2);
1189 if (fine_afopr ==
nullptr) {
1190 vout.
crucial(
"%s: in generate_coarse_op, a bad fine operator"
1191 " is given (mustbbe AFopr_Clover_dd<AFIELD>).\n",
1192 class_name.c_str());
1199 std::vector<int> coarse_lattice(4);
1200 coarse_lattice[0] = m_Nx;
1201 coarse_lattice[1] = m_Ny;
1202 coarse_lattice[2] = m_Nz;
1203 coarse_lattice[3] = m_Nt;
1216 for (
int i1 = 0; i1 < num_vectors; ++i1) {
1217 for (
int ch1 = -1; ch1 < 2; ch1 += 2) {
1221 fine_afopr->
mult_dd(workvec2, workvec1);
1223 int I = 2 * i1 + (ch1 + 1) / 2;
1225 for (
int i2 = 0; i2 < num_vectors; ++i2) {
1228 block_dotc(&tmp_buffer1[0], atestvec[i2], workvec3,
1233 block_dotc(&tmp_buffer2[0], atestvec[i2], workvec3,
1237 for (
int s = coarse_is; s < coarse_ns; s++) {
1238 int idx_r = index_coarse.idx_Gr(J, I, s, 0);
1239 int idx_i = index_coarse.idx_Gi(J, I, s, 0);
1240 out[idx_r] += real(tmp_buffer1[s]);
1241 out[idx_i] += imag(tmp_buffer1[s]);
1245 for (
int s = coarse_is; s < coarse_ns; s++) {
1246 int idx_r = index_coarse.idx_Gr(J, I, s, 0);
1247 int idx_i = index_coarse.idx_Gi(J, I, s, 0);
1248 out[idx_r] += real(tmp_buffer2[s]);
1249 out[idx_i] += imag(tmp_buffer2[s]);
1255 for (
int mu = 0; mu < 4; mu++) {
1257 fine_afopr->
mult_dup(workvec2, workvec1, mu);
1258 real_t *out = out_gauge + mu * 2 * coarse_Nc2 * coarse_nvol;
1261 int I = 2 * i1 + (ch1 + 1) / 2;
1262 for (
int i2 = 0; i2 < num_vectors; ++i2) {
1265 block_dotc(&tmp_buffer1[0], atestvec[i2], workvec3,
1270 block_dotc(&tmp_buffer2[0], atestvec[i2], workvec3,
1274 for (
int s = coarse_is; s < coarse_ns; ++s) {
1275 int idx_r = index_coarse.idx_Gr(J, I, s, 0);
1276 int idx_i = index_coarse.idx_Gi(J, I, s, 0);
1277 out[idx_r] += real(tmp_buffer1[s]);
1278 out[idx_i] += imag(tmp_buffer1[s]);
1282 for (
int s = coarse_is; s < coarse_ns; ++s) {
1283 int idx_r = index_coarse.idx_Gr(J, I, s, 0);
1284 int idx_i = index_coarse.idx_Gi(J, I, s, 0);
1285 out[idx_r] += real(tmp_buffer2[s]);
1286 out[idx_i] += imag(tmp_buffer2[s]);
1298 double clv2 = m_Clov.norm2();
1299 double u2 = m_U.norm2();
1300 vout.
general(
"%s: |m_Clov|^2 = %23.15e\n", class_name.c_str(), clv2);
1301 vout.
general(
"%s: |m_U|^2 = %23.15e\n", class_name.c_str(), u2);
1304 for (
int i = 0; i < 2 * num_vectors; ++i) {
1305 for (
int j = 0; j < 2 * num_vectors; ++j) {
1308 int idx_r = index_coarse.idx_Gr(j, i, s, mu);
1309 int idx_i = index_coarse.idx_Gi(j, i, s, mu);
1311 m_U.cmp(idx_r), m_U.cmp(idx_i));
1321 template<
typename AFIELD>
1324 vout.
crucial(m_vl,
"%s: set_config is called\n", class_name.c_str());
1330 template<
typename AFIELD>
1338 template<
typename AFIELD>
1346 template<
typename AFIELD>
1354 }
else if (mu == 1) {
1356 }
else if (mu == 2) {
1358 }
else if (mu == 3) {
1361 vout.
crucial(m_vl,
"%s: mult_up for %d direction is undefined.",
1362 class_name.c_str(), mu);
1369 template<
typename AFIELD>
1377 }
else if (mu == 1) {
1379 }
else if (mu == 2) {
1381 }
else if (mu == 3) {
1384 vout.
crucial(m_vl,
"%s: mult_dn for %d direction is undefined.",
1385 class_name.c_str(), mu);
1392 template<
typename AFIELD>
1398 if (ith == 0) m_mode = mode;
1405 template<
typename AFIELD>
1413 template<
typename AFIELD>
1416 if (m_mode ==
"D") {
1418 }
else if (m_mode ==
"DdagD") {
1420 }
else if (m_mode ==
"Ddag") {
1422 }
else if (m_mode ==
"H") {
1425 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
1432 template<
typename AFIELD>
1435 if (m_mode ==
"D") {
1437 }
else if (m_mode ==
"DdagD") {
1439 }
else if (m_mode ==
"Ddag") {
1441 }
else if (m_mode ==
"H") {
1444 vout.
crucial(m_vl,
"%s: mode undefined.\n", class_name.c_str());
1451 template<
typename AFIELD>
1461 template<
typename AFIELD>
1472 template<
typename AFIELD>
1482 template<
typename AFIELD>
1497 template<
typename AFIELD>
1512 template<
typename AFIELD>
1515 int ith, nth, is, ns;
1516 set_threadtask(ith, nth, is, ns, m_Nstv);
1517 for (
int s = is; s < ns; ++s) {
1519 const real_t *in = w + s * 2 *
VLEN * m_ncol;
1520 for (
int i = 0; i < m_ncol / 2; ++i) {
1522 load_vec(tmp, &in[4 * i *
VLEN], 4);
1523 scal_vec(tmp,
real_t(-1), 2);
1524 save_vec(&out[4 * i *
VLEN], tmp, 4);
1531 template<
typename AFIELD>
1534 real_t *u = m_Clov.ptr(0);
1536 int ith, nth, is, ns;
1537 set_threadtask(ith, nth, is, ns, m_Nstv);
1540 int nv =
VLEN * m_Nvc;
1541 int nv2 =
VLEN * m_Ndf;
1543 for (
int site = is; site < ns; ++site) {
1544 accum_mult_u(&v2[nv * site], &v1[nv * site],
1545 &u[nv2 * site], m_Nc);
1553 template<
typename AFIELD>
1562 int ith, nth, is, ns;
1563 set_threadtask(ith, nth, is, ns, m_Nstv);
1564 const bool time_keeper = (ith == nth - 1);
1568 real_t *cp = m_Clov.ptr(0);
1570 int Nsize[4] = { m_Nxv, m_Nyv, m_Nz, m_Nt };
1575 if (do_comm_any > 0) {
1593 buf1_zp, buf1_zm, buf1_tp, buf1_tm,
1594 up, wp, Nsize, m_ncol, do_comm);
1640 buf2_xp, buf2_xm, buf2_yp, buf2_ym,
1641 buf2_zp, buf2_zm, buf2_tp, buf2_tm,
1642 Nsize, m_ncol, do_comm, work_shifted[ith],
1643 m_list_boundary[ith]);
1662 template<
typename AFIELD>
1692 template<
typename AFIELD>
1701 template<
typename AFIELD>
1704 int ith, nth, is, ns;
1705 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1708 clear_vec(vzero, 2);
1709 for (
int site = is; site < ns; ++site) {
1711 for (
int ic = 0; ic < m_Nvc; ic += 2) {
1712 save_vec(&out[
VLEN * ic], vzero, 2);
1719 template<
typename AFIELD>
1724 int ith, nth, is, ns;
1725 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1730 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1733 real_t *work = work_shifted[ith];
1736 if (do_comm[0] > 0) {
1742 for (
int site = is; site < ns; ++site) {
1743 int ix = site % m_Nxv;
1744 int iyzt = site / m_Nxv;
1746 int ibf =
VLENY * m_Nvc * iyzt;
1747 mult_coarse_xp1(&buf1[ibf], &v1[
VLEN * m_Nvc * site], m_Nc);
1757 chrecv_up[0].start();
1758 chsend_dn[0].start();
1759 chrecv_up[0].wait();
1760 chsend_dn[0].wait();
1774 for (
int site = is; site < ns; ++site) {
1775 int ix = site % m_Nxv;
1776 int iyzt = site / m_Nxv;
1778 if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
1779 int nei = (ix + 1) + m_Nxv * iyzt;
1780 if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
1781 mult_coarse_xpb(&v2[
VLEN * m_Nvc * site], &u[
VLEN * m_Ndf * site],
1782 &v1[
VLEN * m_Nvc * site], &v1[
VLEN * m_Nvc * nei], m_Nc, work);
1784 int ibf =
VLENY * m_Nvc * iyzt;
1785 mult_coarse_xp2(&v2[
VLEN * m_Nvc * site], &u[
VLEN * m_Ndf * site],
1786 &v1[
VLEN * m_Nvc * site], &buf2[ibf], m_Nc, work);
1799 template<
typename AFIELD>
1804 int ith, nth, is, ns;
1805 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1810 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1813 real_t *work = work_shifted[ith];
1817 if (do_comm[0] > 0) {
1823 for (
int site = is; site < ns; ++site) {
1824 int ix = site % m_Nxv;
1825 int iyzt = site / m_Nxv;
1826 if (ix == m_Nxv - 1) {
1827 int ibf =
VLENY * m_Nvc * iyzt;
1828 mult_coarse_xm1(&buf1[ibf], &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
1837 chrecv_dn[0].start();
1838 chsend_up[0].start();
1839 chrecv_dn[0].wait();
1840 chsend_up[0].wait();
1854 for (
int site = is; site < ns; ++site) {
1855 int ix = site % m_Nxv;
1856 int iyzt = site / m_Nxv;
1858 if ((ix > 0) || (do_comm[0] == 0)) {
1859 int ix2 = (ix - 1 + m_Nxv) % m_Nxv;
1860 int nei = ix2 + m_Nxv * iyzt;
1861 mult_coarse_xmb(&v2[
VLEN * m_Nvc * site],
1862 &u[
VLEN * m_Ndf * site], &u[
VLEN * m_Ndf * nei],
1863 &v1[
VLEN * m_Nvc * site], &v1[
VLEN * m_Nvc * nei],
1866 int ibf =
VLENY * m_Nvc * iyzt;
1867 mult_coarse_xm2(&v2[
VLEN * m_Nvc * site], &u[
VLEN * m_Ndf * site],
1868 &v1[
VLEN * m_Nvc * site], &buf2[ibf], m_Nc);
1881 template<
typename AFIELD>
1885 int Nxyv = m_Nxv * m_Nyv;
1887 int ith, nth, is, ns;
1888 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1893 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1896 if (do_comm[1] > 0) {
1902 for (
int site = is; site < ns; ++site) {
1903 int ix = site % m_Nxv;
1904 int iy = (site / m_Nxv) % m_Nyv;
1905 int izt = site / Nxyv;
1906 int ixzt = ix + m_Nxv * izt;
1908 int ibf =
VLENX * m_Nvc * ixzt;
1909 mult_coarse_yp1(&buf1[ibf], &v1[
VLEN * m_Nvc * site], m_Nc);
1919 chrecv_up[1].start();
1920 chsend_dn[1].start();
1921 chrecv_up[1].wait();
1922 chsend_dn[1].wait();
1937 real_t *work = work_shifted[thread];
1938 for (
int site = is; site < ns; ++site) {
1939 int ix = site % m_Nxv;
1940 int iy = (site / m_Nxv) % m_Nyv;
1941 int izt = site / Nxyv;
1942 int ixzt = ix + m_Nxv * izt;
1944 if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
1945 int iy2 = (iy + 1) % m_Nyv;
1946 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
1947 mult_coarse_ypb(&v2[
VLEN * m_Nvc * site],
1948 &u[
VLEN * m_Ndf * site],
1949 &v1[
VLEN * m_Nvc * site], &v1[
VLEN * m_Nvc * nei],
1952 int ibf =
VLENX * m_Nvc * ixzt;
1953 mult_coarse_yp2(&v2[
VLEN * m_Nvc * site],
1954 &u[
VLEN * m_Ndf * site],
1955 &v1[
VLEN * m_Nvc * site], &buf2[ibf],
1970 template<
typename AFIELD>
1974 int Nxyv = m_Nxv * m_Nyv;
1976 int ith, nth, is, ns;
1977 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1982 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1986 if (do_comm[1] > 0) {
1992 for (
int site = is; site < ns; ++site) {
1993 int ix = site % m_Nxv;
1994 int iy = (site / m_Nxv) % m_Nyv;
1995 int izt = site / Nxyv;
1996 if (iy == m_Nyv - 1) {
1997 int ibf =
VLENX * m_Nvc * (ix + m_Nxv * izt);
1998 mult_coarse_ym1(&buf1[ibf], &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
2008 chrecv_dn[1].start();
2009 chsend_up[1].start();
2010 chrecv_dn[1].wait();
2011 chsend_up[1].wait();
2027 real_t *work = work_shifted[thread];
2029 for (
int site = is; site < ns; ++site) {
2030 int ix = site % m_Nxv;
2031 int iy = (site / m_Nxv) % m_Nyv;
2032 int izt = site / Nxyv;
2034 if ((iy != 0) || (do_comm[idir] == 0)) {
2035 int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
2036 int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
2037 mult_coarse_ymb(&v2[
VLEN * m_Nvc * site],
2038 &u[
VLEN * m_Ndf * site], &u[
VLEN * m_Ndf * nei],
2039 &v1[
VLEN * m_Nvc * site], &v1[
VLEN * m_Nvc * nei],
2042 int ibf =
VLENX * m_Nvc * (ix + m_Nxv * izt);
2043 mult_coarse_ym2(&v2[
VLEN * m_Nvc * site],
2044 &u[
VLEN * m_Ndf * site],
2045 &v1[
VLEN * m_Nvc * site],
2058 template<
typename AFIELD>
2062 int Nxyv = m_Nxv * m_Nyv;
2064 int ith, nth, is, ns;
2065 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2070 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2073 if (do_comm[2] > 0) {
2079 for (
int site = is; site < ns; ++site) {
2080 int ixy = site % Nxyv;
2081 int iz = (site / Nxyv) % m_Nz;
2082 int it = site / (Nxyv * m_Nz);
2083 int ixyt = ixy + Nxyv * it;
2085 mult_coarse_zp1(&buf1[
VLEN * m_Nvc * ixyt], &v1[
VLEN * m_Nvc * site], m_Nc);
2095 chrecv_up[2].start();
2096 chsend_dn[2].start();
2097 chrecv_up[2].wait();
2098 chsend_dn[2].wait();
2113 for (
int site = is; site < ns; ++site) {
2114 int ixy = site % Nxyv;
2115 int iz = (site / Nxyv) % m_Nz;
2116 int it = site / (Nxyv * m_Nz);
2118 if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
2119 int iz2 = (iz + 1) % m_Nz;
2120 int nei = ixy + Nxyv * (iz2 + m_Nz * it);
2121 mult_coarse_zpb(&v2[
VLEN * m_Nvc * site],
2122 &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * nei], m_Nc);
2124 int ixyt = ixy + Nxyv * it;
2125 mult_coarse_zp2(&v2[
VLEN * m_Nvc * site],
2126 &u[
VLEN * m_Ndf * site], &buf2[
VLEN * m_Nvc * ixyt], m_Nc);
2139 template<
typename AFIELD>
2143 int Nxyv = m_Nxv * m_Nyv;
2145 int ith, nth, is, ns;
2146 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2151 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2155 if (do_comm[2] > 0) {
2161 for (
int site = is; site < ns; ++site) {
2162 int ixy = site % Nxyv;
2163 int iz = (site / Nxyv) % m_Nz;
2164 int it = site / (Nxyv * m_Nz);
2165 if (iz == m_Nz - 1) {
2166 int ixyt = ixy + Nxyv * it;
2167 mult_coarse_zm1(&buf1[
VLEN * m_Nvc * ixyt],
2168 &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
2178 chrecv_dn[2].start();
2179 chsend_up[2].start();
2180 chrecv_dn[2].wait();
2181 chsend_up[2].wait();
2196 for (
int site = is; site < ns; ++site) {
2197 int ixy = site % Nxyv;
2198 int iz = (site / Nxyv) % m_Nz;
2199 int it = site / (Nxyv * m_Nz);
2201 if ((iz > 0) || (do_comm[2] == 0)) {
2202 int iz2 = (iz - 1 + m_Nz) % m_Nz;
2203 int nei = ixy + Nxyv * (iz2 + m_Nz * it);
2204 mult_coarse_zmb(&v2[
VLEN * m_Nvc * site],
2205 &u[
VLEN * m_Ndf * nei], &v1[
VLEN * m_Nvc * nei], m_Nc);
2207 int ixyt = ixy + Nxyv * it;
2208 mult_coarse_zm2(&v2[
VLEN * m_Nvc * site], &buf2[
VLEN * m_Nvc * ixyt], m_Nc);
2220 template<
typename AFIELD>
2224 int Nxyzv = m_Nxv * m_Nyv * m_Nz;
2226 int ith, nth, is, ns;
2227 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2232 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2237 if (do_comm[3] > 0) {
2243 for (
int site = is; site < ns; ++site) {
2244 int ixyz = site % Nxyzv;
2245 int it = site / Nxyzv;
2247 mult_coarse_tp1(&buf1[
VLEN * m_Nvc * ixyz], &v1[
VLEN * m_Nvc * site], m_Nc);
2257 chrecv_up[3].start();
2258 chsend_dn[3].start();
2259 chrecv_up[3].wait();
2260 chsend_dn[3].wait();
2275 for (
int site = is; site < ns; ++site) {
2276 int ixyz = site % Nxyzv;
2277 int it = site / Nxyzv;
2279 if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
2280 int it2 = (it + 1) % m_Nt;
2281 int nei = ixyz + Nxyzv * it2;
2282 mult_coarse_tpb(&v2[
VLEN * m_Nvc * site],
2283 &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * nei], m_Nc);
2285 mult_coarse_tp2(&v2[
VLEN * m_Nvc * site],
2286 &u[
VLEN * m_Ndf * site], &buf2[
VLEN * m_Nvc * ixyz], m_Nc);
2299 template<
typename AFIELD>
2303 int Nxyzv = m_Nxv * m_Nyv * m_Nz;
2305 int ith, nth, is, ns;
2306 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2310 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2312 if (do_comm[3] > 0) {
2313 for (
int site = is; site < ns; ++site) {
2314 int ixyz = site % Nxyzv;
2315 int it = site / Nxyzv;
2316 if (it == m_Nt - 1) {
2317 mult_coarse_tm1(&buf1[
VLEN * m_Nvc * ixyz],
2318 &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
2326 template<
typename AFIELD>
2330 int Nxyzv = m_Nxv * m_Nyv * m_Nz;
2332 int ith, nth, is, ns;
2333 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2337 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2339 for (
int site = is; site < ns; ++site) {
2340 int ixyz = site % Nxyzv;
2341 int it = site / Nxyzv;
2343 if ((it > 0) || (do_comm[3] == 0)) {
2344 int it2 = (it - 1 + m_Nt) % m_Nt;
2345 int nei = ixyz + Nxyzv * it2;
2346 mult_coarse_tmb(&v2[
VLEN * m_Nvc * site],
2347 &u[
VLEN * m_Ndf * nei], &v1[
VLEN * m_Nvc * nei], m_Nc);
2349 mult_coarse_tm2(&v2[
VLEN * m_Nvc * site], &buf2[
VLEN * m_Nvc * ixyz], m_Nc);
2356 template<
typename AFIELD>
2360 int Nxyzv = m_Nxv * m_Nyv * m_Nz;
2362 int ith, nth, is, ns;
2363 set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2368 real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2372 if (do_comm[3] > 0) {
2378 for (
int site = is; site < ns; ++site) {
2379 int ixyz = site % Nxyzv;
2380 int it = site / Nxyzv;
2381 if (it == m_Nt - 1) {
2382 mult_coarse_tm1(&buf1[
VLEN * m_Nvc * ixyz],
2383 &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
2393 chrecv_dn[3].start();
2394 chsend_up[3].start();
2395 chrecv_dn[3].wait();
2396 chsend_up[3].wait();
2410 for (
int site = is; site < ns; ++site) {
2411 int ixyz = site % Nxyzv;
2412 int it = site / Nxyzv;
2414 if ((it > 0) || (do_comm[3] == 0)) {
2415 int it2 = (it - 1 + m_Nt) % m_Nt;
2416 int nei = ixyz + Nxyzv * it2;
2417 mult_coarse_tmb(&v2[
VLEN * m_Nvc * site],
2418 &u[
VLEN * m_Ndf * nei], &v1[
VLEN * m_Nvc * nei], m_Nc);
2420 mult_coarse_tm2(&v2[
VLEN * m_Nvc * site], &buf2[
VLEN * m_Nvc * ixyz], m_Nc);
2433 template<
typename AFIELD>
2441 double flop_site, flop;
2448 flop_site =
static_cast<double>(5 * 8 * m_Nc * m_Nc);
2450 flop = flop_site *
static_cast<double>(Lvol);
2451 if ((mode ==
"DdagD") || (mode ==
"DDdag")) flop *= 2.0;