14 #define AFOPR_CLOVER_COARSE_TIMER 
   16 #ifdef AFOPR_CLOVER_COARSE_TIMER 
   18 #define TIMER_mult_start               timer_mult->start(); 
   19 #define TIMER_mult_stop                timer_mult->stop(); 
   20 #define TIMER_pack_start               timer_pack->start(); 
   21 #define TIMER_pack_stop                timer_pack->stop(); 
   22 #define TIMER_bulk_start               timer_bulk->start(); 
   23 #define TIMER_bulk_stop                timer_bulk->stop(); 
   24 #define TIMER_boundary_start           timer_boundary->start(); 
   25 #define TIMER_boundary_stop            timer_boundary->stop(); 
   26 #define TIMER_comm_start               timer_comm->start(); 
   27 #define TIMER_comm_stop                timer_comm->stop(); 
   28 #define TIMER_comm_recv_wait_start     timer_comm_recv_wait->start(); 
   29 #define TIMER_comm_recv_wait_stop      timer_comm_recv_wait->stop(); 
   30 #define TIMER_comm_send_wait_start     timer_comm_send_wait->start(); 
   31 #define TIMER_comm_send_wait_stop      timer_comm_send_wait->stop(); 
   32 #define TIMER_comm_recv_start_start    timer_comm_recv_start->start(); 
   33 #define TIMER_comm_recv_start_stop     timer_comm_recv_start->stop(); 
   34 #define TIMER_comm_send_start_start    timer_comm_send_start->start(); 
   35 #define TIMER_comm_send_start_stop     timer_comm_send_start->stop(); 
   36 #define TIMER_comm_test_all_start      timer_comm_test_all->start(); 
   37 #define TIMER_comm_test_all_stop       timer_comm_test_all->stop(); 
   38 #define TIMER_clear_start              timer_clear->start(); 
   39 #define TIMER_clear_stop               timer_clear->stop(); 
   41 #define TIMER_mult_start 
   42 #define TIMER_mult_stop 
   43 #define TIMER_pack_start 
   44 #define TIMER_pack_stop 
   45 #define TIMER_bulk_start 
   46 #define TIMER_bulk_stop 
   47 #define TIMER_boundary_start 
   48 #define TIMER_boundary_stop 
   49 #define TIMER_comm_start 
   50 #define TIMER_comm_stop 
   51 #define TIMER_comm_recv_wait_start 
   52 #define TIMER_comm_recv_wait_stop 
   53 #define TIMER_comm_send_wait_start 
   54 #define TIMER_comm_send_wait_stop 
   55 #define TIMER_comm_recv_start_start 
   56 #define TIMER_comm_recv_start_stop 
   57 #define TIMER_comm_send_start_start 
   58 #define TIMER_comm_send_start_stop 
   59 #define TIMER_comm_test_all_start 
   60 #define TIMER_comm_test_all_stop 
   61 #define TIMER_clear_start 
   62 #define TIMER_clear_stop 
   68 #ifndef QXS_DATA_ALIGNMENT 
   69   constexpr 
int alignment = 256;
 
   71   constexpr 
int alignment = QXS_DATA_ALIGNMENT;
 
   75   static inline void accum_mult_u_i(
Vsimd_t *out,
 
   81     const int nh = ncol / 2;
 
   89     for (
int j = 0; j < nh; j++) {
 
   91       load_vec(vu, &u[
VLEN * 4 * j], 4);
 
   92       load_vec(vin, &in[
VLEN * 4 * j], 4);
 
   95       add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
 
   96       sub_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
 
   97       add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
 
   98       add_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
 
  101       add_dot_vec(&out[e1r], &vu[e2r], &vin[e2r], 1);
 
  102       sub_dot_vec(&out[e1r], &vu[e2i], &vin[e2i], 1);
 
  103       add_dot_vec(&out[e1i], &vu[e2r], &vin[e2i], 1);
 
  104       add_dot_vec(&out[e1i], &vu[e2i], &vin[e2r], 1);
 
  106       load_vec(vu, &u[
VLEN * 4 * (j + nh)], 4);
 
  108       add_dot_vec(&out[e2r], &vu[e1r], &vin[e1r], 1);
 
  109       sub_dot_vec(&out[e2r], &vu[e1i], &vin[e1i], 1);
 
  110       add_dot_vec(&out[e2i], &vu[e1r], &vin[e1i], 1);
 
  111       add_dot_vec(&out[e2i], &vu[e1i], &vin[e1r], 1);
 
  114       add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
 
  115       sub_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
 
  116       add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
 
  117       add_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
 
  123   static inline void accum_mult_u_yp_i(
Vsimd_t *out,
 
  129     const int nh = ncol / 2;
 
  137     for (
int j = 0; j < nh; j++) {
 
  139       load_vec(vu, &u[
VLEN * 4 * j], 4);
 
  142       shift_vec2_ybw(in, &in1[
VLEN * 4 * j], &in2[
VLEN * 4 * j], 4);
 
  146       add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
 
  147       sub_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
 
  148       add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
 
  149       add_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
 
  152       add_dot_vec(&out[e1r], &vu[e2r], &vin[e2r], 1);
 
  153       sub_dot_vec(&out[e1r], &vu[e2i], &vin[e2i], 1);
 
  154       add_dot_vec(&out[e1i], &vu[e2r], &vin[e2i], 1);
 
  155       add_dot_vec(&out[e1i], &vu[e2i], &vin[e2r], 1);
 
  157       load_vec(vu, &u[
VLEN * 4 * (j + nh)], 4);
 
  159       add_dot_vec(&out[e2r], &vu[e1r], &vin[e1r], 1);
 
  160       sub_dot_vec(&out[e2r], &vu[e1i], &vin[e1i], 1);
 
  161       add_dot_vec(&out[e2i], &vu[e1r], &vin[e1i], 1);
 
  162       add_dot_vec(&out[e2i], &vu[e1i], &vin[e1r], 1);
 
  165       add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
 
  166       sub_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
 
  167       add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
 
  168       add_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
 
  174   static inline void accum_mult_u(
real_t *out,
 
  186     const int nh = ncol / 2;
 
  188     for (
int i = 0; i < nh; i++) {
 
  190       load_vec(tmp, &out[
VLEN * 4 * i], 4);
 
  191       accum_mult_u_i(tmp, in, u0, i, ncol);
 
  192       save_vec(&out[
VLEN * 4 * i], tmp, 4);
 
  198   static inline void accum_mult_udag_i(
Vsimd_t *out,
 
  204     const int dof2 = ncol * ncol;
 
  205     const int nh   = ncol / 2;
 
  211     for (
int j = 0; j < nh; j++) {
 
  216       load_vec(vu, &u[
VLEN * 4 * i], 4);
 
  217       load_vec(vin, &in[
VLEN * 4 * j], 4);
 
  220       add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
 
  221       add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
 
  222       add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
 
  223       sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
 
  226       sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
 
  227       sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
 
  228       sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
 
  229       add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
 
  231       load_vec(vu, &u[
VLEN * (2 * ncol + 4 * i)], 4);
 
  233       sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
 
  234       sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
 
  235       sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
 
  236       add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
 
  239       add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
 
  240       add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
 
  241       add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
 
  242       sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
 
  248   static inline void accum_mult_udag_xm_i(
Vsimd_t *out,
 
  254     const int nh = ncol / 2;
 
  260     for (
int j = 0; j < nh; j++) {
 
  272       load_vec(vin, &in[
VLEN * 4 * j], 4);
 
  275       shift_vec2_xfw(u, &uu1[
VLEN * 4 * i], &uu2[
VLEN * 4 * i], 4);
 
  279       add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
 
  280       add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
 
  281       add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
 
  282       sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
 
  285       sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
 
  286       sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
 
  287       sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
 
  288       add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
 
  291       shift_vec2_xfw(u, &uu1[
VLEN * (2 * ncol + 4 * i)], &uu2[
VLEN * (2 * ncol + 4 * i)], 4);
 
  295       sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
 
  296       sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
 
  297       sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
 
  298       add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
 
  301       add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
 
  302       add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
 
  303       add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
 
  304       sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
 
  310   static inline void accum_mult_udag_ym_i(
Vsimd_t *out,
 
  316     const int nh = ncol / 2;
 
  322     for (
int j = 0; j < nh; j++) {
 
  335       shift_vec2_yfw(in, &in1[
VLEN * 4 * j], &in2[
VLEN * 4 * j], 4);
 
  336       load_vec(vin, in, 4);
 
  339       shift_vec2_yfw(u, &uu1[
VLEN * 4 * i], &uu2[
VLEN * 4 * i], 4);
 
  343       add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
 
  344       add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
 
  345       add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
 
  346       sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
 
  349       sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
 
  350       sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
 
  351       sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
 
  352       add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
 
  355       shift_vec2_yfw(u, &uu1[
VLEN * (2 * ncol + 4 * i)], &uu2[
VLEN * (2 * ncol + 4 * i)], 4);
 
  359       sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
 
  360       sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
 
  361       sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
 
  362       add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
 
  365       add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
 
  366       add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
 
  367       add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
 
  368       sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
 
  374   static inline void accum_mult_udag_ym_i(
Vsimd_t *out,
 
  380     const int nh = ncol / 2;
 
  386     for (
int j = 0; j < nh; j++) {
 
  398       load_vec(vin, &in[
VLEN * 4 * j], 4);
 
  401       shift_vec2_yfw(u, &uu1[
VLEN * 4 * i], &uu2[
VLEN * 4 * i], 4);
 
  405       add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
 
  406       add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
 
  407       add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
 
  408       sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
 
  411       sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
 
  412       sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
 
  413       sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
 
  414       add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
 
  417       shift_vec2_yfw(u, &uu1[
VLEN * (2 * ncol + 4 * i)], &uu2[
VLEN * (2 * ncol + 4 * i)], 4);
 
  421       sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
 
  422       sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
 
  423       sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
 
  424       add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
 
  427       add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
 
  428       add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
 
  429       add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
 
  430       sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
 
  436   static inline void accum_mult_udag(
real_t *out,
 
  441     const int nh = ncol / 2;
 
  443     for (
int i = 0; i < nh; i++) {
 
  445       load_vec(tmp, &out[
VLEN * 4 * i], 4);
 
  446       accum_mult_udag_i(tmp, in, u0, i, ncol);
 
  447       save_vec(&out[
VLEN * 4 * i], tmp, 4);
 
  453   static inline void set_mult_u(
real_t *out,
 
  458     const int nh = ncol / 2;
 
  460     for (
int i = 0; i < nh; i++) {
 
  463       accum_mult_u_i(tmp, in, u0, i, ncol);
 
  464       save_vec(&out[
VLEN * 4 * i], tmp, 4);
 
  470   static inline void set_mult_udag(
real_t *out,
 
  475     const int nh = ncol / 2;
 
  477     for (
int i = 0; i < nh; i++) {
 
  480       accum_mult_udag_i(tmp, in, u, i, ncol);
 
  481       save_vec(&out[
VLEN * (4 * i)], tmp, 4);
 
  487   static inline void accum_buf(
real_t *out, 
real_t *in, 
const int ncol)
 
  491     for (
int i = 0; i < 2 * ncol; i++) { 
 
  492       load_vec(&vin, &in[
VLEN * i], 1);
 
  494       add_vec(&
vout, &vin, 1);
 
  501   static inline void copy_buf(
real_t *out, 
real_t *in, 
const int ncol)
 
  503     for (
int i = 0; i < 
VLEN * 2 * ncol; i++) { 
 
  517   static inline void mult_coarse_xp1(
real_t *buf, 
real_t *in, 
const int ncol)
 
  519     const int nh = ncol / 2;
 
  520     for (
int i = 0; i < nh; i++) {
 
  522       load_vec(tmp, &in[
VLEN * 4 * i], 4);
 
  523       save_vec1_x(&buf[
VLENY * 4 * i], tmp, 0, 4);
 
  528   static inline void mult_coarse_xp2(
real_t *out,
 
  535     const int nh = ncol / 2;
 
  537     shift_vec0_xbw(in, in0, 2 * ncol); 
 
  539     for (
int i = 0; i < nh; i++) {
 
  542       shift_vec1_xbw(buf, &buf0[
VLENY * 4 * i], 4);
 
  543       load_vec(vin, &in[
VLEN * 4 * i], 4);
 
  544       add_vec(vin, buf, 4);
 
  545       save_vec(&in[
VLEN * 4 * i], vin, 4);
 
  547     accum_mult_u(out, in, u0, ncol);
 
  551   static inline void mult_coarse_xpb(
real_t *out,
 
  554                                      const int ncol, 
real_t *work)
 
  557     shift_vec2_xbw(in, in1, in2, 2 * ncol); 
 
  558     accum_mult_u(out, in, u, ncol);
 
  565   static inline void mult_coarse_xm1(
real_t *buf, 
real_t *u, 
real_t *in, 
const int ncol)
 
  567     const int nh = ncol / 2;
 
  568     for (
int i = 0; i < nh; i++) {
 
  571       accum_mult_udag_i(tmp, in, u, i, ncol);
 
  572       save_vec1_x(&buf[
VLENY * 4 * i], tmp, 
VLENX - 1, 4);
 
  577   static inline void mult_coarse_xm2(
real_t *out,
 
  582     const int nh = ncol / 2;
 
  583     for (
int i = 0; i < nh; i++) {
 
  588       accum_mult_udag_i(vtmp, in0, u0, i, ncol);
 
  592       save_vec(tmp1, vtmp, 4);
 
  593       shift_vec0_xfw(tmp2, tmp1, 4);
 
  594       load_vec(vtmp, tmp2, 4);
 
  597       shift_vec1_xfw(buf, &buf0[
VLENY * 4 * i], 4);
 
  598       add_vec(vtmp, buf, 4);
 
  602       load_vec(tmpout, &out[
VLEN * 4 * i], 4);
 
  603       add_vec(tmpout, vtmp, 4);
 
  604       save_vec(&out[
VLEN * 4 * i], tmpout, 4);
 
  609   static inline void mult_coarse_xmb(
real_t *out,
 
  616     shift_vec2_xfw(in, in1, in2, 2 * ncol); 
 
  618     const int nh = ncol / 2;
 
  619     for (
int i = 0; i < nh; i++) {
 
  621       load_vec(tmp, &out[
VLEN * 4 * i], 4);
 
  622       accum_mult_udag_xm_i(tmp, in, u1, u2, i, ncol);
 
  623       save_vec(&out[
VLEN * 4 * i], tmp, 4);
 
  631   static inline void mult_coarse_yp1(
real_t *buf, 
real_t *in, 
const int ncol)
 
  633     const int nh = ncol / 2;
 
  634     for (
int i = 0; i < nh; i++) {
 
  636       load_vec(tmp, &in[
VLEN * 4 * i], 4);
 
  637       save_vec1_y(&buf[
VLENX * 4 * i], tmp, 0, 4);
 
  642   static inline void mult_coarse_yp2(
real_t *out,
 
  649     const int nh = ncol / 2;
 
  651     shift_vec0_ybw(in, in0, 2 * ncol); 
 
  653     for (
int i = 0; i < nh; i++) {
 
  656       shift_vec1_ybw(buf, &buf0[
VLENX * 4 * i], 4);
 
  657       load_vec(vin, &in[
VLEN * 4 * i], 4);
 
  658       add_vec(vin, buf, 4);
 
  659       save_vec(&in[
VLEN * 4 * i], vin, 4);
 
  661     for (
int i = 0; i < nh; i++) {
 
  663       load_vec(tmp, &out[
VLEN * 4 * i], 4);
 
  664       accum_mult_u_i(tmp, in, u0, i, ncol);
 
  665       save_vec(&out[
VLEN * 4 * i], tmp, 4);
 
  670   static inline void mult_coarse_ypb(
real_t *out,
 
  677     shift_vec2_ybw(in, in1, in2, 2 * ncol); 
 
  678     const int nh = ncol / 2;
 
  679     for (
int i = 0; i < nh; i++) {
 
  681       load_vec(tmp, &out[
VLEN * 4 * i], 4);
 
  682       accum_mult_u_i(tmp, in, u, i, ncol);
 
  683       save_vec(&out[
VLEN * 4 * i], tmp, 4);
 
  691   static inline void mult_coarse_ym1(
real_t *buf, 
real_t *u, 
real_t *in, 
const int ncol)
 
  693     const int nh = ncol / 2;
 
  694     for (
int i = 0; i < nh; i++) {
 
  697       accum_mult_udag_i(tmp, in, u, i, ncol);
 
  698       save_vec1_y(&buf[
VLENX * 4 * i], tmp, 
VLENY - 1, 4);
 
  703   static inline void mult_coarse_ym2(
real_t *out,
 
  708     const int nh = ncol / 2;
 
  709     for (
int i = 0; i < nh; i++) {
 
  714       accum_mult_udag_i(vtmp, in0, u0, i, ncol);
 
  718       save_vec(tmp1, vtmp, 4);
 
  719       shift_vec0_yfw(tmp2, tmp1, 4);
 
  720       load_vec(vtmp, tmp2, 4);
 
  723       shift_vec1_yfw(buf, &buf0[
VLENX * 4 * i], 4);
 
  724       add_vec(vtmp, buf, 4);
 
  728       load_vec(tmpout, &out[
VLEN * 4 * i], 4);
 
  729       add_vec(tmpout, vtmp, 4);
 
  730       save_vec(&out[
VLEN * 4 * i], tmpout, 4);
 
  735   static inline void mult_coarse_ymb(
real_t *out,
 
  742     shift_vec2_yfw(in, in1, in2, 2 * ncol); 
 
  744     const int nh = ncol / 2;
 
  745     for (
int i = 0; i < nh; i++) {
 
  747       load_vec(tmp, &out[
VLEN * 4 * i], 4);
 
  748       accum_mult_udag_ym_i(tmp, in, u1, u2, i, ncol);
 
  749       save_vec(&out[
VLEN * 4 * i], tmp, 4);
 
  757   static inline void mult_coarse_zp1(
real_t *out, 
real_t *in, 
const int ncol)
 
  759     copy_buf(out, in, ncol);
 
  763   static inline void mult_coarse_zp2(
real_t *out, 
real_t *u, 
real_t *buf, 
const int ncol)
 
  765     accum_mult_u(out, buf, u, ncol);
 
  769   static inline void mult_coarse_zpb(
real_t *out, 
real_t *u, 
real_t *in, 
const int ncol)
 
  771     accum_mult_u(out, in, u, ncol);
 
  778   static inline void mult_coarse_zm1(
real_t *out, 
real_t *u, 
real_t *buf, 
const int ncol)
 
  780     set_mult_udag(out, buf, u, ncol);
 
  784   static inline void mult_coarse_zm2(
real_t *out, 
real_t *buf, 
const int ncol)
 
  786     accum_buf(out, buf, ncol);
 
  790   static inline void mult_coarse_zmb(
real_t *out, 
real_t *u, 
real_t *in, 
const int ncol)
 
  792     accum_mult_udag(out, in, u, ncol);
 
  799   static inline void mult_coarse_tp1(
real_t *out, 
real_t *in, 
const int ncol)
 
  801     copy_buf(out, in, ncol);
 
  805   static inline void mult_coarse_tp2(
real_t *out, 
real_t *u, 
real_t *buf, 
const int ncol)
 
  807     accum_mult_u(out, buf, u, ncol);
 
  811   static inline void mult_coarse_tpb(
real_t *out, 
real_t *u, 
real_t *in, 
const int ncol)
 
  813     accum_mult_u(out, in, u, ncol);
 
  820   static inline void mult_coarse_tm1(
real_t *out, 
real_t *u, 
real_t *buf, 
const int ncol)
 
  822     set_mult_udag(out, buf, u, ncol);
 
  826   static inline void mult_coarse_tm2(
real_t *out, 
real_t *buf, 
const int ncol)
 
  828     accum_buf(out, buf, ncol);
 
  832   static inline void mult_coarse_tmb(
real_t *out, 
real_t *u, 
real_t *in, 
const int ncol)
 
  834     accum_mult_udag(out, in, u, ncol);
 
  840 template<
typename AFIELD>
 
  842   = 
"AFopr_Clover_coarse";
 
  845 template<
typename AFIELD>
 
  858   for (
int mu = 0; mu < Ndim; ++mu) {
 
  861     do_comm_any += do_comm[mu];
 
  862     vout.
general(
"  do_comm[%d] = %d\n", mu, do_comm[mu]);
 
  865   m_bdsize.resize(Ndim);
 
  870   int NinF      = 2 * Nc * Nd;
 
  871   workvec1.reset(NinF, fine_nvol, 1);
 
  872   workvec2.reset(NinF, fine_nvol, 1);
 
  873   workvec3.reset(NinF, fine_nvol, 1);
 
  876 #ifdef AFOPR_CLOVER_COARSE_TIMER 
  877   timer_mult.reset(
new            Timer(
"afopr_Clover_coarse: mult           "));
 
  878   timer_pack.reset(
new            Timer(
"afopr_Clover_coarse: pack           "));
 
  879   timer_bulk.reset(
new            Timer(
"afopr_Clover_coarse: bulk           "));
 
  880   timer_boundary.reset(
new        Timer(
"afopr_Clover_coarse: boundary       "));
 
  881   timer_comm.reset(
new            Timer(
"afopr_Clover_coarse: comm           "));
 
  882   timer_comm_recv_wait.reset(
new  Timer(
"afopr_Clover_coarse: comm_recv_wait "));
 
  883   timer_comm_send_wait.reset(
new  Timer(
"afopr_Clover_coarse: comm_send_wait "));
 
  884   timer_comm_recv_start.reset(
new Timer(
"afopr_Clover_coarse: comm_recv_start"));
 
  885   timer_comm_send_start.reset(
new Timer(
"afopr_Clover_coarse: comm_send_start"));
 
  886   timer_comm_test_all.reset(
new   Timer(
"afopr_Clover_coarse: comm_test_all  "));
 
  887   timer_clear.reset(
new           Timer(
"afopr_Clover_coarse: clear          "));
 
  893 template<
typename AFIELD>
 
  898   chsend_up.resize(Ndim);
 
  899   chrecv_up.resize(Ndim);
 
  900   chsend_dn.resize(Ndim);
 
  901   chrecv_dn.resize(Ndim);
 
  903   for (
int mu = 0; mu < Ndim; ++mu) {
 
  904     size_t Nvsize = m_bdsize[mu] * 
sizeof(
real_t);
 
  906     chsend_dn[mu].send_init(Nvsize, mu, -1);
 
  907     chsend_up[mu].send_init(Nvsize, mu, 1);
 
  909     chrecv_up[mu].recv_init(Nvsize, mu, 1);
 
  910     chrecv_dn[mu].recv_init(Nvsize, mu, -1);
 
  912     void *buf_up = (
void *)chsend_dn[mu].ptr();
 
  913     chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
 
  914     void *buf_dn = (
void *)chsend_up[mu].ptr();
 
  915     chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
 
  918     if (do_comm[mu] == 1) {
 
  919       chset_send.append(chsend_up[mu]);
 
  920       chset_send.append(chsend_dn[mu]);
 
  921       chset_recv.append(chrecv_up[mu]);
 
  922       chset_recv.append(chrecv_dn[mu]);
 
  929 template<
typename AFIELD>
 
  935   for (
int i = 0; i < work_shifted.size(); i++) {
 
  936     free(work_shifted[i]);
 
  937     work_shifted[i] = 
nullptr;
 
  940 #ifdef AFOPR_CLOVER_COARSE_TIMER 
  941   timer_mult->report();
 
  942   timer_clear->report();
 
  943   timer_pack->report();
 
  944   timer_bulk->report();
 
  945   timer_boundary->report();
 
  946   timer_comm->report();
 
  947   timer_comm_recv_wait->report();
 
  948   timer_comm_send_wait->report();
 
  949   timer_comm_recv_start->report();
 
  950   timer_comm_send_start->report();
 
  951   timer_comm_test_all->report();
 
  957 template<
typename AFIELD>
 
  960   const string str_vlevel = params.
get_string(
"verbose_level");
 
  965   std::vector<int> coarse_lattice;
 
  968   err += params.
fetch_int(
"number_of_testvectors", num_testvectors);
 
  971     vout.
crucial(m_vl, 
"Error at %s: input parameter not found.\n",
 
  976   set_parameters(num_testvectors, coarse_lattice);
 
  981 template<
typename AFIELD>
 
  983   const int num_testvectors,
 
  984   const std::vector<int>& coarse_lattice)
 
  989   assert(coarse_lattice.size() == Ndim);
 
  991   m_num_testvectors = num_testvectors;
 
  992   m_ncol            = 2 * num_testvectors; 
 
  994   m_Nc2 = m_ncol * m_ncol;
 
  996   m_Ndf = 2 * m_Nc * m_Nc; 
 
  997   int Nc2 = m_ncol * m_ncol;
 
  999   m_Nx   = coarse_lattice[0];
 
 1000   m_Ny   = coarse_lattice[1];
 
 1001   m_Nz   = coarse_lattice[2];
 
 1002   m_Nt   = coarse_lattice[3];
 
 1003   m_Nst  = m_Nx * m_Ny * m_Nz * m_Nt;
 
 1004   m_Nstv = m_Nst / 
VLEN;
 
 1005   m_Nxv  = m_Nx / 
VLENX;
 
 1006   m_Nyv  = m_Ny / 
VLENY;
 
 1009   if (m_Nxv * 
VLENX != m_Nx) {
 
 1010     vout.
crucial(
"%s: bad coarse lattice size in x-direction: must be a multiple of %d (given: %d)\n", class_name.c_str(), 
VLENX, m_Nx);
 
 1013   if (m_Nyv * 
VLENY != m_Ny) {
 
 1014     vout.
crucial(
"%s: bad coarse lattice size in y-direction: must be a multiple of %d (given: %d)\n", class_name.c_str(), 
VLENY, m_Ny);
 
 1019   m_bdsize[0] = m_Nvc * m_Ny * m_Nz * m_Nt;
 
 1020   m_bdsize[1] = m_Nvc * m_Nx * m_Nz * m_Nt;
 
 1021   m_bdsize[2] = m_Nvc * m_Nx * m_Ny * m_Nt;
 
 1022   m_bdsize[3] = m_Nvc * m_Nx * m_Ny * m_Nz;
 
 1026   size_t coarse_nvol = m_Nst;
 
 1029   m_U.reset(m_Ndf, m_Nst, Ndim);   
 
 1030   m_Clov.reset(m_Ndf, m_Nst, 1);   
 
 1032   tmp_buffer1.resize(coarse_nvol);
 
 1033   tmp_buffer2.resize(coarse_nvol);
 
 1036   int pool_size = ((
sizeof(
real_t) * 
VLEN * m_Nvc - 1) / alignment + 1) * alignment;
 
 1038   for (
int i = 0; i < work_shifted.size(); i++) {
 
 1039     free(work_shifted[i]);
 
 1040     work_shifted[i] = 
nullptr;
 
 1042   work_shifted.resize(nthreads);
 
 1043   for (
int i = 0; i < nthreads; i++) {
 
 1044     posix_memalign((
void **)&work_shifted[i], alignment, pool_size);
 
 1046   vout.
detailed(m_vl, 
"shifted buffer: size=%d, alignment=%d\n", pool_size, alignment);
 
 1050   for (
int th = 0; th < m_list_boundary.size(); th++) {
 
 1051     vout.
detailed(m_vl, 
"  thread=%d,  number of boundary sites = %d\n", th, m_list_boundary[th].size());
 
 1053   vout.
general(m_vl, 
"Parameters of %s:\n", class_name.c_str());
 
 1054   for (
int mu = 0; mu < Ndim; ++mu) {
 
 1055     vout.
general(m_vl, 
"  coarse_lattice_size[%d] = %2d\n",
 
 1056                  mu, coarse_lattice[mu]);
 
 1062 template<
typename AFIELD>
 
 1065   int              work_xp = m_ncol;
 
 1066   int              work_xm = m_ncol;
 
 1067   int              work_yp = m_ncol;
 
 1068   int              work_ym = m_ncol;
 
 1069   int              work_zp = m_ncol;
 
 1071   int              work_tp = m_ncol;
 
 1073   std::vector<int> workload(m_Nstv);
 
 1074   for (
int site = 0; site < m_Nstv; ++site) {
 
 1077   for (
int site = 0; site < m_Nstv; ++site) {
 
 1078     int ix   = site % m_Nxv;
 
 1079     int iyzt = site / m_Nxv;
 
 1080     int iy   = iyzt % m_Nyv;
 
 1081     int izt  = site / (m_Nxv * m_Nyv);
 
 1082     int iz   = izt % m_Nz;
 
 1083     int it   = izt / m_Nz;
 
 1084     if (do_comm[0] == 1) {
 
 1085       if (ix == m_Nxv - 1) { workload[site] += work_xp; }
 
 1086       if (ix == 0) { workload[site] += work_xm; }
 
 1088     if (do_comm[1] == 1) {
 
 1089       if (iy == m_Nyv - 1) { workload[site] += work_yp; }
 
 1090       if (iy == 0) { workload[site] += work_ym; }
 
 1092     if (do_comm[2] == 1) {
 
 1093       if (iz == m_Nz - 1) { workload[site] += work_zp; }
 
 1094       if (iz == 0) { workload[site] += work_zm; }
 
 1096     if (do_comm[3] == 1) {
 
 1097       if (it == m_Nt - 1) { workload[site] += work_tp; }
 
 1098       if (it == 0) { workload[site] += work_tm; }
 
 1104   if (nth > 2) { nth--; }  
 
 1106   std::vector<std::vector<int> > tmp_list;
 
 1107   std::vector<int>               work(nth);
 
 1108   std::vector<int>               tmp_list_next(nth);
 
 1109   tmp_list.resize(nth);
 
 1110   for (
int i = 0; i < nth; i++) {
 
 1111     tmp_list[i].resize(m_Nstv);
 
 1113   for (
int i = 0; i < nth; i++) {
 
 1116   for (
int i = 0; i < nth; i++) {
 
 1117     tmp_list_next[i] = 0;
 
 1119   int th_min_work = 0;
 
 1124     int max_work_site = -1;
 
 1125     for (
int site = 0; site < m_Nstv; site++) {
 
 1126       if (workload[site] > max_work) {
 
 1127         max_work      = workload[site];
 
 1128         max_work_site = site;
 
 1131     if (max_work == 0) {  
 
 1135     tmp_list[th_min_work][tmp_list_next[th_min_work]] = max_work_site;
 
 1136     tmp_list_next[th_min_work]++;
 
 1137     work[th_min_work]      += max_work;
 
 1138     workload[max_work_site] = 0;
 
 1141     int min_work = work[th_min_work];
 
 1142     for (
int th = 0; th < nth; th++) {
 
 1143       if (work[th] < min_work) {
 
 1144         min_work    = work[th];
 
 1151   m_list_boundary.resize(nth0);
 
 1152   m_list_boundary[0].resize(0);
 
 1153   for (
int th = 0; th < nth; th++) {
 
 1155     if (nth0 > nth) { th0++; }
 
 1156     int size = tmp_list_next[th];
 
 1157     m_list_boundary[th0].resize(size);
 
 1158     vout.
general(
"hoge: setting list: th0=%d/%d, size=%d, load=%d\n",
 
 1159                  th0, nth0, size, work[th]);
 
 1160     for (
int i = 0; i < size; i++) {
 
 1162                    th0, nth0, i, tmp_list[th][i]);
 
 1164       m_list_boundary[th0][i] = tmp_list[th][i];
 
 1171 template<
typename AFIELD>
 
 1174   const std::vector<AFIELD>& atestvec)
 
 1176   int       ith, nth, coarse_is, coarse_ns;
 
 1177   const int coarse_nvol = m_U.nvol();
 
 1178   set_threadtask(ith, nth, coarse_is, coarse_ns, coarse_nvol);
 
 1180   real_t    *out_clov   = m_Clov.ptr(0);
 
 1181   real_t    *out_gauge  = m_U.ptr(0);
 
 1182   const int num_vectors = m_num_testvectors;
 
 1183   const int coarse_Nc2  = m_ncol * m_ncol;
 
 1184   assert(m_Nc2 == coarse_Nc2);
 
 1189   if (fine_afopr == 
nullptr) {
 
 1190     vout.
crucial(
"%s: in generate_coarse_op, a bad fine operator" 
 1191                  " is given (mustbbe AFopr_Clover_dd<AFIELD>).\n",
 
 1192                  class_name.c_str());
 
 1199   std::vector<int> coarse_lattice(4);
 
 1200   coarse_lattice[0] = m_Nx;
 
 1201   coarse_lattice[1] = m_Ny;
 
 1202   coarse_lattice[2] = m_Nz;
 
 1203   coarse_lattice[3] = m_Nt;
 
 1216   for (
int i1 = 0; i1 < num_vectors; ++i1) {
 
 1217     for (
int ch1 = -1; ch1 < 2; ch1 += 2) { 
 
 1221       fine_afopr->
mult_dd(workvec2, workvec1);
 
 1223       int    I    = 2 * i1 + (ch1 + 1) / 2;
 
 1225       for (
int i2 = 0; i2 < num_vectors; ++i2) {
 
 1228         block_dotc(&tmp_buffer1[0], atestvec[i2], workvec3,
 
 1233         block_dotc(&tmp_buffer2[0], atestvec[i2], workvec3,
 
 1237         for (
int s = coarse_is; s < coarse_ns; s++) {
 
 1238           int idx_r = index_coarse.idx_Gr(J, I, s, 0);
 
 1239           int idx_i = index_coarse.idx_Gi(J, I, s, 0);
 
 1240           out[idx_r] += real(tmp_buffer1[s]);
 
 1241           out[idx_i] += imag(tmp_buffer1[s]);
 
 1245         for (
int s = coarse_is; s < coarse_ns; s++) {
 
 1246           int idx_r = index_coarse.idx_Gr(J, I, s, 0);
 
 1247           int idx_i = index_coarse.idx_Gi(J, I, s, 0);
 
 1248           out[idx_r] += real(tmp_buffer2[s]);
 
 1249           out[idx_i] += imag(tmp_buffer2[s]);
 
 1255       for (
int mu = 0; mu < 4; mu++) {
 
 1257         fine_afopr->
mult_dup(workvec2, workvec1, mu);
 
 1258         real_t *out = out_gauge + mu * 2 * coarse_Nc2 * coarse_nvol;
 
 1261         int I = 2 * i1 + (ch1 + 1) / 2;
 
 1262         for (
int i2 = 0; i2 < num_vectors; ++i2) {
 
 1265           block_dotc(&tmp_buffer1[0], atestvec[i2], workvec3,
 
 1270           block_dotc(&tmp_buffer2[0], atestvec[i2], workvec3,
 
 1274           for (
int s = coarse_is; s < coarse_ns; ++s) {
 
 1275             int idx_r = index_coarse.idx_Gr(J, I, s, 0);
 
 1276             int idx_i = index_coarse.idx_Gi(J, I, s, 0);
 
 1277             out[idx_r] += real(tmp_buffer1[s]);
 
 1278             out[idx_i] += imag(tmp_buffer1[s]);
 
 1282           for (
int s = coarse_is; s < coarse_ns; ++s) {
 
 1283             int idx_r = index_coarse.idx_Gr(J, I, s, 0);
 
 1284             int idx_i = index_coarse.idx_Gi(J, I, s, 0);
 
 1285             out[idx_r] += real(tmp_buffer2[s]);
 
 1286             out[idx_i] += imag(tmp_buffer2[s]);
 
 1298     double clv2 = m_Clov.norm2();
 
 1299     double u2   = m_U.norm2();
 
 1300     vout.
general(
"%s: |m_Clov|^2 = %23.15e\n", class_name.c_str(), clv2);
 
 1301     vout.
general(
"%s: |m_U|^2    = %23.15e\n", class_name.c_str(), u2);
 
 1304     for (
int i = 0; i < 2 * num_vectors; ++i) {
 
 1305       for (
int j = 0; j < 2 * num_vectors; ++j) {
 
 1308         int idx_r = index_coarse.idx_Gr(j, i, s, mu);
 
 1309         int idx_i = index_coarse.idx_Gi(j, i, s, mu);
 
 1311                      m_U.cmp(idx_r), m_U.cmp(idx_i));
 
 1321 template<
typename AFIELD>
 
 1324   vout.
crucial(m_vl, 
"%s: set_config is called\n", class_name.c_str());
 
 1330 template<
typename AFIELD>
 
 1338 template<
typename AFIELD>
 
 1346 template<
typename AFIELD>
 
 1354   } 
else if (mu == 1) {
 
 1356   } 
else if (mu == 2) {
 
 1358   } 
else if (mu == 3) {
 
 1361     vout.
crucial(m_vl, 
"%s: mult_up for %d direction is undefined.",
 
 1362                  class_name.c_str(), mu);
 
 1369 template<
typename AFIELD>
 
 1377   } 
else if (mu == 1) {
 
 1379   } 
else if (mu == 2) {
 
 1381   } 
else if (mu == 3) {
 
 1384     vout.
crucial(m_vl, 
"%s: mult_dn for %d direction is undefined.",
 
 1385                  class_name.c_str(), mu);
 
 1392 template<
typename AFIELD>
 
 1398   if (ith == 0) m_mode = mode;
 
 1405 template<
typename AFIELD>
 
 1413 template<
typename AFIELD>
 
 1416   if (m_mode == 
"D") {
 
 1418   } 
else if (m_mode == 
"DdagD") {
 
 1420   } 
else if (m_mode == 
"Ddag") {
 
 1422   } 
else if (m_mode == 
"H") {
 
 1425     vout.
crucial(m_vl, 
"%s: mode undefined.\n", class_name.c_str());
 
 1432 template<
typename AFIELD>
 
 1435   if (m_mode == 
"D") {
 
 1437   } 
else if (m_mode == 
"DdagD") {
 
 1439   } 
else if (m_mode == 
"Ddag") {
 
 1441   } 
else if (m_mode == 
"H") {
 
 1444     vout.
crucial(m_vl, 
"%s: mode undefined.\n", class_name.c_str());
 
 1451 template<
typename AFIELD>
 
 1461 template<
typename AFIELD>
 
 1472 template<
typename AFIELD>
 
 1482 template<
typename AFIELD>
 
 1497 template<
typename AFIELD>
 
 1512 template<
typename AFIELD>
 
 1515   int ith, nth, is, ns;
 
 1516   set_threadtask(ith, nth, is, ns, m_Nstv);
 
 1517   for (
int s = is; s < ns; ++s) {
 
 1519     const real_t *in = w + s * 2 * 
VLEN * m_ncol;
 
 1520     for (
int i = 0; i < m_ncol / 2; ++i) {
 
 1522       load_vec(tmp, &in[4 * i * 
VLEN], 4);
 
 1523       scal_vec(tmp, 
real_t(-1), 2); 
 
 1524       save_vec(&out[4 * i * 
VLEN], tmp, 4);
 
 1531 template<
typename AFIELD>
 
 1534   real_t *u = m_Clov.ptr(0);
 
 1536   int ith, nth, is, ns;
 
 1537   set_threadtask(ith, nth, is, ns, m_Nstv);
 
 1540   int nv  = 
VLEN * m_Nvc;
 
 1541   int nv2 = 
VLEN * m_Ndf;
 
 1543   for (
int site = is; site < ns; ++site) {
 
 1544     accum_mult_u(&v2[nv * site], &v1[nv * site],
 
 1545                  &u[nv2 * site], m_Nc);
 
 1553 template<
typename AFIELD>
 
 1562   int ith, nth, is, ns;
 
 1563   set_threadtask(ith, nth, is, ns, m_Nstv);
 
 1564   const bool time_keeper = (ith == nth - 1);
 
 1568   real_t     *cp         = m_Clov.ptr(0);
 
 1570   int Nsize[4] = { m_Nxv, m_Nyv, m_Nz, m_Nt };
 
 1575   if (do_comm_any > 0) {
 
 1593                              buf1_zp, buf1_zm, buf1_tp, buf1_tm,
 
 1594                              up, wp, Nsize, m_ncol, do_comm);
 
 1640                            buf2_xp, buf2_xm, buf2_yp, buf2_ym,
 
 1641                            buf2_zp, buf2_zm, buf2_tp, buf2_tm,
 
 1642                            Nsize, m_ncol, do_comm, work_shifted[ith],
 
 1643                            m_list_boundary[ith]);
 
 1662 template<
typename AFIELD>
 
 1692 template<
typename AFIELD>
 
 1701 template<
typename AFIELD>
 
 1704   int ith, nth, is, ns;
 
 1705   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1708   clear_vec(vzero, 2);
 
 1709   for (
int site = is; site < ns; ++site) {
 
 1711     for (
int ic = 0; ic < m_Nvc; ic += 2) {
 
 1712       save_vec(&out[
VLEN * ic], vzero, 2);
 
 1719 template<
typename AFIELD>
 
 1724   int ith, nth, is, ns;
 
 1725   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1730   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1733   real_t *work = work_shifted[ith];
 
 1736   if (do_comm[0] > 0) {
 
 1742     for (
int site = is; site < ns; ++site) {
 
 1743       int ix   = site % m_Nxv;
 
 1744       int iyzt = site / m_Nxv;
 
 1746         int ibf = 
VLENY * m_Nvc * iyzt;
 
 1747         mult_coarse_xp1(&buf1[ibf], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 1757       chrecv_up[0].start();
 
 1758       chsend_dn[0].start();
 
 1759       chrecv_up[0].wait();
 
 1760       chsend_dn[0].wait();
 
 1774   for (
int site = is; site < ns; ++site) {
 
 1775     int ix   = site % m_Nxv;
 
 1776     int iyzt = site / m_Nxv;
 
 1778     if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
 
 1779       int nei = (ix + 1) + m_Nxv * iyzt;
 
 1780       if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
 
 1781       mult_coarse_xpb(&v2[
VLEN * m_Nvc * site], &u[
VLEN * m_Ndf * site],
 
 1782                       &v1[
VLEN * m_Nvc * site], &v1[
VLEN * m_Nvc * nei], m_Nc, work);
 
 1784       int ibf = 
VLENY * m_Nvc * iyzt;
 
 1785       mult_coarse_xp2(&v2[
VLEN * m_Nvc * site], &u[
VLEN * m_Ndf * site],
 
 1786                       &v1[
VLEN * m_Nvc * site], &buf2[ibf], m_Nc, work);
 
 1799 template<
typename AFIELD>
 
 1804   int ith, nth, is, ns;
 
 1805   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1810   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1813   real_t *work = work_shifted[ith];
 
 1817   if (do_comm[0] > 0) {
 
 1823     for (
int site = is; site < ns; ++site) {
 
 1824       int ix   = site % m_Nxv;
 
 1825       int iyzt = site / m_Nxv;
 
 1826       if (ix == m_Nxv - 1) {
 
 1827         int ibf = 
VLENY * m_Nvc * iyzt;
 
 1828         mult_coarse_xm1(&buf1[ibf], &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 1837       chrecv_dn[0].start();
 
 1838       chsend_up[0].start();
 
 1839       chrecv_dn[0].wait();
 
 1840       chsend_up[0].wait();
 
 1854   for (
int site = is; site < ns; ++site) {
 
 1855     int ix   = site % m_Nxv;
 
 1856     int iyzt = site / m_Nxv;
 
 1858     if ((ix > 0) || (do_comm[0] == 0)) {
 
 1859       int ix2 = (ix - 1 + m_Nxv) % m_Nxv;
 
 1860       int nei = ix2 + m_Nxv * iyzt;
 
 1861       mult_coarse_xmb(&v2[
VLEN * m_Nvc * site],
 
 1862                       &u[
VLEN * m_Ndf * site], &u[
VLEN * m_Ndf * nei],
 
 1863                       &v1[
VLEN * m_Nvc * site], &v1[
VLEN * m_Nvc * nei],
 
 1866       int ibf = 
VLENY * m_Nvc * iyzt;
 
 1867       mult_coarse_xm2(&v2[
VLEN * m_Nvc * site], &u[
VLEN * m_Ndf * site],
 
 1868                       &v1[
VLEN * m_Nvc * site], &buf2[ibf], m_Nc);
 
 1881 template<
typename AFIELD>
 
 1885   int Nxyv = m_Nxv * m_Nyv;
 
 1887   int ith, nth, is, ns;
 
 1888   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1893   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1896   if (do_comm[1] > 0) {
 
 1902     for (
int site = is; site < ns; ++site) {
 
 1903       int ix   = site % m_Nxv;
 
 1904       int iy   = (site / m_Nxv) % m_Nyv;
 
 1905       int izt  = site / Nxyv;
 
 1906       int ixzt = ix + m_Nxv * izt;
 
 1908         int ibf = 
VLENX * m_Nvc * ixzt;
 
 1909         mult_coarse_yp1(&buf1[ibf], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 1919       chrecv_up[1].start();
 
 1920       chsend_dn[1].start();
 
 1921       chrecv_up[1].wait();
 
 1922       chsend_dn[1].wait();
 
 1937   real_t *work  = work_shifted[thread];
 
 1938   for (
int site = is; site < ns; ++site) {
 
 1939     int ix   = site % m_Nxv;
 
 1940     int iy   = (site / m_Nxv) % m_Nyv;
 
 1941     int izt  = site / Nxyv;
 
 1942     int ixzt = ix + m_Nxv * izt;
 
 1944     if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
 
 1945       int iy2 = (iy + 1) % m_Nyv;
 
 1946       int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
 
 1947       mult_coarse_ypb(&v2[
VLEN * m_Nvc * site],
 
 1948                       &u[
VLEN * m_Ndf * site],
 
 1949                       &v1[
VLEN * m_Nvc * site], &v1[
VLEN * m_Nvc * nei],
 
 1952       int ibf = 
VLENX * m_Nvc * ixzt;
 
 1953       mult_coarse_yp2(&v2[
VLEN * m_Nvc * site],
 
 1954                       &u[
VLEN * m_Ndf * site],
 
 1955                       &v1[
VLEN * m_Nvc * site], &buf2[ibf],
 
 1970 template<
typename AFIELD>
 
 1974   int Nxyv = m_Nxv * m_Nyv;
 
 1976   int ith, nth, is, ns;
 
 1977   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 1982   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 1986   if (do_comm[1] > 0) {
 
 1992     for (
int site = is; site < ns; ++site) {
 
 1993       int ix  = site % m_Nxv;
 
 1994       int iy  = (site / m_Nxv) % m_Nyv;
 
 1995       int izt = site / Nxyv;
 
 1996       if (iy == m_Nyv - 1) {
 
 1997         int ibf = 
VLENX * m_Nvc * (ix + m_Nxv * izt);
 
 1998         mult_coarse_ym1(&buf1[ibf], &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 2008       chrecv_dn[1].start();
 
 2009       chsend_up[1].start();
 
 2010       chrecv_dn[1].wait();
 
 2011       chsend_up[1].wait();
 
 2027   real_t *work  = work_shifted[thread];
 
 2029   for (
int site = is; site < ns; ++site) {
 
 2030     int ix  = site % m_Nxv;
 
 2031     int iy  = (site / m_Nxv) % m_Nyv;
 
 2032     int izt = site / Nxyv;
 
 2034     if ((iy != 0) || (do_comm[idir] == 0)) {
 
 2035       int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
 
 2036       int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
 
 2037       mult_coarse_ymb(&v2[
VLEN * m_Nvc * site],
 
 2038                       &u[
VLEN * m_Ndf * site], &u[
VLEN * m_Ndf * nei],
 
 2039                       &v1[
VLEN * m_Nvc * site], &v1[
VLEN * m_Nvc * nei],
 
 2042       int ibf = 
VLENX * m_Nvc * (ix + m_Nxv * izt);
 
 2043       mult_coarse_ym2(&v2[
VLEN * m_Nvc * site],
 
 2044                       &u[
VLEN * m_Ndf * site],
 
 2045                       &v1[
VLEN * m_Nvc * site],
 
 2058 template<
typename AFIELD>
 
 2062   int Nxyv = m_Nxv * m_Nyv;
 
 2064   int ith, nth, is, ns;
 
 2065   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2070   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2073   if (do_comm[2] > 0) {
 
 2079     for (
int site = is; site < ns; ++site) {
 
 2080       int ixy  = site % Nxyv;
 
 2081       int iz   = (site / Nxyv) % m_Nz;
 
 2082       int it   = site / (Nxyv * m_Nz);
 
 2083       int ixyt = ixy + Nxyv * it;
 
 2085         mult_coarse_zp1(&buf1[
VLEN * m_Nvc * ixyt], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 2095       chrecv_up[2].start();
 
 2096       chsend_dn[2].start();
 
 2097       chrecv_up[2].wait();
 
 2098       chsend_dn[2].wait();
 
 2113   for (
int site = is; site < ns; ++site) {
 
 2114     int ixy = site % Nxyv;
 
 2115     int iz  = (site / Nxyv) % m_Nz;
 
 2116     int it  = site / (Nxyv * m_Nz);
 
 2118     if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
 
 2119       int iz2 = (iz + 1) % m_Nz;
 
 2120       int nei = ixy + Nxyv * (iz2 + m_Nz * it);
 
 2121       mult_coarse_zpb(&v2[
VLEN * m_Nvc * site],
 
 2122                       &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * nei], m_Nc);
 
 2124       int ixyt = ixy + Nxyv * it;
 
 2125       mult_coarse_zp2(&v2[
VLEN * m_Nvc * site],
 
 2126                       &u[
VLEN * m_Ndf * site], &buf2[
VLEN * m_Nvc * ixyt], m_Nc);
 
 2139 template<
typename AFIELD>
 
 2143   int Nxyv = m_Nxv * m_Nyv;
 
 2145   int ith, nth, is, ns;
 
 2146   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2151   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2155   if (do_comm[2] > 0) {
 
 2161     for (
int site = is; site < ns; ++site) {
 
 2162       int ixy = site % Nxyv;
 
 2163       int iz  = (site / Nxyv) % m_Nz;
 
 2164       int it  = site / (Nxyv * m_Nz);
 
 2165       if (iz == m_Nz - 1) {
 
 2166         int ixyt = ixy + Nxyv * it;
 
 2167         mult_coarse_zm1(&buf1[
VLEN * m_Nvc * ixyt],
 
 2168                         &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 2178       chrecv_dn[2].start();
 
 2179       chsend_up[2].start();
 
 2180       chrecv_dn[2].wait();
 
 2181       chsend_up[2].wait();
 
 2196   for (
int site = is; site < ns; ++site) {
 
 2197     int ixy = site % Nxyv;
 
 2198     int iz  = (site / Nxyv) % m_Nz;
 
 2199     int it  = site / (Nxyv * m_Nz);
 
 2201     if ((iz > 0) || (do_comm[2] == 0)) {
 
 2202       int iz2 = (iz - 1 + m_Nz) % m_Nz;
 
 2203       int nei = ixy + Nxyv * (iz2 + m_Nz * it);
 
 2204       mult_coarse_zmb(&v2[
VLEN * m_Nvc * site],
 
 2205                       &u[
VLEN * m_Ndf * nei], &v1[
VLEN * m_Nvc * nei], m_Nc);
 
 2207       int ixyt = ixy + Nxyv * it;
 
 2208       mult_coarse_zm2(&v2[
VLEN * m_Nvc * site], &buf2[
VLEN * m_Nvc * ixyt], m_Nc);
 
 2220 template<
typename AFIELD>
 
 2224   int Nxyzv = m_Nxv * m_Nyv * m_Nz;
 
 2226   int ith, nth, is, ns;
 
 2227   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2232   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2237   if (do_comm[3] > 0) {
 
 2243     for (
int site = is; site < ns; ++site) {
 
 2244       int ixyz = site % Nxyzv;
 
 2245       int it   = site / Nxyzv;
 
 2247         mult_coarse_tp1(&buf1[
VLEN * m_Nvc * ixyz], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 2257       chrecv_up[3].start();
 
 2258       chsend_dn[3].start();
 
 2259       chrecv_up[3].wait();
 
 2260       chsend_dn[3].wait();
 
 2275   for (
int site = is; site < ns; ++site) {
 
 2276     int ixyz = site % Nxyzv;
 
 2277     int it   = site / Nxyzv;
 
 2279     if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
 
 2280       int it2 = (it + 1) % m_Nt;
 
 2281       int nei = ixyz + Nxyzv * it2;
 
 2282       mult_coarse_tpb(&v2[
VLEN * m_Nvc * site],
 
 2283                       &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * nei], m_Nc);
 
 2285       mult_coarse_tp2(&v2[
VLEN * m_Nvc * site],
 
 2286                       &u[
VLEN * m_Ndf * site], &buf2[
VLEN * m_Nvc * ixyz], m_Nc);
 
 2299 template<
typename AFIELD>
 
 2303   int Nxyzv = m_Nxv * m_Nyv * m_Nz;
 
 2305   int ith, nth, is, ns;
 
 2306   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2310   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2312   if (do_comm[3] > 0) {
 
 2313     for (
int site = is; site < ns; ++site) {
 
 2314       int ixyz = site % Nxyzv;
 
 2315       int it   = site / Nxyzv;
 
 2316       if (it == m_Nt - 1) {
 
 2317         mult_coarse_tm1(&buf1[
VLEN * m_Nvc * ixyz],
 
 2318                         &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 2326 template<
typename AFIELD>
 
 2330   int Nxyzv = m_Nxv * m_Nyv * m_Nz;
 
 2332   int ith, nth, is, ns;
 
 2333   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2337   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2339   for (
int site = is; site < ns; ++site) {
 
 2340     int ixyz = site % Nxyzv;
 
 2341     int it   = site / Nxyzv;
 
 2343     if ((it > 0) || (do_comm[3] == 0)) {
 
 2344       int it2 = (it - 1 + m_Nt) % m_Nt;
 
 2345       int nei = ixyz + Nxyzv * it2;
 
 2346       mult_coarse_tmb(&v2[
VLEN * m_Nvc * site],
 
 2347                       &u[
VLEN * m_Ndf * nei], &v1[
VLEN * m_Nvc * nei], m_Nc);
 
 2349       mult_coarse_tm2(&v2[
VLEN * m_Nvc * site], &buf2[
VLEN * m_Nvc * ixyz], m_Nc);
 
 2356 template<
typename AFIELD>
 
 2360   int Nxyzv = m_Nxv * m_Nyv * m_Nz;
 
 2362   int ith, nth, is, ns;
 
 2363   set_threadtask_mult(ith, nth, is, ns, m_Nstv);
 
 2368   real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
 
 2372   if (do_comm[3] > 0) {
 
 2378     for (
int site = is; site < ns; ++site) {
 
 2379       int ixyz = site % Nxyzv;
 
 2380       int it   = site / Nxyzv;
 
 2381       if (it == m_Nt - 1) {
 
 2382         mult_coarse_tm1(&buf1[
VLEN * m_Nvc * ixyz],
 
 2383                         &u[
VLEN * m_Ndf * site], &v1[
VLEN * m_Nvc * site], m_Nc);
 
 2393       chrecv_dn[3].start();
 
 2394       chsend_up[3].start();
 
 2395       chrecv_dn[3].wait();
 
 2396       chsend_up[3].wait();
 
 2410   for (
int site = is; site < ns; ++site) {
 
 2411     int ixyz = site % Nxyzv;
 
 2412     int it   = site / Nxyzv;
 
 2414     if ((it > 0) || (do_comm[3] == 0)) {
 
 2415       int it2 = (it - 1 + m_Nt) % m_Nt;
 
 2416       int nei = ixyz + Nxyzv * it2;
 
 2417       mult_coarse_tmb(&v2[
VLEN * m_Nvc * site],
 
 2418                       &u[
VLEN * m_Ndf * nei], &v1[
VLEN * m_Nvc * nei], m_Nc);
 
 2420       mult_coarse_tm2(&v2[
VLEN * m_Nvc * site], &buf2[
VLEN * m_Nvc * ixyz], m_Nc);
 
 2433 template<
typename AFIELD>
 
 2441   double flop_site, flop;
 
 2448   flop_site = 
static_cast<double>(5 * 8 * m_Nc * m_Nc);
 
 2450   flop = flop_site * 
static_cast<double>(Lvol);
 
 2451   if ((mode == 
"DdagD") || (mode == 
"DDdag")) flop *= 2.0;