Bridge++  Ver. 2.0.2
afopr_Clover_coarse-tmpl.h
Go to the documentation of this file.
1 
13 
14 #define AFOPR_CLOVER_COARSE_TIMER
15 
16 #ifdef AFOPR_CLOVER_COARSE_TIMER
17 #include "lib/Tools/timer.h"
18 #define TIMER_mult_start timer_mult->start();
19 #define TIMER_mult_stop timer_mult->stop();
20 #define TIMER_pack_start timer_pack->start();
21 #define TIMER_pack_stop timer_pack->stop();
22 #define TIMER_bulk_start timer_bulk->start();
23 #define TIMER_bulk_stop timer_bulk->stop();
24 #define TIMER_boundary_start timer_boundary->start();
25 #define TIMER_boundary_stop timer_boundary->stop();
26 #define TIMER_comm_start timer_comm->start();
27 #define TIMER_comm_stop timer_comm->stop();
28 #define TIMER_comm_recv_wait_start timer_comm_recv_wait->start();
29 #define TIMER_comm_recv_wait_stop timer_comm_recv_wait->stop();
30 #define TIMER_comm_send_wait_start timer_comm_send_wait->start();
31 #define TIMER_comm_send_wait_stop timer_comm_send_wait->stop();
32 #define TIMER_comm_recv_start_start timer_comm_recv_start->start();
33 #define TIMER_comm_recv_start_stop timer_comm_recv_start->stop();
34 #define TIMER_comm_send_start_start timer_comm_send_start->start();
35 #define TIMER_comm_send_start_stop timer_comm_send_start->stop();
36 #define TIMER_comm_test_all_start timer_comm_test_all->start();
37 #define TIMER_comm_test_all_stop timer_comm_test_all->stop();
38 #define TIMER_clear_start timer_clear->start();
39 #define TIMER_clear_stop timer_clear->stop();
40 #else
41 #define TIMER_mult_start
42 #define TIMER_mult_stop
43 #define TIMER_pack_start
44 #define TIMER_pack_stop
45 #define TIMER_bulk_start
46 #define TIMER_bulk_stop
47 #define TIMER_boundary_start
48 #define TIMER_boundary_stop
49 #define TIMER_comm_start
50 #define TIMER_comm_stop
51 #define TIMER_comm_recv_wait_start
52 #define TIMER_comm_recv_wait_stop
53 #define TIMER_comm_send_wait_start
54 #define TIMER_comm_send_wait_stop
55 #define TIMER_comm_recv_start_start
56 #define TIMER_comm_recv_start_stop
57 #define TIMER_comm_send_start_start
58 #define TIMER_comm_send_start_stop
59 #define TIMER_comm_test_all_start
60 #define TIMER_comm_test_all_stop
61 #define TIMER_clear_start
62 #define TIMER_clear_stop
63 #endif
64 
65 
66 //====================================================================
67 namespace {
68 #ifndef QXS_DATA_ALIGNMENT
69  constexpr int alignment = 256;
70 #else
71  constexpr int alignment = QXS_DATA_ALIGNMENT;
72 #endif
73 
74  //====================================================================
75  static inline void accum_mult_u_i(Vsimd_t *out,
76  real_t *in,
77  real_t *u0,
78  int i,
79  const int ncol)
80  {
81  const int nh = ncol / 2;
82 
83  enum velem
84  {
85  e1r, e1i, e2r, e2i
86  };
87 
88  real_t *u = u0 + VLEN * 4 * i * ncol;
89  for (int j = 0; j < nh; j++) {
90  Vsimd_t vu[4], vin[4];
91  load_vec(vu, &u[VLEN * 4 * j], 4);
92  load_vec(vin, &in[VLEN * 4 * j], 4);
93 
94  // out1 += (u[2*j ]) * in[2*j]; // ch:--
95  add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
96  sub_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
97  add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
98  add_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
99 
100  // out1 += u[2*j+1) * in[2*j+1]; // ch:-+
101  add_dot_vec(&out[e1r], &vu[e2r], &vin[e2r], 1);
102  sub_dot_vec(&out[e1r], &vu[e2i], &vin[e2i], 1);
103  add_dot_vec(&out[e1i], &vu[e2r], &vin[e2i], 1);
104  add_dot_vec(&out[e1i], &vu[e2i], &vin[e2r], 1);
105 
106  load_vec(vu, &u[VLEN * 4 * (j + nh)], 4);
107  // out2 += (u[2*j ]) * in[2*j]; // ch:--
108  add_dot_vec(&out[e2r], &vu[e1r], &vin[e1r], 1);
109  sub_dot_vec(&out[e2r], &vu[e1i], &vin[e1i], 1);
110  add_dot_vec(&out[e2i], &vu[e1r], &vin[e1i], 1);
111  add_dot_vec(&out[e2i], &vu[e1i], &vin[e1r], 1);
112 
113  // out2 += u[2*j+1) * in[2*j+1]; // ch:-+
114  add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
115  sub_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
116  add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
117  add_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
118  } // j
119  }
120 
121 
122  //====================================================================
123  static inline void accum_mult_u_yp_i(Vsimd_t *out,
124  real_t *in1, real_t *in2,
125  real_t *u0,
126  int i,
127  const int ncol)
128  {
129  const int nh = ncol / 2;
130 
131  enum velem
132  {
133  e1r, e1i, e2r, e2i
134  };
135 
136  real_t *u = u0 + VLEN * 4 * ncol * i;
137  for (int j = 0; j < nh; j++) {
138  Vsimd_t vu[4], vin[4];
139  load_vec(vu, &u[VLEN * 4 * j], 4);
140  // shifted input vector
141  real_t in[VLEN * 4];
142  shift_vec2_ybw(in, &in1[VLEN * 4 * j], &in2[VLEN * 4 * j], 4);
143 
144 
145  // out1 += (u[2*j ]) * in[2*j]; // ch:--
146  add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
147  sub_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
148  add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
149  add_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
150 
151  // out1 += u[2*j+1) * in[2*j+1]; // ch:-+
152  add_dot_vec(&out[e1r], &vu[e2r], &vin[e2r], 1);
153  sub_dot_vec(&out[e1r], &vu[e2i], &vin[e2i], 1);
154  add_dot_vec(&out[e1i], &vu[e2r], &vin[e2i], 1);
155  add_dot_vec(&out[e1i], &vu[e2i], &vin[e2r], 1);
156 
157  load_vec(vu, &u[VLEN * 4 * (j + nh)], 4);
158  // out2 += (u[2*j ]) * in[2*j]; // ch:--
159  add_dot_vec(&out[e2r], &vu[e1r], &vin[e1r], 1);
160  sub_dot_vec(&out[e2r], &vu[e1i], &vin[e1i], 1);
161  add_dot_vec(&out[e2i], &vu[e1r], &vin[e1i], 1);
162  add_dot_vec(&out[e2i], &vu[e1i], &vin[e1r], 1);
163 
164  // out2 += u[2*j+1) * in[2*j+1]; // ch:-+
165  add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
166  sub_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
167  add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
168  add_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
169  } // j
170  }
171 
172 
173  //====================================================================
174  static inline void accum_mult_u(real_t *out,
175  real_t *in,
176  real_t *u0,
177  const int ncol)
178  { // simple implementation
179  /*
180  for(int i=0; i<ncol; i++){
181  for(int j=0; j<ncol; j++){
182  out[i]+=u0[i*ncol+j]*in[j];
183  }
184  }
185  */
186  const int nh = ncol / 2;
187 
188  for (int i = 0; i < nh; i++) {
189  Vsimd_t tmp[4];
190  load_vec(tmp, &out[VLEN * 4 * i], 4);
191  accum_mult_u_i(tmp, in, u0, i, ncol);
192  save_vec(&out[VLEN * 4 * i], tmp, 4);
193  }
194  }
195 
196 
197  //====================================================================
198  static inline void accum_mult_udag_i(Vsimd_t *out,
199  real_t *in,
200  real_t *u0,
201  int i,
202  const int ncol)
203  {
204  const int dof2 = ncol * ncol;
205  const int nh = ncol / 2;
206 
207  enum velem
208  {
209  e1r, e1i, e2r, e2i
210  };
211  for (int j = 0; j < nh; j++) {
212  // const complex_t *u = u0 + 2 * j * ncol;
213  real_t *u = u0 + VLEN * 4 * j * ncol;
214 
215  Vsimd_t vu[4], vin[4];
216  load_vec(vu, &u[VLEN * 4 * i], 4);
217  load_vec(vin, &in[VLEN * 4 * j], 4);
218 
219  // out1 += conj(u[2*i ]) * in[2*j]; // ch:--
220  add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
221  add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
222  add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
223  sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
224 
225  // out2 -= conj(u[2*i+1]) * in[2*j]; // ch:-+
226  sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
227  sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
228  sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
229  add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
230 
231  load_vec(vu, &u[VLEN * (2 * ncol + 4 * i)], 4);
232  // out1 -= conj(u[ncol+2*i ]) * in[2*j+1]; // ch:+-
233  sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
234  sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
235  sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
236  add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
237 
238  // out2 += conj(u[ncol+2*i+1]) * in[2*j+1]; // ch:++
239  add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
240  add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
241  add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
242  sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
243  }
244  }
245 
246 
247  //====================================================================
248  static inline void accum_mult_udag_xm_i(Vsimd_t *out,
249  real_t *in,
250  real_t *u1, real_t *u2,
251  int i,
252  const int ncol)
253  {
254  const int nh = ncol / 2;
255 
256  enum velem
257  {
258  e1r, e1i, e2r, e2i
259  };
260  for (int j = 0; j < nh; j++) {
261  // const complex_t *u = u0 + 2 * j * ncol;
262  real_t *uu1 = u1 + VLEN * 4 * j * ncol;
263  real_t *uu2 = u2 + VLEN * 4 * j * ncol;
264 
265  // work area for field shifting
266  real_t u[VLEN * 4];
267 
268  // vector variables: treat shifted varaibles
269  Vsimd_t vu[4], vin[4];
270 
271  // shifted input vector
272  load_vec(vin, &in[VLEN * 4 * j], 4);
273 
274  // shifted gauge field: for ch -*
275  shift_vec2_xfw(u, &uu1[VLEN * 4 * i], &uu2[VLEN * 4 * i], 4);
276  load_vec(vu, u, 4);
277 
278  // out1 += conj(u[2*i ]) * in[2*j]; // ch:--
279  add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
280  add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
281  add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
282  sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
283 
284  // out2 -= conj(u[2*i+1]) * in[2*j]; // ch:-+
285  sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
286  sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
287  sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
288  add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
289 
290  // shifted gauge field: for ch +*
291  shift_vec2_xfw(u, &uu1[VLEN * (2 * ncol + 4 * i)], &uu2[VLEN * (2 * ncol + 4 * i)], 4);
292  load_vec(vu, u, 4);
293 
294  // out1 -= conj(u[ncol+2*i ]) * in[2*j+1]; // ch:+-
295  sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
296  sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
297  sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
298  add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
299 
300  // out2 += conj(u[ncol+2*i+1]) * in[2*j+1]; // ch:++
301  add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
302  add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
303  add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
304  sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
305  }
306  }
307 
308 
309  //====================================================================
310  static inline void accum_mult_udag_ym_i(Vsimd_t *out,
311  real_t *in1, real_t *in2,
312  real_t *u1, real_t *u2,
313  int i,
314  const int ncol)
315  {
316  const int nh = ncol / 2;
317 
318  enum velem
319  {
320  e1r, e1i, e2r, e2i
321  };
322  for (int j = 0; j < nh; j++) {
323  // const complex_t *u = u0 + 2 * j * ncol;
324  real_t *uu1 = u1 + VLEN * 4 * j * ncol;
325  real_t *uu2 = u2 + VLEN * 4 * j * ncol;
326 
327  // work area for field shifting
328  real_t u[VLEN * 4];
329  real_t in[VLEN * 4];
330 
331  // vector variables: treat shifted varaibles
332  Vsimd_t vu[4], vin[4];
333 
334  // shifted input vector
335  shift_vec2_yfw(in, &in1[VLEN * 4 * j], &in2[VLEN * 4 * j], 4);
336  load_vec(vin, in, 4);
337 
338  // shifted gauge field: for ch -*
339  shift_vec2_yfw(u, &uu1[VLEN * 4 * i], &uu2[VLEN * 4 * i], 4);
340  load_vec(vu, u, 4);
341 
342  // out1 += conj(u[2*i ]) * in[2*j]; // ch:--
343  add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
344  add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
345  add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
346  sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
347 
348  // out2 -= conj(u[2*i+1]) * in[2*j]; // ch:-+
349  sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
350  sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
351  sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
352  add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
353 
354  // shifted gauge field: for ch +*
355  shift_vec2_yfw(u, &uu1[VLEN * (2 * ncol + 4 * i)], &uu2[VLEN * (2 * ncol + 4 * i)], 4);
356  load_vec(vu, u, 4);
357 
358  // out1 -= conj(u[ncol+2*i ]) * in[2*j+1]; // ch:+-
359  sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
360  sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
361  sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
362  add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
363 
364  // out2 += conj(u[ncol+2*i+1]) * in[2*j+1]; // ch:++
365  add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
366  add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
367  add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
368  sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
369  }
370  }
371 
372 
373  //====================================================================
374  static inline void accum_mult_udag_ym_i(Vsimd_t *out,
375  real_t *in,
376  real_t *u1, real_t *u2,
377  int i,
378  const int ncol)
379  {
380  const int nh = ncol / 2;
381 
382  enum velem
383  {
384  e1r, e1i, e2r, e2i
385  };
386  for (int j = 0; j < nh; j++) {
387  // const complex_t *u = u0 + 2 * j * ncol;
388  real_t *uu1 = u1 + VLEN * 4 * j * ncol;
389  real_t *uu2 = u2 + VLEN * 4 * j * ncol;
390 
391  // work area for field shifting
392  real_t u[VLEN * 4];
393 
394  // vector variables: treat shifted varaibles
395  Vsimd_t vu[4], vin[4];
396 
397  // shifted input vector
398  load_vec(vin, &in[VLEN * 4 * j], 4);
399 
400  // shifted gauge field: for ch -*
401  shift_vec2_yfw(u, &uu1[VLEN * 4 * i], &uu2[VLEN * 4 * i], 4);
402  load_vec(vu, u, 4);
403 
404  // out1 += conj(u[2*i ]) * in[2*j]; // ch:--
405  add_dot_vec(&out[e1r], &vu[e1r], &vin[e1r], 1);
406  add_dot_vec(&out[e1r], &vu[e1i], &vin[e1i], 1);
407  add_dot_vec(&out[e1i], &vu[e1r], &vin[e1i], 1);
408  sub_dot_vec(&out[e1i], &vu[e1i], &vin[e1r], 1);
409 
410  // out2 -= conj(u[2*i+1]) * in[2*j]; // ch:-+
411  sub_dot_vec(&out[e2r], &vu[e2r], &vin[e1r], 1);
412  sub_dot_vec(&out[e2r], &vu[e2i], &vin[e1i], 1);
413  sub_dot_vec(&out[e2i], &vu[e2r], &vin[e1i], 1);
414  add_dot_vec(&out[e2i], &vu[e2i], &vin[e1r], 1);
415 
416  // shifted gauge field: for ch +*
417  shift_vec2_yfw(u, &uu1[VLEN * (2 * ncol + 4 * i)], &uu2[VLEN * (2 * ncol + 4 * i)], 4);
418  load_vec(vu, u, 4);
419 
420  // out1 -= conj(u[ncol+2*i ]) * in[2*j+1]; // ch:+-
421  sub_dot_vec(&out[e1r], &vu[e1r], &vin[e2r], 1);
422  sub_dot_vec(&out[e1r], &vu[e1i], &vin[e2i], 1);
423  sub_dot_vec(&out[e1i], &vu[e1r], &vin[e2i], 1);
424  add_dot_vec(&out[e1i], &vu[e1i], &vin[e2r], 1);
425 
426  // out2 += conj(u[ncol+2*i+1]) * in[2*j+1]; // ch:++
427  add_dot_vec(&out[e2r], &vu[e2r], &vin[e2r], 1);
428  add_dot_vec(&out[e2r], &vu[e2i], &vin[e2i], 1);
429  add_dot_vec(&out[e2i], &vu[e2r], &vin[e2i], 1);
430  sub_dot_vec(&out[e2i], &vu[e2i], &vin[e2r], 1);
431  }
432  }
433 
434 
435  //====================================================================
436  static inline void accum_mult_udag(real_t *out,
437  real_t *in,
438  real_t *u0,
439  const int ncol)
440  {
441  const int nh = ncol / 2;
442 
443  for (int i = 0; i < nh; i++) {
444  Vsimd_t tmp[4];
445  load_vec(tmp, &out[VLEN * 4 * i], 4);
446  accum_mult_udag_i(tmp, in, u0, i, ncol);
447  save_vec(&out[VLEN * 4 * i], tmp, 4);
448  }
449  }
450 
451 
452  //====================================================================
453  static inline void set_mult_u(real_t *out,
454  real_t *in,
455  real_t *u0,
456  const int ncol)
457  {
458  const int nh = ncol / 2;
459 
460  for (int i = 0; i < nh; i++) {
461  Vsimd_t tmp[4];
462  clear_vec(tmp, 4);
463  accum_mult_u_i(tmp, in, u0, i, ncol);
464  save_vec(&out[VLEN * 4 * i], tmp, 4);
465  }
466  }
467 
468 
469  //====================================================================
470  static inline void set_mult_udag(real_t *out,
471  real_t *in,
472  real_t *u,
473  const int ncol)
474  {
475  const int nh = ncol / 2;
476 
477  for (int i = 0; i < nh; i++) {
478  Vsimd_t tmp[4];
479  clear_vec(tmp, 4);
480  accum_mult_udag_i(tmp, in, u, i, ncol);
481  save_vec(&out[VLEN * (4 * i)], tmp, 4);
482  }
483  }
484 
485 
486  //====================================================================
487  static inline void accum_buf(real_t *out, real_t *in, const int ncol)
488  {
489  Vsimd_t vin;
490  Vsimd_t vout;
491  for (int i = 0; i < 2 * ncol; i++) { // 2 for complex
492  load_vec(&vin, &in[VLEN * i], 1);
493  load_vec(&vout, &out[VLEN * i], 1);
494  add_vec(&vout, &vin, 1);
495  save_vec(&out[VLEN * i], &vout, 1);
496  }
497  }
498 
499 
500  //====================================================================
501  static inline void copy_buf(real_t *out, real_t *in, const int ncol)
502  {
503  for (int i = 0; i < VLEN * 2 * ncol; i++) { // 2 for complex
504  out[i] = in[i];
505  }
506  // Vsimd_t vin;
507  // for(int i=0; i< 2*ncol; i++){ // 2 for complex
508  // load_vec(&vin, &in[VLEN*i], 1);
509  // save_vec(&out[VLEN*i], &vin, 1);
510  // }
511  }
512 
513 
515  // for mult_xp
517  static inline void mult_coarse_xp1(real_t *buf, real_t *in, const int ncol)
518  {
519  const int nh = ncol / 2;
520  for (int i = 0; i < nh; i++) {
521  Vsimd_t tmp[4];
522  load_vec(tmp, &in[VLEN * 4 * i], 4);
523  save_vec1_x(&buf[VLENY * 4 * i], tmp, 0, 4);
524  }
525  }
526 
527 
528  static inline void mult_coarse_xp2(real_t *out,
529  real_t *u0,
530  real_t *in0,
531  real_t *buf0,
532  const int ncol,
533  real_t *work)
534  {
535  const int nh = ncol / 2;
536  real_t *in = work;
537  shift_vec0_xbw(in, in0, 2 * ncol); // 2 for complex
538 
539  for (int i = 0; i < nh; i++) {
540  Vsimd_t vin[4], buf[4];
541  // merge the buffer
542  shift_vec1_xbw(buf, &buf0[VLENY * 4 * i], 4);
543  load_vec(vin, &in[VLEN * 4 * i], 4);
544  add_vec(vin, buf, 4);
545  save_vec(&in[VLEN * 4 * i], vin, 4);
546  }
547  accum_mult_u(out, in, u0, ncol);
548  }
549 
550 
551  static inline void mult_coarse_xpb(real_t *out,
552  real_t *u,
553  real_t *in1, real_t *in2,
554  const int ncol, real_t *work)
555  {
556  real_t *in = work;
557  shift_vec2_xbw(in, in1, in2, 2 * ncol); // 2 for complex
558  accum_mult_u(out, in, u, ncol);
559  }
560 
561 
563  // for mult_xm
565  static inline void mult_coarse_xm1(real_t *buf, real_t *u, real_t *in, const int ncol)
566  {
567  const int nh = ncol / 2;
568  for (int i = 0; i < nh; i++) {
569  Vsimd_t tmp[4];
570  clear_vec(tmp, 4);
571  accum_mult_udag_i(tmp, in, u, i, ncol);
572  save_vec1_x(&buf[VLENY * 4 * i], tmp, VLENX - 1, 4);
573  }
574  }
575 
576 
577  static inline void mult_coarse_xm2(real_t *out,
578  real_t *u0, real_t *in0,
579  real_t *buf0,
580  const int ncol)
581  {
582  const int nh = ncol / 2;
583  for (int i = 0; i < nh; i++) {
584  Vsimd_t vtmp[4], buf[4];
585 
586  // multipliy udag
587  clear_vec(vtmp, 4);
588  accum_mult_udag_i(vtmp, in0, u0, i, ncol);
589 
590  // shift the result
591  real_t tmp1[4 * VLEN], tmp2[4 * VLEN];
592  save_vec(tmp1, vtmp, 4);
593  shift_vec0_xfw(tmp2, tmp1, 4);
594  load_vec(vtmp, tmp2, 4);
595 
596  // merge the buffer
597  shift_vec1_xfw(buf, &buf0[VLENY * 4 * i], 4);
598  add_vec(vtmp, buf, 4);
599 
600  // accumulate to the output
601  Vsimd_t tmpout[4];
602  load_vec(tmpout, &out[VLEN * 4 * i], 4);
603  add_vec(tmpout, vtmp, 4);
604  save_vec(&out[VLEN * 4 * i], tmpout, 4);
605  }
606  }
607 
608 
609  static inline void mult_coarse_xmb(real_t *out,
610  real_t *u1, real_t *u2,
611  real_t *in1, real_t *in2,
612  const int ncol,
613  real_t *work)
614  {
615  real_t *in = work;
616  shift_vec2_xfw(in, in1, in2, 2 * ncol); // 2 for complex
617 
618  const int nh = ncol / 2;
619  for (int i = 0; i < nh; i++) {
620  Vsimd_t tmp[4];
621  load_vec(tmp, &out[VLEN * 4 * i], 4);
622  accum_mult_udag_xm_i(tmp, in, u1, u2, i, ncol);
623  save_vec(&out[VLEN * 4 * i], tmp, 4);
624  }
625  }
626 
627 
629  // for mult_yp
631  static inline void mult_coarse_yp1(real_t *buf, real_t *in, const int ncol)
632  {
633  const int nh = ncol / 2;
634  for (int i = 0; i < nh; i++) {
635  Vsimd_t tmp[4];
636  load_vec(tmp, &in[VLEN * 4 * i], 4);
637  save_vec1_y(&buf[VLENX * 4 * i], tmp, 0, 4);
638  }
639  }
640 
641 
642  static inline void mult_coarse_yp2(real_t *out,
643  real_t *u0,
644  real_t *in0,
645  real_t *buf0,
646  const int ncol,
647  real_t *work)
648  {
649  const int nh = ncol / 2;
650  real_t *in = work;
651  shift_vec0_ybw(in, in0, 2 * ncol); // 2 for complex
652 
653  for (int i = 0; i < nh; i++) {
654  Vsimd_t vin[4], buf[4];
655  // merge the buffer
656  shift_vec1_ybw(buf, &buf0[VLENX * 4 * i], 4);
657  load_vec(vin, &in[VLEN * 4 * i], 4);
658  add_vec(vin, buf, 4);
659  save_vec(&in[VLEN * 4 * i], vin, 4);
660  }
661  for (int i = 0; i < nh; i++) {
662  Vsimd_t tmp[4];
663  load_vec(tmp, &out[VLEN * 4 * i], 4);
664  accum_mult_u_i(tmp, in, u0, i, ncol);
665  save_vec(&out[VLEN * 4 * i], tmp, 4);
666  }
667  }
668 
669 
670  static inline void mult_coarse_ypb(real_t *out,
671  real_t *u,
672  real_t *in1, real_t *in2,
673  const int ncol,
674  real_t *work)
675  {
676  real_t *in = work;
677  shift_vec2_ybw(in, in1, in2, 2 * ncol); // 2 for complex
678  const int nh = ncol / 2;
679  for (int i = 0; i < nh; i++) {
680  Vsimd_t tmp[4];
681  load_vec(tmp, &out[VLEN * 4 * i], 4);
682  accum_mult_u_i(tmp, in, u, i, ncol);
683  save_vec(&out[VLEN * 4 * i], tmp, 4);
684  }
685  }
686 
687 
689  // for mult_ym
691  static inline void mult_coarse_ym1(real_t *buf, real_t *u, real_t *in, const int ncol)
692  {
693  const int nh = ncol / 2;
694  for (int i = 0; i < nh; i++) {
695  Vsimd_t tmp[4];
696  clear_vec(tmp, 4);
697  accum_mult_udag_i(tmp, in, u, i, ncol);
698  save_vec1_y(&buf[VLENX * 4 * i], tmp, VLENY - 1, 4);
699  }
700  }
701 
702 
703  static inline void mult_coarse_ym2(real_t *out,
704  real_t *u0, real_t *in0,
705  real_t *buf0,
706  const int ncol)
707  {
708  const int nh = ncol / 2;
709  for (int i = 0; i < nh; i++) {
710  Vsimd_t vtmp[4], buf[4];
711 
712  // multipliy udag
713  clear_vec(vtmp, 4);
714  accum_mult_udag_i(vtmp, in0, u0, i, ncol);
715 
716  // shift the result
717  real_t tmp1[4 * VLEN], tmp2[4 * VLEN];
718  save_vec(tmp1, vtmp, 4);
719  shift_vec0_yfw(tmp2, tmp1, 4);
720  load_vec(vtmp, tmp2, 4);
721 
722  // merge the buffer
723  shift_vec1_yfw(buf, &buf0[VLENX * 4 * i], 4);
724  add_vec(vtmp, buf, 4);
725 
726  // accumulate to the output
727  Vsimd_t tmpout[4];
728  load_vec(tmpout, &out[VLEN * 4 * i], 4);
729  add_vec(tmpout, vtmp, 4);
730  save_vec(&out[VLEN * 4 * i], tmpout, 4);
731  }
732  }
733 
734 
735  static inline void mult_coarse_ymb(real_t *out,
736  real_t *u1, real_t *u2,
737  real_t *in1, real_t *in2,
738  const int ncol,
739  real_t *work)
740  {
741  real_t *in = work;
742  shift_vec2_yfw(in, in1, in2, 2 * ncol); // 2 for complex
743 
744  const int nh = ncol / 2;
745  for (int i = 0; i < nh; i++) {
746  Vsimd_t tmp[4];
747  load_vec(tmp, &out[VLEN * 4 * i], 4);
748  accum_mult_udag_ym_i(tmp, in, u1, u2, i, ncol);
749  save_vec(&out[VLEN * 4 * i], tmp, 4);
750  }
751  }
752 
753 
755  // for mult_zp
757  static inline void mult_coarse_zp1(real_t *out, real_t *in, const int ncol)
758  {
759  copy_buf(out, in, ncol);
760  }
761 
762 
763  static inline void mult_coarse_zp2(real_t *out, real_t *u, real_t *buf, const int ncol)
764  {
765  accum_mult_u(out, buf, u, ncol);
766  }
767 
768 
769  static inline void mult_coarse_zpb(real_t *out, real_t *u, real_t *in, const int ncol)
770  {
771  accum_mult_u(out, in, u, ncol);
772  }
773 
774 
776  // for mult_zm
778  static inline void mult_coarse_zm1(real_t *out, real_t *u, real_t *buf, const int ncol)
779  {
780  set_mult_udag(out, buf, u, ncol);
781  }
782 
783 
784  static inline void mult_coarse_zm2(real_t *out, real_t *buf, const int ncol)
785  {
786  accum_buf(out, buf, ncol);
787  }
788 
789 
790  static inline void mult_coarse_zmb(real_t *out, real_t *u, real_t *in, const int ncol)
791  {
792  accum_mult_udag(out, in, u, ncol);
793  }
794 
795 
797  // for mult_tp
799  static inline void mult_coarse_tp1(real_t *out, real_t *in, const int ncol)
800  {
801  copy_buf(out, in, ncol);
802  }
803 
804 
805  static inline void mult_coarse_tp2(real_t *out, real_t *u, real_t *buf, const int ncol)
806  {
807  accum_mult_u(out, buf, u, ncol);
808  }
809 
810 
811  static inline void mult_coarse_tpb(real_t *out, real_t *u, real_t *in, const int ncol)
812  {
813  accum_mult_u(out, in, u, ncol);
814  }
815 
816 
818  // for mult_tm
820  static inline void mult_coarse_tm1(real_t *out, real_t *u, real_t *buf, const int ncol)
821  {
822  set_mult_udag(out, buf, u, ncol);
823  }
824 
825 
826  static inline void mult_coarse_tm2(real_t *out, real_t *buf, const int ncol)
827  {
828  accum_buf(out, buf, ncol);
829  }
830 
831 
832  static inline void mult_coarse_tmb(real_t *out, real_t *u, real_t *in, const int ncol)
833  {
834  accum_mult_udag(out, in, u, ncol);
835  }
836 } // anonymous namespace
837 
838 //====================================================================
839 
840 template<typename AFIELD>
842  = "AFopr_Clover_coarse";
843 
844 //====================================================================
845 template<typename AFIELD>
847 {
849 
850  m_repr = "Dirac"; // now only the Dirac repr is available.
851 
852  int req_comm = 1; // set 1 if communication forced any time
853  //int req_comm = 0; // set 0 if communication called in necessary
854 
855  int Ndim = CommonParameters::Ndim();
856 
857  do_comm_any = 0;
858  for (int mu = 0; mu < Ndim; ++mu) {
859  do_comm[mu] = 1;
860  if ((req_comm == 0) && (Communicator::npe(mu) == 1)) do_comm[mu] = 0;
861  do_comm_any += do_comm[mu];
862  vout.general(" do_comm[%d] = %d\n", mu, do_comm[mu]);
863  }
864 
865  m_bdsize.resize(Ndim);
866 
867  int fine_nvol = CommonParameters::Nvol();
868  int Nc = CommonParameters::Nc();
869  int Nd = CommonParameters::Nd();
870  int NinF = 2 * Nc * Nd;
871  workvec1.reset(NinF, fine_nvol, 1);
872  workvec2.reset(NinF, fine_nvol, 1);
873  workvec3.reset(NinF, fine_nvol, 1);
874 
875  // rest the timers
876 #ifdef AFOPR_CLOVER_COARSE_TIMER
877  timer_mult.reset(new Timer("afopr_Clover_coarse: mult "));
878  timer_pack.reset(new Timer("afopr_Clover_coarse: pack "));
879  timer_bulk.reset(new Timer("afopr_Clover_coarse: bulk "));
880  timer_boundary.reset(new Timer("afopr_Clover_coarse: boundary "));
881  timer_comm.reset(new Timer("afopr_Clover_coarse: comm "));
882  timer_comm_recv_wait.reset(new Timer("afopr_Clover_coarse: comm_recv_wait "));
883  timer_comm_send_wait.reset(new Timer("afopr_Clover_coarse: comm_send_wait "));
884  timer_comm_recv_start.reset(new Timer("afopr_Clover_coarse: comm_recv_start"));
885  timer_comm_send_start.reset(new Timer("afopr_Clover_coarse: comm_send_start"));
886  timer_comm_test_all.reset(new Timer("afopr_Clover_coarse: comm_test_all "));
887  timer_clear.reset(new Timer("afopr_Clover_coarse: clear "));
888 #endif
889 }
890 
891 
892 //====================================================================
893 template<typename AFIELD>
895 {
896  int Ndim = CommonParameters::Ndim();
897 
898  chsend_up.resize(Ndim);
899  chrecv_up.resize(Ndim);
900  chsend_dn.resize(Ndim);
901  chrecv_dn.resize(Ndim);
902 
903  for (int mu = 0; mu < Ndim; ++mu) {
904  size_t Nvsize = m_bdsize[mu] * sizeof(real_t);
905 
906  chsend_dn[mu].send_init(Nvsize, mu, -1);
907  chsend_up[mu].send_init(Nvsize, mu, 1);
908 #ifdef USE_MPI
909  chrecv_up[mu].recv_init(Nvsize, mu, 1);
910  chrecv_dn[mu].recv_init(Nvsize, mu, -1);
911 #else
912  void *buf_up = (void *)chsend_dn[mu].ptr();
913  chrecv_up[mu].recv_init(Nvsize, mu, 1, buf_up);
914  void *buf_dn = (void *)chsend_up[mu].ptr();
915  chrecv_dn[mu].recv_init(Nvsize, mu, -1, buf_dn);
916 #endif
917 
918  if (do_comm[mu] == 1) {
919  chset_send.append(chsend_up[mu]);
920  chset_send.append(chsend_dn[mu]);
921  chset_recv.append(chrecv_up[mu]);
922  chset_recv.append(chrecv_dn[mu]);
923  }
924  }
925 }
926 
927 
928 //====================================================================
929 template<typename AFIELD>
931 {
932  vout.general("%s: tidyup\n", class_name.c_str());
935  for (int i = 0; i < work_shifted.size(); i++) {
936  free(work_shifted[i]);
937  work_shifted[i] = nullptr;
938  }
939 
940 #ifdef AFOPR_CLOVER_COARSE_TIMER
941  timer_mult->report();
942  timer_clear->report();
943  timer_pack->report();
944  timer_bulk->report();
945  timer_boundary->report();
946  timer_comm->report();
947  timer_comm_recv_wait->report();
948  timer_comm_send_wait->report();
949  timer_comm_recv_start->report();
950  timer_comm_send_start->report();
951  timer_comm_test_all->report();
952 #endif
953 }
954 
955 
956 //====================================================================
957 template<typename AFIELD>
959 {
960  const string str_vlevel = params.get_string("verbose_level");
961  m_vl = vout.set_verbose_level(str_vlevel);
962 
963  //- fetch and check input parameters
964  int num_testvectors;
965  std::vector<int> coarse_lattice;
966 
967  int err = 0;
968  err += params.fetch_int("number_of_testvectors", num_testvectors);
969  err += params.fetch_int_vector("coarse_lattice_size", coarse_lattice);
970  if (err) {
971  vout.crucial(m_vl, "Error at %s: input parameter not found.\n",
972  class_name.c_str());
973  exit(EXIT_FAILURE);
974  }
975 
976  set_parameters(num_testvectors, coarse_lattice);
977 }
978 
979 
980 //====================================================================
981 template<typename AFIELD>
983  const int num_testvectors,
984  const std::vector<int>& coarse_lattice)
985 {
987 
988  int Ndim = CommonParameters::Ndim();
989  assert(coarse_lattice.size() == Ndim);
990 
991  m_num_testvectors = num_testvectors;
992  m_ncol = 2 * num_testvectors; // number of chirality is multplied.
993  m_Nc = m_ncol;
994  m_Nc2 = m_ncol * m_ncol;
995  m_Nvc = 2 * m_Nc; // 2 for complex
996  m_Ndf = 2 * m_Nc * m_Nc; // 2 for complex
997  int Nc2 = m_ncol * m_ncol;
998 
999  m_Nx = coarse_lattice[0];
1000  m_Ny = coarse_lattice[1];
1001  m_Nz = coarse_lattice[2];
1002  m_Nt = coarse_lattice[3];
1003  m_Nst = m_Nx * m_Ny * m_Nz * m_Nt;
1004  m_Nstv = m_Nst / VLEN;
1005  m_Nxv = m_Nx / VLENX;
1006  m_Nyv = m_Ny / VLENY;
1007 
1008  // sanity check
1009  if (m_Nxv * VLENX != m_Nx) {
1010  vout.crucial("%s: bad coarse lattice size in x-direction: must be a multiple of %d (given: %d)\n", class_name.c_str(), VLENX, m_Nx);
1011  exit(EXIT_FAILURE);
1012  }
1013  if (m_Nyv * VLENY != m_Ny) {
1014  vout.crucial("%s: bad coarse lattice size in y-direction: must be a multiple of %d (given: %d)\n", class_name.c_str(), VLENY, m_Ny);
1015  exit(EXIT_FAILURE);
1016  }
1017 
1018 
1019  m_bdsize[0] = m_Nvc * m_Ny * m_Nz * m_Nt;
1020  m_bdsize[1] = m_Nvc * m_Nx * m_Nz * m_Nt;
1021  m_bdsize[2] = m_Nvc * m_Nx * m_Ny * m_Nt;
1022  m_bdsize[3] = m_Nvc * m_Nx * m_Ny * m_Nz;
1023 
1024  setup_channels();
1025 
1026  size_t coarse_nvol = m_Nst;
1027  m_coarse_lvol = coarse_nvol * CommonParameters::NPE();
1028 
1029  m_U.reset(m_Ndf, m_Nst, Ndim); // hopping term
1030  m_Clov.reset(m_Ndf, m_Nst, 1); // on-site term
1031 
1032  tmp_buffer1.resize(coarse_nvol);
1033  tmp_buffer2.resize(coarse_nvol);
1034 
1036  int pool_size = ((sizeof(real_t) * VLEN * m_Nvc - 1) / alignment + 1) * alignment;
1037 
1038  for (int i = 0; i < work_shifted.size(); i++) {
1039  free(work_shifted[i]);
1040  work_shifted[i] = nullptr;
1041  }
1042  work_shifted.resize(nthreads);
1043  for (int i = 0; i < nthreads; i++) {
1044  posix_memalign((void **)&work_shifted[i], alignment, pool_size);
1045  }
1046  vout.detailed(m_vl, "shifted buffer: size=%d, alignment=%d\n", pool_size, alignment);
1047 
1048  set_list();
1049  vout.detailed(m_vl, "setting list vector, done\n");
1050  for (int th = 0; th < m_list_boundary.size(); th++) {
1051  vout.detailed(m_vl, " thread=%d, number of boundary sites = %d\n", th, m_list_boundary[th].size());
1052  }
1053  vout.general(m_vl, "Parameters of %s:\n", class_name.c_str());
1054  for (int mu = 0; mu < Ndim; ++mu) {
1055  vout.general(m_vl, " coarse_lattice_size[%d] = %2d\n",
1056  mu, coarse_lattice[mu]);
1057  }
1058 }
1059 
1060 
1061 //====================================================================
1062 template<typename AFIELD>
1064 {
1065  int work_xp = m_ncol;
1066  int work_xm = m_ncol;
1067  int work_yp = m_ncol;
1068  int work_ym = m_ncol;
1069  int work_zp = m_ncol;
1070  int work_zm = 1;
1071  int work_tp = m_ncol;
1072  int work_tm = 1;
1073  std::vector<int> workload(m_Nstv);
1074  for (int site = 0; site < m_Nstv; ++site) {
1075  workload[site] = 0;
1076  }
1077  for (int site = 0; site < m_Nstv; ++site) {
1078  int ix = site % m_Nxv;
1079  int iyzt = site / m_Nxv;
1080  int iy = iyzt % m_Nyv;
1081  int izt = site / (m_Nxv * m_Nyv);
1082  int iz = izt % m_Nz;
1083  int it = izt / m_Nz;
1084  if (do_comm[0] == 1) {
1085  if (ix == m_Nxv - 1) { workload[site] += work_xp; }
1086  if (ix == 0) { workload[site] += work_xm; }
1087  } // do_comm[0] == 1
1088  if (do_comm[1] == 1) {
1089  if (iy == m_Nyv - 1) { workload[site] += work_yp; }
1090  if (iy == 0) { workload[site] += work_ym; }
1091  } // do_comm[1] == 1
1092  if (do_comm[2] == 1) {
1093  if (iz == m_Nz - 1) { workload[site] += work_zp; }
1094  if (iz == 0) { workload[site] += work_zm; }
1095  } // do_comm[2] == 1
1096  if (do_comm[3] == 1) {
1097  if (it == m_Nt - 1) { workload[site] += work_tp; }
1098  if (it == 0) { workload[site] += work_tm; }
1099  } // do_comm[3] == 1
1100  } // site
1101 
1103  int nth = nth0;
1104  if (nth > 2) { nth--; } // do not use master thread
1105 
1106  std::vector<std::vector<int> > tmp_list;
1107  std::vector<int> work(nth);
1108  std::vector<int> tmp_list_next(nth);
1109  tmp_list.resize(nth);
1110  for (int i = 0; i < nth; i++) {
1111  tmp_list[i].resize(m_Nstv);
1112  }
1113  for (int i = 0; i < nth; i++) {
1114  work[i] = 0;
1115  }
1116  for (int i = 0; i < nth; i++) {
1117  tmp_list_next[i] = 0;
1118  }
1119  int th_min_work = 0;
1120  while (1)
1121  {
1122  // find the next site, which has the maximum laod
1123  int max_work = 0;
1124  int max_work_site = -1;
1125  for (int site = 0; site < m_Nstv; site++) {
1126  if (workload[site] > max_work) {
1127  max_work = workload[site];
1128  max_work_site = site;
1129  }
1130  }
1131  if (max_work == 0) { // no work is left
1132  break;
1133  }
1134  // assign the work a thread with the minnum works so far
1135  tmp_list[th_min_work][tmp_list_next[th_min_work]] = max_work_site;
1136  tmp_list_next[th_min_work]++;
1137  work[th_min_work] += max_work;
1138  workload[max_work_site] = 0;
1139 
1140  // look for the next thread to work
1141  int min_work = work[th_min_work];
1142  for (int th = 0; th < nth; th++) {
1143  if (work[th] < min_work) {
1144  min_work = work[th];
1145  th_min_work = th;
1146  }
1147  }
1148  }
1149 
1150  // resize and set the list vector
1151  m_list_boundary.resize(nth0);
1152  m_list_boundary[0].resize(0);
1153  for (int th = 0; th < nth; th++) {
1154  int th0 = th;
1155  if (nth0 > nth) { th0++; }
1156  int size = tmp_list_next[th];
1157  m_list_boundary[th0].resize(size);
1158  vout.general("hoge: setting list: th0=%d/%d, size=%d, load=%d\n",
1159  th0, nth0, size, work[th]);
1160  for (int i = 0; i < size; i++) {
1161  vout.general(" th0=%d/%d, i=%d site=%d\n",
1162  th0, nth0, i, tmp_list[th][i]);
1163 
1164  m_list_boundary[th0][i] = tmp_list[th][i];
1165  }
1166  }
1167 }
1168 
1169 
1170 //====================================================================
1171 template<typename AFIELD>
1173  AFopr_dd<AFIELD> *fine_afopr_,
1174  const std::vector<AFIELD>& atestvec)
1175 {
1176  int ith, nth, coarse_is, coarse_ns;
1177  const int coarse_nvol = m_U.nvol();
1178  set_threadtask(ith, nth, coarse_is, coarse_ns, coarse_nvol);
1179 
1180  real_t *out_clov = m_Clov.ptr(0);
1181  real_t *out_gauge = m_U.ptr(0);
1182  const int num_vectors = m_num_testvectors;
1183  const int coarse_Nc2 = m_ncol * m_ncol;
1184  assert(m_Nc2 == coarse_Nc2);
1185 
1186  // must be AFopr_Clover_dd
1187  AFopr_Clover_dd<AFIELD> *fine_afopr
1188  = dynamic_cast<AFopr_Clover_dd<AFIELD> *>(fine_afopr_);
1189  if (fine_afopr == nullptr) {
1190  vout.crucial("%s: in generate_coarse_op, a bad fine operator"
1191  " is given (mustbbe AFopr_Clover_dd<AFIELD>).\n",
1192  class_name.c_str());
1193  exit(EXIT_FAILURE);
1194  }
1195 
1196  m_Clov.set(0.0);
1197  m_U.set(0.0);
1198 
1199  std::vector<int> coarse_lattice(4);
1200  coarse_lattice[0] = m_Nx;
1201  coarse_lattice[1] = m_Ny;
1202  coarse_lattice[2] = m_Nz;
1203  coarse_lattice[3] = m_Nt;
1204  AIndex_block_lex<real_t, QXS> index_block(coarse_lattice);
1205 
1206  AIndex_coarse_lex<real_t, QXS> index_coarse(m_Nx, m_Ny, m_Nz, m_Nt, num_vectors, 2);
1207 
1208 #pragma omp barrier
1209 
1210  // coarse clover:
1211  // I = 2*i1 + chirality1
1212  // J = 2*i2 + chirality2
1213  // JI = (2*num_vectors)*J + I
1214  // <J|D|I>
1215 
1216  for (int i1 = 0; i1 < num_vectors; ++i1) {
1217  for (int ch1 = -1; ch1 < 2; ch1 += 2) { // ch=-1,+1
1218  fine_afopr->project_chiral(workvec1, atestvec[i1], ch1);
1219 
1220  // diag block: "clover"
1221  fine_afopr->mult_dd(workvec2, workvec1);
1222  real_t *out = out_clov;
1223  int I = 2 * i1 + (ch1 + 1) / 2;
1224 
1225  for (int i2 = 0; i2 < num_vectors; ++i2) {
1226  fine_afopr->project_chiral(workvec3, workvec2, -1);
1227 #pragma omp barrier
1228  block_dotc(&tmp_buffer1[0], atestvec[i2], workvec3,
1229  index_block);
1230 #pragma omp barrier
1231  fine_afopr->project_chiral(workvec3, workvec2, 1);
1232 #pragma omp barrier
1233  block_dotc(&tmp_buffer2[0], atestvec[i2], workvec3,
1234  index_block);
1235 
1236  int J = 2 * i2;
1237  for (int s = coarse_is; s < coarse_ns; s++) {
1238  int idx_r = index_coarse.idx_Gr(J, I, s, 0);
1239  int idx_i = index_coarse.idx_Gi(J, I, s, 0);
1240  out[idx_r] += real(tmp_buffer1[s]);
1241  out[idx_i] += imag(tmp_buffer1[s]);
1242  }
1243 
1244  ++J;
1245  for (int s = coarse_is; s < coarse_ns; s++) {
1246  int idx_r = index_coarse.idx_Gr(J, I, s, 0);
1247  int idx_i = index_coarse.idx_Gi(J, I, s, 0);
1248  out[idx_r] += real(tmp_buffer2[s]);
1249  out[idx_i] += imag(tmp_buffer2[s]);
1250  }
1251 #pragma omp barrier
1252  } // i2
1253 
1254  // hopping block: "gauge"
1255  for (int mu = 0; mu < 4; mu++) {
1256  workvec2.set(0.0);
1257  fine_afopr->mult_dup(workvec2, workvec1, mu);
1258  real_t *out = out_gauge + mu * 2 * coarse_Nc2 * coarse_nvol;
1259  // mu comes last
1260 
1261  int I = 2 * i1 + (ch1 + 1) / 2;
1262  for (int i2 = 0; i2 < num_vectors; ++i2) {
1263  fine_afopr->project_chiral(workvec3, workvec2, -1);
1264 #pragma omp barrier
1265  block_dotc(&tmp_buffer1[0], atestvec[i2], workvec3,
1266  index_block);
1267 #pragma omp barrier
1268  fine_afopr->project_chiral(workvec3, workvec2, 1);
1269 #pragma omp barrier
1270  block_dotc(&tmp_buffer2[0], atestvec[i2], workvec3,
1271  index_block);
1272 
1273  int J = 2 * i2;
1274  for (int s = coarse_is; s < coarse_ns; ++s) {
1275  int idx_r = index_coarse.idx_Gr(J, I, s, 0);
1276  int idx_i = index_coarse.idx_Gi(J, I, s, 0);
1277  out[idx_r] += real(tmp_buffer1[s]);
1278  out[idx_i] += imag(tmp_buffer1[s]);
1279  }
1280 
1281  ++J;
1282  for (int s = coarse_is; s < coarse_ns; ++s) {
1283  int idx_r = index_coarse.idx_Gr(J, I, s, 0);
1284  int idx_i = index_coarse.idx_Gi(J, I, s, 0);
1285  out[idx_r] += real(tmp_buffer2[s]);
1286  out[idx_i] += imag(tmp_buffer2[s]);
1287  }
1288 #pragma omp barrier
1289  } // i2
1290  } // mu
1291  }
1292  } // i1, ch1
1293  // rescale operator as project chiral does not have 1/2
1294  m_U.scal(0.25);
1295  m_Clov.scal(0.25);
1296 
1297  {
1298  double clv2 = m_Clov.norm2();
1299  double u2 = m_U.norm2();
1300  vout.general("%s: |m_Clov|^2 = %23.15e\n", class_name.c_str(), clv2);
1301  vout.general("%s: |m_U|^2 = %23.15e\n", class_name.c_str(), u2);
1302 
1303 #ifdef DEBUG
1304  for (int i = 0; i < 2 * num_vectors; ++i) {
1305  for (int j = 0; j < 2 * num_vectors; ++j) {
1306  int s = 0;
1307  int mu = 3;
1308  int idx_r = index_coarse.idx_Gr(j, i, s, mu);
1309  int idx_i = index_coarse.idx_Gi(j, i, s, mu);
1310  vout.general("i = %d j = %d %f %f\n", i, j,
1311  m_U.cmp(idx_r), m_U.cmp(idx_i));
1312  //m_Clov.cmp(idx_r), m_Clov.cmp(idx_i));
1313  }
1314  }
1315 #endif
1316  }
1317 }
1318 
1319 
1320 //====================================================================
1321 template<typename AFIELD>
1323 {
1324  vout.crucial(m_vl, "%s: set_config is called\n", class_name.c_str());
1325  exit(EXIT_FAILURE);
1326 }
1327 
1328 
1329 //====================================================================
1330 template<typename AFIELD>
1332 {
1333  // no need
1334 }
1335 
1336 
1337 //====================================================================
1338 template<typename AFIELD>
1340 {
1341  // no need
1342 }
1343 
1344 
1345 //====================================================================
1346 template<typename AFIELD>
1348 {
1349  real_t *vp = v.ptr(0);
1350  real_t *wp = const_cast<AFIELD *>(&w)->ptr(0);
1351 
1352  if (mu == 0) {
1353  mult_xp(vp, wp);
1354  } else if (mu == 1) {
1355  mult_yp(vp, wp);
1356  } else if (mu == 2) {
1357  mult_zp(vp, wp);
1358  } else if (mu == 3) {
1359  mult_tp(vp, wp);
1360  } else {
1361  vout.crucial(m_vl, "%s: mult_up for %d direction is undefined.",
1362  class_name.c_str(), mu);
1363  exit(EXIT_FAILURE);
1364  }
1365 }
1366 
1367 
1368 //====================================================================
1369 template<typename AFIELD>
1371 {
1372  real_t *vp = v.ptr(0);
1373  real_t *wp = const_cast<AFIELD *>(&w)->ptr(0);
1374 
1375  if (mu == 0) {
1376  mult_xm(vp, wp);
1377  } else if (mu == 1) {
1378  mult_ym(vp, wp);
1379  } else if (mu == 2) {
1380  mult_zm(vp, wp);
1381  } else if (mu == 3) {
1382  mult_tm(vp, wp);
1383  } else {
1384  vout.crucial(m_vl, "%s: mult_dn for %d direction is undefined.",
1385  class_name.c_str(), mu);
1386  exit(EXIT_FAILURE);
1387  }
1388 }
1389 
1390 
1391 //====================================================================
1392 template<typename AFIELD>
1394 {
1395 #pragma omp barrier
1396 
1397  int ith = ThreadManager::get_thread_id();
1398  if (ith == 0) m_mode = mode;
1399 
1400 #pragma omp barrier
1401 }
1402 
1403 
1404 //====================================================================
1405 template<typename AFIELD>
1407 {
1408  return m_mode;
1409 }
1410 
1411 
1412 //====================================================================
1413 template<typename AFIELD>
1415 {
1416  if (m_mode == "D") {
1417  return D(v, w);
1418  } else if (m_mode == "DdagD") {
1419  return DdagD(v, w);
1420  } else if (m_mode == "Ddag") {
1421  return Ddag(v, w);
1422  } else if (m_mode == "H") {
1423  return H(v, w);
1424  } else {
1425  vout.crucial(m_vl, "%s: mode undefined.\n", class_name.c_str());
1426  exit(EXIT_FAILURE);
1427  }
1428 }
1429 
1430 
1431 //====================================================================
1432 template<typename AFIELD>
1434 {
1435  if (m_mode == "D") {
1436  return Ddag(v, w);
1437  } else if (m_mode == "DdagD") {
1438  return DdagD(v, w);
1439  } else if (m_mode == "Ddag") {
1440  return D(v, w);
1441  } else if (m_mode == "H") {
1442  return H(v, w);
1443  } else {
1444  vout.crucial(m_vl, "%s: mode undefined.\n", class_name.c_str());
1445  exit(EXIT_FAILURE);
1446  }
1447 }
1448 
1449 
1450 //====================================================================
1451 template<typename AFIELD>
1453 {
1454  mult_D(v, w);
1455  //mult_D_alt(v, w);
1456  // mult_D_alt_keep(v, w);
1457 }
1458 
1459 
1460 //====================================================================
1461 template<typename AFIELD>
1463 {
1464  D(m_v2, w);
1465  mult_gm5(v, m_v2);
1466  D(m_v2, v);
1467  mult_gm5(v, m_v2);
1468 }
1469 
1470 
1471 //====================================================================
1472 template<typename AFIELD>
1474 {
1475  mult_gm5(v, w);
1476  D(m_v2, v);
1477  mult_gm5(v, m_v2);
1478 }
1479 
1480 
1481 //====================================================================
1482 template<typename AFIELD>
1484 {
1485  real_t *vp = v.ptr(0);
1486  real_t *wp = const_cast<AFIELD *>(&w)->ptr(0);
1487 
1488 #pragma omp barrier
1489 
1490  mult_gm5(vp, wp);
1491 
1492 #pragma omp barrier
1493 }
1494 
1495 
1496 //====================================================================
1497 template<typename AFIELD>
1499 {
1500  real_t *vp = v.ptr(0);
1501  real_t *wp = const_cast<AFIELD *>(&w)->ptr(0);
1502 
1503 #pragma omp barrier
1504 
1505  mult_csw(vp, wp);
1506 
1507 #pragma omp barrier
1508 }
1509 
1510 
1511 //====================================================================
1512 template<typename AFIELD>
1514 {
1515  int ith, nth, is, ns;
1516  set_threadtask(ith, nth, is, ns, m_Nstv);
1517  for (int s = is; s < ns; ++s) {
1518  real_t *out = v + s * 2 * VLEN * m_ncol;
1519  const real_t *in = w + s * 2 * VLEN * m_ncol;
1520  for (int i = 0; i < m_ncol / 2; ++i) {
1521  Vsimd_t tmp[4]; // 2 for chirality, 2 for complex
1522  load_vec(tmp, &in[4 * i * VLEN], 4);
1523  scal_vec(tmp, real_t(-1), 2); // ( -re, -im, +re, +im)
1524  save_vec(&out[4 * i * VLEN], tmp, 4);
1525  }
1526  }
1527 }
1528 
1529 
1530 //====================================================================
1531 template<typename AFIELD>
1533 { // Dirac representation is assumed.
1534  real_t *u = m_Clov.ptr(0);
1535 
1536  int ith, nth, is, ns;
1537  set_threadtask(ith, nth, is, ns, m_Nstv);
1538 
1539 #pragma omp barrier
1540  int nv = VLEN * m_Nvc;
1541  int nv2 = VLEN * m_Ndf;
1542 
1543  for (int site = is; site < ns; ++site) {
1544  accum_mult_u(&v2[nv * site], &v1[nv * site],
1545  &u[nv2 * site], m_Nc);
1546  }
1547 
1548 #pragma omp barrier
1549 }
1550 
1551 
1552 //====================================================================
1553 template<typename AFIELD>
1555 {
1556 #pragma omp barrier
1557 #pragma omp master
1558  {
1560  }
1561 
1562  int ith, nth, is, ns;
1563  set_threadtask(ith, nth, is, ns, m_Nstv);
1564  const bool time_keeper = (ith == nth - 1);
1565  real_t *vp = v.ptr(0);
1566  real_t *wp = const_cast<AFIELD *>(&w)->ptr(0);
1567  real_t *up = m_U.ptr(0);
1568  real_t *cp = m_Clov.ptr(0);
1569 
1570  int Nsize[4] = { m_Nxv, m_Nyv, m_Nz, m_Nt };
1571 
1572  if (time_keeper) {
1574  }
1575  if (do_comm_any > 0) {
1576 #pragma omp master
1577  {
1579  chset_recv.start();
1581  }
1582 
1583  real_t *buf1_xp = (real_t *)chsend_dn[0].ptr();
1584  real_t *buf1_xm = (real_t *)chsend_up[0].ptr();
1585  real_t *buf1_yp = (real_t *)chsend_dn[1].ptr();
1586  real_t *buf1_ym = (real_t *)chsend_up[1].ptr();
1587  real_t *buf1_zp = (real_t *)chsend_dn[2].ptr();
1588  real_t *buf1_zm = (real_t *)chsend_up[2].ptr();
1589  real_t *buf1_tp = (real_t *)chsend_dn[3].ptr();
1590  real_t *buf1_tm = (real_t *)chsend_up[3].ptr();
1591 
1592  BridgeQXS::mult_coarse_1(buf1_xp, buf1_xm, buf1_yp, buf1_ym,
1593  buf1_zp, buf1_zm, buf1_tp, buf1_tm,
1594  up, wp, Nsize, m_ncol, do_comm);
1595  }
1596  if (time_keeper) {
1598  }
1599 
1600  // clear(vp); // redundant, to be deleted
1601 
1602 #pragma omp barrier
1603 
1604 #pragma omp master
1605  {
1608  chset_send.start();
1610  }
1611 
1612 
1613  if (time_keeper) { TIMER_bulk_start; }
1614  BridgeQXS::mult_coarse_b(vp, up, cp, wp, Nsize, m_ncol, do_comm, work_shifted[ith]);
1615  if (time_keeper) { TIMER_bulk_stop; } // due to load imbalacne, this timer is not accuate
1616 
1617 #pragma omp master
1618  {
1620  chset_recv.wait();
1622  }
1623 #pragma omp barrier
1624 
1625 #pragma omp master
1626  {
1628  }
1629  if (time_keeper) { TIMER_boundary_start; }
1630  real_t *buf2_xp = (real_t *)chrecv_up[0].ptr();
1631  real_t *buf2_xm = (real_t *)chrecv_dn[0].ptr();
1632  real_t *buf2_yp = (real_t *)chrecv_up[1].ptr();
1633  real_t *buf2_ym = (real_t *)chrecv_dn[1].ptr();
1634  real_t *buf2_zp = (real_t *)chrecv_up[2].ptr();
1635  real_t *buf2_zm = (real_t *)chrecv_dn[2].ptr();
1636  real_t *buf2_tp = (real_t *)chrecv_up[3].ptr();
1637  real_t *buf2_tm = (real_t *)chrecv_dn[3].ptr();
1638 
1639  BridgeQXS::mult_coarse_2(vp, up, wp,
1640  buf2_xp, buf2_xm, buf2_yp, buf2_ym,
1641  buf2_zp, buf2_zm, buf2_tp, buf2_tm,
1642  Nsize, m_ncol, do_comm, work_shifted[ith],
1643  m_list_boundary[ith]);
1644 
1645 #pragma omp master
1646  {
1648  chset_send.wait();
1650  }
1651 #pragma omp barrier
1652  if (time_keeper) { TIMER_boundary_stop; }
1653 
1654 #pragma omp master
1655  {
1657  }
1658 }
1659 
1660 
1661 //====================================================================
1662 template<typename AFIELD>
1664 {
1665 #pragma omp master
1666  {
1668  }
1669 
1670  real_t *vp = v.ptr(0);
1671  real_t *wp = const_cast<AFIELD *>(&w)->ptr(0);
1672 
1673  clear(vp);
1674  mult_xp(vp, wp);
1675  mult_xm(vp, wp);
1676  mult_yp(vp, wp);
1677  mult_ym(vp, wp);
1678  mult_zp(vp, wp);
1679  mult_zm(vp, wp);
1680  mult_tp(vp, wp);
1681  mult_tm(vp, wp);
1682  mult_csw(vp, wp);
1683 
1684 #pragma omp master
1685  {
1687  }
1688 }
1689 
1690 
1691 //====================================================================
1692 template<typename AFIELD>
1694 {
1695  D(m_v2, w);
1696  mult_gm5(v, m_v2);
1697 }
1698 
1699 
1700 //====================================================================
1701 template<typename AFIELD>
1703 {
1704  int ith, nth, is, ns;
1705  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1706 
1707  Vsimd_t vzero[2]; // 2 for complex
1708  clear_vec(vzero, 2);
1709  for (int site = is; site < ns; ++site) {
1710  real_t *out = v + VLEN * m_Nvc * site;
1711  for (int ic = 0; ic < m_Nvc; ic += 2) {
1712  save_vec(&out[VLEN * ic], vzero, 2);
1713  }
1714  }
1715 }
1716 
1717 
1718 //====================================================================
1719 template<typename AFIELD>
1721 {
1722  int idir = 0;
1723 
1724  int ith, nth, is, ns;
1725  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1726 
1727  real_t *buf1 = (real_t *)chsend_dn[0].ptr();
1728  real_t *buf2 = (real_t *)chrecv_up[0].ptr();
1729 
1730  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1731 
1732  // work area for shifed vectors
1733  real_t *work = work_shifted[ith];
1734 
1735 #pragma omp barrier
1736  if (do_comm[0] > 0) {
1737 #pragma omp master
1738  {
1740  }
1741 
1742  for (int site = is; site < ns; ++site) {
1743  int ix = site % m_Nxv;
1744  int iyzt = site / m_Nxv;
1745  if (ix == 0) {
1746  int ibf = VLENY * m_Nvc * iyzt;
1747  mult_coarse_xp1(&buf1[ibf], &v1[VLEN * m_Nvc * site], m_Nc);
1748  }
1749  }
1750 
1751 #pragma omp barrier
1752 
1753 #pragma omp master
1754  {
1757  chrecv_up[0].start();
1758  chsend_dn[0].start();
1759  chrecv_up[0].wait();
1760  chsend_dn[0].wait();
1761  }
1762 #pragma omp barrier
1763 #pragma omp master
1764  {
1766  }
1767  } // if(do_comm[0] == 1)
1768 
1769 #pragma omp master
1770  {
1772  }
1773 
1774  for (int site = is; site < ns; ++site) {
1775  int ix = site % m_Nxv;
1776  int iyzt = site / m_Nxv;
1777 
1778  if ((ix < m_Nxv - 1) || (do_comm[0] == 0)) {
1779  int nei = (ix + 1) + m_Nxv * iyzt;
1780  if (ix == m_Nxv - 1) nei = 0 + m_Nxv * iyzt;
1781  mult_coarse_xpb(&v2[VLEN * m_Nvc * site], &u[VLEN * m_Ndf * site],
1782  &v1[VLEN * m_Nvc * site], &v1[VLEN * m_Nvc * nei], m_Nc, work);
1783  } else {
1784  int ibf = VLENY * m_Nvc * iyzt;
1785  mult_coarse_xp2(&v2[VLEN * m_Nvc * site], &u[VLEN * m_Ndf * site],
1786  &v1[VLEN * m_Nvc * site], &buf2[ibf], m_Nc, work);
1787  }
1788  }
1789 
1790 #pragma omp barrier
1791 #pragma omp master
1792  {
1794  }
1795 }
1796 
1797 
1798 //====================================================================
1799 template<typename AFIELD>
1801 {
1802  int idir = 0;
1803 
1804  int ith, nth, is, ns;
1805  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1806 
1807  real_t *buf1 = (real_t *)chsend_up[0].ptr();
1808  real_t *buf2 = (real_t *)chrecv_dn[0].ptr();
1809 
1810  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1811 
1812  // work area for shifed vectors
1813  real_t *work = work_shifted[ith];
1814 
1815 #pragma omp barrier
1816 
1817  if (do_comm[0] > 0) {
1818 #pragma omp master
1819  {
1821  }
1822 
1823  for (int site = is; site < ns; ++site) {
1824  int ix = site % m_Nxv;
1825  int iyzt = site / m_Nxv;
1826  if (ix == m_Nxv - 1) {
1827  int ibf = VLENY * m_Nvc * iyzt;
1828  mult_coarse_xm1(&buf1[ibf], &u[VLEN * m_Ndf * site], &v1[VLEN * m_Nvc * site], m_Nc);
1829  }
1830  }
1831 
1832 #pragma omp barrier
1833 #pragma omp master
1834  {
1837  chrecv_dn[0].start();
1838  chsend_up[0].start();
1839  chrecv_dn[0].wait();
1840  chsend_up[0].wait();
1841  }
1842 #pragma omp barrier
1843 #pragma omp master
1844  {
1846  }
1847  } // end of if(do_comm[0] > 0)
1848 
1849 #pragma omp master
1850  {
1852  }
1853 
1854  for (int site = is; site < ns; ++site) {
1855  int ix = site % m_Nxv;
1856  int iyzt = site / m_Nxv;
1857 
1858  if ((ix > 0) || (do_comm[0] == 0)) {
1859  int ix2 = (ix - 1 + m_Nxv) % m_Nxv;
1860  int nei = ix2 + m_Nxv * iyzt;
1861  mult_coarse_xmb(&v2[VLEN * m_Nvc * site],
1862  &u[VLEN * m_Ndf * site], &u[VLEN * m_Ndf * nei],
1863  &v1[VLEN * m_Nvc * site], &v1[VLEN * m_Nvc * nei],
1864  m_Nc, work);
1865  } else {
1866  int ibf = VLENY * m_Nvc * iyzt;
1867  mult_coarse_xm2(&v2[VLEN * m_Nvc * site], &u[VLEN * m_Ndf * site],
1868  &v1[VLEN * m_Nvc * site], &buf2[ibf], m_Nc);
1869  }
1870  }
1871 
1872 #pragma omp barrier
1873 #pragma omp master
1874  {
1876  }
1877 }
1878 
1879 
1880 //====================================================================
1881 template<typename AFIELD>
1883 {
1884  int idir = 1;
1885  int Nxyv = m_Nxv * m_Nyv;
1886 
1887  int ith, nth, is, ns;
1888  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1889 
1890  real_t *buf1 = (real_t *)chsend_dn[1].ptr();
1891  real_t *buf2 = (real_t *)chrecv_up[1].ptr();
1892 
1893  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1894 
1895 #pragma omp barrier
1896  if (do_comm[1] > 0) {
1897 #pragma omp master
1898  {
1900  }
1901 
1902  for (int site = is; site < ns; ++site) {
1903  int ix = site % m_Nxv;
1904  int iy = (site / m_Nxv) % m_Nyv;
1905  int izt = site / Nxyv;
1906  int ixzt = ix + m_Nxv * izt;
1907  if (iy == 0) {
1908  int ibf = VLENX * m_Nvc * ixzt;
1909  mult_coarse_yp1(&buf1[ibf], &v1[VLEN * m_Nvc * site], m_Nc);
1910  }
1911  }
1912 
1913 #pragma omp barrier
1914 
1915 #pragma omp master
1916  {
1919  chrecv_up[1].start();
1920  chsend_dn[1].start();
1921  chrecv_up[1].wait();
1922  chsend_dn[1].wait();
1923  }
1924 
1925 #pragma omp barrier
1926 #pragma omp master
1927  {
1929  }
1930  } // end of if(do_comm[1] > 0)
1931 #pragma omp master
1932  {
1934  }
1935 
1936  int thread = ThreadManager::get_thread_id();
1937  real_t *work = work_shifted[thread];
1938  for (int site = is; site < ns; ++site) {
1939  int ix = site % m_Nxv;
1940  int iy = (site / m_Nxv) % m_Nyv;
1941  int izt = site / Nxyv;
1942  int ixzt = ix + m_Nxv * izt;
1943 
1944  if ((iy < m_Nyv - 1) || (do_comm[1] == 0)) {
1945  int iy2 = (iy + 1) % m_Nyv;
1946  int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
1947  mult_coarse_ypb(&v2[VLEN * m_Nvc * site],
1948  &u[VLEN * m_Ndf * site],
1949  &v1[VLEN * m_Nvc * site], &v1[VLEN * m_Nvc * nei],
1950  m_Nc, work);
1951  } else {
1952  int ibf = VLENX * m_Nvc * ixzt;
1953  mult_coarse_yp2(&v2[VLEN * m_Nvc * site],
1954  &u[VLEN * m_Ndf * site],
1955  &v1[VLEN * m_Nvc * site], &buf2[ibf],
1956  m_Nc, work);
1957  }
1958  }
1959 
1960 
1961 #pragma omp barrier
1962 #pragma omp master
1963  {
1965  }
1966 }
1967 
1968 
1969 //====================================================================
1970 template<typename AFIELD>
1972 {
1973  int idir = 1;
1974  int Nxyv = m_Nxv * m_Nyv;
1975 
1976  int ith, nth, is, ns;
1977  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
1978 
1979  real_t *buf1 = (real_t *)chsend_up[1].ptr();
1980  real_t *buf2 = (real_t *)chrecv_dn[1].ptr();
1981 
1982  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
1983 
1984 
1985 #pragma omp barrier
1986  if (do_comm[1] > 0) {
1987 #pragma omp master
1988  {
1990  }
1991 
1992  for (int site = is; site < ns; ++site) {
1993  int ix = site % m_Nxv;
1994  int iy = (site / m_Nxv) % m_Nyv;
1995  int izt = site / Nxyv;
1996  if (iy == m_Nyv - 1) {
1997  int ibf = VLENX * m_Nvc * (ix + m_Nxv * izt);
1998  mult_coarse_ym1(&buf1[ibf], &u[VLEN * m_Ndf * site], &v1[VLEN * m_Nvc * site], m_Nc);
1999  }
2000  }
2001 
2002 #pragma omp barrier
2003 
2004 #pragma omp master
2005  {
2008  chrecv_dn[1].start();
2009  chsend_up[1].start();
2010  chrecv_dn[1].wait();
2011  chsend_up[1].wait();
2012  }
2013 
2014 #pragma omp barrier
2015 #pragma omp master
2016  {
2018  }
2019  }
2020 
2021 #pragma omp master
2022  {
2024  }
2025 
2026  int thread = ThreadManager::get_thread_id();
2027  real_t *work = work_shifted[thread];
2028 
2029  for (int site = is; site < ns; ++site) {
2030  int ix = site % m_Nxv;
2031  int iy = (site / m_Nxv) % m_Nyv;
2032  int izt = site / Nxyv;
2033 
2034  if ((iy != 0) || (do_comm[idir] == 0)) {
2035  int iy2 = (iy - 1 + m_Nyv) % m_Nyv;
2036  int nei = ix + m_Nxv * (iy2 + m_Nyv * izt);
2037  mult_coarse_ymb(&v2[VLEN * m_Nvc * site],
2038  &u[VLEN * m_Ndf * site], &u[VLEN * m_Ndf * nei],
2039  &v1[VLEN * m_Nvc * site], &v1[VLEN * m_Nvc * nei],
2040  m_Nc, work);
2041  } else {
2042  int ibf = VLENX * m_Nvc * (ix + m_Nxv * izt);
2043  mult_coarse_ym2(&v2[VLEN * m_Nvc * site],
2044  &u[VLEN * m_Ndf * site],
2045  &v1[VLEN * m_Nvc * site],
2046  &buf2[ibf], m_Nc);
2047  }
2048  }
2049 #pragma omp barrier
2050 #pragma omp master
2051  {
2053  }
2054 }
2055 
2056 
2057 //====================================================================
2058 template<typename AFIELD>
2060 {
2061  int idir = 2;
2062  int Nxyv = m_Nxv * m_Nyv;
2063 
2064  int ith, nth, is, ns;
2065  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2066 
2067  real_t *buf1 = (real_t *)chsend_dn[2].ptr();
2068  real_t *buf2 = (real_t *)chrecv_up[2].ptr();
2069 
2070  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2071 
2072 #pragma omp barrier
2073  if (do_comm[2] > 0) {
2074 #pragma omp master
2075  {
2077  }
2078 
2079  for (int site = is; site < ns; ++site) {
2080  int ixy = site % Nxyv;
2081  int iz = (site / Nxyv) % m_Nz;
2082  int it = site / (Nxyv * m_Nz);
2083  int ixyt = ixy + Nxyv * it;
2084  if (iz == 0) {
2085  mult_coarse_zp1(&buf1[VLEN * m_Nvc * ixyt], &v1[VLEN * m_Nvc * site], m_Nc);
2086  }
2087  }
2088 
2089 #pragma omp barrier
2090 
2091 #pragma omp master
2092  {
2095  chrecv_up[2].start();
2096  chsend_dn[2].start();
2097  chrecv_up[2].wait();
2098  chsend_dn[2].wait();
2099  }
2100 
2101 #pragma omp barrier
2102 #pragma omp master
2103  {
2105  }
2106  }
2107 
2108 #pragma omp master
2109  {
2111  }
2112 
2113  for (int site = is; site < ns; ++site) {
2114  int ixy = site % Nxyv;
2115  int iz = (site / Nxyv) % m_Nz;
2116  int it = site / (Nxyv * m_Nz);
2117 
2118  if ((iz != m_Nz - 1) || (do_comm[2] == 0)) {
2119  int iz2 = (iz + 1) % m_Nz;
2120  int nei = ixy + Nxyv * (iz2 + m_Nz * it);
2121  mult_coarse_zpb(&v2[VLEN * m_Nvc * site],
2122  &u[VLEN * m_Ndf * site], &v1[VLEN * m_Nvc * nei], m_Nc);
2123  } else {
2124  int ixyt = ixy + Nxyv * it;
2125  mult_coarse_zp2(&v2[VLEN * m_Nvc * site],
2126  &u[VLEN * m_Ndf * site], &buf2[VLEN * m_Nvc * ixyt], m_Nc);
2127  }
2128  }
2129 
2130 #pragma omp barrier
2131 #pragma omp master
2132  {
2134  }
2135 }
2136 
2137 
2138 //====================================================================
2139 template<typename AFIELD>
2141 {
2142  int idir = 2;
2143  int Nxyv = m_Nxv * m_Nyv;
2144 
2145  int ith, nth, is, ns;
2146  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2147 
2148  real_t *buf1 = (real_t *)chsend_up[2].ptr();
2149  real_t *buf2 = (real_t *)chrecv_dn[2].ptr();
2150 
2151  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2152 
2153 #pragma omp barrier
2154 
2155  if (do_comm[2] > 0) {
2156 #pragma omp master
2157  {
2159  }
2160 
2161  for (int site = is; site < ns; ++site) {
2162  int ixy = site % Nxyv;
2163  int iz = (site / Nxyv) % m_Nz;
2164  int it = site / (Nxyv * m_Nz);
2165  if (iz == m_Nz - 1) {
2166  int ixyt = ixy + Nxyv * it;
2167  mult_coarse_zm1(&buf1[VLEN * m_Nvc * ixyt],
2168  &u[VLEN * m_Ndf * site], &v1[VLEN * m_Nvc * site], m_Nc);
2169  }
2170  }
2171 
2172 #pragma omp barrier
2173 
2174 #pragma omp master
2175  {
2178  chrecv_dn[2].start();
2179  chsend_up[2].start();
2180  chrecv_dn[2].wait();
2181  chsend_up[2].wait();
2182  }
2183 
2184 #pragma omp barrier
2185 #pragma omp master
2186  {
2188  }
2189  }
2190 
2191 #pragma omp master
2192  {
2194  }
2195 
2196  for (int site = is; site < ns; ++site) {
2197  int ixy = site % Nxyv;
2198  int iz = (site / Nxyv) % m_Nz;
2199  int it = site / (Nxyv * m_Nz);
2200 
2201  if ((iz > 0) || (do_comm[2] == 0)) {
2202  int iz2 = (iz - 1 + m_Nz) % m_Nz;
2203  int nei = ixy + Nxyv * (iz2 + m_Nz * it);
2204  mult_coarse_zmb(&v2[VLEN * m_Nvc * site],
2205  &u[VLEN * m_Ndf * nei], &v1[VLEN * m_Nvc * nei], m_Nc);
2206  } else {
2207  int ixyt = ixy + Nxyv * it;
2208  mult_coarse_zm2(&v2[VLEN * m_Nvc * site], &buf2[VLEN * m_Nvc * ixyt], m_Nc);
2209  }
2210  }
2211 #pragma omp barrier
2212 #pragma omp master
2213  {
2215  }
2216 }
2217 
2218 
2219 //====================================================================
2220 template<typename AFIELD>
2222 {
2223  int idir = 3;
2224  int Nxyzv = m_Nxv * m_Nyv * m_Nz;
2225 
2226  int ith, nth, is, ns;
2227  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2228 
2229  real_t *buf1 = (real_t *)chsend_dn[3].ptr();
2230  real_t *buf2 = (real_t *)chrecv_up[3].ptr();
2231 
2232  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2233 
2234 
2235 #pragma omp barrier
2236 
2237  if (do_comm[3] > 0) {
2238 #pragma omp master
2239  {
2241  }
2242 
2243  for (int site = is; site < ns; ++site) {
2244  int ixyz = site % Nxyzv;
2245  int it = site / Nxyzv;
2246  if (it == 0) {
2247  mult_coarse_tp1(&buf1[VLEN * m_Nvc * ixyz], &v1[VLEN * m_Nvc * site], m_Nc);
2248  }
2249  }
2250 
2251 #pragma omp barrier
2252 
2253 #pragma omp master
2254  {
2257  chrecv_up[3].start();
2258  chsend_dn[3].start();
2259  chrecv_up[3].wait();
2260  chsend_dn[3].wait();
2261  }
2262 
2263 #pragma omp barrier
2264 #pragma omp master
2265  {
2267  }
2268  }
2269 
2270 #pragma omp master
2271  {
2273  }
2274 
2275  for (int site = is; site < ns; ++site) {
2276  int ixyz = site % Nxyzv;
2277  int it = site / Nxyzv;
2278 
2279  if ((it < m_Nt - 1) || (do_comm[3] == 0)) {
2280  int it2 = (it + 1) % m_Nt;
2281  int nei = ixyz + Nxyzv * it2;
2282  mult_coarse_tpb(&v2[VLEN * m_Nvc * site],
2283  &u[VLEN * m_Ndf * site], &v1[VLEN * m_Nvc * nei], m_Nc);
2284  } else {
2285  mult_coarse_tp2(&v2[VLEN * m_Nvc * site],
2286  &u[VLEN * m_Ndf * site], &buf2[VLEN * m_Nvc * ixyz], m_Nc);
2287  }
2288  }
2289 
2290 #pragma omp barrier
2291 #pragma omp master
2292  {
2294  }
2295 }
2296 
2297 
2298 //====================================================================
2299 template<typename AFIELD>
2301 {
2302  int idir = 3;
2303  int Nxyzv = m_Nxv * m_Nyv * m_Nz;
2304 
2305  int ith, nth, is, ns;
2306  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2307 
2308  real_t *buf1 = (real_t *)chsend_up[3].ptr();
2309 
2310  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2311 
2312  if (do_comm[3] > 0) {
2313  for (int site = is; site < ns; ++site) {
2314  int ixyz = site % Nxyzv;
2315  int it = site / Nxyzv;
2316  if (it == m_Nt - 1) {
2317  mult_coarse_tm1(&buf1[VLEN * m_Nvc * ixyz],
2318  &u[VLEN * m_Ndf * site], &v1[VLEN * m_Nvc * site], m_Nc);
2319  }
2320  }
2321  }
2322 }
2323 
2324 
2325 //====================================================================
2326 template<typename AFIELD>
2328 {
2329  int idir = 3;
2330  int Nxyzv = m_Nxv * m_Nyv * m_Nz;
2331 
2332  int ith, nth, is, ns;
2333  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2334 
2335  real_t *buf2 = (real_t *)chrecv_dn[3].ptr();
2336 
2337  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2338 
2339  for (int site = is; site < ns; ++site) {
2340  int ixyz = site % Nxyzv;
2341  int it = site / Nxyzv;
2342 
2343  if ((it > 0) || (do_comm[3] == 0)) {
2344  int it2 = (it - 1 + m_Nt) % m_Nt;
2345  int nei = ixyz + Nxyzv * it2;
2346  mult_coarse_tmb(&v2[VLEN * m_Nvc * site],
2347  &u[VLEN * m_Ndf * nei], &v1[VLEN * m_Nvc * nei], m_Nc);
2348  } else {
2349  mult_coarse_tm2(&v2[VLEN * m_Nvc * site], &buf2[VLEN * m_Nvc * ixyz], m_Nc);
2350  }
2351  }
2352 }
2353 
2354 
2355 //====================================================================
2356 template<typename AFIELD>
2358 {
2359  int idir = 3;
2360  int Nxyzv = m_Nxv * m_Nyv * m_Nz;
2361 
2362  int ith, nth, is, ns;
2363  set_threadtask_mult(ith, nth, is, ns, m_Nstv);
2364 
2365  real_t *buf1 = (real_t *)chsend_up[3].ptr();
2366  real_t *buf2 = (real_t *)chrecv_dn[3].ptr();
2367 
2368  real_t *u = m_U.ptr(m_Ndf * m_Nst * idir);
2369 
2370 #pragma omp barrier
2371 
2372  if (do_comm[3] > 0) {
2373 #pragma omp master
2374  {
2376  }
2377 
2378  for (int site = is; site < ns; ++site) {
2379  int ixyz = site % Nxyzv;
2380  int it = site / Nxyzv;
2381  if (it == m_Nt - 1) {
2382  mult_coarse_tm1(&buf1[VLEN * m_Nvc * ixyz],
2383  &u[VLEN * m_Ndf * site], &v1[VLEN * m_Nvc * site], m_Nc);
2384  }
2385  }
2386 
2387 #pragma omp barrier
2388 
2389 #pragma omp master
2390  {
2393  chrecv_dn[3].start();
2394  chsend_up[3].start();
2395  chrecv_dn[3].wait();
2396  chsend_up[3].wait();
2397  }
2398 #pragma omp barrier
2399 #pragma omp master
2400  {
2402  }
2403  }
2404 
2405 #pragma omp master
2406  {
2408  }
2409 
2410  for (int site = is; site < ns; ++site) {
2411  int ixyz = site % Nxyzv;
2412  int it = site / Nxyzv;
2413 
2414  if ((it > 0) || (do_comm[3] == 0)) {
2415  int it2 = (it - 1 + m_Nt) % m_Nt;
2416  int nei = ixyz + Nxyzv * it2;
2417  mult_coarse_tmb(&v2[VLEN * m_Nvc * site],
2418  &u[VLEN * m_Ndf * nei], &v1[VLEN * m_Nvc * nei], m_Nc);
2419  } else {
2420  mult_coarse_tm2(&v2[VLEN * m_Nvc * site], &buf2[VLEN * m_Nvc * ixyz], m_Nc);
2421  }
2422  }
2423 
2424 #pragma omp barrier
2425 #pragma omp master
2426  {
2428  }
2429 }
2430 
2431 
2432 //====================================================================
2433 template<typename AFIELD>
2434 double AFopr_Clover_coarse<AFIELD>::flop_count(const std::string mode)
2435 {
2436  // The following counting explicitly depends on the implementation.
2437  // It will be recalculated when the code is modified.
2438  // The present counting is based on rev.1107. [24 Aug 2014 H.Matsufuru]
2439 
2440  int Lvol = m_Nst * CommonParameters::NPE();
2441  double flop_site, flop;
2442 
2443  // flop_site = static_cast<double>(m_Nc * m_Nc * (4 * m_Nc));
2444  // each of matrix mult takes 8 N^2 flops
2445  // there is a room to improve this by using a property of Hermite matrix,
2446  // but is not implemented yet.
2447  // [28 Mar 2021 I.Kanamori]
2448  flop_site = static_cast<double>(5 * 8 * m_Nc * m_Nc);
2449 
2450  flop = flop_site * static_cast<double>(Lvol);
2451  if ((mode == "DdagD") || (mode == "DDdag")) flop *= 2.0;
2452 
2453  return flop;
2454 }
2455 
2456 
2457 //============================================================END=====
AFopr_Clover_coarse::mult_dag
void mult_dag(AFIELD &, const AFIELD &)
hermitian conjugate of mult.
Definition: afopr_Clover_coarse-tmpl.h:1433
AFopr_Clover_coarse::mult_tm1
void mult_tm1(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:2300
TIMER_comm_send_wait_stop
#define TIMER_comm_send_wait_stop
Definition: afopr_Clover_coarse-tmpl.h:31
AFopr_Clover_coarse::mult_yp
void mult_yp(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:1882
TIMER_mult_stop
#define TIMER_mult_stop
Definition: afopr_Clover_coarse-tmpl.h:19
TIMER_comm_recv_wait_stop
#define TIMER_comm_recv_wait_stop
Definition: afopr_Clover_coarse-tmpl.h:29
TIMER_comm_recv_start_stop
#define TIMER_comm_recv_start_stop
Definition: afopr_Clover_coarse-tmpl.h:33
AFopr_Clover_coarse::mult_D
void mult_D(AFIELD &, const AFIELD &)
standard D mult.
Definition: afopr_Clover_coarse-tmpl.h:1554
AFopr_Clover_coarse::get_mode
std::string get_mode() const
returns mult mode.
Definition: afopr_Clover_coarse-tmpl.h:1406
CommonParameters::Ndim
static int Ndim()
Definition: commonParameters.h:117
AFopr_Clover_coarse::Ddag
void Ddag(AFIELD &, const AFIELD &)
Definition: afopr_Clover_coarse-tmpl.h:1473
AFopr_Clover_coarse::generate_coarse_op
void generate_coarse_op(AFopr_dd< AFIELD > *fine_afopr, const std::vector< AFIELD > &testvec)
Definition: afopr_Clover_coarse-tmpl.h:1172
Parameters
Class for parameters.
Definition: parameters.h:46
AFopr_Clover_coarse::real_t
AFIELD::real_t real_t
Definition: afopr_Clover_coarse.h:43
AFopr_Clover_coarse::convert
void convert(AFIELD &v, const Field &w)
convert of spinor field.
Definition: afopr_Clover_coarse-tmpl.h:1331
TIMER_pack_start
#define TIMER_pack_start
Definition: afopr_Clover_coarse-tmpl.h:20
AFopr_Clover_coarse::mult_D_alt
void mult_D_alt(AFIELD &, const AFIELD &)
D mult using QWS library.
Definition: afopr_Clover_coarse-tmpl.h:1663
VLEN
#define VLEN
Definition: bridgeQXS_Clover_coarse_double.cpp:12
AFopr_Clover_coarse::mult_ym
void mult_ym(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:1971
Bridge::BridgeIO::detailed
void detailed(const char *format,...)
Definition: bridgeIO.cpp:219
AFopr_Clover_dd::mult_dd
void mult_dd(AFIELD &, const AFIELD &)
Mult only inside domain.
Definition: afopr_Clover_dd-tmpl.h:1008
AFopr_Clover_coarse::set_mode
void set_mode(std::string mode)
returns the pointer to gauge configuration.
Definition: afopr_Clover_coarse-tmpl.h:1393
AFopr_Clover_coarse::mult_tp
void mult_tp(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:2221
AFopr_Clover_coarse::init
void init()
initial setup.
Definition: afopr_Clover_coarse-tmpl.h:846
CommonParameters::Nvol
static int Nvol()
Definition: commonParameters.h:109
AFopr_Clover_coarse::D
void D(AFIELD &, const AFIELD &)
Definition: afopr_Clover_coarse-tmpl.h:1452
Vsimd_t
Definition: vsimd_double-inc.h:13
BridgeQXS::mult_coarse_2
void mult_coarse_2(double *v2, double *u0, double *v1, double *buf2_xp, double *buf2_xm, double *buf2_yp, double *buf2_ym, double *buf2_zp, double *buf2_zm, double *buf2_tp, double *buf2_tm, const int *Nsize, int ncol, const int *do_comm, double *work, std::vector< int > &list)
Definition: mult_Clover_coarse_qxs-inc.h:314
aindex_block_lex.h
AFopr_Clover_dd::mult_dup
void mult_dup(AFIELD &, const AFIELD &, const int mu)
Upward hopping part of mult.
Definition: afopr_Clover_dd-tmpl.h:1064
real_t
double real_t
Definition: bridgeQXS_Clover_coarse_double.cpp:16
AIndex_block_lex
Definition: aindex_block_lex_base.h:18
AFopr_dd
Base class of fermion operator family.
Definition: afopr_dd.h:24
AFopr_Clover_coarse::mult_tmb2
void mult_tmb2(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:2327
BridgeQXS::mult_coarse_1
void mult_coarse_1(double *buf1_xp, double *buf1_xm, double *buf1_yp, double *buf1_ym, double *buf1_zp, double *buf1_zm, double *buf1_tp, double *buf1_tm, double *u0, double *v1, const int *Nsize, int ncol, const int *do_comm)
Definition: mult_Clover_coarse_qxs-inc.h:32
Timer
Definition: timer.h:31
AFopr_Clover_coarse::flop_count
double flop_count()
returns floating operation counts.
Definition: afopr_Clover_coarse.h:155
AIndex_coarse_lex
Definition: aindex_coarse_lex_base.h:17
AFopr_Clover_coarse::DdagD
void DdagD(AFIELD &, const AFIELD &)
Definition: afopr_Clover_coarse-tmpl.h:1462
block_dotc
void block_dotc(typename AFIELD::complex_t *out, const AFIELD &v, const AFIELD &w, const INDEX &block_index)
Definition: afield_dd-inc.h:24
TIMER_comm_send_start_stop
#define TIMER_comm_send_start_stop
Definition: afopr_Clover_coarse-tmpl.h:35
AFopr_Clover_coarse::mult_up
void mult_up(int mu, AFIELD &, const AFIELD &)
upward nearest neighbor hopping term.
Definition: afopr_Clover_coarse-tmpl.h:1347
AFopr_Clover_coarse::set_list
void set_list()
Definition: afopr_Clover_coarse-tmpl.h:1063
AFopr_Clover_coarse::mult_xp
void mult_xp(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:1720
BridgeQXS::mult_coarse_b
void mult_coarse_b(double *v2, double *u0, double *c0, double *v1, const int *Nsize, int ncol, const int *do_comm, double *work)
Definition: mult_Clover_coarse_qxs-inc.h:165
AFopr_Clover_coarse::mult_xm
void mult_xm(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:1800
AFopr_Clover_coarse::set_parameters
void set_parameters(const Parameters &params)
setting parameters by a Parameter object.
Definition: afopr_Clover_coarse-tmpl.h:958
CommonParameters::Nc
static int Nc()
Definition: commonParameters.h:115
TIMER_boundary_start
#define TIMER_boundary_start
Definition: afopr_Clover_coarse-tmpl.h:24
timer.h
aindex_coarse_lex.h
AFopr_Clover_coarse
Definition: afopr_Clover_coarse.h:40
Parameters::fetch_int_vector
int fetch_int_vector(const string &key, vector< int > &value) const
Definition: parameters.cpp:429
AFopr_Clover_coarse::mult
void mult(AFIELD &, const AFIELD &)
multiplies fermion operator to a given field.
Definition: afopr_Clover_coarse-tmpl.h:1414
Communicator::npe
static int npe(const int dir)
logical grid extent
Definition: communicator.cpp:112
AFopr_Clover_coarse::mult_zp
void mult_zp(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:2059
AFopr_Clover_coarse::mult_gm5
void mult_gm5(AFIELD &, const AFIELD &)
multiplies gamma_5 matrix.
Definition: afopr_Clover_coarse-tmpl.h:1483
TIMER_bulk_start
#define TIMER_bulk_start
Definition: afopr_Clover_coarse-tmpl.h:22
AFopr_Clover_coarse::mult_csw
void mult_csw(AFIELD &, const AFIELD &)
Definition: afopr_Clover_coarse-tmpl.h:1498
AFopr_Clover_coarse::reverse
void reverse(Field &v, const AFIELD &w)
reverse of spinor field.
Definition: afopr_Clover_coarse-tmpl.h:1339
CommonParameters::NPE
static int NPE()
Definition: commonParameters.h:101
AFopr_Clover_coarse::mult_zm
void mult_zm(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:2140
AFopr_Clover_coarse::tidyup
void tidyup()
final tidy-up.
Definition: afopr_Clover_coarse-tmpl.h:930
TIMER_comm_stop
#define TIMER_comm_stop
Definition: afopr_Clover_coarse-tmpl.h:27
VLENY
#define VLENY
Definition: bridgeQXS_Clover_coarse_double.cpp:14
Field::ptr
const double * ptr(const int jin, const int site, const int jex) const
Definition: field.h:153
ThreadManager::get_num_threads_available
static int get_num_threads_available()
returns number of threads (works outside of parallel region).
Definition: threadManager.h:86
TIMER_pack_stop
#define TIMER_pack_stop
Definition: afopr_Clover_coarse-tmpl.h:21
CommonParameters::Nd
static int Nd()
Definition: commonParameters.h:116
AFopr_Clover_dd
Definition: afopr_Clover_dd.h:45
Bridge::BridgeIO::set_verbose_level
static VerboseLevel set_verbose_level(const std::string &str)
Definition: bridgeIO.cpp:133
TIMER_comm_recv_start_start
#define TIMER_comm_recv_start_start
Definition: afopr_Clover_coarse-tmpl.h:32
AFopr_Clover_coarse::mult_tm
void mult_tm(real_t *, real_t *)
Definition: afopr_Clover_coarse-tmpl.h:2357
AFopr_Clover_dd::project_chiral
void project_chiral(AFIELD &, const AFIELD &, int ch)
Definition: afopr_Clover_dd-tmpl.h:753
VLENX
#define VLENX
Definition: bridgeQXS_Clover_coarse_double.cpp:13
TIMER_comm_recv_wait_start
#define TIMER_comm_recv_wait_start
Definition: afopr_Clover_coarse-tmpl.h:28
TIMER_comm_send_start_start
#define TIMER_comm_send_start_start
Definition: afopr_Clover_coarse-tmpl.h:34
TIMER_comm_start
#define TIMER_comm_start
Definition: afopr_Clover_coarse-tmpl.h:26
AFopr_Clover_coarse::clear
void clear(real_t *)
Definition: afopr_Clover_coarse-tmpl.h:1702
TIMER_boundary_stop
#define TIMER_boundary_stop
Definition: afopr_Clover_coarse-tmpl.h:25
AFopr_Clover_coarse::H
void H(AFIELD &, const AFIELD &)
Definition: afopr_Clover_coarse-tmpl.h:1693
Parameters::get_string
string get_string(const string &key) const
Definition: parameters.cpp:221
AFopr_Clover_coarse::mult_dn
void mult_dn(int mu, AFIELD &, const AFIELD &)
downward nearest neighbor hopping term.
Definition: afopr_Clover_coarse-tmpl.h:1370
Bridge::BridgeIO::crucial
void crucial(const char *format,...)
Definition: bridgeIO.cpp:180
Field
Container of Field-type object.
Definition: field.h:46
ThreadManager::get_thread_id
static int get_thread_id()
returns thread id.
Definition: threadManager.cpp:253
AFopr_Clover_coarse::setup_channels
void setup_channels()
setup channels for communication.
Definition: afopr_Clover_coarse-tmpl.h:894
TIMER_bulk_stop
#define TIMER_bulk_stop
Definition: afopr_Clover_coarse-tmpl.h:23
TIMER_mult_start
#define TIMER_mult_start
Definition: afopr_Clover_coarse-tmpl.h:18
Parameters::fetch_int
int fetch_int(const string &key, int &value) const
Definition: parameters.cpp:346
Bridge::BridgeIO::general
void general(const char *format,...)
Definition: bridgeIO.cpp:200
afield_dd-inc.h
TIMER_comm_send_wait_start
#define TIMER_comm_send_wait_start
Definition: afopr_Clover_coarse-tmpl.h:30
ThreadManager::assert_single_thread
static void assert_single_thread(const std::string &class_name)
assert currently running on single thread.
Definition: threadManager.cpp:372
Bridge::vout
BridgeIO vout
Definition: bridgeIO.cpp:512
AFopr_Clover_coarse::set_config
void set_config(Field *u)
setting gauge configuration.
Definition: afopr_Clover_coarse-tmpl.h:1322