15 template<
typename AFIELD,
typename AFIELD2>
20 #ifndef AFILED_HAS_SUB
21 template<
typename AFIELD>
29 #ifndef AFILED_HAS_ADD
30 template<
typename AFIELD>
38 static inline int get_flop_restrict_site(
int ncol)
40 return ncol * (2 * 6 + 8 * 12);
44 static inline int get_flop_prolong_site(
int ncol)
46 return ncol * (2 * 6 + 8 * 12);
50 template<
typename INDEX,
typename AFIELD>
58 template<
typename AFIELD,
typename AFIELD2>
68 template<
typename AFIELD,
typename AFIELD2>
73 int nin =
solver->get_fopr()->field_nin();
74 int nvol =
solver->get_fopr()->field_nvol();
75 int nex =
solver->get_fopr()->field_nex();
76 m_coarse_v.reset(nin, nvol, nex);
77 m_coarse_w.reset(nin, nvol, nex);
78 m_coarse_ncol = nin / 2;
83 template<
typename AFIELD,
typename AFIELD2>
88 int nin =
solver->get_fopr()->field_nin();
89 int nvol =
solver->get_fopr()->field_nvol();
90 int nex =
solver->get_fopr()->field_nex();
91 m_fine_w.reset(nin, nvol, nex);
92 m_fine_v.reset(nin, nvol, nex);
93 m_fine_r.reset(nin, nvol, nex);
94 m_fine_tmp.reset(nin, nvol, nex);
99 template<
typename AFIELD,
typename AFIELD2>
107 template<
typename AFIELD,
typename AFIELD2>
114 assert(m_fine_v.check_size(nin, nvol, nex));
115 assert(m_fine_w.check_size(nin, nvol, nex));
132 double norm2_in =
norm2(w);
133 double scale_in = sqrt(norm2_in);
135 v.
scal(1.0 / scale_in);
138 double time_double = time1 - time0;
145 convert(index_f, m_fine_w, index_d, v);
148 double time_d2f = time1 - time0;
153 mult_single(m_fine_v, m_fine_w);
160 convert(index_d, v, index_f, m_fine_v);
163 double time_f2d = time1 - time0;
170 time_double += (time1 - time0);
176 const double flop_other_double = 4 * 24 *
static_cast<double>(Lvol);
177 m_accum_flop_double += flop_other_double;
189 m_time_mult_total += elapsed_time;
190 m_time_d2f += time_d2f;
191 m_time_f2d += time_f2d;
192 m_time_double += time_double;
198 template<
typename AFIELD,
typename AFIELD2>
227 m_afoprF->mult(m_fine_r, w);
232 double time_residual = time1 - time0;
237 m_multigrid->make_coarse_vector(m_coarse_v, m_fine_r);
241 double time_restriction = time1 - time0;
247 int coarse_nconv = -1;
248 real_t coarse_diff = -1.0;
250 m_coarse_solver->solve(m_coarse_w, m_coarse_v, coarse_nconv, coarse_diff);
252 class_name.c_str(), coarse_nconv, coarse_diff);
255 double time_coarse_solver = time1 - time0;
263 m_multigrid->make_fine_vector(m_fine_tmp, m_coarse_w);
266 double time_prolongation = time1 - time0;
274 m_afoprF->mult(m_fine_r, v);
278 double r2 = m_fine_r.norm2();
279 vout.
general(
"%s: after the coarse solver, r2=%15.7e\n", class_name.c_str(), r2);
284 time_residual += (time1 - time0);
293 int smoother_nconv = -1;
294 real_t smoother_diff = -1.0;
295 m_smoother->solve(m_fine_tmp, m_fine_r, smoother_nconv, smoother_diff);
298 class_name.c_str(), smoother_nconv, smoother_diff);
301 double time_smoother = time1 - time0;
310 m_afoprF->mult(m_fine_r, v);
312 double r2 = m_fine_r.norm2();
313 vout.
general(
"%s: after the smoother, r2=%15.7e\n", class_name.c_str(), r2);
318 time_residual += (time1 - time0);
320 double time_single_total = time1;
329 m_time_restriction += time_restriction;
330 m_time_coarse_solver += time_coarse_solver;
331 m_time_prolongation += time_prolongation;
332 m_time_smoother += time_smoother;
333 m_time_residual += time_residual;
334 m_time_mult_single_total += time_single_total;
335 ++m_num_mult_single_called;
343 template<
typename AFIELD,
typename AFIELD2>
354 template<
typename AFIELD,
typename AFIELD2>
361 const double flop_restrict_site = get_flop_restrict_site(m_coarse_ncol);
362 const double flop_prolong_site = get_flop_prolong_site(m_coarse_ncol);
363 const double flop_other_float
364 = m_afoprF->flop_count()
365 + (flop_restrict_site + flop_prolong_site + 4 * 24)
366 *
static_cast<double>(Lvol)
367 + m_afoprF->flop_count();
369 m_accum_flop_coarse += m_coarse_solver->flop_count();
370 m_accum_flop_smoother += m_smoother->flop_count();
371 m_accum_flop_other += flop_other_float;
372 tmp += m_coarse_solver->flop_count();
373 tmp += m_smoother->flop_count();
374 tmp += flop_other_float;
375 m_accum_flop_float += tmp;
376 m_accum_flop_restrict += flop_restrict_site *
static_cast<double>(Lvol);
377 m_accum_flop_prolong += flop_prolong_site *
static_cast<double>(Lvol);
378 vout.
general(
"update_flop_count: flop_other_float=%e, m_accum_flop_float=%e, tmp=%e\n",
379 flop_other_float, m_accum_flop_float, tmp);
386 template<
typename AFIELD,
typename AFIELD2>
389 m_accum_flop_coarse = 0.0;
390 m_accum_flop_smoother = 0.0;
391 m_accum_flop_other = 0.0;
392 m_accum_flop_float = 0.0;
393 m_accum_flop_double = 0.0;
398 m_time_residual = 0.0;
399 m_time_restriction = 0.0;
400 m_time_coarse_solver = 0.0;
401 m_time_smoother = 0.0;
402 m_time_prolongation = 0.0;
403 m_time_mult_total = 0.0;
404 m_time_mult_single_total = 0.0;
406 m_num_mult_called = 0;
407 m_num_mult_single_called = 0;
412 template<
typename AFIELD,
typename AFIELD2>
415 vout.
general(
"%s: time budget\n", class_name.c_str());
416 vout.
general(
"Elapsed time: restriction : total %14.6e\n", m_time_restriction);
417 vout.
general(
"Elapsed time: coarse solver : total %14.6e\n", m_time_coarse_solver);
418 vout.
general(
"Elapsed time: prolongation : total %14.6e\n", m_time_prolongation);
419 vout.
general(
"Elapsed time: smoother : total %14.6e\n", m_time_smoother);
420 vout.
general(
"Elapsed time: resudial(+lin.alg) : total %14.6e\n", m_time_residual);
421 vout.
general(
"Elapsed time: convert(f2d) : total %14.6e\n", m_time_f2d);
422 vout.
general(
"Elapsed time: convert(d2f) : total %14.6e\n", m_time_d2f);
423 vout.
general(
"Elapsed time: double : total %14.6e\n", m_time_double);
424 vout.
general(
"Elapsed time: mult_single(total) : total %14.6e\n", m_time_mult_single_total);
425 vout.
general(
"Elapsed time: mult (total) : total %14.6e\n", m_time_mult_total);
426 vout.
general(
" number of mult call : %d\n", m_num_mult_called);
428 double flop_restrict = get_flop_restrict_site(m_coarse_ncol)
430 double flop_prolong = get_flop_prolong_site(m_coarse_ncol)
433 vout.
general(
" Flops: smoother (float) : %14.6e GFlop/s\n", flop_count_smoother() / m_time_smoother * 1.0e-9);
434 vout.
general(
" Flops: coarse solver (float) : %14.6e GFlop/s\n", flop_count_coarse() / m_time_coarse_solver * 1.0e-9);
435 vout.
general(
" Flops: float total : %14.6e GFlop/s\n", m_accum_flop_float / m_time_mult_single_total * 1.0e-9);
436 vout.
general(
" Flops: restrict (float) : %14.6e GFlop/s\n", flop_restrict / m_time_restriction * 1.0e-9);
437 vout.
general(
" Flops: prolong (float) : %14.6e GFlop/s\n", flop_prolong / m_time_prolongation * 1.0e-9);