22 namespace ThreadManager_OpenMP_Reduce {
23 template<
typename REALTYPE>
26 std::vector<REALTYPE>& array_reduction,
27 const int each_buf_size,
28 const int i_thread,
const int Nthread)
30 typedef REALTYPE real_t;
32 std::vector<real_t> sum;
34 real_t *psum =
nullptr;
42 for (
int i = 0; i < num; i++) {
51 const int n = (remaining < each_buf_size)?num:each_buf_size;
52 for (
int i = 0; i < n; i++) {
53 array_reduction[i_thread * each_buf_size + i] = pa[i];
58 for (
int i = 0; i < Nthread; i++) {
59 for (
int j = 0; j < n; j++) {
60 psum[j] += array_reduction[i * each_buf_size + j];
63 psum += each_buf_size;
66 remaining -= each_buf_size;
77 const int total_buf_size = each_buf_size * Nthread;
80 const int n = (remaining < total_buf_size)?num:total_buf_size;
83 for (
int i = 0; i < n; i++) {
84 array_reduction[i] = pa[i];
99 for (
int i = 0; i < n; i++) {
100 pa[i] = array_reduction[i];
102 pa += total_buf_size;
103 remaining -= total_buf_size;
131 if (omp_get_thread_num() == 0) {
132 Nthread_env = omp_get_num_threads();
137 if ((Nthread == Nthread_env) || (Nthread == 0)) {
145 omp_set_num_threads(Nthread);
167 return omp_get_num_threads();
174 return omp_get_thread_num();
208 const int i_thread,
const int Nthread)
219 const int i_thread,
const int Nthread)
229 const int i_thread,
const int Nthread)
240 const int i_thread,
const int Nthread)
250 const int i_thread,
const int Nthread)
261 const int i_thread,
const int Nthread)
277 vout.
crucial(
m_vl,
"Single-thread %s is called in parallel region.\n", name.c_str());
static int m_Nthread
number of threads.
static const int each_buf_size
reduction buffer size for each thread (double)
static int get_num_threads()
returns available number of threads.
static const int each_buf_sizeDC
reduction buffer size for each thread (dcomplex)
void general(const char *format,...)
static std::vector< float > m_darray_reductionF
static Bridge::VerboseLevel Vlevel()
static int reduce_sum(int count, dcomplex *recv_buf, dcomplex *send_buf, int pattern=0)
make a global sum of an array of dcomplex over the communicator. pattern specifies the dimensions to ...
static void wait()
barrier among threads inside a node.
static int get_thread_id()
returns thread id.
static void init(int Nthread)
setup: called in main only once.
static std::vector< dcomplex > m_darray_reductionDC
static const std::string class_name
static void barrier(const int Nthread)
barrier among threads inside a node.
static void sync_barrier_all()
barrier among all the threads and nodes.
static void finalize()
finalization.
void paranoiac(const char *format,...)
static const int each_buf_sizeF
reduction buffer size for each thread (float)
void crucial(const char *format,...)
static void reduce_sum_global(dcomplex &value, const int i_thread, const int Nthread)
global reduction with summation: dcomplex values are assumed thread local.
static Bridge::VerboseLevel m_vl
verbose level.
void sum_global(REALTYPE *a, const int num, std::vector< REALTYPE > &array_reduction, const int each_buf_size, const int i_thread, const int Nthread)
static std::vector< double > m_darray_reduction
static int sync()
synchronize within small world.
static void assert_single_thread(const std::string &class_name)
assert currently running on single thread.