Bridge++  Ver. 2.0.2
afield-tmpl.h
Go to the documentation of this file.
1 
10 #include <cassert>
11 
12 #include "lib_alt_QXS/inline/afield_th-inc.h"
13 
14 template<typename REALTYPE>
15 const std::string AField<REALTYPE, QXS>::class_name = "AField";
16 
17 //====================================================================
18 template<typename REALTYPE>
19 void AField<REALTYPE, QXS>::init(const int nin, const int nvol, const int nex,
20  const element_type cmpl)
21 {
22  m_nin = nin;
23  m_nvol = nvol;
24  m_nex = nex;
25  m_element_type = cmpl;
26 
27  m_offset = 0;
28 
29  if (m_nvol % VLEN != 0) {
30  vout.crucial("%s: bad nvol (too small?), must be a multiple of VLEN %d (given nvol=%d)\n",
31  class_name.c_str(), VLEN, m_nvol);
32  exit(EXIT_FAILURE);
33  }
34 
35  m_nsize = m_nin * m_nvol * m_nex;
36  m_nsize_off = m_nsize + m_offset + m_offset;
37  // offset is set both before and after the net data
38  m_nsizev = m_nsize / VLEN;
39  if (m_nsize_off != m_field.size()) {
40  m_field.resize(m_nsize_off, 0.0f);
41  vout.detailed("%s: data resized: nin = %d, nvol = %d, nex = %d\n",
42  class_name.c_str(), m_nin, m_nvol, m_nex);
43  }
44 }
45 
46 
47 //====================================================================
48 template<typename REALTYPE>
50 {
51  // currently do nothing.
52 }
53 
54 
55 //====================================================================
56 template<typename REALTYPE>
57 void AField<REALTYPE, QXS>::set(const REALTYPE a)
58 {
59  int ith, nth, is, ns;
60  set_threadtask(ith, nth, is, ns, size());
61 #pragma omp barrier
62  for (int i = is; i < ns; ++i) {
63  m_field[m_offset + i] = a;
64  }
65 #pragma omp barrier
66 }
67 
68 
69 //====================================================================
70 template<typename REALTYPE>
72 {
73  assert(check_size(w));
74 #pragma omp barrier
75 
76  int ith, nth, is, ns;
77  set_threadtask(ith, nth, is, ns, size());
78 
79  for (int i = is; i < ns; ++i) {
80  m_field[m_offset + i] = REALTYPE(w.cmp(i));
81  }
82 
83 #pragma omp barrier
84 }
85 
86 
87 //====================================================================
88 template<typename REALTYPE>
90 {
91  assert(check_size(w));
92 
93 #pragma omp barrier
94 
95 #ifdef USE_QXS_ACLE
96  real_t *__restrict__ vp = ptr(0);
97  real_t *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
98 #else
99  real_t *vp = ptr(0);
100  real_t *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
101 #endif
102  assert(vp != wp);
103 
104  int ith, nth, is, ns;
105  set_threadtask(ith, nth, is, ns, m_nsizev);
106 
107  svbool_t pg = set_predicate();
108  for (int i = is; i < ns; ++i) {
109  svreal_t vt;
110  load_vec(pg, vt, &wp[VLEN * i]);
111  save_vec(pg, &vp[VLEN * i], vt);
112  }
113 
114 #pragma omp barrier
115 }
116 
117 
118 //====================================================================
119 template<typename REALTYPE>
120 void AField<REALTYPE, QXS>::copy(const int ex,
121  const AField<REALTYPE, QXS>& w, const int ex_w)
122 {
123  assert(nin() == w.nin());
124  assert(nvol() == w.nvol());
125 
126 #pragma omp barrier
127 
128 #ifdef USE_QXS_ACLE
129  real_t *__restrict__ vp = ptr(0);
130  real_t *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
131 #else
132  real_t *vp = ptr(0);
133  real_t *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
134 #endif
135  assert(vp != wp);
136 
137  int sizev = m_nin * (m_nvol / VLEN);
138  int ith, nth, is, ns;
139  set_threadtask(ith, nth, is, ns, sizev);
140 
141  svbool_t pg = set_predicate();
142  for (int i = is; i < ns; ++i) {
143  svreal_t vt;
144  load_vec(pg, vt, &wp[VLEN * (i + sizev * ex_w)]);
145  save_vec(pg, &vp[VLEN * (i + sizev * ex)], vt);
146  }
147 
148 #pragma omp barrier
149 }
150 
151 
152 //====================================================================
153 template<typename REALTYPE>
154 void AField<REALTYPE, QXS>::axpy(const REALTYPE a, const AField<REALTYPE, QXS>& w)
155 {
156  assert(check_size(w));
157 
158 #pragma omp barrier
159 
160 #ifdef USE_QXS_ACLE
161  real_t *__restrict__ vp = ptr(0);
162  real_t *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
163 #else
164  real_t *vp = ptr(0);
165  real_t *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
166 #endif
167  assert(vp != wp);
168 
169  int ith, nth, is, ns;
170  set_threadtask(ith, nth, is, ns, m_nsizev);
171 
172  svbool_t pg = set_predicate();
173  for (int i = is; i < ns; ++i) {
174  svreal_t vt, wt;
175  load_vec(pg, wt, &wp[VLEN * i]);
176  load_vec(pg, vt, &vp[VLEN * i]);
177  axpy_vec(pg, vt, a, wt);
178  save_vec(pg, &vp[VLEN * i], vt);
179  }
180 
181 #pragma omp barrier
182 }
183 
184 
185 //====================================================================
186 template<typename REALTYPE>
187 void AField<REALTYPE, QXS>::axpy(const int ex, const REALTYPE a,
188  const AField<REALTYPE, QXS>& w,
189  const int ex_w)
190 {
191  assert(nin() == w.nin());
192  assert(nvol() == w.nvol());
193 
194 #pragma omp barrier
195 
196  int sizev = m_nin * (m_nvol / VLEN);
197 
198  int ith, nth, is, ns;
199  set_threadtask(ith, nth, is, ns, sizev);
200 
201 #ifdef USE_QXS_ACLE
202  real_t *__restrict__ vp = ptr(VLEN * sizev * ex);
203  real_t *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(VLEN * sizev * ex_w);
204 #else
205  real_t *vp = ptr(VLEN * sizev * ex);
206  real_t *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(VLEN * sizev * ex_w);
207 #endif
208  assert(vp != wp);
209 
210  svbool_t pg = set_predicate();
211  for (int i = is; i < ns; ++i) {
212  svreal_t vt, wt;
213  load_vec(pg, wt, &wp[VLEN * i]);
214  load_vec(pg, vt, &vp[VLEN * i]);
215  axpy_vec(pg, vt, a, wt);
216  save_vec(pg, &vp[VLEN * i], vt);
217  }
218 
219 #pragma omp barrier
220 }
221 
222 
223 //====================================================================
224 template<typename REALTYPE>
225 void AField<REALTYPE, QXS>::axpy(const REALTYPE ar, const REALTYPE ai,
226  const AField<REALTYPE, QXS>& w)
227 {
228  assert(check_size(w));
229 
230 #pragma omp barrier
231 
232 #ifdef USE_QXS_ACLE
233  REALTYPE *__restrict__ vp = this->ptr(0);
234  REALTYPE *__restrict__ wp
235  = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
236 #else
237  REALTYPE *vp = this->ptr(0);
238  REALTYPE *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
239 #endif
240 
241  svbool_t pg = set_predicate();
242 
243  int ith, nth, is, ns;
244  set_threadtask(ith, nth, is, ns, m_nsizev / 2);
245 
246  for (int i = is; i < ns; ++i) {
247  int inr = VLEN * (2 * i);
248  int ini = VLEN * (2 * i + 1);
249  svreal_t wtr, wti, vtr, vti;
250  load_vec(pg, wtr, &wp[inr]);
251  load_vec(pg, wti, &wp[ini]);
252  load_vec(pg, vtr, &vp[inr]);
253  load_vec(pg, vti, &vp[ini]);
254  axpy_vec(pg, vtr, ar, wtr);
255  axpy_vec(pg, vtr, -ai, wti);
256  axpy_vec(pg, vti, ar, wti);
257  axpy_vec(pg, vti, ai, wtr);
258  save_vec(pg, &vp[inr], vtr);
259  save_vec(pg, &vp[ini], vti);
260  }
261 
262 #pragma omp barrier
263 }
264 
265 
266 //====================================================================
267 template<typename REALTYPE>
268 void AField<REALTYPE, QXS>::axpy(const int ex,
269  const REALTYPE atr, const REALTYPE ati,
270  const AField<REALTYPE, QXS>& w, const int ex_w)
271 {
272  assert(nin() == w.nin());
273  assert(nvol() == w.nvol());
274  assert(ex < nex());
275  assert(ex_w < w.nex());
276 
277  int Nin = this->nin();
278  int Nex = this->nex();
279  int Nstv = this->nvol() / VLEN;
280 
281  // set_threadtask(ith, nth, is, ns, Nstv);
282 
283 #pragma omp barrier
284 
285 #ifdef USE_QXS_ACLE
286  REALTYPE *__restrict__ vp = this->ptr(0) + VLEN * Nin * Nstv * ex;
287  REALTYPE *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0) + VLEN * Nin * Nstv * ex_w;
288 #else
289  REALTYPE *vp = this->ptr(0) + VLEN * Nin * Nstv * ex;
290  REALTYPE *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0)
291  + VLEN * Nin * Nstv * ex_w;
292 #endif
293  assert(vp != wp);
294 
295  svbool_t pg = set_predicate();
296  int Nstvin2 = Nstv * Nin / 2;
297 
298  int ith, nth, is, ns;
299  set_threadtask(ith, nth, is, ns, Nstvin2);
300 
301  for (int i2 = 2 * is * VLEN; i2 < 2 * ns * VLEN; i2 += 2 * VLEN) {
302  int inr = i2;
303  int ini = i2 + VLEN;
304  svreal_t wtr, wti, vtr, vti;
305  load_vec(pg, wtr, &wp[inr]);
306  load_vec(pg, vtr, &vp[inr]);
307  load_vec(pg, wti, &wp[ini]);
308  load_vec(pg, vti, &vp[ini]);
309  axpy_vec(pg, vtr, atr, wtr);
310  axpy_vec(pg, vtr, -ati, wti);
311  axpy_vec(pg, vti, atr, wti);
312  axpy_vec(pg, vti, ati, wtr);
313  save_vec(pg, &vp[inr], vtr);
314  save_vec(pg, &vp[ini], vti);
315  }
316 
317 #pragma omp barrier
318 }
319 
320 
321 //====================================================================
322 template<typename REALTYPE>
323 void AField<REALTYPE, QXS>::aypx(const REALTYPE a,
324  const AField<REALTYPE, QXS>& w)
325 {
326  assert(check_size(w));
327 
328 #pragma omp barrier
329 
330  int ith, nth, is, ns;
331  set_threadtask(ith, nth, is, ns, m_nsizev);
332 
333 #ifdef USE_QXS_ACLE
334  real_t *__restrict__ vp = ptr(0);
335  real_t *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
336 #else
337  real_t *vp = ptr(0);
338  real_t *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
339 #endif
340  assert(vp != wp);
341 
342  svbool_t pg = set_predicate();
343  for (int i = is; i < ns; ++i) {
344  svreal_t vt, wt;
345  load_vec(pg, wt, &wp[VLEN * i]);
346  load_vec(pg, vt, &vp[VLEN * i]);
347  aypx_vec(pg, a, vt, wt);
348  save_vec(pg, &vp[VLEN * i], vt);
349  }
350 
351 #pragma omp barrier
352 }
353 
354 
355 //====================================================================
356 template<typename REALTYPE>
357 void AField<REALTYPE, QXS>::aypx(const REALTYPE ar, const REALTYPE ai,
358  const AField<REALTYPE, QXS>& w)
359 {
360  assert(check_size(w));
361 
362 #pragma omp barrier
363 
364 #ifdef USE_QXS_ACLE
365  REALTYPE *__restrict__ vp = this->ptr(0);
366  REALTYPE *__restrict__ wp
367  = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
368 #else
369  REALTYPE *vp = this->ptr(0);
370  REALTYPE *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
371 #endif
372  assert(vp != wp);
373 
374  svbool_t pg = set_predicate();
375 
376  int ith, nth, is, ns;
377  set_threadtask(ith, nth, is, ns, m_nsizev / 2);
378 
379  for (int i = is; i < ns; ++i) {
380  int inr = VLEN * (2 * i);
381  int ini = VLEN * (2 * i + 1);
382  svreal_t wtr, wti, vtr, vti;
383  load_vec(pg, wtr, &wp[inr]);
384  load_vec(pg, wti, &wp[ini]);
385  load_vec(pg, vtr, &vp[inr]);
386  load_vec(pg, vti, &vp[ini]);
387  axpy_vec(pg, wtr, ar, vtr);
388  axpy_vec(pg, wti, ar, vti);
389  axpy_vec(pg, wtr, -ai, vti);
390  axpy_vec(pg, wti, ai, vtr);
391  save_vec(pg, &vp[inr], wtr);
392  save_vec(pg, &vp[ini], wti);
393  }
394 
395 #pragma omp barrier
396 }
397 
398 
399 //====================================================================
400 template<typename REALTYPE>
401 void AField<REALTYPE, QXS>::aypx(const int ex,
402  const REALTYPE atr, const REALTYPE ati,
403  const AField<REALTYPE, QXS>& w,
404  const int ex_w)
405 {
406  assert(nin() == w.nin());
407  assert(nvol() == w.nvol());
408  assert(ex < nex());
409  assert(ex_w < w.nex());
410 
411  int Nin = this->nin();
412  int Nex = this->nex();
413  int Nstv = this->nvol() / VLEN;
414 
415 #pragma omp barrier
416 
417 #ifdef USE_QXS_ACLE
418  REALTYPE *__restrict__ vp = this->ptr(0) + VLEN * Nin * Nstv * ex;
419  REALTYPE *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0) + VLEN * Nin * Nstv * ex_w;
420 #else
421  REALTYPE *vp = this->ptr(0) + VLEN * Nin * Nstv * ex;
422  REALTYPE *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0)
423  + VLEN * Nin * Nstv * ex_w;
424 #endif
425  assert(vp != wp);
426 
427  svbool_t pg = set_predicate();
428 
429  int ith, nth, is, ns;
430  set_threadtask(ith, nth, is, ns, Nstv * Nin / 2);
431 
432  for (int i2 = 2 * is * VLEN; i2 < 2 * ns * VLEN; i2 += 2 * VLEN) {
433  int inr = i2;
434  int ini = i2 + VLEN;
435  svreal_t wtr, wti, vtr, vti;
436  load_vec(pg, wtr, &wp[inr]);
437  load_vec(pg, wti, &wp[ini]);
438  load_vec(pg, vtr, &vp[inr]);
439  load_vec(pg, vti, &vp[ini]);
440  axpy_vec(pg, wtr, atr, vtr);
441  axpy_vec(pg, wti, atr, vti);
442  axpy_vec(pg, wtr, -ati, vti);
443  axpy_vec(pg, wti, ati, vtr);
444  save_vec(pg, &vp[inr], wtr);
445  save_vec(pg, &vp[ini], wti);
446  }
447 #pragma omp barrier
448 }
449 
450 
451 //====================================================================
452 template<typename REALTYPE>
453 void AField<REALTYPE, QXS>::scal(const REALTYPE a)
454 {
455  int Nin = this->nin();
456  int Nex = this->nex();
457  int Nstv = this->nvol() / VLEN;
458 
459  int ith, nth, is, ns;
460 
461  real_t *vp = ptr(0);
462 
463 #pragma omp barrier
464 
465  svbool_t pg = set_predicate();
466  set_threadtask(ith, nth, is, ns, Nstv * Nin);
467  for (int ex = 0; ex < Nex; ++ex) {
468  real_t *v = vp + VLEN * Nstv * Nin * ex;
469  for (int iv = is * VLEN; iv < ns * VLEN; iv += VLEN) {
470  svreal_t vt;
471  load_vec(pg, vt, &v[iv]);
472  scal_vec(pg, vt, a);
473  save_vec(pg, &v[iv], vt);
474  }
475  }
476 #pragma omp barrier
477 }
478 
479 
480 //====================================================================
481 template<typename REALTYPE>
482 void AField<REALTYPE, QXS>::scal(const REALTYPE ar, const REALTYPE ai)
483 {
484  int Nin = this->nin();
485  int Nex = this->nex();
486  int Nstv = this->nvol() / VLEN;
487 
488 #pragma omp barrier
489 
490 #ifdef USE_QXS_ACLE
491  REALTYPE *__restrict__ vp = this->ptr(0);
492 #else
493  REALTYPE *vp = this->ptr(0);
494 #endif
495 
496  int ith, nth, is, ns;
497  set_threadtask(ith, nth, is, ns, Nstv * Nin / 2);
498  svbool_t pg = set_predicate();
499 
500  for (int ex = 0; ex < Nex; ++ex) {
501  real_t *v = vp + VLEN * Nstv * Nin * ex;
502  for (int i = is; i < ns; ++i) {
503  int inr = VLEN * (2 * i);
504  int ini = VLEN * (2 * i + 1);
505  svreal_t vtr, vti;
506  load_vec(pg, vtr, &vp[inr]);
507  load_vec(pg, vti, &vp[ini]);
508  axpy_vec(pg, vtr, ar, vtr);
509  axpy_vec(pg, vtr, -ai, vti);
510  axpy_vec(pg, vti, ar, vti);
511  axpy_vec(pg, vti, ai, vtr);
512  save_vec(pg, &vp[inr], vtr);
513  save_vec(pg, &vp[ini], vti);
514  }
515  }
516 
517 #pragma omp barrier
518 }
519 
520 
521 //====================================================================
522 template<typename REALTYPE>
524 {
525  int Nin = this->nin();
526  int Nex = this->nex();
527  int Nstv = this->nvol() / VLEN;
528 
529  int ith, nth, is, ns;
530  set_threadtask(ith, nth, is, ns, Nstv);
531 
532 #ifdef USE_QXS_ACLE
533  real_t *__restrict__ vp = const_cast<AField<REALTYPE, QXS> *>(this)->ptr(0);
534  real_t *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
535 #else
536  real_t *vp = const_cast<AField<REALTYPE, QXS> *>(this)->ptr(0);
537  real_t *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
538 #endif
539  assert(vp != wp);
540 
541 #pragma omp barrier
542 
543  svbool_t pg = set_predicate();
544 
545  svreal_t yt;
546  set_vec(pg, yt, 0.0);
547 
548  for (int ex = 0; ex < Nex; ++ex) {
549  svreal_t tmp_yt;
550  set_vec(pg, tmp_yt, 0.0);
551  for (int site = is; site < ns; ++site) {
552  real_t *v = &vp[VLEN * Nin * (site + Nstv * ex)];
553  real_t *w = &wp[VLEN * Nin * (site + Nstv * ex)];
554  svreal_t xt, vt, wt;
555  set_vec(pg, xt, 0.0);
556  for (int in = 0; in < Nin; ++in) {
557  load_vec(pg, wt, &w[VLEN * in]);
558  load_vec(pg, vt, &v[VLEN * in]);
559  add_dot_vec(pg, xt, vt, wt);
560  }
561  add_vec(pg, tmp_yt, xt);
562  }
563  add_vec(pg, yt, tmp_yt);
564  }
565 
566  real_t a;
567  reduce_vec(pg, a, yt);
568 
569 #pragma omp barrier
570 
572 
573  return a;
574 }
575 
576 
577 //====================================================================
578 template<typename REALTYPE>
579 void AField<REALTYPE, QXS>::dotc(REALTYPE& atr, REALTYPE& ati,
580  const AField<REALTYPE, QXS>& w) const
581 {
582  assert(check_size(w));
583 
584  int Nin = this->nin();
585  int Nex = this->nex();
586  int Nstv = this->nvol() / VLEN;
587  int Nin2 = Nin / 2;
588 
589  int ith, nth, is, ns;
590 
591 #ifdef USE_QXS_ACLE
592  real_t *__restrict__ vp = const_cast<AField<REALTYPE, QXS> *>(this)->ptr(0);
593  real_t *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
594 #else
595  real_t *vp = const_cast<AField<REALTYPE, QXS> *>(this)->ptr(0);
596  real_t *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
597 #endif
598  assert(vp != wp);
599 
600 #pragma omp barrier
601 
602 
603  svbool_t pg = set_predicate();
604 
605  int Nouter = (Nstv > Nin2) ? Nstv : Nin2;
606  int Ninner2 = (Nstv * Nin2) / Nouter;
607  int Ninner = 2 * Ninner2;
608  set_threadtask(ith, nth, is, ns, Nouter); // for better load balance
609  svreal_t ytr, yti;
610  set_vec(pg, ytr, 0.0);
611  set_vec(pg, yti, 0.0);
612 
613  for (int ex = 0; ex < Nex; ++ex) {
614  svreal_t tmp_ytr, tmp_yti;
615  set_vec(pg, tmp_ytr, 0.0);
616  set_vec(pg, tmp_yti, 0.0);
617  for (int i = is; i < ns; ++i) {
618  real_t *v = &vp[VLEN * Ninner * (i + Nouter * ex)];
619  real_t *w = &wp[VLEN * Ninner * (i + Nouter * ex)];
620  svreal_t xtr, xti, vtr, vti, wtr, wti;
621  set_vec(pg, xtr, 0.0);
622  set_vec(pg, xti, 0.0);
623  for (int in = 0; in < Ninner2; ++in) {
624  int inr = 2 * in;
625  int ini = 2 * in + 1;
626  load_vec(pg, wtr, &w[VLEN * inr]);
627  load_vec(pg, wti, &w[VLEN * ini]);
628  load_vec(pg, vtr, &v[VLEN * inr]);
629  load_vec(pg, vti, &v[VLEN * ini]);
630  add_dot_vec(pg, xtr, vtr, wtr);
631  add_dot_vec(pg, xtr, vti, wti);
632  add_dot_vec(pg, xti, vtr, wti);
633  sub_dot_vec(pg, xti, vti, wtr);
634  }
635  add_vec(pg, tmp_ytr, xtr);
636  add_vec(pg, tmp_yti, xti);
637  }
638 
639  add_vec(pg, ytr, tmp_ytr);
640  add_vec(pg, yti, tmp_yti);
641  }
642 
643  reduce_vec(pg, atr, ytr);
644  reduce_vec(pg, ati, yti);
645 
646 #pragma omp barrier
647  real_t sum[2] = { atr, ati };
648  ThreadManager::reduce_sum_global(sum, 2, ith, nth);
649  atr = sum[0];
650  ati = sum[1];
651 }
652 
653 
654 //====================================================================
655 template<typename REALTYPE>
657 {
658  int Nin = this->nin();
659  int Nex = this->nex();
660  int Nstv = this->nvol() / VLEN;
661 
662  real_t *vp = const_cast<AField<REALTYPE, QXS> *>(this)->ptr(0);
663 
664 #pragma omp barrier
665 
666  int Nouter = (Nstv > Nin) ? Nstv : Nin;
667  int Ninner = (Nstv * Nin) / Nouter;
668 
669  int ith, nth, is, ns;
670  set_threadtask(ith, nth, is, ns, Nouter); // for better load balance
671 
672  svbool_t pg = set_predicate();
673 
674  svreal_t yt;
675  set_vec(pg, yt, 0.0);
676 
677  for (int ex = 0; ex < Nex; ++ex) {
678  svreal_t tmp_yt;
679  set_vec(pg, tmp_yt, 0.0);
680  for (int i = is; i < ns; ++i) {
681  real_t *v = &vp[VLEN * Ninner * (i + Nouter * ex)];
682  svreal_t xt, vt;
683  set_vec(pg, xt, 0.0);
684  for (int in = 0; in < Ninner; ++in) {
685  load_vec(pg, vt, &v[VLEN * in]);
686  add_norm2_vec(pg, xt, vt);
687  }
688  add_vec(pg, tmp_yt, xt);
689  }
690  add_vec(pg, yt, tmp_yt);
691  }
692 
693  real_t a;
694  reduce_vec(pg, a, yt);
695 
696 #pragma omp barrier
697 
699 
700  return a;
701 }
702 
703 
704 //====================================================================
705 template<typename REALTYPE>
707 AField<REALTYPE, QXS>::dotc_and_norm2(REALTYPE& norm2, REALTYPE& w_norm2,
708  const AField<REALTYPE, QXS>& w) const
709 {
710  if (m_element_type == Element_type::REAL) {
711  vout.crucial("%s: dotc_and_norm2 for real field is called\n",
712  class_name.c_str());
713 
714  exit(EXIT_FAILURE);
715  }
716  assert(check_size(w));
717 
718  int Nin = this->nin();
719  int Nex = this->nex();
720  int Nstv = this->nvol() / VLEN;
721  int Nin2 = Nin / 2;
722 
723 #ifdef USE_QXS_ACLE
724  real_t *__restrict__ vp = const_cast<AField<REALTYPE, QXS> *>(this)->ptr(0);
725  real_t *__restrict__ wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
726 #else
727  real_t *vp = const_cast<AField<REALTYPE, QXS> *>(this)->ptr(0);
728  real_t *wp = const_cast<AField<REALTYPE, QXS> *>(&w)->ptr(0);
729 #endif
730  assert(vp != wp);
731 
732  real_t atr = 0.0;
733  real_t ati = 0.0;
734  norm2 = 0.0;
735  w_norm2 = 0.0;
736 #pragma omp barrier
737 
738  int Nouter = (Nstv > Nin2) ? Nstv : Nin2;
739  int Ninner2 = (Nstv * Nin2) / Nouter;
740  int Ninner = 2 * Ninner2;
741 
742  int ith, nth, is, ns;
743  set_threadtask(ith, nth, is, ns, Nouter); // for better load balance
744 
745  svbool_t pg = set_predicate();
746 
747  svreal_t ytr, yti, w2, v2;
748  set_vec(pg, ytr, 0.0);
749  set_vec(pg, yti, 0.0);
750  set_vec(pg, w2, 0.0);
751  set_vec(pg, v2, 0.0);
752 
753  for (int ex = 0; ex < Nex; ++ex) {
754  svreal_t tmp_ytr, tmp_yti, tmp_vt2, tmp_wt2;
755  set_vec(pg, tmp_ytr, 0.0);
756  set_vec(pg, tmp_yti, 0.0);
757  set_vec(pg, tmp_vt2, 0.0);
758  set_vec(pg, tmp_wt2, 0.0);
759  for (int iout = is; iout < ns; ++iout) {
760  real_t *v = &vp[VLEN * Ninner * (iout + Nouter * ex)];
761  real_t *w = &wp[VLEN * Ninner * (iout + Nouter * ex)];
762  svreal_t xtr, xti, vtr, vti, wtr, wti, vt2, wt2;
763  set_vec(pg, xtr, 0.0);
764  set_vec(pg, xti, 0.0);
765  set_vec(pg, vt2, 0.0);
766  set_vec(pg, wt2, 0.0);
767  for (int in = 0; in < Ninner2; ++in) {
768  int inr = 2 * in;
769  int ini = 2 * in + 1;
770  load_vec(pg, wtr, &w[VLEN * inr]);
771  load_vec(pg, wti, &w[VLEN * ini]);
772  load_vec(pg, vtr, &v[VLEN * inr]);
773  load_vec(pg, vti, &v[VLEN * ini]);
774  add_dot_vec(pg, xtr, vtr, wtr);
775  add_dot_vec(pg, xtr, vti, wti);
776  add_dot_vec(pg, xti, vtr, wti);
777  sub_dot_vec(pg, xti, vti, wtr);
778  add_norm2_vec(pg, vt2, vtr);
779  add_norm2_vec(pg, vt2, vti);
780  add_norm2_vec(pg, wt2, wtr);
781  add_norm2_vec(pg, wt2, wti);
782  }
783  add_vec(pg, tmp_ytr, xtr);
784  add_vec(pg, tmp_yti, xti);
785  add_vec(pg, tmp_vt2, vt2);
786  add_vec(pg, tmp_wt2, wt2);
787  }
788  add_vec(pg, ytr, tmp_ytr);
789  add_vec(pg, yti, tmp_yti);
790  add_vec(pg, v2, tmp_vt2);
791  add_vec(pg, w2, tmp_wt2);
792  }
793 
794  reduce_vec(pg, atr, ytr);
795  reduce_vec(pg, ati, yti);
796  reduce_vec(pg, w_norm2, w2);
797  reduce_vec(pg, norm2, v2);
798  real_t sum[4] = { atr, ati, w_norm2, norm2 };
799 
800 #pragma omp barrier
801 
802  ThreadManager::reduce_sum_global(sum, 4, ith, nth);
803  complex_t result = { sum[0], sum[1] };
804 
805  w_norm2 = sum[2];
806  norm2 = sum[3];
807 
808  return result;
809 }
810 
811 
812 //============================================================END=====
AField< REALTYPE, QXS >::nex
int nex() const
returning size of extra d.o.f.
Definition: afield.h:115
AField< REALTYPE, QXS >::complex_t
ComplexTraits< REALTYPE >::complex_t complex_t
Definition: afield.h:39
AField< REALTYPE, QXS >
Definition: afield.h:35
AField
Definition: afield_base.h:16
AField< REALTYPE, QXS >::nvol
int nvol() const
returning size of site d.o.f.
Definition: afield.h:112
VLEN
#define VLEN
Definition: bridgeQXS_Clover_coarse_double.cpp:12
Bridge::BridgeIO::detailed
void detailed(const char *format,...)
Definition: bridgeIO.cpp:219
aypx
void aypx(const double a, Field &y, const Field &x)
aypx(y, a, x): y := a * y + x
Definition: field.cpp:509
Vsimd_t
Definition: vsimd_double-inc.h:13
axpy
void axpy(Field &y, const double a, const Field &x)
axpy(y, a, x): y := a * x + y
Definition: field.cpp:380
dot
double dot(const Field &y, const Field &x)
Definition: field.cpp:576
dotc_and_norm2
void dotc_and_norm2(dcomplex &yx, double &y2, double &x2, const Field &y, const Field &x)
Definition: field.cpp:806
norm2
REALTYPE norm2(const AField< REALTYPE, QXS > &v)
Definition: afield-inc.h:453
copy
void copy(Field &y, const Field &x)
copy(y, x): y = x
Definition: field.cpp:212
ThreadManager::reduce_sum_global
static void reduce_sum_global(dcomplex &value, const int i_thread, const int Nthread)
global reduction with summation: dcomplex values are assumed thread local.
Definition: threadManager.cpp:288
Element_type::REAL
@ REAL
Definition: bridge_defs.h:43
dotc
dcomplex dotc(const Field &y, const Field &x)
Definition: field.cpp:712
AField< REALTYPE, QXS >::real_t
REALTYPE real_t
Definition: afield.h:38
Field::cmp
double cmp(const int jin, const int site, const int jex) const
Definition: field.h:143
svbool_t
Definition: vsimd_double-inc.h:30
scal
void scal(Field &x, const double a)
scal(x, a): x = a * x
Definition: field.cpp:261
Bridge::BridgeIO::crucial
void crucial(const char *format,...)
Definition: bridgeIO.cpp:180
Field
Container of Field-type object.
Definition: field.h:46
AField< REALTYPE, QXS >::nin
int nin() const
returning size of inner (on site) d.o.f.
Definition: afield.h:109
Element_type::type
type
Definition: bridge_defs.h:41
Bridge::vout
BridgeIO vout
Definition: bridgeIO.cpp:512