12 #include "lib_alt_QXS/inline/afield_th-inc.h"
14 template<
typename REALTYPE>
18 template<
typename REALTYPE>
25 m_element_type = cmpl;
29 if (m_nvol %
VLEN != 0) {
30 vout.
crucial(
"%s: bad nvol (too small?), must be a multiple of VLEN %d (given nvol=%d)\n",
31 class_name.c_str(),
VLEN, m_nvol);
35 m_nsize = m_nin * m_nvol * m_nex;
36 m_nsize_off = m_nsize + m_offset + m_offset;
38 m_nsizev = m_nsize /
VLEN;
39 if (m_nsize_off != m_field.size()) {
40 m_field.resize(m_nsize_off, 0.0f);
41 vout.
detailed(
"%s: data resized: nin = %d, nvol = %d, nex = %d\n",
42 class_name.c_str(), m_nin, m_nvol, m_nex);
48 template<
typename REALTYPE>
56 template<
typename REALTYPE>
60 set_threadtask(ith, nth, is, ns, size());
62 for (
int i = is; i < ns; ++i) {
63 m_field[m_offset + i] = a;
70 template<
typename REALTYPE>
73 assert(check_size(w));
77 set_threadtask(ith, nth, is, ns, size());
79 for (
int i = is; i < ns; ++i) {
80 m_field[m_offset + i] = REALTYPE(w.
cmp(i));
88 template<
typename REALTYPE>
91 assert(check_size(w));
96 real_t *__restrict__ vp = ptr(0);
104 int ith, nth, is, ns;
105 set_threadtask(ith, nth, is, ns, m_nsizev);
108 for (
int i = is; i < ns; ++i) {
110 load_vec(pg, vt, &wp[
VLEN * i]);
111 save_vec(pg, &vp[
VLEN * i], vt);
119 template<
typename REALTYPE>
123 assert(nin() == w.
nin());
124 assert(nvol() == w.
nvol());
129 real_t *__restrict__ vp = ptr(0);
137 int sizev = m_nin * (m_nvol /
VLEN);
138 int ith, nth, is, ns;
139 set_threadtask(ith, nth, is, ns, sizev);
142 for (
int i = is; i < ns; ++i) {
144 load_vec(pg, vt, &wp[
VLEN * (i + sizev * ex_w)]);
145 save_vec(pg, &vp[
VLEN * (i + sizev * ex)], vt);
153 template<
typename REALTYPE>
156 assert(check_size(w));
161 real_t *__restrict__ vp = ptr(0);
169 int ith, nth, is, ns;
170 set_threadtask(ith, nth, is, ns, m_nsizev);
173 for (
int i = is; i < ns; ++i) {
175 load_vec(pg, wt, &wp[
VLEN * i]);
176 load_vec(pg, vt, &vp[
VLEN * i]);
177 axpy_vec(pg, vt, a, wt);
178 save_vec(pg, &vp[
VLEN * i], vt);
186 template<
typename REALTYPE>
191 assert(nin() == w.
nin());
192 assert(nvol() == w.
nvol());
196 int sizev = m_nin * (m_nvol /
VLEN);
198 int ith, nth, is, ns;
199 set_threadtask(ith, nth, is, ns, sizev);
202 real_t *__restrict__ vp = ptr(
VLEN * sizev * ex);
211 for (
int i = is; i < ns; ++i) {
213 load_vec(pg, wt, &wp[
VLEN * i]);
214 load_vec(pg, vt, &vp[
VLEN * i]);
215 axpy_vec(pg, vt, a, wt);
216 save_vec(pg, &vp[
VLEN * i], vt);
224 template<
typename REALTYPE>
228 assert(check_size(w));
233 REALTYPE *__restrict__ vp = this->ptr(0);
234 REALTYPE *__restrict__ wp
237 REALTYPE *vp = this->ptr(0);
243 int ith, nth, is, ns;
244 set_threadtask(ith, nth, is, ns, m_nsizev / 2);
246 for (
int i = is; i < ns; ++i) {
247 int inr =
VLEN * (2 * i);
248 int ini =
VLEN * (2 * i + 1);
250 load_vec(pg, wtr, &wp[inr]);
251 load_vec(pg, wti, &wp[ini]);
252 load_vec(pg, vtr, &vp[inr]);
253 load_vec(pg, vti, &vp[ini]);
254 axpy_vec(pg, vtr, ar, wtr);
255 axpy_vec(pg, vtr, -ai, wti);
256 axpy_vec(pg, vti, ar, wti);
257 axpy_vec(pg, vti, ai, wtr);
258 save_vec(pg, &vp[inr], vtr);
259 save_vec(pg, &vp[ini], vti);
267 template<
typename REALTYPE>
269 const REALTYPE atr,
const REALTYPE ati,
272 assert(nin() == w.
nin());
273 assert(nvol() == w.
nvol());
275 assert(ex_w < w.
nex());
277 int Nin = this->nin();
278 int Nex = this->nex();
279 int Nstv = this->nvol() /
VLEN;
286 REALTYPE *__restrict__ vp = this->ptr(0) +
VLEN * Nin * Nstv * ex;
289 REALTYPE *vp = this->ptr(0) +
VLEN * Nin * Nstv * ex;
291 +
VLEN * Nin * Nstv * ex_w;
296 int Nstvin2 = Nstv * Nin / 2;
298 int ith, nth, is, ns;
299 set_threadtask(ith, nth, is, ns, Nstvin2);
301 for (
int i2 = 2 * is *
VLEN; i2 < 2 * ns *
VLEN; i2 += 2 *
VLEN) {
305 load_vec(pg, wtr, &wp[inr]);
306 load_vec(pg, vtr, &vp[inr]);
307 load_vec(pg, wti, &wp[ini]);
308 load_vec(pg, vti, &vp[ini]);
309 axpy_vec(pg, vtr, atr, wtr);
310 axpy_vec(pg, vtr, -ati, wti);
311 axpy_vec(pg, vti, atr, wti);
312 axpy_vec(pg, vti, ati, wtr);
313 save_vec(pg, &vp[inr], vtr);
314 save_vec(pg, &vp[ini], vti);
322 template<
typename REALTYPE>
326 assert(check_size(w));
330 int ith, nth, is, ns;
331 set_threadtask(ith, nth, is, ns, m_nsizev);
334 real_t *__restrict__ vp = ptr(0);
343 for (
int i = is; i < ns; ++i) {
345 load_vec(pg, wt, &wp[
VLEN * i]);
346 load_vec(pg, vt, &vp[
VLEN * i]);
347 aypx_vec(pg, a, vt, wt);
348 save_vec(pg, &vp[
VLEN * i], vt);
356 template<
typename REALTYPE>
360 assert(check_size(w));
365 REALTYPE *__restrict__ vp = this->ptr(0);
366 REALTYPE *__restrict__ wp
369 REALTYPE *vp = this->ptr(0);
376 int ith, nth, is, ns;
377 set_threadtask(ith, nth, is, ns, m_nsizev / 2);
379 for (
int i = is; i < ns; ++i) {
380 int inr =
VLEN * (2 * i);
381 int ini =
VLEN * (2 * i + 1);
383 load_vec(pg, wtr, &wp[inr]);
384 load_vec(pg, wti, &wp[ini]);
385 load_vec(pg, vtr, &vp[inr]);
386 load_vec(pg, vti, &vp[ini]);
387 axpy_vec(pg, wtr, ar, vtr);
388 axpy_vec(pg, wti, ar, vti);
389 axpy_vec(pg, wtr, -ai, vti);
390 axpy_vec(pg, wti, ai, vtr);
391 save_vec(pg, &vp[inr], wtr);
392 save_vec(pg, &vp[ini], wti);
400 template<
typename REALTYPE>
402 const REALTYPE atr,
const REALTYPE ati,
406 assert(nin() == w.
nin());
407 assert(nvol() == w.
nvol());
409 assert(ex_w < w.
nex());
411 int Nin = this->nin();
412 int Nex = this->nex();
413 int Nstv = this->nvol() /
VLEN;
418 REALTYPE *__restrict__ vp = this->ptr(0) +
VLEN * Nin * Nstv * ex;
421 REALTYPE *vp = this->ptr(0) +
VLEN * Nin * Nstv * ex;
423 +
VLEN * Nin * Nstv * ex_w;
429 int ith, nth, is, ns;
430 set_threadtask(ith, nth, is, ns, Nstv * Nin / 2);
432 for (
int i2 = 2 * is *
VLEN; i2 < 2 * ns *
VLEN; i2 += 2 *
VLEN) {
436 load_vec(pg, wtr, &wp[inr]);
437 load_vec(pg, wti, &wp[ini]);
438 load_vec(pg, vtr, &vp[inr]);
439 load_vec(pg, vti, &vp[ini]);
440 axpy_vec(pg, wtr, atr, vtr);
441 axpy_vec(pg, wti, atr, vti);
442 axpy_vec(pg, wtr, -ati, vti);
443 axpy_vec(pg, wti, ati, vtr);
444 save_vec(pg, &vp[inr], wtr);
445 save_vec(pg, &vp[ini], wti);
452 template<
typename REALTYPE>
455 int Nin = this->nin();
456 int Nex = this->nex();
457 int Nstv = this->nvol() /
VLEN;
459 int ith, nth, is, ns;
466 set_threadtask(ith, nth, is, ns, Nstv * Nin);
467 for (
int ex = 0; ex < Nex; ++ex) {
471 load_vec(pg, vt, &v[iv]);
473 save_vec(pg, &v[iv], vt);
481 template<
typename REALTYPE>
484 int Nin = this->nin();
485 int Nex = this->nex();
486 int Nstv = this->nvol() /
VLEN;
491 REALTYPE *__restrict__ vp = this->ptr(0);
493 REALTYPE *vp = this->ptr(0);
496 int ith, nth, is, ns;
497 set_threadtask(ith, nth, is, ns, Nstv * Nin / 2);
500 for (
int ex = 0; ex < Nex; ++ex) {
502 for (
int i = is; i < ns; ++i) {
503 int inr =
VLEN * (2 * i);
504 int ini =
VLEN * (2 * i + 1);
506 load_vec(pg, vtr, &vp[inr]);
507 load_vec(pg, vti, &vp[ini]);
508 axpy_vec(pg, vtr, ar, vtr);
509 axpy_vec(pg, vtr, -ai, vti);
510 axpy_vec(pg, vti, ar, vti);
511 axpy_vec(pg, vti, ai, vtr);
512 save_vec(pg, &vp[inr], vtr);
513 save_vec(pg, &vp[ini], vti);
522 template<
typename REALTYPE>
525 int Nin = this->nin();
526 int Nex = this->nex();
527 int Nstv = this->nvol() /
VLEN;
529 int ith, nth, is, ns;
530 set_threadtask(ith, nth, is, ns, Nstv);
546 set_vec(pg, yt, 0.0);
548 for (
int ex = 0; ex < Nex; ++ex) {
550 set_vec(pg, tmp_yt, 0.0);
551 for (
int site = is; site < ns; ++site) {
552 real_t *v = &vp[
VLEN * Nin * (site + Nstv * ex)];
553 real_t *w = &wp[
VLEN * Nin * (site + Nstv * ex)];
555 set_vec(pg, xt, 0.0);
556 for (
int in = 0; in < Nin; ++in) {
557 load_vec(pg, wt, &w[
VLEN * in]);
558 load_vec(pg, vt, &v[
VLEN * in]);
559 add_dot_vec(pg, xt, vt, wt);
561 add_vec(pg, tmp_yt, xt);
563 add_vec(pg, yt, tmp_yt);
567 reduce_vec(pg, a, yt);
578 template<
typename REALTYPE>
582 assert(check_size(w));
584 int Nin = this->nin();
585 int Nex = this->nex();
586 int Nstv = this->nvol() /
VLEN;
589 int ith, nth, is, ns;
605 int Nouter = (Nstv > Nin2) ? Nstv : Nin2;
606 int Ninner2 = (Nstv * Nin2) / Nouter;
607 int Ninner = 2 * Ninner2;
608 set_threadtask(ith, nth, is, ns, Nouter);
610 set_vec(pg, ytr, 0.0);
611 set_vec(pg, yti, 0.0);
613 for (
int ex = 0; ex < Nex; ++ex) {
615 set_vec(pg, tmp_ytr, 0.0);
616 set_vec(pg, tmp_yti, 0.0);
617 for (
int i = is; i < ns; ++i) {
618 real_t *v = &vp[
VLEN * Ninner * (i + Nouter * ex)];
619 real_t *w = &wp[
VLEN * Ninner * (i + Nouter * ex)];
620 svreal_t xtr, xti, vtr, vti, wtr, wti;
621 set_vec(pg, xtr, 0.0);
622 set_vec(pg, xti, 0.0);
623 for (
int in = 0; in < Ninner2; ++in) {
625 int ini = 2 * in + 1;
626 load_vec(pg, wtr, &w[
VLEN * inr]);
627 load_vec(pg, wti, &w[
VLEN * ini]);
628 load_vec(pg, vtr, &v[
VLEN * inr]);
629 load_vec(pg, vti, &v[
VLEN * ini]);
630 add_dot_vec(pg, xtr, vtr, wtr);
631 add_dot_vec(pg, xtr, vti, wti);
632 add_dot_vec(pg, xti, vtr, wti);
633 sub_dot_vec(pg, xti, vti, wtr);
635 add_vec(pg, tmp_ytr, xtr);
636 add_vec(pg, tmp_yti, xti);
639 add_vec(pg, ytr, tmp_ytr);
640 add_vec(pg, yti, tmp_yti);
643 reduce_vec(pg, atr, ytr);
644 reduce_vec(pg, ati, yti);
647 real_t sum[2] = { atr, ati };
655 template<
typename REALTYPE>
658 int Nin = this->nin();
659 int Nex = this->nex();
660 int Nstv = this->nvol() /
VLEN;
666 int Nouter = (Nstv > Nin) ? Nstv : Nin;
667 int Ninner = (Nstv * Nin) / Nouter;
669 int ith, nth, is, ns;
670 set_threadtask(ith, nth, is, ns, Nouter);
675 set_vec(pg, yt, 0.0);
677 for (
int ex = 0; ex < Nex; ++ex) {
679 set_vec(pg, tmp_yt, 0.0);
680 for (
int i = is; i < ns; ++i) {
681 real_t *v = &vp[
VLEN * Ninner * (i + Nouter * ex)];
683 set_vec(pg, xt, 0.0);
684 for (
int in = 0; in < Ninner; ++in) {
685 load_vec(pg, vt, &v[
VLEN * in]);
686 add_norm2_vec(pg, xt, vt);
688 add_vec(pg, tmp_yt, xt);
690 add_vec(pg, yt, tmp_yt);
694 reduce_vec(pg, a, yt);
705 template<
typename REALTYPE>
711 vout.
crucial(
"%s: dotc_and_norm2 for real field is called\n",
716 assert(check_size(w));
718 int Nin = this->nin();
719 int Nex = this->nex();
720 int Nstv = this->nvol() /
VLEN;
738 int Nouter = (Nstv > Nin2) ? Nstv : Nin2;
739 int Ninner2 = (Nstv * Nin2) / Nouter;
740 int Ninner = 2 * Ninner2;
742 int ith, nth, is, ns;
743 set_threadtask(ith, nth, is, ns, Nouter);
748 set_vec(pg, ytr, 0.0);
749 set_vec(pg, yti, 0.0);
750 set_vec(pg, w2, 0.0);
751 set_vec(pg, v2, 0.0);
753 for (
int ex = 0; ex < Nex; ++ex) {
754 svreal_t tmp_ytr, tmp_yti, tmp_vt2, tmp_wt2;
755 set_vec(pg, tmp_ytr, 0.0);
756 set_vec(pg, tmp_yti, 0.0);
757 set_vec(pg, tmp_vt2, 0.0);
758 set_vec(pg, tmp_wt2, 0.0);
759 for (
int iout = is; iout < ns; ++iout) {
760 real_t *v = &vp[
VLEN * Ninner * (iout + Nouter * ex)];
761 real_t *w = &wp[
VLEN * Ninner * (iout + Nouter * ex)];
762 svreal_t xtr, xti, vtr, vti, wtr, wti, vt2, wt2;
763 set_vec(pg, xtr, 0.0);
764 set_vec(pg, xti, 0.0);
765 set_vec(pg, vt2, 0.0);
766 set_vec(pg, wt2, 0.0);
767 for (
int in = 0; in < Ninner2; ++in) {
769 int ini = 2 * in + 1;
770 load_vec(pg, wtr, &w[
VLEN * inr]);
771 load_vec(pg, wti, &w[
VLEN * ini]);
772 load_vec(pg, vtr, &v[
VLEN * inr]);
773 load_vec(pg, vti, &v[
VLEN * ini]);
774 add_dot_vec(pg, xtr, vtr, wtr);
775 add_dot_vec(pg, xtr, vti, wti);
776 add_dot_vec(pg, xti, vtr, wti);
777 sub_dot_vec(pg, xti, vti, wtr);
778 add_norm2_vec(pg, vt2, vtr);
779 add_norm2_vec(pg, vt2, vti);
780 add_norm2_vec(pg, wt2, wtr);
781 add_norm2_vec(pg, wt2, wti);
783 add_vec(pg, tmp_ytr, xtr);
784 add_vec(pg, tmp_yti, xti);
785 add_vec(pg, tmp_vt2, vt2);
786 add_vec(pg, tmp_wt2, wt2);
788 add_vec(pg, ytr, tmp_ytr);
789 add_vec(pg, yti, tmp_yti);
790 add_vec(pg, v2, tmp_vt2);
791 add_vec(pg, w2, tmp_wt2);
794 reduce_vec(pg, atr, ytr);
795 reduce_vec(pg, ati, yti);
796 reduce_vec(pg, w_norm2, w2);
797 reduce_vec(pg,
norm2, v2);