16 #if defined USE_GROUP_SU3
18 #elif defined USE_GROUP_SU2
20 #elif defined USE_GROUP_SU_N
25 #ifdef USE_FACTORY_AUTOREGISTER
27 bool init = Fopr_Staggered::register_factory();
174 const std::vector<int> bc)
176 assert(bc.size() ==
m_Ndim);
190 for (
int mu = 0; mu <
m_Ndim; ++mu) {
198 for (
int mu = 0; mu <
m_Ndim; ++mu) {
229 for (
int t = 0; t < Nt; ++t) {
230 int t2 = t + ipet * Nt;
231 for (
int z = 0; z < Nz; ++z) {
232 int z2 = z + ipez * Nz;
233 for (
int y = 0; y < Ny; ++y) {
234 int y2 = y + ipey * Ny;
235 for (
int x = 0; x < Nx; ++x) {
236 int x2 = x + ipex * Nx;
237 int is = idx_lex.
site(x, y, z, t);
246 if (((x2 + y2 + z2 + t2) % 2) == 1) {
253 if (((x2 + y2) % 2) == 1) {
256 if (((x2 + y2 + z2) % 2) == 1) {
287 assert(v.
nvol() == Nvol);
289 for (
int ex = 0; ex < Nex; ++ex) {
290 for (
int site = 0; site < Nvol; ++site) {
291 for (
int in = 0; in < Nin; ++in) {
293 v.
set(in, site, ex, vt);
331 int ith, nth, is, ns;
332 set_threadtask(ith, nth, is, ns,
m_Nvol);
336 for (
int mu = 0; mu <
m_Ndim; ++mu) {
337 for (
int site = is; site < ns; ++site) {
339 for (
int cc = 0; cc <
m_Ndf; ++cc) {
340 double ut = ph * U->
cmp(cc, site, mu);
341 m_U.
set(cc, site, mu, ut);
354 if (ith == 0)
m_mode = mode;
364 }
else if (
m_mode ==
"Ddag") {
366 }
else if (
m_mode ==
"DdagD") {
368 }
else if (
m_mode ==
"H") {
382 }
else if (
m_mode ==
"Ddag") {
384 }
else if (
m_mode ==
"DdagD") {
386 }
else if (
m_mode ==
"H") {
492 int ith, nth, is, ns;
493 set_threadtask(ith, nth, is, ns,
m_Nvol);
497 for (
int ex = 0; ex < Nex; ++ex) {
498 for (
int site = is; site < ns; ++site) {
500 for (
int in = 0; in < Nin; ++in) {
501 double vt = ph * w.
cmp(in, site, ex);
502 v.
set(in, site, ex, vt);
519 int ith, nth, is, ns;
520 set_threadtask(ith, nth, is, ns,
m_Nvol);
524 for (
int ex = 0; ex < Nex; ++ex) {
525 for (
int site = is; site < ns; ++site) {
527 for (
int in = 0; in < Nin; ++in) {
528 double vt = ph * v.
cmp(in, site, ex);
529 v.
set(in, site, ex, vt);
546 double *vp = v.
ptr(0);
547 const double *wp = w.
ptr(0);
550 int ith, nth, is, ns;
551 set_threadtask(ith, nth, is, ns,
m_Nvol);
555 for (
int site = is; site < ns; ++site) {
556 int ix = site %
m_Nx;
557 int iyzt = site /
m_Nx;
559 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
573 for (
int site = is; site < ns; ++site) {
574 int ix = site %
m_Nx;
575 int iyzt = site /
m_Nx;
576 int nei = ix + 1 +
m_Nx * iyzt;
579 for (
int ic = 0; ic <
m_Nc; ++ic) {
580 int ic2 = ic *
m_Nvc;
584 vp[2 * ic +
m_Nvc * site] += wtr;
585 vp[2 * ic + 1 +
m_Nvc * site] += wti;
588 for (
int ic = 0; ic <
m_Nc; ++ic) {
589 int ic2 = ic *
m_Nvc;
593 vp[2 * ic +
m_Nvc * site] += wtr;
594 vp[2 * ic + 1 +
m_Nvc * site] += wti;
611 double *vp = v.
ptr(0);
612 const double *wp = w.
ptr(0);
615 int ith, nth, is, ns;
616 set_threadtask(ith, nth, is, ns,
m_Nvol);
620 for (
int site = is; site < ns; ++site) {
621 int ix = site %
m_Nx;
622 int iyzt = site /
m_Nx;
623 if (ix ==
m_Nx - 1) {
624 for (
int ic = 0; ic <
m_Nc; ++ic) {
627 wtr = mult_udagv_r(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * site],
m_Nc);
628 wti = mult_udagv_i(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * site],
m_Nc);
643 for (
int site = is; site < ns; ++site) {
644 int ix = site %
m_Nx;
645 int iyzt = site /
m_Nx;
646 int nei = ix - 1 +
m_Nx * iyzt;
649 for (
int ic = 0; ic <
m_Nc; ++ic) {
652 wtr = mult_udagv_r(&up[ic2 +
m_Ndf * nei], &wp[
m_Nvc * nei],
m_Nc);
653 wti = mult_udagv_i(&up[ic2 +
m_Ndf * nei], &wp[
m_Nvc * nei],
m_Nc);
654 vp[2 * ic +
m_Nvc * site] -= wtr;
655 vp[2 * ic + 1 +
m_Nvc * site] -= wti;
658 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
676 double *vp = v.
ptr(0);
677 const double *wp = w.
ptr(0);
680 int ith, nth, is, ns;
681 set_threadtask(ith, nth, is, ns,
m_Nvol);
685 for (
int site = is; site < ns; ++site) {
686 int ix = site %
m_Nx;
687 int iyzt = site /
m_Nx;
688 int iy = iyzt %
m_Ny;
689 int izt = iyzt /
m_Ny;
690 int ixzt = ix +
m_Nx * izt;
692 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
706 for (
int site = is; site < ns; ++site) {
707 int ix = site %
m_Nx;
708 int iyzt = site /
m_Nx;
709 int iy = iyzt %
m_Ny;
710 int izt = iyzt /
m_Ny;
711 int ixzt = ix +
m_Nx * izt;
712 int nei = ix +
m_Nx * (iy + 1 +
m_Ny * izt);
715 for (
int ic = 0; ic <
m_Nc; ++ic) {
716 int ic2 = ic *
m_Nvc;
720 vp[2 * ic +
m_Nvc * site] += wtr;
721 vp[2 * ic + 1 +
m_Nvc * site] += wti;
724 for (
int ic = 0; ic <
m_Nc; ++ic) {
725 int ic2 = ic *
m_Nvc;
729 vp[2 * ic +
m_Nvc * site] += wtr;
730 vp[2 * ic + 1 +
m_Nvc * site] += wti;
747 double *vp = v.
ptr(0);
748 const double *wp = w.
ptr(0);
751 int ith, nth, is, ns;
752 set_threadtask(ith, nth, is, ns,
m_Nvol);
756 for (
int site = is; site < ns; ++site) {
757 int ix = site %
m_Nx;
758 int iyzt = site /
m_Nx;
759 int iy = iyzt %
m_Ny;
760 int izt = iyzt /
m_Ny;
761 int ixzt = ix +
m_Nx * izt;
762 if (iy ==
m_Ny - 1) {
763 for (
int ic = 0; ic <
m_Nc; ++ic) {
766 wtr = mult_udagv_r(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * site],
m_Nc);
767 wti = mult_udagv_i(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * site],
m_Nc);
782 for (
int site = is; site < ns; ++site) {
783 int ix = site %
m_Nx;
784 int iyzt = site /
m_Nx;
785 int iy = iyzt %
m_Ny;
786 int izt = iyzt /
m_Ny;
787 int ixzt = ix +
m_Nx * izt;
788 int nei = ix +
m_Nx * (iy - 1 +
m_Ny * izt);
791 for (
int ic = 0; ic <
m_Nc; ++ic) {
794 wtr = mult_udagv_r(&up[ic2 +
m_Ndf * nei], &wp[
m_Nvc * nei],
m_Nc);
795 wti = mult_udagv_i(&up[ic2 +
m_Ndf * nei], &wp[
m_Nvc * nei],
m_Nc);
796 vp[2 * ic +
m_Nvc * site] -= wtr;
797 vp[2 * ic + 1 +
m_Nvc * site] -= wti;
800 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
818 double *vp = v.
ptr(0);
819 const double *wp = w.
ptr(0);
822 int ith, nth, is, ns;
823 set_threadtask(ith, nth, is, ns,
m_Nvol);
829 for (
int site = is; site < ns; ++site) {
830 int ixy = site % Nxy;
831 int izt = site / Nxy;
834 int ixyt = ixy + Nxy * it;
836 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
850 for (
int site = is; site < ns; ++site) {
851 int ixy = site % Nxy;
852 int izt = site / Nxy;
855 int ixyt = ixy + Nxy * it;
856 int nei = ixy + Nxy * (iz + 1 +
m_Nz * it);
859 for (
int ic = 0; ic <
m_Nc; ++ic) {
860 int ic2 = ic *
m_Nvc;
864 vp[2 * ic +
m_Nvc * site] += wtr;
865 vp[2 * ic + 1 +
m_Nvc * site] += wti;
868 for (
int ic = 0; ic <
m_Nc; ++ic) {
869 int ic2 = ic *
m_Nvc;
873 vp[2 * ic +
m_Nvc * site] += wtr;
874 vp[2 * ic + 1 +
m_Nvc * site] += wti;
891 double *vp = v.
ptr(0);
892 const double *wp = w.
ptr(0);
895 int ith, nth, is, ns;
896 set_threadtask(ith, nth, is, ns,
m_Nvol);
902 for (
int site = is; site < ns; ++site) {
903 int ixy = site % Nxy;
904 int izt = site / Nxy;
907 int ixyt = ixy + Nxy * it;
908 if (iz ==
m_Nz - 1) {
909 for (
int ic = 0; ic <
m_Nc; ++ic) {
912 wtr = mult_udagv_r(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * site],
m_Nc);
913 wti = mult_udagv_i(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * site],
m_Nc);
928 for (
int site = is; site < ns; ++site) {
929 int ixy = site % Nxy;
930 int izt = site / Nxy;
933 int ixyt = ixy + Nxy * it;
934 int nei = ixy + Nxy * (iz - 1 +
m_Nz * it);
937 for (
int ic = 0; ic <
m_Nc; ++ic) {
940 wtr = mult_udagv_r(&up[ic2 +
m_Ndf * nei], &wp[
m_Nvc * nei],
m_Nc);
941 wti = mult_udagv_i(&up[ic2 +
m_Ndf * nei], &wp[
m_Nvc * nei],
m_Nc);
942 vp[2 * ic +
m_Nvc * site] -= wtr;
943 vp[2 * ic + 1 +
m_Nvc * site] -= wti;
946 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
964 double *vp = v.
ptr(0);
965 const double *wp = w.
ptr(0);
968 int ith, nth, is, ns;
969 set_threadtask(ith, nth, is, ns,
m_Nvol);
975 for (
int site = is; site < ns; ++site) {
976 int ixyz = site % Nxyz;
977 int it = site / Nxyz;
979 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
993 for (
int site = is; site < ns; ++site) {
994 int ixyz = site % Nxyz;
995 int it = site / Nxyz;
996 int nei = ixyz + Nxyz * (it + 1);
999 for (
int ic = 0; ic <
m_Nc; ++ic) {
1000 int ic2 = ic *
m_Nvc;
1002 wtr = mult_uv_r(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * nei],
m_Nc);
1003 wti = mult_uv_i(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * nei],
m_Nc);
1004 vp[2 * ic +
m_Nvc * site] += wtr;
1005 vp[2 * ic + 1 +
m_Nvc * site] += wti;
1008 for (
int ic = 0; ic <
m_Nc; ++ic) {
1009 int ic2 = ic *
m_Nvc;
1013 vp[2 * ic +
m_Nvc * site] += wtr;
1014 vp[2 * ic + 1 +
m_Nvc * site] += wti;
1031 double *vp = v.
ptr(0);
1032 const double *wp = w.
ptr(0);
1035 int ith, nth, is, ns;
1036 set_threadtask(ith, nth, is, ns,
m_Nvol);
1042 for (
int site = is; site < ns; ++site) {
1043 int ixyz = site % Nxyz;
1044 int it = site / Nxyz;
1045 if (it ==
m_Nt - 1) {
1046 for (
int ic = 0; ic <
m_Nc; ++ic) {
1049 wtr = mult_udagv_r(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * site],
m_Nc);
1050 wti = mult_udagv_i(&up[ic2 +
m_Ndf * site], &wp[
m_Nvc * site],
m_Nc);
1065 for (
int site = is; site < ns; ++site) {
1066 int ixyz = site % Nxyz;
1067 int it = site / Nxyz;
1068 int nei = ixyz + Nxyz * (it - 1);
1071 for (
int ic = 0; ic <
m_Nc; ++ic) {
1074 wtr = mult_udagv_r(&up[ic2 +
m_Ndf * nei], &wp[
m_Nvc * nei],
m_Nc);
1075 wti = mult_udagv_i(&up[ic2 +
m_Ndf * nei], &wp[
m_Nvc * nei],
m_Nc);
1076 vp[2 * ic +
m_Nvc * site] -= wtr;
1077 vp[2 * ic + 1 +
m_Nvc * site] -= wti;
1080 for (
int ivc = 0; ivc <
m_Nvc; ++ivc) {
1100 double flop_vol = double(Nvol) * double(NPE);
1102 double gflop = double(flop_site) * flop_vol * 1.0e-9;
1104 if ((mode ==
"DdagD") || (mode ==
"DDdag")) {