/*!
        @file    $Id:: fopr_Wilson_impl.cpp #$

        @brief

        @author  <Hideo Matsufuru> hideo.matsufuru@kek.jp(matsufuru)
                 $LastChangedBy: sueda $

        @date    $LastChangedDate:: 2013-07-12 16:56:41 #$

        @version $LastChangedRevision: 930 $
*/

#include "fopr_Wilson_impl.h"

using std::valarray;
using std::string;

//====================================================================
namespace {
  inline double mult_uv_r(double *u, double *v)
  {
    return
      u[0] * v[0] - u[1] * v[1]
      + u[2] * v[2] - u[3] * v[3]
      + u[4] * v[4] - u[5] * v[5];
  }


  inline double mult_uv_i(double *u, double *v)
  {
    return
      u[0] * v[1] + u[1] * v[0]
      + u[2] * v[3] + u[3] * v[2]
      + u[4] * v[5] + u[5] * v[4];
  }


  inline double mult_udagv_r(double *u, double *v)
  {
    return
      u[0] * v[0] + u[1] * v[1]
      + u[6] * v[2] + u[7] * v[3]
      + u[12] * v[4] + u[13] * v[5];
  }


  inline double mult_udagv_i(double *u, double *v)
  {
    return
      u[0] * v[1] - u[1] * v[0]
      + u[6] * v[3] - u[7] * v[2]
      + u[12] * v[5] - u[13] * v[4];
  }
}

//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::set_parameters(const double kappa, const valarray<int> bc)
{
  //- print input parameters
  vout.general(m_vl, "Parameters of Fopr_Wilson_impl:\n");
  vout.general(m_vl, "  kappa  = %8.4f\n", kappa);
  for (int mu = 0; mu < m_Ndim; ++mu) {
    vout.general(m_vl, "  boundary[%d] = %2d\n", mu, bc[mu]);
  }

  //- range check
  // NB. kappa = 0 is allowed.
  assert(bc.size() == m_Ndim);

  //- store values
  m_kappa = kappa;

  // m_boundary.resize(m_Ndim);  // already resized in init.
  for (int mu = 0; mu < m_Ndim; ++mu) {
    m_boundary[mu] = bc[mu];
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::init(string repr)
{
  m_vl = CommonParameters::Vlevel();

  m_Nvol = CommonParameters::Nvol();
  m_Ndim = CommonParameters::Ndim();
  m_boundary.resize(m_Ndim);

  m_U = 0;

  m_repr = repr;

  m_GM.resize(m_Ndim + 1);

  GammaMatrixSet *gmset = GammaMatrixSet::New(m_repr);

  m_GM[0] = gmset->get_GM(GammaMatrixSet::GAMMA1);
  m_GM[1] = gmset->get_GM(GammaMatrixSet::GAMMA2);
  m_GM[2] = gmset->get_GM(GammaMatrixSet::GAMMA3);
  m_GM[3] = gmset->get_GM(GammaMatrixSet::GAMMA4);
  m_GM[4] = gmset->get_GM(GammaMatrixSet::GAMMA5);


  m_mult     = &Fopr_Wilson::Fopr_Wilson_impl::mult_undef;
  m_mult_dag = &Fopr_Wilson::Fopr_Wilson_impl::mult_undef;

  if (m_repr == "Dirac") {
    m_D       = &Fopr_Wilson::Fopr_Wilson_impl::D_dirac;
    m_gm5     = &Fopr_Wilson::Fopr_Wilson_impl::gm5_dirac;
    m_mult_tp = &Fopr_Wilson::Fopr_Wilson_impl::mult_tp_dirac;
    m_mult_tm = &Fopr_Wilson::Fopr_Wilson_impl::mult_tm_dirac;
  } else if (m_repr == "Chiral") {
    m_D       = &Fopr_Wilson::Fopr_Wilson_impl::D_chiral;
    m_gm5     = &Fopr_Wilson::Fopr_Wilson_impl::gm5_chiral;
    m_mult_tp = &Fopr_Wilson::Fopr_Wilson_impl::mult_tp_chiral;
    m_mult_tm = &Fopr_Wilson::Fopr_Wilson_impl::mult_tm_chiral;
  } else {
    vout.crucial(m_vl, "Fopr_Wilson: input repr is undefined.\n");
    abort();
  }

  delete gmset;
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::set_mode(std::string mode)
{
  m_mode = mode;

  if (m_mode == "D") {
    m_mult     = &Fopr_Wilson::Fopr_Wilson_impl::D;
    m_mult_dag = &Fopr_Wilson::Fopr_Wilson_impl::Ddag;
  } else if (m_mode == "Ddag") {
    m_mult     = &Fopr_Wilson::Fopr_Wilson_impl::Ddag;
    m_mult_dag = &Fopr_Wilson::Fopr_Wilson_impl::D;
  } else if (m_mode == "DdagD") {
    m_mult     = &Fopr_Wilson::Fopr_Wilson_impl::DdagD;
    m_mult_dag = &Fopr_Wilson::Fopr_Wilson_impl::DdagD;
  } else if (m_mode == "H") {
    m_mult     = &Fopr_Wilson::Fopr_Wilson_impl::H;
    m_mult_dag = &Fopr_Wilson::Fopr_Wilson_impl::H;
  } else {
    vout.crucial(m_vl, "Fopr_Wilson: input mode is undefined.\n");
    abort();
  }
}


//====================================================================
std::string Fopr_Wilson::Fopr_Wilson_impl::get_mode() const
{
  return m_mode;
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::D_dirac(Field& w, const Field& f)
{
  w = 0.0;

  mult_xp(w, f);
  mult_xm(w, f);

  mult_yp(w, f);
  mult_ym(w, f);

  mult_zp(w, f);
  mult_zm(w, f);

  mult_tp_dirac(w, f);
  mult_tm_dirac(w, f);

  w *= -m_kappa;
  w += f;
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::D_chiral(Field& w, const Field& f)
{
  mult_xp(w, f);
  mult_xm(w, f);

  mult_yp(w, f);
  mult_ym(w, f);

  mult_zp(w, f);
  mult_zm(w, f);

  mult_tp_chiral(w, f);
  mult_tm_chiral(w, f);

  w *= -m_kappa;
  w += f;
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_up(int mu, Field& w, const Field& f)
{
  if (mu == 0) {
    mult_xp(w, f);
  } else if (mu == 1) {
    mult_yp(w, f);
  } else if (mu == 2) {
    mult_zp(w, f);
  } else if (mu == 3) {
    (this->*m_mult_tp)(w, f);
  } else {
    vout.crucial(m_vl, "Fopr_Wilson: illegal parameter mu in mult_up.\n");
    abort();
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_dn(int mu, Field& w, const Field& f)
{
  if (mu == 0) {
    mult_xm(w, f);
  } else if (mu == 1) {
    mult_ym(w, f);
  } else if (mu == 2) {
    mult_zm(w, f);
  } else if (mu == 3) {
    (this->*m_mult_tm)(w, f);
  } else {
    vout.crucial(m_vl, "Fopr_Wilson: illegal parameter mu in mult_dn.\n");
    abort();
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::gm5_dirac(Field& w, const Field& f)
{
  int Nvc = 2 * CommonParameters::Nc();
  int Nd  = CommonParameters::Nd();

  double *v1;
  double *v2;

  v1 = const_cast<Field *>(&f)->ptr(0);
  v2 = w.ptr(0);

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  for (int site = 0; site < m_Nvol; ++site) {
    for (int icc = 0; icc < Nvc; icc++) {
      int in = Nvc * Nd * site;
      v2[icc + id1 + in] = v1[icc + id3 + in];
      v2[icc + id2 + in] = v1[icc + id4 + in];
      v2[icc + id3 + in] = v1[icc + id1 + in];
      v2[icc + id4 + in] = v1[icc + id2 + in];
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::gm5_chiral(Field& w, const Field& f)
{
  int Nvc = 2 * CommonParameters::Nc();
  int Nd  = CommonParameters::Nd();

  double *v1;
  double *v2;

  v1 = const_cast<Field *>(&f)->ptr(0);
  v2 = w.ptr(0);

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  for (int site = 0; site < m_Nvol; ++site) {
    for (int icc = 0; icc < Nvc; icc++) {
      int in = Nvc * Nd * site;
      v2[icc + id1 + in] = v1[icc + id1 + in];
      v2[icc + id2 + in] = v1[icc + id2 + in];
      v2[icc + id3 + in] = -v1[icc + id3 + in];
      v2[icc + id4 + in] = -v1[icc + id4 + in];
    }
  }
}


//====================================================================
const Field_F Fopr_Wilson::Fopr_Wilson_impl::mult_gm5p(int mu, const Field_F& w)
{
  Field vt(w.nin(), w.nvol(), w.nex());

  vt = 0.0;

  if (mu == 0) {
    mult_xp(vt, (Field)w);
  } else if (mu == 1) {
    mult_yp(vt, (Field)w);
  } else if (mu == 2) {
    mult_zp(vt, (Field)w);
  } else if (mu == 3) {
    if (m_repr == "Dirac") {
      mult_tp_dirac(vt, w);
    } else {
      mult_tp_chiral(vt, w);
    }
  } else {
    abort();
  }

  //  return (Field_F) mult_gm5(vt);

  Field vt2(w.nin(), w.nvol(), w.nex());
  mult_gm5(vt2, vt);

  return vt2;
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_xp(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 0;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  int ix, nn;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  double vcp1[Nvc * 2 * Ny * Nz * Nt], vcp2[Nvc * 2 * Ny * Nz * Nt];

  //- boundary part
  ix = Nx - 1;
  nn = 0;
  for (int iyzt = 0; iyzt < (Ny * Nz * Nt); iyzt++) {
    int in  = Nvc * ND * (nn + iyzt * Nx);
    int ix1 = Nvc * 2 * iyzt;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      vcp1[2 * ic + ix1]     = bc2 * (v1[2 * ic + id1 + in] - v1[2 * ic + 1 + id4 + in]);
      vcp1[2 * ic + 1 + ix1] = bc2 * (v1[2 * ic + 1 + id1 + in] + v1[2 * ic + id4 + in]);
      vcp1[2 * ic + ix2]     = bc2 * (v1[2 * ic + id2 + in] - v1[2 * ic + 1 + id3 + in]);
      vcp1[2 * ic + 1 + ix2] = bc2 * (v1[2 * ic + 1 + id2 + in] + v1[2 * ic + id3 + in]);
    }
  }

  Communicator::exchange(Nvc * 2 * Ny * Nz * Nt, vcp2, vcp1, 0, 1, 1);

  ix = Nx - 1;
  nn = 0;
  for (int iyzt = 0; iyzt < (Ny * Nz * Nt); iyzt++) {
    int iv  = Nvc * ND * (ix + iyzt * Nx);
    int ig  = Ndf * (ix + iyzt * Nx + idir * Nst);
    int ix1 = Nvc * 2 * iyzt;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      int ic2 = ic * Nvc;

      wt1r = mult_uv_r(&u[ic2 + ig], &vcp2[ix1]);
      wt1i = mult_uv_i(&u[ic2 + ig], &vcp2[ix1]);
      wt2r = mult_uv_r(&u[ic2 + ig], &vcp2[ix2]);
      wt2i = mult_uv_i(&u[ic2 + ig], &vcp2[ix2]);

      v2[2 * ic + id1 + iv]     = wt1r;
      v2[2 * ic + 1 + id1 + iv] = wt1i;
      v2[2 * ic + id2 + iv]     = wt2r;
      v2[2 * ic + 1 + id2 + iv] = wt2i;
      v2[2 * ic + id3 + iv]     = wt2i;
      v2[2 * ic + 1 + id3 + iv] = -wt2r;
      v2[2 * ic + id4 + iv]     = wt1i;
      v2[2 * ic + 1 + id4 + iv] = -wt1r;
    }
  }

  //- bulk part
  for (int iyzt = 0; iyzt < (Ny * Nz * Nt); iyzt++) {
    for (int ix = 0; ix < (Nx - 1); ix++) {
      nn = ix + 1;

      int iv = Nvc * ND * (ix + iyzt * Nx);
      int in = Nvc * ND * (nn + iyzt * Nx);
      int ig = Ndf * (ix + iyzt * Nx + idir * Nst);

      for (int ic = 0; ic < Ncol; ic++) {
        vt1[2 * ic]     = v1[2 * ic + id1 + in] - v1[2 * ic + 1 + id4 + in];
        vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] + v1[2 * ic + id4 + in];
        vt2[2 * ic]     = v1[2 * ic + id2 + in] - v1[2 * ic + 1 + id3 + in];
        vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] + v1[2 * ic + id3 + in];
      }

      for (int ic = 0; ic < Ncol; ic++) {
        int ic2 = ic * Nvc;

        wt1r = mult_uv_r(&u[ic2 + ig], vt1);
        wt1i = mult_uv_i(&u[ic2 + ig], vt1);
        wt2r = mult_uv_r(&u[ic2 + ig], vt2);
        wt2i = mult_uv_i(&u[ic2 + ig], vt2);

        v2[2 * ic + id1 + iv]     = wt1r;
        v2[2 * ic + 1 + id1 + iv] = wt1i;
        v2[2 * ic + id2 + iv]     = wt2r;
        v2[2 * ic + 1 + id2 + iv] = wt2i;
        v2[2 * ic + id3 + iv]     = wt2i;
        v2[2 * ic + 1 + id3 + iv] = -wt2r;
        v2[2 * ic + id4 + iv]     = wt1i;
        v2[2 * ic + 1 + id4 + iv] = -wt1r;
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_xm(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 0;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  int ix, nn;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  double vcp1[Nvc * 2 * Ny * Nz * Nt], vcp2[Nvc * 2 * Ny * Nz * Nt];

  //- boundary part
  ix = 0;
  nn = Nx - 1;
  for (int iyzt = 0; iyzt < (Ny * Nz * Nt); iyzt++) {
    int in  = Nvc * ND * (nn + iyzt * Nx);
    int ig  = Ndf * (nn + iyzt * Nx + idir * Nst);
    int ix1 = Nvc * 2 * iyzt;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      vt1[2 * ic]     = v1[2 * ic + id1 + in] + v1[2 * ic + 1 + id4 + in];
      vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] - v1[2 * ic + id4 + in];

      vt2[2 * ic]     = v1[2 * ic + id2 + in] + v1[2 * ic + 1 + id3 + in];
      vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] - v1[2 * ic + id3 + in];
    }

    for (int ic = 0; ic < Ncol; ic++) {
      int icr = 2 * ic;
      vcp1[icr + ix1]     = mult_udagv_r(&u[icr + ig], vt1);
      vcp1[icr + 1 + ix1] = mult_udagv_i(&u[icr + ig], vt1);
      vcp1[icr + ix2]     = mult_udagv_r(&u[icr + ig], vt2);
      vcp1[icr + 1 + ix2] = mult_udagv_i(&u[icr + ig], vt2);
    }
  }

  Communicator::exchange(Nvc * 2 * Ny * Nz * Nt, vcp2, vcp1, 0, -1, 2);

  ix = 0;
  nn = Nx - 1;
  for (int iyzt = 0; iyzt < (Ny * Nz * Nt); iyzt++) {
    int iv  = Nvc * ND * (ix + iyzt * Nx);
    int ix1 = Nvc * 2 * iyzt;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      int icr = 2 * ic;
      int ici = 2 * ic + 1;
      v2[icr + id1 + iv] += bc2 * vcp2[icr + ix1];
      v2[ici + id1 + iv] += bc2 * vcp2[ici + ix1];
      v2[icr + id2 + iv] += bc2 * vcp2[icr + ix2];
      v2[ici + id2 + iv] += bc2 * vcp2[ici + ix2];
      v2[icr + id3 + iv] += -bc2 * vcp2[ici + ix2];
      v2[ici + id3 + iv] += +bc2 * vcp2[icr + ix2];
      v2[icr + id4 + iv] += -bc2 * vcp2[ici + ix1];
      v2[ici + id4 + iv] += +bc2 * vcp2[icr + ix1];
    }
  }

  //- bulk part
  for (int iyzt = 0; iyzt < Ny * Nz * Nt; iyzt++) {
    for (int ix = 1; ix < Nx; ix++) {
      nn = ix - 1;

      int iv = Nvc * ND * (ix + (iyzt) * Nx);
      int in = Nvc * ND * (nn + (iyzt) * Nx);
      int ig = Ndf * (nn + iyzt * Nx + idir * Nst);

      for (int ic = 0; ic < Ncol; ic++) {
        vt1[2 * ic]     = v1[2 * ic + id1 + in] + v1[2 * ic + 1 + id4 + in];
        vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] - v1[2 * ic + id4 + in];

        vt2[2 * ic]     = v1[2 * ic + id2 + in] + v1[2 * ic + 1 + id3 + in];
        vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] - v1[2 * ic + id3 + in];
      }

      for (int ic = 0; ic < Ncol; ic++) {
        int icr = 2 * ic;
        int ici = 2 * ic + 1;
        int ic1 = 0;
        int ic2 = Nvc;
        int ic3 = 2 * Nvc;

        wt1r = mult_udagv_r(&u[icr + ig], vt1);
        wt1i = mult_udagv_i(&u[icr + ig], vt1);
        wt2r = mult_udagv_r(&u[icr + ig], vt2);
        wt2i = mult_udagv_i(&u[icr + ig], vt2);

        v2[icr + id1 + iv] += wt1r;
        v2[ici + id1 + iv] += wt1i;
        v2[icr + id2 + iv] += wt2r;
        v2[ici + id2 + iv] += wt2i;
        v2[icr + id3 + iv] += -wt2i;
        v2[ici + id3 + iv] += +wt2r;
        v2[icr + id4 + iv] += -wt1i;
        v2[ici + id4 + iv] += +wt1r;
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_yp(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 1;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  int    iy, nn;
  double vcp1[Nvc * 2 * Nx * Nz * Nt], vcp2[Nvc * 2 * Nx * Nz * Nt];

  //- boundary part
  iy = Ny - 1;
  nn = 0;
  for (int izt = 0; izt < (Nz * Nt); izt++) {
    for (int ix = 0; ix < Nx; ix++) {
      int in  = Nvc * ND * (ix + nn * Nx + izt * Nx * Ny);
      int ix1 = Nvc * 2 * (ix + izt * Nx);
      int ix2 = ix1 + Nvc;

      for (int ic = 0; ic < Ncol; ic++) {
        vcp1[2 * ic + ix1]     = bc2 * (v1[2 * ic + id1 + in] + v1[2 * ic + id4 + in]);
        vcp1[2 * ic + 1 + ix1] = bc2 * (v1[2 * ic + 1 + id1 + in] + v1[2 * ic + 1 + id4 + in]);
        vcp1[2 * ic + ix2]     = bc2 * (v1[2 * ic + id2 + in] - v1[2 * ic + id3 + in]);
        vcp1[2 * ic + 1 + ix2] = bc2 * (v1[2 * ic + 1 + id2 + in] - v1[2 * ic + 1 + id3 + in]);
      }
    }
  }

  Communicator::exchange(Nvc * 2 * Nx * Nz * Nt, vcp2, vcp1, 1, 1, 3);

  iy = Ny - 1;
  nn = 0;
  for (int izt = 0; izt < (Nz * Nt); izt++) {
    for (int ix = 0; ix < Nx; ix++) {
      int iv  = Nvc * ND * (ix + iy * Nx + izt * Nx * Ny);
      int ig  = Ndf * (ix + iy * Nx + izt * Nx * Ny + idir * Nst);
      int ix1 = Nvc * 2 * (ix + izt * Nx);
      int ix2 = ix1 + Nvc;

      for (int ic = 0; ic < Ncol; ic++) {
        int ic2 = ic * Nvc;

        wt1r = mult_uv_r(&u[ic2 + ig], &vcp2[ix1]);
        wt1i = mult_uv_i(&u[ic2 + ig], &vcp2[ix1]);
        wt2r = mult_uv_r(&u[ic2 + ig], &vcp2[ix2]);
        wt2i = mult_uv_i(&u[ic2 + ig], &vcp2[ix2]);

        v2[2 * ic + id1 + iv]     += wt1r;
        v2[2 * ic + 1 + id1 + iv] += wt1i;
        v2[2 * ic + id2 + iv]     += wt2r;
        v2[2 * ic + 1 + id2 + iv] += wt2i;
        v2[2 * ic + id3 + iv]     += -wt2r;
        v2[2 * ic + 1 + id3 + iv] += -wt2i;
        v2[2 * ic + id4 + iv]     += wt1r;
        v2[2 * ic + 1 + id4 + iv] += wt1i;
      }
    }
  }

  //- bulk part
  for (int izt = 0; izt < (Nz * Nt); izt++) {
    for (int iy = 0; iy < (Ny - 1); iy++) {
      int nn = iy + 1;
      for (int ix = 0; ix < Nx; ix++) {
        int iv = Nvc * ND * (ix + iy * Nx + izt * Nx * Ny);
        int in = Nvc * ND * (ix + nn * Nx + izt * Nx * Ny);
        int ig = Ndf * (ix + iy * Nx + izt * Nx * Ny + idir * Nst);

        for (int ic = 0; ic < Ncol; ic++) {
          vt1[2 * ic]     = v1[2 * ic + id1 + in] + v1[2 * ic + id4 + in];
          vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] + v1[2 * ic + 1 + id4 + in];
          vt2[2 * ic]     = v1[2 * ic + id2 + in] - v1[2 * ic + id3 + in];
          vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] - v1[2 * ic + 1 + id3 + in];
        }

        for (int ic = 0; ic < Ncol; ic++) {
          int ic2 = ic * Nvc;

          wt1r = mult_uv_r(&u[ic2 + ig], vt1);
          wt1i = mult_uv_i(&u[ic2 + ig], vt1);
          wt2r = mult_uv_r(&u[ic2 + ig], vt2);
          wt2i = mult_uv_i(&u[ic2 + ig], vt2);

          v2[2 * ic + id1 + iv]     += wt1r;
          v2[2 * ic + 1 + id1 + iv] += wt1i;
          v2[2 * ic + id2 + iv]     += wt2r;
          v2[2 * ic + 1 + id2 + iv] += wt2i;
          v2[2 * ic + id3 + iv]     += -wt2r;
          v2[2 * ic + 1 + id3 + iv] += -wt2i;
          v2[2 * ic + id4 + iv]     += wt1r;
          v2[2 * ic + 1 + id4 + iv] += wt1i;
        }
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_ym(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 1;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  int    iy, nn;
  double vcp1[Nvc * 2 * Nx * Nz * Nt], vcp2[Nvc * 2 * Nx * Nz * Nt];

  //- boundary part
  iy = 0;
  nn = Ny - 1;
  for (int izt = 0; izt < (Nz * Nt); izt++) {
    for (int ix = 0; ix < Nx; ix++) {
      int in  = Nvc * ND * (ix + nn * Nx + izt * Nx * Ny);
      int ig  = Ndf * (ix + nn * Nx + izt * Nx * Ny + idir * Nst);
      int ix1 = Nvc * 2 * (ix + izt * Nx);
      int ix2 = ix1 + Nvc;

      for (int ic = 0; ic < Ncol; ic++) {
        vt1[2 * ic]     = v1[2 * ic + id1 + in] - v1[2 * ic + id4 + in];
        vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] - v1[2 * ic + 1 + id4 + in];

        vt2[2 * ic]     = v1[2 * ic + id2 + in] + v1[2 * ic + id3 + in];
        vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] + v1[2 * ic + 1 + id3 + in];
      }

      for (int ic = 0; ic < Ncol; ic++) {
        int icr = 2 * ic;
        int ici = 2 * ic + 1;
        vcp1[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1);
        vcp1[ici + ix1] = mult_udagv_i(&u[icr + ig], vt1);
        vcp1[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2);
        vcp1[ici + ix2] = mult_udagv_i(&u[icr + ig], vt2);
      }
    }
  }

  Communicator::exchange(Nvc * 2 * Nx * Nz * Nt, vcp2, vcp1, 1, -1, 4);

  iy = 0;
  nn = Ny - 1;
  for (int izt = 0; izt < (Nz * Nt); izt++) {
    for (int ix = 0; ix < Nx; ix++) {
      int iv  = Nvc * ND * (ix + iy * Nx + izt * Nx * Ny);
      int ix1 = Nvc * 2 * (ix + izt * Nx);
      int ix2 = ix1 + Nvc;

      for (int ic = 0; ic < Ncol; ic++) {
        int icr = 2 * ic;
        int ici = 2 * ic + 1;
        v2[icr + id1 + iv] += bc2 * vcp2[icr + ix1];
        v2[ici + id1 + iv] += bc2 * vcp2[ici + ix1];
        v2[icr + id2 + iv] += bc2 * vcp2[icr + ix2];
        v2[ici + id2 + iv] += bc2 * vcp2[ici + ix2];
        v2[icr + id3 + iv] += bc2 * vcp2[icr + ix2];
        v2[ici + id3 + iv] += bc2 * vcp2[ici + ix2];
        v2[icr + id4 + iv] += -bc2 * vcp2[icr + ix1];
        v2[ici + id4 + iv] += -bc2 * vcp2[ici + ix1];
      }
    }
  }

  //- bulk part
  for (int izt = 0; izt < (Nz * Nt); izt++) {
    for (int iy = 1; iy < Ny; iy++) {
      int nn = iy - 1;
      for (int ix = 0; ix < Nx; ix++) {
        int iv = Nvc * ND * (ix + iy * Nx + izt * Nx * Ny);
        int in = Nvc * ND * (ix + nn * Nx + izt * Nx * Ny);
        int ig = Ndf * (ix + nn * Nx + izt * Nx * Ny + idir * Nst);

        for (int ic = 0; ic < Ncol; ic++) {
          vt1[2 * ic]     = v1[2 * ic + id1 + in] - v1[2 * ic + id4 + in];
          vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] - v1[2 * ic + 1 + id4 + in];

          vt2[2 * ic]     = v1[2 * ic + id2 + in] + v1[2 * ic + id3 + in];
          vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] + v1[2 * ic + 1 + id3 + in];
        }

        for (int ic = 0; ic < Ncol; ic++) {
          int icr = 2 * ic;
          int ici = 2 * ic + 1;
          int ic1 = 0;
          int ic2 = Nvc;
          int ic3 = 2 * Nvc;

          wt1r = mult_udagv_r(&u[icr + ig], vt1);
          wt1i = mult_udagv_i(&u[icr + ig], vt1);
          wt2r = mult_udagv_r(&u[icr + ig], vt2);
          wt2i = mult_udagv_i(&u[icr + ig], vt2);

          v2[icr + id1 + iv] += wt1r;
          v2[ici + id1 + iv] += wt1i;
          v2[icr + id2 + iv] += wt2r;
          v2[ici + id2 + iv] += wt2i;
          v2[icr + id3 + iv] += wt2r;
          v2[ici + id3 + iv] += wt2i;
          v2[icr + id4 + iv] += -wt1r;
          v2[ici + id4 + iv] += -wt1i;
        }
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_zp(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 2;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  int    iz, nn;
  double vcp1[Nvc * 2 * Nx * Ny * Nt], vcp2[Nvc * 2 * Nx * Ny * Nt];

  //- boundary part
  iz = Nz - 1;
  nn = 0;
  for (int it = 0; it < Nt; it++) {
    for (int ixy = 0; ixy < (Nx * Ny); ixy++) {
      int in  = Nvc * ND * (ixy + nn * Nx * Ny + it * Nx * Ny * Nz);
      int ix1 = Nvc * 2 * (ixy + it * Nx * Ny);
      int ix2 = ix1 + Nvc;

      for (int ic = 0; ic < Ncol; ic++) {
        vcp1[2 * ic + ix1]     = bc2 * (v1[2 * ic + id1 + in] - v1[2 * ic + 1 + id3 + in]);
        vcp1[2 * ic + 1 + ix1] = bc2 * (v1[2 * ic + 1 + id1 + in] + v1[2 * ic + id3 + in]);
        vcp1[2 * ic + ix2]     = bc2 * (v1[2 * ic + id2 + in] + v1[2 * ic + 1 + id4 + in]);
        vcp1[2 * ic + 1 + ix2] = bc2 * (v1[2 * ic + 1 + id2 + in] - v1[2 * ic + id4 + in]);
      }
    }
  }

  Communicator::exchange(Nvc * 2 * Nx * Ny * Nt, vcp2, vcp1, 2, 1, 5);

  iz = Nz - 1;
  nn = 0;
  for (int it = 0; it < Nt; it++) {
    for (int ixy = 0; ixy < (Nx * Ny); ixy++) {
      int iv  = Nvc * ND * (ixy + iz * Nx * Ny + it * Nx * Ny * Nz);
      int ig  = Ndf * (ixy + iz * Nx * Ny + it * Nx * Ny * Nz + idir * Nst);
      int ix1 = Nvc * 2 * (ixy + it * Nx * Ny);
      int ix2 = ix1 + Nvc;

      for (int ic = 0; ic < Ncol; ic++) {
        int ic2 = ic * Nvc;

        wt1r = mult_uv_r(&u[ic2 + ig], &vcp2[ix1]);
        wt1i = mult_uv_i(&u[ic2 + ig], &vcp2[ix1]);
        wt2r = mult_uv_r(&u[ic2 + ig], &vcp2[ix2]);
        wt2i = mult_uv_i(&u[ic2 + ig], &vcp2[ix2]);

        v2[2 * ic + id1 + iv]     += wt1r;
        v2[2 * ic + 1 + id1 + iv] += wt1i;
        v2[2 * ic + id2 + iv]     += wt2r;
        v2[2 * ic + 1 + id2 + iv] += wt2i;
        v2[2 * ic + id3 + iv]     += wt1i;
        v2[2 * ic + 1 + id3 + iv] += -wt1r;
        v2[2 * ic + id4 + iv]     += -wt2i;
        v2[2 * ic + 1 + id4 + iv] += wt2r;
      }
    }
  }

  //- bulk part
  for (int it = 0; it < Nt; it++) {
    for (int iz = 0; iz < (Nz - 1); iz++) {
      int nn = iz + 1;
      for (int ixy = 0; ixy < (Nx * Ny); ixy++) {
        int iv = Nvc * ND * (ixy + iz * Nx * Ny + it * Nx * Ny * Nz);
        int in = Nvc * ND * (ixy + nn * Nx * Ny + it * Nx * Ny * Nz);
        int ig = Ndf * (ixy + iz * Nx * Ny + it * Nx * Ny * Nz + idir * Nst);

        for (int ic = 0; ic < Ncol; ic++) {
          vt1[2 * ic]     = v1[2 * ic + id1 + in] - v1[2 * ic + 1 + id3 + in];
          vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] + v1[2 * ic + id3 + in];
          vt2[2 * ic]     = v1[2 * ic + id2 + in] + v1[2 * ic + 1 + id4 + in];
          vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] - v1[2 * ic + id4 + in];
        }

        for (int ic = 0; ic < Ncol; ic++) {
          int ic2 = ic * Nvc;

          wt1r = mult_uv_r(&u[ic2 + ig], vt1);
          wt1i = mult_uv_i(&u[ic2 + ig], vt1);
          wt2r = mult_uv_r(&u[ic2 + ig], vt2);
          wt2i = mult_uv_i(&u[ic2 + ig], vt2);

          v2[2 * ic + id1 + iv]     += wt1r;
          v2[2 * ic + 1 + id1 + iv] += wt1i;
          v2[2 * ic + id2 + iv]     += wt2r;
          v2[2 * ic + 1 + id2 + iv] += wt2i;
          v2[2 * ic + id3 + iv]     += wt1i;
          v2[2 * ic + 1 + id3 + iv] += -wt1r;
          v2[2 * ic + id4 + iv]     += -wt2i;
          v2[2 * ic + 1 + id4 + iv] += wt2r;
        }
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_zm(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 2;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  int    iz, nn;
  double vcp1[Nvc * 2 * Nx * Ny * Nt], vcp2[Nvc * 2 * Nx * Ny * Nt];

  //- boundary part
  iz = 0;
  nn = Nz - 1;
  for (int it = 0; it < Nt; it++) {
    for (int ixy = 0; ixy < (Nx * Ny); ixy++) {
      int in  = Nvc * ND * (ixy + nn * Nx * Ny + it * Nx * Ny * Nz);
      int ig  = Ndf * (ixy + nn * Nx * Ny + it * Nx * Ny * Nz + idir * Nst);
      int ix1 = Nvc * 2 * (ixy + it * Nx * Ny);
      int ix2 = ix1 + Nvc;

      for (int ic = 0; ic < Ncol; ic++) {
        vt1[2 * ic]     = v1[2 * ic + id1 + in] + v1[2 * ic + 1 + id3 + in];
        vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] - v1[2 * ic + id3 + in];
        vt2[2 * ic]     = v1[2 * ic + id2 + in] - v1[2 * ic + 1 + id4 + in];
        vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] + v1[2 * ic + id4 + in];
      }

      for (int ic = 0; ic < Ncol; ic++) {
        int icr = 2 * ic;
        int ici = 2 * ic + 1;
        vcp1[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1);
        vcp1[ici + ix1] = mult_udagv_i(&u[icr + ig], vt1);
        vcp1[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2);
        vcp1[ici + ix2] = mult_udagv_i(&u[icr + ig], vt2);
      }
    }
  }

  Communicator::exchange(Nvc * 2 * Nx * Ny * Nt, vcp2, vcp1, 2, -1, 6);

  iz = 0;
  nn = Nz - 1;
  for (int it = 0; it < Nt; it++) {
    for (int ixy = 0; ixy < (Nx * Ny); ixy++) {
      int iv  = Nvc * ND * (ixy + iz * Nx * Ny + it * Nx * Ny * Nz);
      int ix1 = Nvc * 2 * (ixy + it * Nx * Ny);
      int ix2 = ix1 + Nvc;

      for (int ic = 0; ic < Ncol; ic++) {
        int icr = 2 * ic;
        int ici = 2 * ic + 1;
        v2[icr + id1 + iv] += bc2 * vcp2[icr + ix1];
        v2[ici + id1 + iv] += bc2 * vcp2[ici + ix1];
        v2[icr + id2 + iv] += bc2 * vcp2[icr + ix2];
        v2[ici + id2 + iv] += bc2 * vcp2[ici + ix2];
        v2[icr + id3 + iv] += -bc2 * vcp2[ici + ix1];
        v2[ici + id3 + iv] += bc2 * vcp2[icr + ix1];
        v2[icr + id4 + iv] += bc2 * vcp2[ici + ix2];
        v2[ici + id4 + iv] += -bc2 * vcp2[icr + ix2];
      }
    }
  }

  //- bulk part
  for (int it = 0; it < Nt; it++) {
    for (int iz = 1; iz < Nz; iz++) {
      int nn = iz - 1;
      for (int ixy = 0; ixy < (Nx * Ny); ixy++) {
        int iv = Nvc * ND * (ixy + iz * Nx * Ny + it * Nx * Ny * Nz);
        int in = Nvc * ND * (ixy + nn * Nx * Ny + it * Nx * Ny * Nz);
        int ig = Ndf * (ixy + nn * Nx * Ny + it * Nx * Ny * Nz + idir * Nst);

        for (int ic = 0; ic < Ncol; ic++) {
          vt1[2 * ic]     = v1[2 * ic + id1 + in] + v1[2 * ic + 1 + id3 + in];
          vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] - v1[2 * ic + id3 + in];
          vt2[2 * ic]     = v1[2 * ic + id2 + in] - v1[2 * ic + 1 + id4 + in];
          vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] + v1[2 * ic + id4 + in];
        }

        for (int ic = 0; ic < Ncol; ic++) {
          int icr = 2 * ic;
          int ici = 2 * ic + 1;
          int ic1 = 0;
          int ic2 = Nvc;
          int ic3 = 2 * Nvc;

          wt1r = mult_udagv_r(&u[icr + ig], vt1);
          wt1i = mult_udagv_i(&u[icr + ig], vt1);
          wt2r = mult_udagv_r(&u[icr + ig], vt2);
          wt2i = mult_udagv_i(&u[icr + ig], vt2);

          v2[icr + id1 + iv] += wt1r;
          v2[ici + id1 + iv] += wt1i;
          v2[icr + id2 + iv] += wt2r;
          v2[ici + id2 + iv] += wt2i;
          v2[icr + id3 + iv] += -wt1i;
          v2[ici + id3 + iv] += wt1r;
          v2[icr + id4 + iv] += wt2i;
          v2[ici + id4 + iv] += -wt2r;
        }
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_tp_dirac(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 3;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  int    it, nn;
  double vcp1[Nvc * 2 * Nx * Ny * Nz], vcp2[Nvc * 2 * Nx * Ny * Nz];

  //- boundary part
  it = Nt - 1;
  nn = 0;
  for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
    int in  = Nvc * ND * (ixyz + nn * Nx * Ny * Nz);
    int ix1 = Nvc * 2 * ixyz;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      vcp1[2 * ic + ix1]     = 2.0 * bc2 * v1[2 * ic + id3 + in];
      vcp1[2 * ic + 1 + ix1] = 2.0 * bc2 * v1[2 * ic + 1 + id3 + in];
      vcp1[2 * ic + ix2]     = 2.0 * bc2 * v1[2 * ic + id4 + in];
      vcp1[2 * ic + 1 + ix2] = 2.0 * bc2 * v1[2 * ic + 1 + id4 + in];
    }
  }

  Communicator::exchange(Nvc * 2 * Nx * Ny * Nz, vcp2, vcp1, 3, 1, 7);

  it = Nt - 1;
  nn = 0;
  for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
    int iv  = Nvc * ND * (ixyz + it * Nx * Ny * Nz);
    int ig  = Ndf * (ixyz + it * Nx * Ny * Nz + idir * Nst);
    int ix1 = Nvc * 2 * ixyz;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      int ic2 = ic * Nvc;

      wt1r = mult_uv_r(&u[ic2 + ig], &vcp2[ix1]);
      wt1i = mult_uv_i(&u[ic2 + ig], &vcp2[ix1]);
      wt2r = mult_uv_r(&u[ic2 + ig], &vcp2[ix2]);
      wt2i = mult_uv_i(&u[ic2 + ig], &vcp2[ix2]);

      v2[2 * ic + id3 + iv]     += wt1r;
      v2[2 * ic + 1 + id3 + iv] += wt1i;
      v2[2 * ic + id4 + iv]     += wt2r;
      v2[2 * ic + 1 + id4 + iv] += wt2i;
    }
  }

  //- bulk part
  for (int it = 0; it < (Nt - 1); it++) {
    int nn = it + 1;
    for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
      int iv = Nvc * ND * (ixyz + it * Nx * Ny * Nz);
      int in = Nvc * ND * (ixyz + nn * Nx * Ny * Nz);
      int ig = Ndf * (ixyz + it * Nx * Ny * Nz + idir * Nst);

      for (int ic = 0; ic < Ncol; ic++) {
        vt1[2 * ic]     = 2.0 * v1[2 * ic + id3 + in];
        vt1[2 * ic + 1] = 2.0 * v1[2 * ic + 1 + id3 + in];
        vt2[2 * ic]     = 2.0 * v1[2 * ic + id4 + in];
        vt2[2 * ic + 1] = 2.0 * v1[2 * ic + 1 + id4 + in];
      }

      for (int ic = 0; ic < Ncol; ic++) {
        int ic2 = ic * Nvc;

        wt1r = mult_uv_r(&u[ic2 + ig], vt1);
        wt1i = mult_uv_i(&u[ic2 + ig], vt1);
        wt2r = mult_uv_r(&u[ic2 + ig], vt2);
        wt2i = mult_uv_i(&u[ic2 + ig], vt2);

        v2[2 * ic + id3 + iv]     += wt1r;
        v2[2 * ic + 1 + id3 + iv] += wt1i;
        v2[2 * ic + id4 + iv]     += wt2r;
        v2[2 * ic + 1 + id4 + iv] += wt2i;
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_tp_chiral(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 3;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  int    it, nn;
  double vcp1[Nvc * 2 * Nx * Ny * Nz], vcp2[Nvc * 2 * Nx * Ny * Nz];

  //- boundary part
  it = Nt - 1;
  nn = 0;
  for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
    int in  = Nvc * ND * (ixyz + nn * Nx * Ny * Nz);
    int ix1 = Nvc * 2 * ixyz;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      vcp1[2 * ic + ix1]     = bc2 * (v1[2 * ic + id1 + in] + v1[2 * ic + id3 + in]);
      vcp1[2 * ic + 1 + ix1] = bc2 * (v1[2 * ic + 1 + id1 + in] + v1[2 * ic + 1 + id3 + in]);
      vcp1[2 * ic + ix2]     = bc2 * (v1[2 * ic + id2 + in] + v1[2 * ic + id4 + in]);
      vcp1[2 * ic + 1 + ix2] = bc2 * (v1[2 * ic + 1 + id2 + in] + v1[2 * ic + 1 + id4 + in]);
    }
  }

  Communicator::exchange(Nvc * 2 * Nx * Ny * Nz, vcp2, vcp1, 3, 1, 7);

  it = Nt - 1;
  nn = 0;
  for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
    int iv  = Nvc * ND * (ixyz + it * Nx * Ny * Nz);
    int ig  = Ndf * (ixyz + it * Nx * Ny * Nz + idir * Nst);
    int ix1 = Nvc * 2 * ixyz;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      int ic2 = ic * Nvc;

      wt1r = mult_uv_r(&u[ic2 + ig], &vcp2[ix1]);
      wt1i = mult_uv_i(&u[ic2 + ig], &vcp2[ix1]);
      wt2r = mult_uv_r(&u[ic2 + ig], &vcp2[ix2]);
      wt2i = mult_uv_i(&u[ic2 + ig], &vcp2[ix2]);

      v2[2 * ic + id1 + iv]     += wt1r;
      v2[2 * ic + 1 + id1 + iv] += wt1i;
      v2[2 * ic + id2 + iv]     += wt2r;
      v2[2 * ic + 1 + id2 + iv] += wt2i;
      v2[2 * ic + id3 + iv]     += wt1r;
      v2[2 * ic + 1 + id3 + iv] += wt1i;
      v2[2 * ic + id4 + iv]     += wt2r;
      v2[2 * ic + 1 + id4 + iv] += wt2i;
    }
  }

  //- bulk part
  for (int it = 0; it < (Nt - 1); it++) {
    int nn = it + 1;
    for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
      int iv = Nvc * ND * (ixyz + it * Nx * Ny * Nz);
      int in = Nvc * ND * (ixyz + nn * Nx * Ny * Nz);
      int ig = Ndf * (ixyz + it * Nx * Ny * Nz + idir * Nst);

      for (int ic = 0; ic < Ncol; ic++) {
        vt1[2 * ic]     = v1[2 * ic + id1 + in] + v1[2 * ic + id3 + in];
        vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] + v1[2 * ic + 1 + id3 + in];
        vt2[2 * ic]     = v1[2 * ic + id2 + in] + v1[2 * ic + id4 + in];
        vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] + v1[2 * ic + 1 + id4 + in];
      }

      for (int ic = 0; ic < Ncol; ic++) {
        int ic2 = ic * Nvc;

        wt1r = mult_uv_r(&u[ic2 + ig], vt1);
        wt1i = mult_uv_i(&u[ic2 + ig], vt1);
        wt2r = mult_uv_r(&u[ic2 + ig], vt2);
        wt2i = mult_uv_i(&u[ic2 + ig], vt2);

        v2[2 * ic + id1 + iv]     += wt1r;
        v2[2 * ic + 1 + id1 + iv] += wt1i;
        v2[2 * ic + id2 + iv]     += wt2r;
        v2[2 * ic + 1 + id2 + iv] += wt2i;
        v2[2 * ic + id3 + iv]     += wt1r;
        v2[2 * ic + 1 + id3 + iv] += wt1i;
        v2[2 * ic + id4 + iv]     += wt2r;
        v2[2 * ic + 1 + id4 + iv] += wt2i;
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_tm_dirac(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 3;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  int    it, nn;
  double vcp1[Nvc * 2 * Nx * Ny * Nz], vcp2[Nvc * 2 * Nx * Ny * Nz];

  //- boundary part
  it = 0;
  nn = Nt - 1;
  for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
    int in  = Nvc * ND * (ixyz + nn * Nx * Ny * Nz);
    int ig  = Ndf * (ixyz + nn * Nx * Ny * Nz + idir * Nst);
    int ix1 = Nvc * 2 * ixyz;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      vt1[2 * ic]     = 2.0 * v1[2 * ic + id1 + in];
      vt1[2 * ic + 1] = 2.0 * v1[2 * ic + 1 + id1 + in];
      vt2[2 * ic]     = 2.0 * v1[2 * ic + id2 + in];
      vt2[2 * ic + 1] = 2.0 * v1[2 * ic + 1 + id2 + in];
    }

    for (int ic = 0; ic < Ncol; ic++) {
      int icr = 2 * ic;
      int ici = 2 * ic + 1;
      vcp1[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1);
      vcp1[ici + ix1] = mult_udagv_i(&u[icr + ig], vt1);
      vcp1[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2);
      vcp1[ici + ix2] = mult_udagv_i(&u[icr + ig], vt2);
    }
  }

  Communicator::exchange(Nvc * 2 * Nx * Ny * Nz, vcp2, vcp1, 3, -1, 8);

  it = 0;
  nn = Nt - 1;
  for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
    int iv  = Nvc * ND * (ixyz + it * Nx * Ny * Nz);
    int ix1 = Nvc * 2 * ixyz;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      int icr = 2 * ic;
      int ici = 2 * ic + 1;
      v2[icr + id1 + iv] += bc2 * vcp2[icr + ix1];
      v2[ici + id1 + iv] += bc2 * vcp2[ici + ix1];
      v2[icr + id2 + iv] += bc2 * vcp2[icr + ix2];
      v2[ici + id2 + iv] += bc2 * vcp2[ici + ix2];
    }
  }

  //- bulk part
  for (int it = 1; it < Nt; it++) {
    int nn = it - 1;
    for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
      int iv = Nvc * ND * (ixyz + it * Nx * Ny * Nz);
      int in = Nvc * ND * (ixyz + nn * Nx * Ny * Nz);
      int ig = Ndf * (ixyz + nn * Nx * Ny * Nz + idir * Nst);

      for (int ic = 0; ic < Ncol; ic++) {
        vt1[2 * ic]     = 2.0 * v1[2 * ic + id1 + in];
        vt1[2 * ic + 1] = 2.0 * v1[2 * ic + 1 + id1 + in];
        vt2[2 * ic]     = 2.0 * v1[2 * ic + id2 + in];
        vt2[2 * ic + 1] = 2.0 * v1[2 * ic + 1 + id2 + in];
      }

      for (int ic = 0; ic < Ncol; ic++) {
        int icr = 2 * ic;
        int ici = 2 * ic + 1;
        int ic1 = 0;
        int ic2 = Nvc;
        int ic3 = 2 * Nvc;

        wt1r = mult_udagv_r(&u[icr + ig], vt1);
        wt1i = mult_udagv_i(&u[icr + ig], vt1);
        wt2r = mult_udagv_r(&u[icr + ig], vt2);
        wt2i = mult_udagv_i(&u[icr + ig], vt2);

        v2[icr + id1 + iv] += wt1r;
        v2[ici + id1 + iv] += wt1i;
        v2[icr + id2 + iv] += wt2r;
        v2[ici + id2 + iv] += wt2i;
      }
    }
  }
}


//====================================================================
void Fopr_Wilson::Fopr_Wilson_impl::mult_tm_chiral(Field& w, const Field& f)
{
  int Ncol = CommonParameters::Nc();
  int Nvc  = 2 * Ncol;
  int Ndf  = 2 * Ncol * Ncol;
  int ND   = CommonParameters::Nd();
  int Nx   = CommonParameters::Nx();
  int Ny   = CommonParameters::Ny();
  int Nz   = CommonParameters::Nz();
  int Nt   = CommonParameters::Nt();
  int Nst  = Nx * Ny * Nz * Nt;

  double *v1;
  double *v2;
  double *u;

  v2 = w.ptr(0);
  v1 = const_cast<Field *>(&f)->ptr(0);
  u  = const_cast<Field_G *>(m_U)->ptr(0);

  int idir = 3;

  int id1 = 0;
  int id2 = Nvc;
  int id3 = Nvc * 2;
  int id4 = Nvc * 3;

  double vt1[Nvc], vt2[Nvc];
  double wt1r, wt1i, wt2r, wt2i;
  double bc2 = 1.0;
  if (Communicator::ipe(idir) == 0) bc2 = m_boundary[idir];

  int    it, nn;
  double vcp1[Nvc * 2 * Nx * Ny * Nz], vcp2[Nvc * 2 * Nx * Ny * Nz];

  //- boundary part
  it = 0;
  nn = Nt - 1;
  for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
    int in  = Nvc * ND * (ixyz + nn * Nx * Ny * Nz);
    int ig  = Ndf * (ixyz + nn * Nx * Ny * Nz + idir * Nst);
    int ix1 = Nvc * 2 * ixyz;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      vt1[2 * ic]     = v1[2 * ic + id1 + in] - v1[2 * ic + id3 + in];
      vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] - v1[2 * ic + 1 + id3 + in];
      vt2[2 * ic]     = v1[2 * ic + id2 + in] - v1[2 * ic + id4 + in];
      vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] - v1[2 * ic + 1 + id4 + in];
    }

    for (int ic = 0; ic < Ncol; ic++) {
      int icr = 2 * ic;
      int ici = 2 * ic + 1;
      vcp1[icr + ix1] = mult_udagv_r(&u[icr + ig], vt1);
      vcp1[ici + ix1] = mult_udagv_i(&u[icr + ig], vt1);
      vcp1[icr + ix2] = mult_udagv_r(&u[icr + ig], vt2);
      vcp1[ici + ix2] = mult_udagv_i(&u[icr + ig], vt2);
    }
  }

  Communicator::exchange(Nvc * 2 * Nx * Ny * Nz, vcp2, vcp1, 3, -1, 8);

  it = 0;
  nn = Nt - 1;
  for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
    int iv  = Nvc * ND * (ixyz + it * Nx * Ny * Nz);
    int ix1 = Nvc * 2 * ixyz;
    int ix2 = ix1 + Nvc;

    for (int ic = 0; ic < Ncol; ic++) {
      int icr = 2 * ic;
      int ici = 2 * ic + 1;
      v2[icr + id1 + iv] += bc2 * vcp2[icr + ix1];
      v2[ici + id1 + iv] += bc2 * vcp2[ici + ix1];
      v2[icr + id2 + iv] += bc2 * vcp2[icr + ix2];
      v2[ici + id2 + iv] += bc2 * vcp2[ici + ix2];
      v2[icr + id3 + iv] -= bc2 * vcp2[icr + ix1];
      v2[ici + id3 + iv] -= bc2 * vcp2[ici + ix1];
      v2[icr + id4 + iv] -= bc2 * vcp2[icr + ix2];
      v2[ici + id4 + iv] -= bc2 * vcp2[ici + ix2];
    }
  }

  //- bulk part
  for (int it = 1; it < Nt; it++) {
    int nn = it - 1;
    for (int ixyz = 0; ixyz < (Nx * Ny * Nz); ixyz++) {
      int iv = Nvc * ND * (ixyz + it * Nx * Ny * Nz);
      int in = Nvc * ND * (ixyz + nn * Nx * Ny * Nz);
      int ig = Ndf * (ixyz + nn * Nx * Ny * Nz + idir * Nst);

      for (int ic = 0; ic < Ncol; ic++) {
        vt1[2 * ic]     = v1[2 * ic + id1 + in] - v1[2 * ic + id3 + in];
        vt1[2 * ic + 1] = v1[2 * ic + 1 + id1 + in] - v1[2 * ic + 1 + id3 + in];
        vt2[2 * ic]     = v1[2 * ic + id2 + in] - v1[2 * ic + id4 + in];
        vt2[2 * ic + 1] = v1[2 * ic + 1 + id2 + in] - v1[2 * ic + 1 + id4 + in];
      }

      for (int ic = 0; ic < Ncol; ic++) {
        int icr = 2 * ic;
        int ici = 2 * ic + 1;
        int ic1 = 0;
        int ic2 = Nvc;
        int ic3 = 2 * Nvc;

        wt1r = mult_udagv_r(&u[icr + ig], vt1);
        wt1i = mult_udagv_i(&u[icr + ig], vt1);
        wt2r = mult_udagv_r(&u[icr + ig], vt2);
        wt2i = mult_udagv_i(&u[icr + ig], vt2);

        v2[icr + id1 + iv] += wt1r;
        v2[ici + id1 + iv] += wt1i;
        v2[icr + id2 + iv] += wt2r;
        v2[ici + id2 + iv] += wt2i;
        v2[icr + id3 + iv] -= wt1r;
        v2[ici + id3 + iv] -= wt1i;
        v2[icr + id4 + iv] -= wt2r;
        v2[ici + id4 + iv] -= wt2i;
      }
    }
  }
}


//====================================================================
//============================================================END=====
