Unverified Commit 12b958fd authored by Andreas Marek's avatar Andreas Marek
Browse files

Fortran interfaces for the C/C++ SSE/AVX/AVX2 kernels

In order to increase type safty all ELPA2 kernels provide now
an interface. The interfaces for the C/C++ kernels are automatically
generated during the configure step
parent 16ed9380
...@@ -12,6 +12,7 @@ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSIO ...@@ -12,6 +12,7 @@ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSIO
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/mod_mpi.F90 \ src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \ src/mod_mpi_stubs.F90 \
src/elpa2_kernels/mod_fortran_interfaces.F90 \
src/elpa_utilities.F90 \ src/elpa_utilities.F90 \
src/elpa1_compute.F90 \ src/elpa1_compute.F90 \
src/elpa1.F90 \ src/elpa1.F90 \
......
...@@ -852,6 +852,10 @@ echo "Generating elpa/elpa_generated.h..." ...@@ -852,6 +852,10 @@ echo "Generating elpa/elpa_generated.h..."
mkdir -p elpa mkdir -p elpa
grep -h "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_generated.h || exit 1 grep -h "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_generated.h || exit 1
echo "Generating Fortran interfaces for C kernels"
grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.c | sed 's/^ *!f>//;' > elpa/elpa_generated_fortran_interfaces.h || exit 1
grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.cpp | sed 's/^ *!f>//;' >> elpa/elpa_generated_fortran_interfaces.h || exit 1
echo "Generating test/shared_sources/generated.h..." echo "Generating test/shared_sources/generated.h..."
mkdir -p test/shared_sources mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1 grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
......
...@@ -89,57 +89,20 @@ static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<doubl ...@@ -89,57 +89,20 @@ static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<doubl
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#if 0 /*
static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) !f>#ifdef HAVE_AVX
{ !f> interface
std::complex<double> x0; !f> subroutine single_hh_trafo_complex_avx_avx2_1hv(q, hh, pnb, pnq, pldq) bind(C, name="single_hh_trafo_complex_avx_avx2_1hv")
std::complex<double> x1; !f> use, intrinsic :: iso_c_binding
std::complex<double> x2; !f> integer(kind=c_int) :: pnb, pnq, pldq
std::complex<double> x3; !f> complex(kind=c_double) :: q(*)
std::complex<double> h0; !f> complex(kind=c_double) :: hh(pnb,2)
std::complex<double> tau0; !f> end subroutine
int i=0; !f> end interface
!f>#endif
x0 = q[0]; */
x1 = q[1];
x2 = q[2]; void single_hh_trafo_complex_avx_avx2_1hv(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
x3 = q[3];
for (i = 1; i < nb; i++)
{
h0 = conj(hh[i]);
x0 += (q[(i*ldq)+0] * h0);
x1 += (q[(i*ldq)+1] * h0);
x2 += (q[(i*ldq)+2] * h0);
x3 += (q[(i*ldq)+3] * h0);
}
tau0 = hh[0];
h0 = (-1.0)*tau0;
x0 *= h0;
x1 *= h0;
x2 *= h0;
x3 *= h0;
q[0] += x0;
q[1] += x1;
q[2] += x2;
q[3] += x3;
for (i = 1; i < nb; i++)
{
h0 = hh[i];
q[(i*ldq)+0] += (x0*h0);
q[(i*ldq)+1] += (x1*h0);
q[(i*ldq)+2] += (x2*h0);
q[(i*ldq)+3] += (x3*h0);
}
}
#endif // if 0
void single_hh_trafo_complex_avx_avx2_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
...@@ -90,105 +90,20 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double> ...@@ -90,105 +90,20 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#if 0 /*
static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) !f>#ifdef HAVE_AVX
{ !f> interface
std::complex<double> x1; !f> subroutine double_hh_trafo_complex_avx_avx2_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_complex_avx_avx2_2hv")
std::complex<double> x2; !f> use, intrinsic :: iso_c_binding
std::complex<double> x3; !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
std::complex<double> x4; !f> complex(kind=c_double) :: q(*)
std::complex<double> y1; !f> complex(kind=c_double) :: hh(pnb,2)
std::complex<double> y2; !f> end subroutine
std::complex<double> y3; !f> end interface
std::complex<double> y4; !f>#endif
std::complex<double> h1; */
std::complex<double> h2;
std::complex<double> tau1; void double_hh_trafo_complex_avx_avx2_2hv(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
std::complex<double> tau2;
int i=0;
x1 = q[ldq+0];
x2 = q[ldq+1];
x3 = q[ldq+2];
x4 = q[ldq+3];
h2 = conj(hh[ldh+1]);
y1 = q[0] + (x1*h2);
y2 = q[1] + (x2*h2);
y3 = q[2] + (x3*h2);
y4 = q[3] + (x4*h2);
for (i = 2; i < nb; i++)
{
h1 = conj(hh[i-1]);
h2 = conj(hh[ldh+i]);
x1 += (q[(i*ldq)+0] * h1);
y1 += (q[(i*ldq)+0] * h2);
x2 += (q[(i*ldq)+1] * h1);
y2 += (q[(i*ldq)+1] * h2);
x3 += (q[(i*ldq)+2] * h1);
y3 += (q[(i*ldq)+2] * h2);
x4 += (q[(i*ldq)+3] * h1);
y4 += (q[(i*ldq)+3] * h2);
}
h1 = conj(hh[nb-1]);
x1 += (q[(nb*ldq)+0] * h1);
x2 += (q[(nb*ldq)+1] * h1);
x3 += (q[(nb*ldq)+2] * h1);
x4 += (q[(nb*ldq)+3] * h1);
tau1 = hh[0];
tau2 = hh[ldh];
h1 = (-1.0)*tau1;
x1 *= h1;
x2 *= h1;
x3 *= h1;
x4 *= h1;
h1 = (-1.0)*tau2;
h2 = (-1.0)*tau2;
h2 *= s;
y1 = y1*h1 +x1*h2;
y2 = y2*h1 +x2*h2;
y3 = y3*h1 +x3*h2;
y4 = y4*h1 +x4*h2;
q[0] += y1;
q[1] += y2;
q[2] += y3;
q[3] += y4;
h2 = hh[ldh+1];
q[ldq+0] += (x1 + (y1*h2));
q[ldq+1] += (x2 + (y2*h2));
q[ldq+2] += (x3 + (y3*h2));
q[ldq+3] += (x4 + (y4*h2));
for (i = 2; i < nb; i++)
{
h1 = hh[i-1];
h2 = hh[ldh+i];
q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
}
h1 = hh[nb-1];
q[(nb*ldq)+0] += (x1*h1);
q[(nb*ldq)+1] += (x2*h1);
q[(nb*ldq)+2] += (x3*h1);
q[(nb*ldq)+3] += (x4*h1);
}
#endif
void double_hh_trafo_complex_avx_avx2_2hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
...@@ -79,57 +79,20 @@ static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double> ...@@ -79,57 +79,20 @@ static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#if 0 /*
static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) !f>#ifdef HAVE_SSE
{ !f> interface
std::complex<double> x0; !f> subroutine single_hh_trafo_complex_sse_1hv(q, hh, pnb, pnq, pldq) bind(C, name="single_hh_trafo_complex_sse_1hv")
std::complex<double> x1; !f> use, intrinsic :: iso_c_binding
std::complex<double> x2; !f> integer(kind=c_int) :: pnb, pnq, pldq
std::complex<double> x3; !f> complex(kind=c_double) :: q(*)
std::complex<double> h0; !f> complex(kind=c_double) :: hh(pnb,2)
std::complex<double> tau0; !f> end subroutine
int i=0; !f> end interface
!f>#endif
x0 = q[0]; */
x1 = q[1];
x2 = q[2]; void single_hh_trafo_complex_sse_1hv(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
x3 = q[3];
for (i = 1; i < nb; i++)
{
h0 = conj(hh[i]);
x0 += (q[(i*ldq)+0] * h0);
x1 += (q[(i*ldq)+1] * h0);
x2 += (q[(i*ldq)+2] * h0);
x3 += (q[(i*ldq)+3] * h0);
}
tau0 = hh[0];
h0 = (-1.0)*tau0;
x0 *= h0;
x1 *= h0;
x2 *= h0;
x3 *= h0;
q[0] += x0;
q[1] += x1;
q[2] += x2;
q[3] += x3;
for (i = 1; i < nb; i++)
{
h0 = hh[i];
q[(i*ldq)+0] += (x0*h0);
q[(i*ldq)+1] += (x1*h0);
q[(i*ldq)+2] += (x2*h0);
q[(i*ldq)+3] += (x3*h0);
}
}
#endif // if 0
void single_hh_trafo_complex_sse_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
...@@ -78,105 +78,20 @@ static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double> ...@@ -78,105 +78,20 @@ static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#if 0 /*
static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) !f>#ifdef HAVE_SSE
{ !f> interface
std::complex<double> x1; !f> subroutine double_hh_trafo_complex_sse_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_complex_sse_2hv")
std::complex<double> x2; !f> use, intrinsic :: iso_c_binding
std::complex<double> x3; !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
std::complex<double> x4; !f> complex(kind=c_double) :: q(*)
std::complex<double> y1; !f> complex(kind=c_double) :: hh(pnb,2)
std::complex<double> y2; !f> end subroutine
std::complex<double> y3; !f> end interface
std::complex<double> y4; !f>#endif
std::complex<double> h1; */
std::complex<double> h2;
std::complex<double> tau1; void double_hh_trafo_complex_sse_2hv(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
std::complex<double> tau2;
int i=0;
x1 = q[ldq+0];
x2 = q[ldq+1];
x3 = q[ldq+2];
x4 = q[ldq+3];
h2 = conj(hh[ldh+1]);
y1 = q[0] + (x1*h2);
y2 = q[1] + (x2*h2);
y3 = q[2] + (x3*h2);
y4 = q[3] + (x4*h2);
for (i = 2; i < nb; i++)
{
h1 = conj(hh[i-1]);
h2 = conj(hh[ldh+i]);
x1 += (q[(i*ldq)+0] * h1);
y1 += (q[(i*ldq)+0] * h2);
x2 += (q[(i*ldq)+1] * h1);
y2 += (q[(i*ldq)+1] * h2);
x3 += (q[(i*ldq)+2] * h1);
y3 += (q[(i*ldq)+2] * h2);
x4 += (q[(i*ldq)+3] * h1);
y4 += (q[(i*ldq)+3] * h2);
}
h1 = conj(hh[nb-1]);
x1 += (q[(nb*ldq)+0] * h1);
x2 += (q[(nb*ldq)+1] * h1);
x3 += (q[(nb*ldq)+2] * h1);
x4 += (q[(nb*ldq)+3] * h1);
tau1 = hh[0];
tau2 = hh[ldh];
h1 = (-1.0)*tau1;
x1 *= h1;
x2 *= h1;
x3 *= h1;
x4 *= h1;
h1 = (-1.0)*tau2;
h2 = (-1.0)*tau2;
h2 *= s;
y1 = y1*h1 +x1*h2;
y2 = y2*h1 +x2*h2;
y3 = y3*h1 +x3*h2;
y4 = y4*h1 +x4*h2;
q[0] += y1;
q[1] += y2;
q[2] += y3;
q[3] += y4;
h2 = hh[ldh+1];
q[ldq+0] += (x1 + (y1*h2));
q[ldq+1] += (x2 + (y2*h2));
q[ldq+2] += (x3 + (y3*h2));
q[ldq+3] += (x4 + (y4*h2));
for (i = 2; i < nb; i++)
{
h1 = hh[i-1];
h2 = hh[ldh+i];
q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
}
h1 = hh[nb-1];
q[(nb*ldq)+0] += (x1*h1);
q[(nb*ldq)+1] += (x2*h1);
q[(nb*ldq)+2] += (x3*h1);
q[(nb*ldq)+3] += (x4*h1);
}
#endif
void double_hh_trafo_complex_sse_2hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
...@@ -86,12 +86,22 @@ __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ...@@ -86,12 +86,22 @@ __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int
__forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); /*
#if 0 !f>#ifdef HAVE_AVX
void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); !f> interface
#endif !f> subroutine double_hh_trafo_real_avx_avx2_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_real_avx_avx2_2hv")
!f> use, intrinsic :: iso_c_binding
void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> real(kind=c_double) :: q(*)
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
void double_hh_trafo_real_avx_avx2_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
void double_hh_trafo_real_avx_avx2_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -143,41 +153,6 @@ void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pn ...@@ -143,41 +153,6 @@ void double_hh_trafo_real_avx_avx2_2hv_(double* q, double* hh, int* pnb, int* pn
hh_trafo_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s); hh_trafo_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
} }
} }
#if 0
void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar product to compute
// 2 householder vectors simultaneously
double s = hh[(ldh)+1]*1.0;
#pragma ivdep
for (i = 2; i < nb; i++)
{
s += hh[i-1] * hh[(i+ldh)];
}
// Production level kernel calls with padding
#ifdef __AVX__
for (i = 0; i < nq; i+=24)
{
hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
}
#else
for (i = 0; i < nq; i+=12)
{
hh_trafo_kernel_12_SSE_2hv(&q[i], hh, nb, ldq, ldh, s);
}
#endif
}
#endif
/** /**
* Unrolled kernel that computes * Unrolled kernel that computes
* 24 rows of Q simultaneously, a * 24 rows of Q simultaneously, a
......
...@@ -88,12 +88,22 @@ __forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int ...@@ -88,12 +88,22 @@ __forceinline void hh_trafo_kernel_4_AVX_4hv(double* q, double* hh, int nb, int
__forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_8_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
__forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_12_AVX_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); /*
#if 0 !f>#ifdef HAVE_AVX
void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); !f> interface
#endif !f> subroutine quad_hh_trafo_real_avx_avx2_4hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="quad_hh_trafo_real_avx_avx2_4hv")
!f> use, intrinsic :: iso_c_binding
void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) !f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> real(kind=c_double) :: q(*)
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
void quad_hh_trafo_real_avx_avx2_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
void quad_hh_trafo_real_avx_avx2_4hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -137,13 +147,6 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, ...@@ -137,13 +147,6 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq,
s_1_4 += hh[i-3] * hh[i+(ldh*3)]; s_1_4 += hh[i-3] * hh[i+(ldh*3)];
} }
// printf("s_1_2: %f\n", s_1_2);
// printf("s_1_3: %f\n", s_1_3);
// printf("s_2_3: %f\n", s_2_3);
// printf("s_1_4: %f\n", s_1_4);
// printf("s_2_4: %f\n", s_2_4);
// printf("s_3_4: %f\n", s_3_4);
// Production level kernel calls with padding // Production level kernel calls with padding
#ifdef __AVX__ #ifdef __AVX__
for (i = 0; i < nq-8; i+=12) for (i = 0; i < nq-8; i+=12)
...@@ -188,66 +191,6 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq, ...@@ -188,66 +191,6 @@ void quad_hh_trafo_real_avx_avx2_4hv_(double* q, double* hh, int* pnb, int* pnq,
#endif #endif
} }
#if 0
void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
// calculating scalar products to compute
// 4 householder vectors simultaneously
double s_1_2 = hh[(ldh)+1];
double s_1_3 = hh[(ldh*2)+2];
double s_2_3 = hh[(ldh*2)+1];
double s_1_4 = hh[(ldh*3)+3];
double s_2_4 = hh[(ldh*3)+2];
double s_3_4 = hh[(ldh*3)+1];
// calculate scalar product of first and fourth householder vector
// loop counter = 2
s_1_2 += hh[2-1] * hh[(2+ldh)];
s_2_3 += hh[(ldh)+2-1] * hh[2+(ldh*2)];
s_3_4 += hh[(ldh*2)+2-1] * hh[2+(ldh*3)];
// loop counter = 3
s_1_2 += hh[3-1] * hh[(3+ldh)];
s_2_3 += hh[(ldh)+3-1] * hh[3+(ldh*2)];
s_3_4 += hh[(ldh*2)+3-1] * hh[3+(ldh*3)];
s_1_3 += hh[3-2] * hh[3+(ldh*2)];
s_2_4 += hh[(ldh*1)+3-2] * hh[3+(ldh*3)];
#pragma ivdep
for (i = 4; i < nb; i++)
{
s_1_2 += hh[i-1] * hh[(i+ldh)];
s_2_3 += hh[(ldh)+i-1] * hh[i+(ldh*2)];
s_3_4 += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
s_1_3 += hh[i-2] * hh[i+(ldh*2)];