Remove C++ dependency

The C++ kernels can be written as C kernels, which simplifies the build
procedure
parent 12b958fd
...@@ -106,19 +106,19 @@ if WITH_REAL_AVX_BLOCK6_KERNEL ...@@ -106,19 +106,19 @@ if WITH_REAL_AVX_BLOCK6_KERNEL
endif endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv.c
endif endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv.c
endif endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv.c
endif endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv.c
endif endif
# install any .mod files in the include/ dir # install any .mod files in the include/ dir
......
...@@ -43,10 +43,10 @@ if test x$_cv_gnu_make_command = x ; then ...@@ -43,10 +43,10 @@ if test x$_cv_gnu_make_command = x ; then
AC_MSG_ERROR([Need GNU Make]) AC_MSG_ERROR([Need GNU Make])
fi fi
AC_CHECK_PROG(CPP_FOUND,cpp,yes,no) #AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
if test x"${CPP_FOUND}" = xno; then #if test x"${CPP_FOUND}" = xno; then
AC_MSG_ERROR([no cpp found]) # AC_MSG_ERROR([no cpp found])
fi #fi
# gnu-make fortran module dependencies # gnu-make fortran module dependencies
m4_include([fdep/fortran_dependencies.m4]) m4_include([fdep/fortran_dependencies.m4])
...@@ -112,17 +112,17 @@ if test x"${enable_openmp}" = x"yes"; then ...@@ -112,17 +112,17 @@ if test x"${enable_openmp}" = x"yes"; then
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS" FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi fi
# C++ ## C++
AC_LANG([C++]) #AC_LANG([C++])
AC_PROG_CXX #AC_PROG_CXX
#
if test x"${enable_openmp}" = x"yes"; then #if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP # AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then # if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS]) # AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi # fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS" # CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi #fi
...@@ -240,26 +240,26 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([ ...@@ -240,26 +240,26 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
) )
AC_MSG_RESULT([${can_compile_avx}]) AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "yes" ; then #if test "${can_compile_avx}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++]) # AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
AC_LANG_PUSH([C++]) # AC_LANG_PUSH([C++])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ # AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h> # #include <x86intrin.h>
int main(int argc, char **argv){ # int main(int argc, char **argv){
double* q; # double* q;
__m256d a1_1 = _mm256_load_pd(q); # __m256d a1_1 = _mm256_load_pd(q);
return 0; # return 0;
} # }
])], # ])],
[can_compile_avx=yes], # [can_compile_avx=yes],
[can_compile_avx=no] # [can_compile_avx=no]
) # )
AC_LANG_POP([C++]) # AC_LANG_POP([C++])
AC_MSG_RESULT([${can_compile_avx}]) # AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "no" ; then # if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether]) # AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
fi # fi
fi #fi
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C]) AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...@@ -275,27 +275,27 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([ ...@@ -275,27 +275,27 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
[can_compile_avx2=no] [can_compile_avx2=no]
) )
AC_MSG_RESULT([${can_compile_avx2}]) AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "yes" ; then #if test "${can_compile_avx2}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++]) # AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
AC_LANG_PUSH([C++]) # AC_LANG_PUSH([C++])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ # AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h> # #include <x86intrin.h>
int main(int argc, char **argv){ # int main(int argc, char **argv){
double* q; # double* q;
__m256d q1 = _mm256_load_pd(q); # __m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_fmadd_pd(q1, q1, q1); # __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
return 0; # return 0;
} # }
])], # ])],
[can_compile_avx2=yes], # [can_compile_avx2=yes],
[can_compile_avx2=no] # [can_compile_avx2=no]
) # )
AC_LANG_POP([C++]) # AC_LANG_POP([C++])
AC_MSG_RESULT([${can_compile_avx2}]) # AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "no" ; then # if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Cannot compile C++ with AVX2!]) # AC_MSG_WARN([Cannot compile C++ with AVX2!])
fi # fi
fi #fi
if test "${can_compile_avx}" = "yes" ; then if test "${can_compile_avx}" = "yes" ; then
install_real_avx_block2=yes install_real_avx_block2=yes
...@@ -854,7 +854,7 @@ grep -h "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elp ...@@ -854,7 +854,7 @@ grep -h "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elp
echo "Generating Fortran interfaces for C kernels" echo "Generating Fortran interfaces for C kernels"
grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.c | sed 's/^ *!f>//;' > elpa/elpa_generated_fortran_interfaces.h || exit 1 grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.c | sed 's/^ *!f>//;' > elpa/elpa_generated_fortran_interfaces.h || exit 1
grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.cpp | sed 's/^ *!f>//;' >> elpa/elpa_generated_fortran_interfaces.h || exit 1 #grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.cpp | sed 's/^ *!f>//;' >> elpa/elpa_generated_fortran_interfaces.h || exit 1
echo "Generating test/shared_sources/generated.h..." echo "Generating test/shared_sources/generated.h..."
mkdir -p test/shared_sources mkdir -p test/shared_sources
......
...@@ -61,7 +61,7 @@ ...@@ -61,7 +61,7 @@
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h" #include "config-f90.h"
#include <complex> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -82,12 +82,10 @@ ...@@ -82,12 +82,10 @@
#endif #endif
extern "C" {
//Forward declaration //Forward declaration
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
/* /*
!f>#ifdef HAVE_AVX !f>#ifdef HAVE_AVX
...@@ -102,7 +100,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double ...@@ -102,7 +100,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double
!f>#endif !f>#endif
*/ */
void single_hh_trafo_complex_avx_avx2_1hv(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq) void single_hh_trafo_complex_avx_avx2_1hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -124,7 +122,7 @@ void single_hh_trafo_complex_avx_avx2_1hv(std::complex<double>* q, std::complex< ...@@ -124,7 +122,7 @@ void single_hh_trafo_complex_avx_avx2_1hv(std::complex<double>* q, std::complex<
} }
} }
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -319,7 +317,7 @@ void single_hh_trafo_complex_avx_avx2_1hv(std::complex<double>* q, std::complex< ...@@ -319,7 +317,7 @@ void single_hh_trafo_complex_avx_avx2_1hv(std::complex<double>* q, std::complex<
} }
} }
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -464,7 +462,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double> ...@@ -464,7 +462,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(std::complex<double>
} }
} }
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -558,4 +556,3 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double> ...@@ -558,4 +556,3 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
} }
} }
} // extern C
...@@ -61,7 +61,7 @@ ...@@ -61,7 +61,7 @@
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h" #include "config-f90.h"
#include <complex> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -82,13 +82,11 @@ ...@@ -82,13 +82,11 @@
#endif #endif
extern "C" {
//Forward declaration //Forward declaration
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
/* /*
!f>#ifdef HAVE_AVX !f>#ifdef HAVE_AVX
...@@ -103,7 +101,7 @@ static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double> ...@@ -103,7 +101,7 @@ static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>
!f>#endif !f>#endif
*/ */
void double_hh_trafo_complex_avx_avx2_2hv(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh) void double_hh_trafo_complex_avx_avx2_2hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -111,7 +109,7 @@ void double_hh_trafo_complex_avx_avx2_2hv(std::complex<double>* q, std::complex< ...@@ -111,7 +109,7 @@ void double_hh_trafo_complex_avx_avx2_2hv(std::complex<double>* q, std::complex<
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
std::complex<double> s = conj(hh[(ldh)+1])*1.0; double complex s = conj(hh[(ldh)+1])*1.0;
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
s += hh[i-1] * conj(hh[(i+ldh)]); s += hh[i-1] * conj(hh[(i+ldh)]);
...@@ -142,7 +140,7 @@ void double_hh_trafo_complex_avx_avx2_2hv(std::complex<double>* q, std::complex< ...@@ -142,7 +140,7 @@ void double_hh_trafo_complex_avx_avx2_2hv(std::complex<double>* q, std::complex<
#endif #endif
} }
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -575,7 +573,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double> ...@@ -575,7 +573,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4); _mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
} }
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -928,7 +926,7 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double> ...@@ -928,7 +926,7 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3); _mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
} }
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -1201,7 +1199,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double> ...@@ -1201,7 +1199,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2); _mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
} }
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -1393,4 +1391,3 @@ static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double> ...@@ -1393,4 +1391,3 @@ static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv(std::complex<double>
_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1); _mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
} }
} // extern C
...@@ -62,7 +62,7 @@ ...@@ -62,7 +62,7 @@
#include "config-f90.h" #include "config-f90.h"
#include <complex> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -72,12 +72,10 @@ ...@@ -72,12 +72,10 @@
#endif #endif
extern "C" {
//Forward declaration //Forward declaration
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq); static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE
...@@ -92,7 +90,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double> ...@@ -92,7 +90,7 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>
!f>#endif !f>#endif
*/ */
void single_hh_trafo_complex_sse_1hv(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq) void single_hh_trafo_complex_sse_1hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -114,7 +112,7 @@ void single_hh_trafo_complex_sse_1hv(std::complex<double>* q, std::complex<doubl ...@@ -114,7 +112,7 @@ void single_hh_trafo_complex_sse_1hv(std::complex<double>* q, std::complex<doubl
} }
} }
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -309,7 +307,7 @@ static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double> ...@@ -309,7 +307,7 @@ static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv(std::complex<double>
} }
} }
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -454,7 +452,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double> ...@@ -454,7 +452,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv(std::complex<double>
} }
} }
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq) static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(double complex* q, double complex* hh, int nb, int ldq)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -548,4 +546,3 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double> ...@@ -548,4 +546,3 @@ static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv(std::complex<double>
_mm_store_pd(&q_dbl[(2*i*ldq)+2], q2); _mm_store_pd(&q_dbl[(2*i*ldq)+2], q2);
} }
} }
} // extern C
...@@ -61,7 +61,7 @@ ...@@ -61,7 +61,7 @@
// -------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------
#include "config-f90.h" #include "config-f90.h"
#include <complex> #include <complex.h>
#include <x86intrin.h> #include <x86intrin.h>
#define __forceinline __attribute__((always_inline)) #define __forceinline __attribute__((always_inline))
...@@ -70,13 +70,12 @@ ...@@ -70,13 +70,12 @@
#undef __AVX__ #undef __AVX__
#endif #endif
extern "C" {
//Forward declaration //Forward declaration
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_3_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_2_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s); static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
/* /*
!f>#ifdef HAVE_SSE !f>#ifdef HAVE_SSE
...@@ -91,7 +90,7 @@ static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double> ...@@ -91,7 +90,7 @@ static __forceinline void hh_trafo_complex_kernel_1_SSE_2hv(std::complex<double>
!f>#endif !f>#endif
*/ */
void double_hh_trafo_complex_sse_2hv(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh) void double_hh_trafo_complex_sse_2hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
...@@ -99,7 +98,7 @@ void double_hh_trafo_complex_sse_2hv(std::complex<double>* q, std::complex<doubl ...@@ -99,7 +98,7 @@ void double_hh_trafo_complex_sse_2hv(std::complex<double>* q, std::complex<doubl
int ldq = *pldq; int ldq = *pldq;
int ldh = *pldh; int ldh = *pldh;
std::complex<double> s = conj(hh[(ldh)+1])*1.0; double complex s = conj(hh[(ldh)+1])*1.0;
for (i = 2; i < nb; i++) for (i = 2; i < nb; i++)
{ {
s += hh[i-1] * conj(hh[(i+ldh)]); s += hh[i-1] * conj(hh[(i+ldh)]);
...@@ -126,7 +125,7 @@ void double_hh_trafo_complex_sse_2hv(std::complex<double>* q, std::complex<doubl ...@@ -126,7 +125,7 @@ void double_hh_trafo_complex_sse_2hv(std::complex<double>* q, std::complex<doubl
#endif #endif
} }
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s) static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{ {
double* q_dbl = (double*)q; double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh; double* hh_dbl = (double*)hh;
...@@ -559,7 +558,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double> ...@@ -559,7 +558,7 @@ static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv(std::complex<double>
_mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4); _mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4);
} }