Commit 62fe6edc authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into ELPA_GPU

parents f568656c 7423daeb
......@@ -12,6 +12,7 @@ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSIO
libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/elpa2_kernels/mod_fortran_interfaces.F90 \
src/elpa_utilities.F90 \
src/elpa1_compute.F90 \
src/elpa1.F90 \
......@@ -139,30 +140,30 @@ endif
endif
if WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.c
endif
endif
if WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.c
endif
endif
if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.c
if WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.cpp
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.c
endif
endif
......
......@@ -43,10 +43,10 @@ if test x$_cv_gnu_make_command = x ; then
AC_MSG_ERROR([Need GNU Make])
fi
AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
if test x"${CPP_FOUND}" = xno; then
AC_MSG_ERROR([no cpp found])
fi
#AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
#if test x"${CPP_FOUND}" = xno; then
# AC_MSG_ERROR([no cpp found])
#fi
# gnu-make fortran module dependencies
m4_include([fdep/fortran_dependencies.m4])
......@@ -120,17 +120,17 @@ if test x"${enable_openmp}" = x"yes"; then
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi
# C++
AC_LANG([C++])
AC_PROG_CXX
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi
## C++
#AC_LANG([C++])
#AC_PROG_CXX
#
#if test x"${enable_openmp}" = x"yes"; then
# AX_ELPA_OPENMP
# if test "$ac_cv_prog_cxx_openmp" = unsupported; then
# AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
# fi
# CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
#fi
......@@ -268,26 +268,26 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
)
AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
AC_LANG_PUSH([C++])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d a1_1 = _mm256_load_pd(q);
return 0;
}
])],
[can_compile_avx=yes],
[can_compile_avx=no]
)
AC_LANG_POP([C++])
AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
fi
fi
#if test "${can_compile_avx}" = "yes" ; then
# AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
# AC_LANG_PUSH([C++])
# AC_COMPILE_IFELSE([AC_LANG_SOURCE([
# #include <x86intrin.h>
# int main(int argc, char **argv){
# double* q;
# __m256d a1_1 = _mm256_load_pd(q);
# return 0;
# }
# ])],
# [can_compile_avx=yes],
# [can_compile_avx=no]
# )
# AC_LANG_POP([C++])
# AC_MSG_RESULT([${can_compile_avx}])
# if test "${can_compile_avx}" = "no" ; then
# AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
# fi
#fi
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
......@@ -303,27 +303,27 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
[can_compile_avx2=no]
)
AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
AC_LANG_PUSH([C++])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
return 0;
}
])],
[can_compile_avx2=yes],
[can_compile_avx2=no]
)
AC_LANG_POP([C++])
AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Cannot compile C++ with AVX2!])
fi
fi
#if test "${can_compile_avx2}" = "yes" ; then
# AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
# AC_LANG_PUSH([C++])
# AC_COMPILE_IFELSE([AC_LANG_SOURCE([
# #include <x86intrin.h>
# int main(int argc, char **argv){
# double* q;
# __m256d q1 = _mm256_load_pd(q);
# __m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
# return 0;
# }
# ])],
# [can_compile_avx2=yes],
# [can_compile_avx2=no]
# )
# AC_LANG_POP([C++])
# AC_MSG_RESULT([${can_compile_avx2}])
# if test "${can_compile_avx2}" = "no" ; then
# AC_MSG_WARN([Cannot compile C++ with AVX2!])
# fi
#fi
if test "${can_compile_avx}" = "yes" ; then
install_real_avx_block2=yes
......@@ -941,6 +941,10 @@ echo "Generating elpa/elpa_generated.h..."
mkdir -p elpa
grep -h "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_generated.h || exit 1
echo "Generating Fortran interfaces for C kernels"
grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.c | sed 's/^ *!f>//;' > elpa/elpa_generated_fortran_interfaces.h || exit 1
#grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.cpp | sed 's/^ *!f>//;' >> elpa/elpa_generated_fortran_interfaces.h || exit 1
echo "Generating test/shared_sources/generated.h..."
mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
......
......@@ -61,7 +61,7 @@
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex>
#include <complex.h>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
......@@ -82,64 +82,26 @@
#endif
extern "C" {
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
std::complex<double> x0;
std::complex<double> x1;
std::complex<double> x2;
std::complex<double> x3;
std::complex<double> h0;
std::complex<double> tau0;
int i=0;
x0 = q[0];
x1 = q[1];
x2 = q[2];
x3 = q[3];
for (i = 1; i < nb; i++)
{
h0 = conj(hh[i]);
x0 += (q[(i*ldq)+0] * h0);
x1 += (q[(i*ldq)+1] * h0);
x2 += (q[(i*ldq)+2] * h0);
x3 += (q[(i*ldq)+3] * h0);
}
tau0 = hh[0];
h0 = (-1.0)*tau0;
x0 *= h0;
x1 *= h0;
x2 *= h0;
x3 *= h0;
q[0] += x0;
q[1] += x1;
q[2] += x2;
q[3] += x3;
for (i = 1; i < nb; i++)
{
h0 = hh[i];
q[(i*ldq)+0] += (x0*h0);
q[(i*ldq)+1] += (x1*h0);
q[(i*ldq)+2] += (x2*h0);
q[(i*ldq)+3] += (x3*h0);
}
}
#endif // if 0
void single_hh_trafo_complex_avx_avx2_1hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
/*
!f>#ifdef HAVE_AVX
!f> interface
!f> subroutine single_hh_trafo_complex_avx_avx2_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> complex(kind=c_double) :: q(*)
!f> complex(kind=c_double) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void single_hh_trafo_complex_avx_avx2_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
{
int i;
int nb = *pnb;
......@@ -161,7 +123,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_double_(std::complex<double>* q, std::
}
}
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -356,7 +318,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_double_(std::complex<double>* q, std::
}
}
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -501,7 +463,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex<
}
}
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -595,4 +557,3 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex<
_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
}
}
} // extern C
......@@ -61,7 +61,7 @@
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex>
#include <complex.h>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
......@@ -82,14 +82,26 @@
#endif
extern "C" {
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq);
void single_hh_trafo_complex_avx_avx2_1hv_single_(std::complex<float>* q, std::complex<float>* hh, int* pnb, int* pnq, int* pldq)
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(complex* q, complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(complex* q, complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(complex* q, complex* hh, int nb, int ldq);
/*
!f>#ifdef HAVE_AVX
!f> interface
!f> subroutine single_hh_trafo_complex_avx_avx2_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> complex(kind=c_float) :: q(*)
!f> complex(kind=c_float) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void single_hh_trafo_complex_avx_avx2_1hv_single(complex* q, complex* hh, int* pnb, int* pnq, int* pldq)
{
int i;
int nb = *pnb;
......@@ -130,7 +142,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_single_(std::complex<float>* q, std::c
}
}
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(complex* q, complex* hh, int nb, int ldq)
{
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
......@@ -331,7 +343,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_single_(std::complex<float>* q, std::c
}
}
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(complex* q, complex* hh, int nb, int ldq)
{
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
......@@ -482,7 +494,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(std::complex<
}
}
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq)
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(complex* q, complex* hh, int nb, int ldq)
{
float* q_dbl = (float*)q;
float* hh_dbl = (float*)hh;
......@@ -580,4 +592,3 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(std::complex<
// _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
}
}
} // extern C
......@@ -61,7 +61,7 @@
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex>
#include <complex.h>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
......@@ -82,113 +82,27 @@
#endif
extern "C" {
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
std::complex<double> x1;
std::complex<double> x2;
std::complex<double> x3;
std::complex<double> x4;
std::complex<double> y1;
std::complex<double> y2;
std::complex<double> y3;
std::complex<double> y4;
std::complex<double> h1;
std::complex<double> h2;
std::complex<double> tau1;
std::complex<double> tau2;
int i=0;
x1 = q[ldq+0];
x2 = q[ldq+1];
x3 = q[ldq+2];
x4 = q[ldq+3];
h2 = conj(hh[ldh+1]);
y1 = q[0] + (x1*h2);
y2 = q[1] + (x2*h2);
y3 = q[2] + (x3*h2);
y4 = q[3] + (x4*h2);
for (i = 2; i < nb; i++)
{
h1 = conj(hh[i-1]);
h2 = conj(hh[ldh+i]);
x1 += (q[(i*ldq)+0] * h1);
y1 += (q[(i*ldq)+0] * h2);
x2 += (q[(i*ldq)+1] * h1);
y2 += (q[(i*ldq)+1] * h2);
x3 += (q[(i*ldq)+2] * h1);
y3 += (q[(i*ldq)+2] * h2);
x4 += (q[(i*ldq)+3] * h1);
y4 += (q[(i*ldq)+3] * h2);
}
h1 = conj(hh[nb-1]);
x1 += (q[(nb*ldq)+0] * h1);
x2 += (q[(nb*ldq)+1] * h1);
x3 += (q[(nb*ldq)+2] * h1);
x4 += (q[(nb*ldq)+3] * h1);
tau1 = hh[0];
tau2 = hh[ldh];
h1 = (-1.0)*tau1;
x1 *= h1;
x2 *= h1;
x3 *= h1;
x4 *= h1;
h1 = (-1.0)*tau2;
h2 = (-1.0)*tau2;
h2 *= s;
y1 = y1*h1 +x1*h2;
y2 = y2*h1 +x2*h2;
y3 = y3*h1 +x3*h2;
y4 = y4*h1 +x4*h2;
q[0] += y1;
q[1] += y2;
q[2] += y3;
q[3] += y4;
h2 = hh[ldh+1];
q[ldq+0] += (x1 + (y1*h2));
q[ldq+1] += (x2 + (y2*h2));
q[ldq+2] += (x3 + (y3*h2));
q[ldq+3] += (x4 + (y4*h2));
for (i = 2; i < nb; i++)
{
h1 = hh[i-1];
h2 = hh[ldh+i];
q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
}
h1 = hh[nb-1];
q[(nb*ldq)+0] += (x1*h1);
q[(nb*ldq)+1] += (x2*h1);
q[(nb*ldq)+2] += (x3*h1);
q[(nb*ldq)+3] += (x4*h1);
}
#endif
void double_hh_trafo_complex_avx_avx2_2hv_double_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s);
/*
!f>#ifdef HAVE_AVX
!f> interface
!f> subroutine double_hh_trafo_complex_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> complex(kind=c_double) :: q(*)
!f> complex(kind=c_double) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void double_hh_trafo_complex_avx_avx2_2hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
int i;
int nb = *pnb;
......@@ -196,7 +110,7 @@ void double_hh_trafo_complex_avx_avx2_2hv_double_(std::complex<double>* q, std::
int ldq = *pldq;
int ldh = *pldh;
std::complex<double> s = conj(hh[(ldh)+1])*1.0;
double complex s = conj(hh[(ldh)+1])*1.0;
for (i = 2; i < nb; i++)
{
s += hh[i-1] * conj(hh[(i+ldh)]);
......@@ -227,7 +141,7 @@ void double_hh_trafo_complex_avx_avx2_2hv_double_(std::complex<double>* q, std::
#endif
}
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -660,7 +574,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex<
_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
}
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -1013,7 +927,7 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex<
_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
}
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -1286,7 +1200,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex<
_mm256_store_pd(&q_dbl[(2*nb*ldq)+4], q2);
}
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(double complex* q, double complex* hh, int nb, int ldq, int ldh, double complex s)
{
double* q_dbl = (double*)q;
double* hh_dbl = (double*)hh;
......@@ -1478,4 +1392,3 @@ static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex<
_mm256_store_pd(&q_dbl[(2*nb*ldq)+0], q1);
}
} // extern C
......@@ -61,7 +61,7 @@
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#include <complex>
#include <complex.h>
#include <x86intrin.h>
#define __forceinline __attribute__((always_inline))
......@@ -82,15 +82,28 @@