Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
62fe6edc
Commit
62fe6edc
authored
Apr 20, 2016
by
Andreas Marek
Browse files
Merge branch 'master' into ELPA_GPU
parents
f568656c
7423daeb
Changes
25
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
62fe6edc
...
...
@@ -12,6 +12,7 @@ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSIO
libelpa@SUFFIX@
_la_SOURCES
=
src/mod_precision.F90
\
src/mod_mpi.F90
\
src/mod_mpi_stubs.F90
\
src/elpa2_kernels/mod_fortran_interfaces.F90
\
src/elpa_utilities.F90
\
src/elpa1_compute.F90
\
src/elpa1.F90
\
...
...
@@ -139,30 +140,30 @@ endif
endif
if
WITH_COMPLEX_SSE_BLOCK1_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.c
pp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.c
pp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_1hv_single_precision.c
endif
endif
if
WITH_COMPLEX_AVX_BLOCK1_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.c
pp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.c
pp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.c
endif
endif
if
WITH_COMPLEX_SSE_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.c
pp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.c
pp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse_2hv_single_precision.c
endif
endif
if
WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.c
pp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.c
if
WANT_SINGLE_PRECISION_COMPLEX
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.c
pp
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.c
endif
endif
...
...
configure.ac
View file @
62fe6edc
...
...
@@ -43,10 +43,10 @@ if test x$_cv_gnu_make_command = x ; then
AC_MSG_ERROR([Need GNU Make])
fi
AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
if test x"${CPP_FOUND}" = xno; then
AC_MSG_ERROR([no cpp found])
fi
#
AC_CHECK_PROG(CPP_FOUND,cpp,yes,no)
#
if test x"${CPP_FOUND}" = xno; then
#
AC_MSG_ERROR([no cpp found])
#
fi
# gnu-make fortran module dependencies
m4_include([fdep/fortran_dependencies.m4])
...
...
@@ -120,17 +120,17 @@ if test x"${enable_openmp}" = x"yes"; then
FCFLAGS="$OPENMP_FCFLAGS $FCFLAGS"
fi
# C++
AC_LANG([C++])
AC_PROG_CXX
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
fi
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
fi
#
# C++
#
AC_LANG([C++])
#
AC_PROG_CXX
#
#
if test x"${enable_openmp}" = x"yes"; then
#
AX_ELPA_OPENMP
#
if test "$ac_cv_prog_cxx_openmp" = unsupported; then
#
AC_MSG_ERROR([Could not compile a C++ program with OpenMP, adjust CXXFLAGS])
#
fi
#
CXXFLAGS="$OPENMP_CXXFLAGS $CXXFLAGS"
#
fi
...
...
@@ -268,26 +268,26 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
)
AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
AC_LANG_PUSH([C++])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d a1_1 = _mm256_load_pd(q);
return 0;
}
])],
[can_compile_avx=yes],
[can_compile_avx=no]
)
AC_LANG_POP([C++])
AC_MSG_RESULT([${can_compile_avx}])
if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
fi
fi
#
if test "${can_compile_avx}" = "yes" ; then
#
AC_MSG_CHECKING([whether we can compile AVX intrinsics in C++])
#
AC_LANG_PUSH([C++])
#
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#
#include <x86intrin.h>
#
int main(int argc, char **argv){
#
double* q;
#
__m256d a1_1 = _mm256_load_pd(q);
#
return 0;
#
}
#
])],
#
[can_compile_avx=yes],
#
[can_compile_avx=no]
#
)
#
AC_LANG_POP([C++])
#
AC_MSG_RESULT([${can_compile_avx}])
#
if test "${can_compile_avx}" = "no" ; then
#
AC_MSG_WARN([Cannot compile C++ with AVX: disabling AVX alltogether])
#
fi
#
fi
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...
...
@@ -303,27 +303,27 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
[can_compile_avx2=no]
)
AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "yes" ; then
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
AC_LANG_PUSH([C++])
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
double* q;
__m256d q1 = _mm256_load_pd(q);
__m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
return 0;
}
])],
[can_compile_avx2=yes],
[can_compile_avx2=no]
)
AC_LANG_POP([C++])
AC_MSG_RESULT([${can_compile_avx2}])
if test "${can_compile_avx2}" = "no" ; then
AC_MSG_WARN([Cannot compile C++ with AVX2!])
fi
fi
#
if test "${can_compile_avx2}" = "yes" ; then
#
AC_MSG_CHECKING([whether we can compile AVX2 intrinsics in C++])
#
AC_LANG_PUSH([C++])
#
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#
#include <x86intrin.h>
#
int main(int argc, char **argv){
#
double* q;
#
__m256d q1 = _mm256_load_pd(q);
#
__m256d y1 = _mm256_fmadd_pd(q1, q1, q1);
#
return 0;
#
}
#
])],
#
[can_compile_avx2=yes],
#
[can_compile_avx2=no]
#
)
#
AC_LANG_POP([C++])
#
AC_MSG_RESULT([${can_compile_avx2}])
#
if test "${can_compile_avx2}" = "no" ; then
#
AC_MSG_WARN([Cannot compile C++ with AVX2!])
#
fi
#
fi
if test "${can_compile_avx}" = "yes" ; then
install_real_avx_block2=yes
...
...
@@ -941,6 +941,10 @@ echo "Generating elpa/elpa_generated.h..."
mkdir -p elpa
grep -h "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_generated.h || exit 1
echo "Generating Fortran interfaces for C kernels"
grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.c | sed 's/^ *!f>//;' > elpa/elpa_generated_fortran_interfaces.h || exit 1
#grep -h "^ *!f>" $srcdir/src/elpa2_kernels/*.cpp | sed 's/^ *!f>//;' >> elpa/elpa_generated_fortran_interfaces.h || exit 1
echo "Generating test/shared_sources/generated.h..."
mkdir -p test/shared_sources
grep -h "^ *!c>" $srcdir/test/shared_sources/*.F90 | sed 's/^ *!c>//;' > test/shared_sources/generated.h || exit 1
...
...
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.c
pp
→
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_double_precision.c
View file @
62fe6edc
...
...
@@ -61,7 +61,7 @@
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<complex>
#include
<complex
.h
>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline))
...
...
@@ -82,64 +82,26 @@
#endif
extern
"C"
{
//Forward declaration
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_1hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
);
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_1hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
);
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_1hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_1hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq)
{
std::complex<double> x0;
std::complex<double> x1;
std::complex<double> x2;
std::complex<double> x3;
std::complex<double> h0;
std::complex<double> tau0;
int i=0;
x0 = q[0];
x1 = q[1];
x2 = q[2];
x3 = q[3];
for (i = 1; i < nb; i++)
{
h0 = conj(hh[i]);
x0 += (q[(i*ldq)+0] * h0);
x1 += (q[(i*ldq)+1] * h0);
x2 += (q[(i*ldq)+2] * h0);
x3 += (q[(i*ldq)+3] * h0);
}
tau0 = hh[0];
h0 = (-1.0)*tau0;
x0 *= h0;
x1 *= h0;
x2 *= h0;
x3 *= h0;
q[0] += x0;
q[1] += x1;
q[2] += x2;
q[3] += x3;
for (i = 1; i < nb; i++)
{
h0 = hh[i];
q[(i*ldq)+0] += (x0*h0);
q[(i*ldq)+1] += (x1*h0);
q[(i*ldq)+2] += (x2*h0);
q[(i*ldq)+3] += (x3*h0);
}
}
#endif // if 0
void
single_hh_trafo_complex_avx_avx2_1hv_double_
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
)
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_1hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
);
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_1hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
);
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_1hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
);
/*
!f>#ifdef HAVE_AVX
!f> interface
!f> subroutine single_hh_trafo_complex_avx_avx2_1hv_double(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> complex(kind=c_double) :: q(*)
!f> complex(kind=c_double) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
single_hh_trafo_complex_avx_avx2_1hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
)
{
int
i
;
int
nb
=
*
pnb
;
...
...
@@ -161,7 +123,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_double_(std::complex<double>* q, std::
}
}
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_1hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>
*
hh
,
int
nb
,
int
ldq
)
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_1hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
)
{
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
...
...
@@ -356,7 +318,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_double_(std::complex<double>* q, std::
}
}
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_1hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>
*
hh
,
int
nb
,
int
ldq
)
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_1hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
)
{
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
...
...
@@ -501,7 +463,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_double(std::complex<
}
}
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_1hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>
*
hh
,
int
nb
,
int
ldq
)
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_1hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
)
{
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
...
...
@@ -595,4 +557,3 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_double(std::complex<
_mm256_store_pd
(
&
q_dbl
[(
2
*
i
*
ldq
)
+
4
],
q2
);
}
}
}
// extern C
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.c
pp
→
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_1hv_single_precision.c
View file @
62fe6edc
...
...
@@ -61,7 +61,7 @@
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<complex>
#include
<complex
.h
>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline))
...
...
@@ -82,14 +82,26 @@
#endif
extern
"C"
{
//Forward declaration
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_1hv_single
(
std
::
complex
<
float
>*
q
,
std
::
complex
<
float
>*
hh
,
int
nb
,
int
ldq
);
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_1hv_single
(
std
::
complex
<
float
>*
q
,
std
::
complex
<
float
>*
hh
,
int
nb
,
int
ldq
);
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_1hv_single
(
std
::
complex
<
float
>*
q
,
std
::
complex
<
float
>*
hh
,
int
nb
,
int
ldq
);
void
single_hh_trafo_complex_avx_avx2_1hv_single_
(
std
::
complex
<
float
>*
q
,
std
::
complex
<
float
>*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
)
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_1hv_single
(
complex
*
q
,
complex
*
hh
,
int
nb
,
int
ldq
);
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_1hv_single
(
complex
*
q
,
complex
*
hh
,
int
nb
,
int
ldq
);
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_1hv_single
(
complex
*
q
,
complex
*
hh
,
int
nb
,
int
ldq
);
/*
!f>#ifdef HAVE_AVX
!f> interface
!f> subroutine single_hh_trafo_complex_avx_avx2_1hv_single(q, hh, pnb, pnq, pldq) &
!f> bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq
!f> complex(kind=c_float) :: q(*)
!f> complex(kind=c_float) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
single_hh_trafo_complex_avx_avx2_1hv_single
(
complex
*
q
,
complex
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
)
{
int
i
;
int
nb
=
*
pnb
;
...
...
@@ -130,7 +142,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_single_(std::complex<float>* q, std::c
}
}
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_1hv_single
(
std
::
complex
<
float
>*
q
,
std
::
complex
<
float
>
*
hh
,
int
nb
,
int
ldq
)
static
__forceinline
void
hh_trafo_complex_kernel_12_AVX_1hv_single
(
complex
*
q
,
complex
*
hh
,
int
nb
,
int
ldq
)
{
float
*
q_dbl
=
(
float
*
)
q
;
float
*
hh_dbl
=
(
float
*
)
hh
;
...
...
@@ -331,7 +343,7 @@ void single_hh_trafo_complex_avx_avx2_1hv_single_(std::complex<float>* q, std::c
}
}
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_1hv_single
(
std
::
complex
<
float
>*
q
,
std
::
complex
<
float
>
*
hh
,
int
nb
,
int
ldq
)
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_1hv_single
(
complex
*
q
,
complex
*
hh
,
int
nb
,
int
ldq
)
{
float
*
q_dbl
=
(
float
*
)
q
;
float
*
hh_dbl
=
(
float
*
)
hh
;
...
...
@@ -482,7 +494,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(std::complex<
}
}
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_1hv_single
(
std
::
complex
<
float
>*
q
,
std
::
complex
<
float
>
*
hh
,
int
nb
,
int
ldq
)
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_1hv_single
(
complex
*
q
,
complex
*
hh
,
int
nb
,
int
ldq
)
{
float
*
q_dbl
=
(
float
*
)
q
;
float
*
hh_dbl
=
(
float
*
)
hh
;
...
...
@@ -580,4 +592,3 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(std::complex<
// _mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
}
}
}
// extern C
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.c
pp
→
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_double_precision.c
View file @
62fe6edc
...
...
@@ -61,7 +61,7 @@
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<complex>
#include
<complex
.h
>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline))
...
...
@@ -82,113 +82,27 @@
#endif
extern
"C"
{
//Forward declaration
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
static
__forceinline
void
hh_trafo_complex_kernel_6_AVX_2hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
static
__forceinline
void
hh_trafo_complex_kernel_2_AVX_2hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
);
#if 0
static __forceinline void hh_trafo_complex_kernel_4_C_2hv_double(std::complex<double>* q, std::complex<double>* hh, int nb, int ldq, int ldh, std::complex<double> s)
{
std::complex<double> x1;
std::complex<double> x2;
std::complex<double> x3;
std::complex<double> x4;
std::complex<double> y1;
std::complex<double> y2;
std::complex<double> y3;
std::complex<double> y4;
std::complex<double> h1;
std::complex<double> h2;
std::complex<double> tau1;
std::complex<double> tau2;
int i=0;
x1 = q[ldq+0];
x2 = q[ldq+1];
x3 = q[ldq+2];
x4 = q[ldq+3];
h2 = conj(hh[ldh+1]);
y1 = q[0] + (x1*h2);
y2 = q[1] + (x2*h2);
y3 = q[2] + (x3*h2);
y4 = q[3] + (x4*h2);
for (i = 2; i < nb; i++)
{
h1 = conj(hh[i-1]);
h2 = conj(hh[ldh+i]);
x1 += (q[(i*ldq)+0] * h1);
y1 += (q[(i*ldq)+0] * h2);
x2 += (q[(i*ldq)+1] * h1);
y2 += (q[(i*ldq)+1] * h2);
x3 += (q[(i*ldq)+2] * h1);
y3 += (q[(i*ldq)+2] * h2);
x4 += (q[(i*ldq)+3] * h1);
y4 += (q[(i*ldq)+3] * h2);
}
h1 = conj(hh[nb-1]);
x1 += (q[(nb*ldq)+0] * h1);
x2 += (q[(nb*ldq)+1] * h1);
x3 += (q[(nb*ldq)+2] * h1);
x4 += (q[(nb*ldq)+3] * h1);
tau1 = hh[0];
tau2 = hh[ldh];
h1 = (-1.0)*tau1;
x1 *= h1;
x2 *= h1;
x3 *= h1;
x4 *= h1;
h1 = (-1.0)*tau2;
h2 = (-1.0)*tau2;
h2 *= s;
y1 = y1*h1 +x1*h2;
y2 = y2*h1 +x2*h2;
y3 = y3*h1 +x3*h2;
y4 = y4*h1 +x4*h2;
q[0] += y1;
q[1] += y2;
q[2] += y3;
q[3] += y4;
h2 = hh[ldh+1];
q[ldq+0] += (x1 + (y1*h2));
q[ldq+1] += (x2 + (y2*h2));
q[ldq+2] += (x3 + (y3*h2));
q[ldq+3] += (x4 + (y4*h2));
for (i = 2; i < nb; i++)
{
h1 = hh[i-1];
h2 = hh[ldh+i];
q[(i*ldq)+0] += ((x1*h1) + (y1*h2));
q[(i*ldq)+1] += ((x2*h1) + (y2*h2));
q[(i*ldq)+2] += ((x3*h1) + (y3*h2));
q[(i*ldq)+3] += ((x4*h1) + (y4*h2));
}
h1 = hh[nb-1];
q[(nb*ldq)+0] += (x1*h1);
q[(nb*ldq)+1] += (x2*h1);
q[(nb*ldq)+2] += (x3*h1);
q[(nb*ldq)+3] += (x4*h1);
}
#endif
void
double_hh_trafo_complex_avx_avx2_2hv_double_
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
static
__forceinline
void
hh_trafo_complex_kernel_6_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
static
__forceinline
void
hh_trafo_complex_kernel_2_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
);
/*
!f>#ifdef HAVE_AVX
!f> interface
!f> subroutine double_hh_trafo_complex_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> complex(kind=c_double) :: q(*)
!f> complex(kind=c_double) :: hh(pnb,2)
!f> end subroutine
!f> end interface
!f>#endif
*/
void
double_hh_trafo_complex_avx_avx2_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
*
pnb
,
int
*
pnq
,
int
*
pldq
,
int
*
pldh
)
{
int
i
;
int
nb
=
*
pnb
;
...
...
@@ -196,7 +110,7 @@ void double_hh_trafo_complex_avx_avx2_2hv_double_(std::complex<double>* q, std::
int
ldq
=
*
pldq
;
int
ldh
=
*
pldh
;
std
::
complex
<
double
>
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1.0
;
double
complex
s
=
conj
(
hh
[(
ldh
)
+
1
])
*
1
.
0
;
for
(
i
=
2
;
i
<
nb
;
i
++
)
{
s
+=
hh
[
i
-
1
]
*
conj
(
hh
[(
i
+
ldh
)]);
...
...
@@ -227,7 +141,7 @@ void double_hh_trafo_complex_avx_avx2_2hv_double_(std::complex<double>* q, std::
#endif
}
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
)
static
__forceinline
void
hh_trafo_complex_kernel_8_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
)
{
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
...
...
@@ -660,7 +574,7 @@ static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_double(std::complex<
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
12
],
q4
);
}
static
__forceinline
void
hh_trafo_complex_kernel_6_AVX_2hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
)
static
__forceinline
void
hh_trafo_complex_kernel_6_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
)
{
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
...
...
@@ -1013,7 +927,7 @@ static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_double(std::complex<
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
8
],
q3
);
}
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
)
static
__forceinline
void
hh_trafo_complex_kernel_4_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
)
{
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
...
...
@@ -1286,7 +1200,7 @@ static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_double(std::complex<
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
4
],
q2
);
}
static
__forceinline
void
hh_trafo_complex_kernel_2_AVX_2hv_double
(
std
::
complex
<
double
>*
q
,
std
::
complex
<
double
>
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
std
::
complex
<
double
>
s
)
static
__forceinline
void
hh_trafo_complex_kernel_2_AVX_2hv_double
(
double
complex
*
q
,
double
complex
*
hh
,
int
nb
,
int
ldq
,
int
ldh
,
double
complex
s
)
{
double
*
q_dbl
=
(
double
*
)
q
;
double
*
hh_dbl
=
(
double
*
)
hh
;
...
...
@@ -1478,4 +1392,3 @@ static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_double(std::complex<
_mm256_store_pd
(
&
q_dbl
[(
2
*
nb
*
ldq
)
+
0
],
q1
);
}
}
// extern C
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.c
pp
→
src/elpa2_kernels/elpa2_kernels_complex_avx-avx2_2hv_single_precision.c
View file @
62fe6edc
...
...
@@ -61,7 +61,7 @@
// --------------------------------------------------------------------------------------------------
#include
"config-f90.h"
#include
<complex>
#include
<complex
.h
>
#include
<x86intrin.h>
#define __forceinline __attribute__((always_inline))
...
...
@@ -82,15 +82,28 @@
#endif
extern
"C"
{