Commit 53f2f2c6 authored by Andreas Marek's avatar Andreas Marek
Browse files

ELPA_2014.06 prepare release

Now it is possible
- to choose the kernel (real and complex independently) at run-time
  via environment variables, or
- to specify the kernel (real and complex independently) at runtime
  via specifing the kernel in the call to ELPA

This has a few implications
1) The ELPA 2014.06 release has a change in the API and is thus not
   binary compatible with previous versions
2) if no kernels are specified, a default kernel is choosen
3) if a wrong kernel is specified, a default kernel is choosen

For sake of simplicity it is still possible to build ELPA with
support for only one kernel, as in previous versions. However, it is
still not binary compatible to previous versions
parent c090a89f
...@@ -23,123 +23,120 @@ else ...@@ -23,123 +23,120 @@ else
libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90 libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90
endif endif
if WITH_GENERIC_SIMPLE if WITH_REAL_GENERIC_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
src/elpa2_kernels/elpa2_kernels_real_simple.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
src/elpa2_kernels/elpa2_kernels_real_simple.f90
endif endif
endif endif
if WITH_GENERIC if WITH_COMPLEX_GENERIC_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90
src/elpa2_kernels/elpa2_kernels_real.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90
src/elpa2_kernels/elpa2_kernels_real.f90
endif endif
endif endif
if WITH_BGP if WITH_REAL_GENERIC_SIMPLE_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.f90
src/elpa2_kernels/elpa2_kernels_complex.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.f90
src/elpa2_kernels/elpa2_kernels_complex.f90 endif
endif
endif endif
if WITH_BGQ if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90
src/elpa2_kernels/elpa2_kernels_complex.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90
src/elpa2_kernels/elpa2_kernels_complex.f90
endif endif
endif endif
if WITH_SSE_AS if WITH_REAL_BGP_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90
endif endif
endif endif
if WITH_AVX_SANDYBRIDGE if WITH_REAL_BGQ_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif endif
endif endif
if WITH_AMD_BULLDOZER if WITH_REAL_SSE_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ endif
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp else
if WITH_COMPLEX_SSE_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
endif
endif endif
endif endif
if WITH_AVX_COMPLEX_BLOCK1
if WITH_REAL_AVX_BLOCK2_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif endif
if WITH_AVX_COMPLEX_BLOCK2 if WITH_REAL_AVX_BLOCK4_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif endif
endif endif
if WITH_AVX_REAL_BLOCK2 if WITH_REAL_AVX_BLOCK6_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
endif endif
endif endif
if WITH_AVX_REAL_BLOCK4 if WITH_COMPLEX_AVX_BLOCK1_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif endif
if WITH_AVX_REAL_BLOCK6 if WITH_COMPLEX_AVX_BLOCK2_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif endif
#if WITH_AVX_SANDYBRIDGE
#if WITH_OPENMP
# libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#else
# libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
#endif
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION) libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION)
else else
...@@ -156,12 +153,24 @@ files_DATA = \ ...@@ -156,12 +153,24 @@ files_DATA = \
test/read_real.F90 \ test/read_real.F90 \
test/read_real_gen.F90 \ test/read_real_gen.F90 \
test/test_complex2.F90 \ test/test_complex2.F90 \
test/test_complex2_default_kernel.F90 \
test/test_complex2_choose_kernel_with_api.F90 \
test/test_complex.F90 \ test/test_complex.F90 \
test/test_complex_gen.F90 \ test/test_complex_gen.F90 \
test/test_real2.F90 \ test/test_real2.F90 \
test/test_real2_default_kernel.F90 \
test/test_real2_choose_kernel_with_api.F90 \
test/print_available_elpa2_kernels.F90 \
test/test_real.F90 \ test/test_real.F90 \
test/test_real_gen.F90 test/test_real_gen.F90
# test programs
if WITH_OPENMP
build_lib = libelpa_mt.la
else
build_lib = libelpa.la
endif
# pkg-config stuff # pkg-config stuff
pkgconfigdir = $(libdir)/pkgconfig pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = elpa.pc pkgconfig_DATA = elpa.pc
...@@ -174,7 +183,8 @@ build_lib = libelpa.la ...@@ -174,7 +183,8 @@ build_lib = libelpa.la
endif endif
#bindir = $(abs_top_builddir) #bindir = $(abs_top_builddir)
bin_PROGRAMS = test_real test_real2 test_complex test_complex2 bin_PROGRAMS = test_real test_real2 test_real2_default_kernel test_complex test_complex2 test_complex2_default_kernel test_real2_choose_kernel_with_api test_complex2_choose_kernel_with_api print_available_elpa2_kernels
test_real_SOURCES = test/test_real.F90 test_real_SOURCES = test/test_real.F90
test_real_LDADD = $(build_lib) test_real_LDADD = $(build_lib)
...@@ -182,14 +192,28 @@ test_real_LDADD = $(build_lib) ...@@ -182,14 +192,28 @@ test_real_LDADD = $(build_lib)
test_real2_SOURCES = test/test_real2.F90 test_real2_SOURCES = test/test_real2.F90
test_real2_LDADD = $(build_lib) test_real2_LDADD = $(build_lib)
test_real2_default_kernel_SOURCES = test/test_real2_default_kernel.F90
test_real2_default_kernel_LDADD = $(build_lib)
test_real2_choose_kernel_with_api_SOURCES = test/test_real2_choose_kernel_with_api.F90
test_real2_choose_kernel_with_api_LDADD = $(build_lib)
test_complex_SOURCES = test/test_complex.F90 test_complex_SOURCES = test/test_complex.F90
test_complex_LDADD = $(build_lib) test_complex_LDADD = $(build_lib)
test_complex2_SOURCES = test/test_complex2.F90 test_complex2_SOURCES = test/test_complex2.F90
test_complex2_LDADD = $(build_lib) test_complex2_LDADD = $(build_lib)
test_complex2_default_kernel_SOURCES = test/test_complex2_default_kernel.F90
test_complex2_default_kernel_LDADD = $(build_lib)
test_complex2_choose_kernel_with_api_SOURCES = test/test_complex2_choose_kernel_with_api.F90
test_complex2_choose_kernel_with_api_LDADD = $(build_lib)
print_available_elpa2_kernels_SOURCES = test/print_available_elpa2_kernels.F90
print_available_elpa2_kernels_LDADD = $(build_lib)
check_SCRIPTS = test_real.sh test_real2.sh test_complex.sh test_complex2.sh check_SCRIPTS = test_real.sh test_real2.sh test_real2_default_kernel.sh test_complex.sh test_complex2.sh test_complex2_default_kernel.sh test_real2_choose_kernel_with_api.sh test_complex2_choose_kernel_with_api.sh print_available_elpa2_kernels.sh
TESTS = $(check_SCRIPTS) TESTS = $(check_SCRIPTS)
test_real.sh: test_real.sh:
...@@ -200,6 +224,14 @@ test_real2.sh: ...@@ -200,6 +224,14 @@ test_real2.sh:
echo "mpiexec -n 2 ./test_real2 > /dev/null 2>&1" > test_real2.sh echo "mpiexec -n 2 ./test_real2 > /dev/null 2>&1" > test_real2.sh
chmod +x test_real2.sh chmod +x test_real2.sh
test_real2_default_kernel.sh:
echo "mpiexec -n 2 ./test_real2_default_kernel > /dev/null 2>&1" > test_real2_default_kernel.sh
chmod +x test_real2_default_kernel.sh
test_real2_choose_kernel_with_api.sh:
echo "mpiexec -n 2 ./test_real2_choose_kernel_with_api > /dev/null 2>&1" > test_real2_choose_kernel_with_api.sh
chmod +x test_real2_choose_kernel_with_api.sh
test_complex.sh: test_complex.sh:
echo "mpiexec -n 2 ./test_complex > /dev/null 2>&1" > test_complex.sh echo "mpiexec -n 2 ./test_complex > /dev/null 2>&1" > test_complex.sh
chmod +x test_complex.sh chmod +x test_complex.sh
...@@ -207,8 +239,25 @@ test_complex.sh: ...@@ -207,8 +239,25 @@ test_complex.sh:
test_complex2.sh: test_complex2.sh:
echo "mpiexec -n 2 ./test_complex2 > /dev/null 2>&1" > test_complex2.sh echo "mpiexec -n 2 ./test_complex2 > /dev/null 2>&1" > test_complex2.sh
chmod +x test_complex2.sh chmod +x test_complex2.sh
test_complex2_default_kernel.sh:
echo "mpiexec -n 2 ./test_complex2_default_kernel > /dev/null 2>&1" > test_complex2_default_kernel.sh
chmod +x test_complex2_default_kernel.sh
test_complex2_choose_kernel_with_api.sh:
echo "mpiexec -n 2 ./test_complex2_choose_kernel_with_api > /dev/null 2>&1" > test_complex2_choose_kernel_with_api.sh
chmod +x test_complex2_choose_kernel_with_api.sh
print_available_elpa2_kernels.sh:
echo "./print_available_elpa2_kernels" > print_available_elpa2_kernels.sh
chmod +x print_available_elpa2_kernels.sh
elpa2.i: $(top_srcdir)/src/elpa2.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2.F90 -o $@
elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
CLEANFILES = test_real.sh test_real2.sh test_complex.sh test_complex2.sh CLEANFILES = test_real.sh test_real2.sh test_real2_default_kernel.sh test_complex.sh test_complex2.sh test_complex2_default_kernel.sh test_real2_choose_kernel_with_api.sh test_complex2_choose_kernel_with_api.sh print_available_elpa2_kernels
@FORTRAN_MODULE_DEPS@ @FORTRAN_MODULE_DEPS@
This diff is collapsed.
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
/* Define to 1 if you have the <dlfcn.h> header file. */ /* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H #undef HAVE_DLFCN_H
/* Fortran can querry environment variables */
#undef HAVE_ENVIRONMENT_CHECKING
/* Define to 1 if you have the <inttypes.h> header file. */ /* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H #undef HAVE_INTTYPES_H
...@@ -67,45 +70,47 @@ ...@@ -67,45 +70,47 @@
/* Version number of package */ /* Version number of package */
#undef VERSION #undef VERSION
/* use kernel tuned for AVX on AMD Bulldozer (written in gcc assembler) */ /* can use complex_avx_block1 kernel */
#undef WITH_AMD_BULLDOZER #undef WITH_COMPLEX_AVX_BLOCK1_KERNEL
/* use AVX optimized complex kernel with blocking 1 (written in gcc assembler) /* can use complex_avx_block2 kernel */
*/ #undef WITH_COMPLEX_AVX_BLOCK2_KERNEL
#undef WITH_AVX_COMPLEX_BLOCK1
/* use AVX optimized complex kernel with blocking 2 (written in gcc assembler) /* can use complex generic kernel */
*/ #undef WITH_COMPLEX_GENERIC_KERNEL
#undef WITH_AVX_COMPLEX_BLOCK2
/* use AVX optimized real kernel with blocking 2 (written in gcc assembler) */ /* can use complex generic-simple kernel */
#undef WITH_AVX_REAL_BLOCK2 #undef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
/* use AVX optimized real kernel with blocking 4 (written in gcc assembler) */ /* can use complex SSE kernel */
#undef WITH_AVX_REAL_BLOCK4 #undef WITH_COMPLEX_SSE_KERNEL
/* use AVX optimized real kernel with blocking 6 (written in gcc assembler) */ /* use OpenMP threading */
#undef WITH_AVX_REAL_BLOCK6 #undef WITH_OPENMP
/* use kernel tuned for AVX on Intel Sandybridge (written in gcc assembler) */ /* can use real_avx_block2 kernel */
#undef WITH_AVX_SANDYBRIDGE #undef WITH_REAL_AVX_BLOCK2_KERNEL
/* use optimized kernel for IBM BG/P */ /* can use real_avx_block4 kernel */
#undef WITH_BGP #undef WITH_REAL_AVX_BLOCK4_KERNEL
/* use optimized kernel for IBM BG/Q */ /* can use real_avx_block6 kernel */
#undef WITH_BGQ #undef WITH_REAL_AVX_BLOCK6_KERNEL
/* use generic kernel for all architectures (with some hand-coded /* can use real BGP kernel */
optimizations) */ #undef WITH_REAL_BGP_KERNEL
#undef WITH_GENERIC
/* use generic simple kernel for all architectures (without any hand-coded /* can use real BGQ kernel */
optimizations) */ #undef WITH_REAL_BGQ_KERNEL
#undef WITH_GENERIC_SIMPLE
/* use OpenMP threading */ /* can use real generic kernel */
#undef WITH_OPENMP #undef WITH_REAL_GENERIC_KERNEL
/* can use real generic-simple kernel */
#undef WITH_REAL_GENERIC_SIMPLE_KERNEL
/* can use real SSE kernel */
#undef WITH_REAL_SSE_KERNEL
/* use kernel tuned for SSE (written in gcc assembler) */ /* use specific real kernel */
#undef WITH_SSE_AS #undef WITH_SPECIFIC_COMPLEX_KERNEL
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -130,7 +130,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>* ...@@ -130,7 +130,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_1hv(std::complex<double>*
} }
#endif // if 0 #endif // if 0
void single_hh_trafo_complex_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq) void single_hh_trafo_complex_sse_avx_1hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
...@@ -179,7 +179,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>* ...@@ -179,7 +179,7 @@ static __forceinline void hh_trafo_complex_kernel_4_C_2hv(std::complex<double>*
} }
#endif #endif
void double_hh_trafo_complex_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh) void double_hh_trafo_complex_sse_avx_2hv_(std::complex<double>* q, std::complex<double>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
...@@ -77,12 +77,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int ...@@ -77,12 +77,12 @@ __forceinline void hh_trafo_kernel_8_SSE_2hv(double* q, double* hh, int nb, int
__forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s); __forceinline void hh_trafo_kernel_12_SSE_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
#endif #endif
void double_hh_trafo_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_real_sse_avx_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#if 0 #if 0
void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif #endif
void double_hh_trafo_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) void double_hh_trafo_real_sse_avx_2hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
...@@ -75,12 +75,12 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int ...@@ -75,12 +75,12 @@ __forceinline void hh_trafo_kernel_4_SSE_4hv(double* q, double* hh, int nb, int
__forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4); __forceinline void hh_trafo_kernel_6_SSE_4hv(double* q, double* hh, int nb, int ldq, int ldh, double s_1_2, double s_1_3, double s_2_3, double s_1_4, double s_2_4, double s_3_4);
#endif #endif
void quad_hh_trafo_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void quad_hh_trafo_real_sse_avx_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#if 0 #if 0
void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void quad_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif #endif
void quad_hh_trafo_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) void quad_hh_trafo_real_sse_avx_4hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
...@@ -73,12 +73,12 @@ static void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, in ...@@ -73,12 +73,12 @@ static void hh_trafo_kernel_2_SSE_6hv(double* q, double* hh, int nb, int ldq, in
static void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods); static void hh_trafo_kernel_4_SSE_6hv(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
#endif #endif
void hexa_hh_trafo_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void hexa_hh_trafo_real_sse_avx_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#if 0 #if 0
void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh); void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif #endif
void hexa_hh_trafo_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) void hexa_hh_trafo_real_sse_avx_6hv_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{ {
int i; int i;
int nb = *pnb; int nb = *pnb;
......
This diff is collapsed.
...@@ -161,30 +161,23 @@ program test_complex2 ...@@ -161,30 +161,23 @@ program test_complex2
if (myid .eq. 0) then if (myid .eq. 0) then
print *," " print *," "
print *,"This ELPA2 is build with" print *,"This ELPA2 is build with"
#ifdef WITH_AVX_COMPLEX_BLOCK2 #ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
print *,"AVX optimized kernel (2 blocking) for complex matrices" print *,"AVX optimized kernel (2 blocking) for complex matrices"
#endif #endif
#ifdef WITH_AVX_COMPLEX_BLOCK1 #ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
print *,"AVX optimized kernel (1 blocking) for complex matrices" print *,"AVX optimized kernel (1 blocking) for complex matrices"
#endif #endif
#ifdef WITH_AVX_SANDYBRIDGE
print *,"AVX SANDYBRIDGE optimized kernel for complex matrices" #ifdef WITH_COMPLEX_GENERIC_KERNEL
#endif
#ifdef WITH_GENERIC
print *,"GENERIC kernel for complex matrices" print *,"GENERIC kernel for complex matrices"
#endif #endif
#ifdef WITH_GENERIC_SIMPLE #ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
print *,"GENERIC SIMPLE kernel for complex matrices" print *,"GENERIC SIMPLE kernel for complex matrices"
#endif #endif
#ifdef WITH_SSE_AS #ifdef WITH_COMPLEX_SSE_KERNEL
print *,"SSE ASSEMBLER kernel for complex matrices" print *,"SSE ASSEMBLER kernel for complex matrices"
#endif #endif
#ifdef WITH_BGP