Commit c0d57a28 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'elpa_interface'

parents d84a25fe 88156480
before_script:
- export LANG=C
- ulimit -s unlimited
- if [ $HOST != "hydra03" -a $HOST != "hydra04" -a $HOST != "hydra05" -a $HOST != "hydra06" -a $HOST != "hydra07" ] ; then module load impi/5.1.3 intel/16.0 gcc/4.9 mkl/11.3 autotools pkg-config ; fi
- module list
- export MKL_INTEL_SCALAPACK_MPI_NO_OMP_BASELINE="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm"
......
......@@ -17,6 +17,7 @@ libelpa@SUFFIX@_public_la_SOURCES = \
src/elpa1/elpa1_auxiliary.F90 \
src/elpa1/elpa1_utilities.F90 \
src/elpa2/elpa2_utilities.F90 \
src/elpa_t.F90 \
src/elpa_utilities.F90
# internal parts
......@@ -43,7 +44,8 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa2/qr/qr_utils.F90 \
src/elpa2/qr/elpa_qrkernels.F90 \
src/elpa2/qr/elpa_pdlarfb.F90 \
src/elpa2/qr/elpa_pdgeqrf.F90
src/elpa2/qr/elpa_pdgeqrf.F90 \
src/elpa_options.c
EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1/elpa_reduce_add_vectors.X90 \
......@@ -308,7 +310,7 @@ BUILT_SOURCES = $(generated_headers)
# install public Fortran modules files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
nobase_elpa_include_HEADERS = $(wildcard modules/*)
nobase_elpa_include_HEADERS += elpa/elpa.h elpa/elpa_kernel_constants.h elpa/elpa_generated.h
nobase_elpa_include_HEADERS += elpa/elpa.h elpa/elpa_kernel_constants.h elpa/elpa_solver_constants.h elpa/elpa_constants.h elpa/elpa_generated.h
dist_man_MANS = \
man/solve_evp_real.3 \
......@@ -380,6 +382,7 @@ dist_files_DATA = \
test/Fortran/test_invert_trm_real.F90 \
test/Fortran/test_cholesky_complex.F90 \
test/Fortran/test_invert_trm_complex.F90 \
test/Fortran/test_new_interface.F90 \
test/Fortran/elpa_tests.F90 \
src/elpa2/elpa2_print_kernels.F90
......@@ -407,7 +410,6 @@ noinst_PROGRAMS = \
elpa2_test_complex@SUFFIX@ \
elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_banded@SUFFIX@ \
elpa_driver_real@SUFFIX@ \
elpa_driver_complex@SUFFIX@ \
......@@ -424,6 +426,7 @@ noinst_PROGRAMS = \
elpa2_test_real_c_version@SUFFIX@ \
elpa2_test_complex_c_version@SUFFIX@ \
elpa_driver_real_c_version@SUFFIX@ \
elpa_test_new_interface@SUFFIX@ \
elpa_driver_complex_c_version@SUFFIX@
if WANT_SINGLE_PRECISION_COMPLEX
......@@ -496,6 +499,11 @@ libelpatest@SUFFIX@_la_SOURCES += \
test/shared/redir.c \
test/shared/redirect.F90
endif
elpa_test_new_interface@SUFFIX@_SOURCES = test/Fortran/test_new_interface.F90
elpa_test_new_interface@SUFFIX@_LDADD = $(build_lib) $(FCLIBS)
elpa_test_new_interface@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa_test_new_interface@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa1_test_real_c_version@SUFFIX@_SOURCES = test/C/elpa1_test_real_c_version.c
elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib) $(FCLIBS)
......@@ -826,6 +834,7 @@ check_SCRIPTS = \
elpa2_test_real_c_version@SUFFIX@.sh \
elpa2_test_complex_c_version@SUFFIX@.sh \
elpa_driver_real_c_version@SUFFIX@.sh \
elpa_test_new_interface@SUFFIX@.sh \
elpa_driver_complex_c_version@SUFFIX@.sh
if WANT_SINGLE_PRECISION_REAL
......
#include <elpa/elpa_kernel_constants.h>
#ifndef ELPA_H
#define ELPA_H
#include <limits.h>
#include <elpa/elpa_constants.h>
#include <elpa/elpa_generated.h>
#endif
#define ELPA_INVALID_INT INT_MIN
#define ELPA_C_ERROR 0
#define ELPA_C_OK 1
#ifdef ELPA_H
#define ELPA_ERROR ELPA_C_ERROR
#define ELPA_OK ELPA_C_OK
#endif
#include <elpa/elpa_kernel_constants.h>
#include <elpa/elpa_solver_constants.h>
#define ELPA2_REAL_KERNEL_GENERIC 1
#define ELPA2_REAL_KERNEL_GENERIC_SIMPLE 2
#define ELPA2_REAL_KERNEL_BGP 3
#define ELPA2_REAL_KERNEL_BGQ 4
#define ELPA2_REAL_KERNEL_SSE 5
#define ELPA2_REAL_KERNEL_SSE_BLOCK2 6
#define ELPA2_REAL_KERNEL_SSE_BLOCK4 7
#define ELPA2_REAL_KERNEL_SSE_BLOCK6 8
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 9
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 10
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 11
#define ELPA2_REAL_KERNEL_AVX2_BLOCK2 12
#define ELPA2_REAL_KERNEL_AVX2_BLOCK4 13
#define ELPA2_REAL_KERNEL_AVX2_BLOCK6 14
#define ELPA2_REAL_KERNEL_AVX512_BLOCK2 15
#define ELPA2_REAL_KERNEL_AVX512_BLOCK4 16
#define ELPA2_REAL_KERNEL_AVX512_BLOCK6 17
#define ELPA2_REAL_KERNEL_GPU 18
#define ELPA2_NUMBER_OF_REAL_KERNELS 18
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
#define ELPA2_COMPLEX_KERNEL_GENERIC_SIMPLE 2
#define ELPA2_COMPLEX_KERNEL_BGP 3
#define ELPA2_COMPLEX_KERNEL_BGQ 4
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_SSE_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 8
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 9
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK1 10
#define ELPA2_COMPLEX_KERNEL_AVX2_BLOCK2 11
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK1 12
#define ELPA2_COMPLEX_KERNEL_AVX512_BLOCK2 13
#define ELPA2_COMPLEX_KERNEL_GPU 14
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 14
#define ELPA_C_2STAGE_REAL_GENERIC 1
#define ELPA_C_2STAGE_REAL_GENERIC_SIMPLE 2
#define ELPA_C_2STAGE_REAL_BGP 3
#define ELPA_C_2STAGE_REAL_BGQ 4
#define ELPA_C_2STAGE_REAL_SSE 5
#define ELPA_C_2STAGE_REAL_SSE_BLOCK2 6
#define ELPA_C_2STAGE_REAL_SSE_BLOCK4 7
#define ELPA_C_2STAGE_REAL_SSE_BLOCK6 8
#define ELPA_C_2STAGE_REAL_AVX_BLOCK2 9
#define ELPA_C_2STAGE_REAL_AVX_BLOCK4 10
#define ELPA_C_2STAGE_REAL_AVX_BLOCK6 11
#define ELPA_C_2STAGE_REAL_AVX2_BLOCK2 12
#define ELPA_C_2STAGE_REAL_AVX2_BLOCK4 13
#define ELPA_C_2STAGE_REAL_AVX2_BLOCK6 14
#define ELPA_C_2STAGE_REAL_AVX512_BLOCK2 15
#define ELPA_C_2STAGE_REAL_AVX512_BLOCK4 16
#define ELPA_C_2STAGE_REAL_AVX512_BLOCK6 17
#define ELPA_C_2STAGE_REAL_GPU 18
#define ELPA_C_2STAGE_NUMBER_OF_REAL_KERNELS 18
#define ELPA_C_2STAGE_COMPLEX_GENERIC 1
#define ELPA_C_2STAGE_COMPLEX_GENERIC_SIMPLE 2
#define ELPA_C_2STAGE_COMPLEX_BGP 3
#define ELPA_C_2STAGE_COMPLEX_BGQ 4
#define ELPA_C_2STAGE_COMPLEX_SSE 5
#define ELPA_C_2STAGE_COMPLEX_SSE_BLOCK1 6
#define ELPA_C_2STAGE_COMPLEX_SSE_BLOCK2 7
#define ELPA_C_2STAGE_COMPLEX_AVX_BLOCK1 8
#define ELPA_C_2STAGE_COMPLEX_AVX_BLOCK2 9
#define ELPA_C_2STAGE_COMPLEX_AVX2_BLOCK1 10
#define ELPA_C_2STAGE_COMPLEX_AVX2_BLOCK2 11
#define ELPA_C_2STAGE_COMPLEX_AVX512_BLOCK1 12
#define ELPA_C_2STAGE_COMPLEX_AVX512_BLOCK2 13
#define ELPA_C_2STAGE_COMPLEX_GPU 14
#define ELPA_C_2STAGE_NUMBER_OF_COMPLEX_KERNELS 14
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_SSE
#endif
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_SSE_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX2_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX2_BLOCK6_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX2_BLOCK6
#else
#ifdef WITH_REAL_AVX2_BLOCK4_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX2_BLOCK4
#else
#ifdef WITH_REAL_AVX2_BLOCK2_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX2_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX2_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL) || defined(WITH_REAL_AVX512_BLOCK4_KERNEL) || defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX512_BLOCK6_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX512_BLOCK6
#else
#ifdef WITH_REAL_AVX512_BLOCK4_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX512_BLOCK4
#else
#ifdef WITH_REAL_AVX512_BLOCK2_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX512_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX512_BLOCK2_KERNEL) || defined(WITH_REAL_AVX512_BLOCK4_KERNEL) || defined(WITH_REAL_AVX512_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BGQ
#endif
#ifdef WITH_GPU_VERSION
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#else /* WITH_REAL_AVX_BLOCK2_KERNEL */
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_SSE
#endif
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#ifdef WITH_REAL_SSE_BLOCK6_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_SSE_BLOCK6
#else
#ifdef WITH_REAL_SSE_BLOCK4_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_SSE_BLOCK4
#else
#ifdef WITH_REAL_SSE_BLOCK2_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_SSE_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_SSE_BLOCK2_KERNEL) || defined(WITH_REAL_SSE_BLOCK4_KERNEL) || defined(WITH_REAL_SSE_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BLOCK6
#else
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BLOCK4
#else
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX2_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX2_BLOCK6_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX2_BLOCK6
#else
#ifdef WITH_REAL_AVX2_BLOCK4_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX2_BLOCK4
#else
#ifdef WITH_REAL_AVX2_BLOCK2_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX2_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX2_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL) */
#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL) || defined(WITH_REAL_AVX512_BLOCK4_KERNEL) || defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
#ifdef WITH_REAL_AVX512_BLOCK6_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX512_BLOCK6
#else
#ifdef WITH_REAL_AVX512_BLOCK4_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX512_BLOCK4
#else
#ifdef WITH_REAL_AVX512_BLOCK2_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX512_BLOCK2
#endif
#endif
#endif
#endif /* #if defined(WITH_REAL_AVX512_BLOCK2_KERNEL) || defined(WITH_REAL_AVX512_BLOCK4_KERNEL) || defined(WITH_REAL_AVX512_BLOCK6_KERNEL) */
#ifdef WITH_REAL_BGP_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_AVX_BGQ
#endif
#ifdef WITH_GPU_VERSION
#define ELPA_C_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_COMPLEX_GENERIC_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_SSE
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_SSE_BLOCK2
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_SSE_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX2_BLOCK2_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX2_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX2_BLOCK1_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX2_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX512_BLOCK2_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX512_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX512_BLOCK1_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX512_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) */
#ifdef WITH_GPU_VERSION
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#ifdef WITH_COMPLEX_GENERIC_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_SSE
#endif
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_SSE_BLOCK2_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_SSE_BLOCK2
#else
#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_SSE_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEXL_SSE_BLOCK1_KERNEL) || defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX_BLOCK1_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX2_BLOCK2_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX2_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX2_BLOCK1_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX2_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL) */
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
#ifdef WITH_COMPLEX_AVX512_BLOCK2_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX512_BLOCK2
#else
#ifdef WITH_COMPLEX_AVX512_BLOCK1_KERNEL
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_AVX512_BLOCK1
#endif
#endif
#endif /* defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) */
#ifdef WITH_GPU_VERSION
#define ELPA_C_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
#ifdef ELPA_H
#define ELPA_2STAGE_REAL_GENERIC ELPA_C_2STAGE_REAL_GENERIC
#define ELPA_2STAGE_REAL_GENERIC_SIMPLE ELPA_C_2STAGE_REAL_GENERIC_SIMPLE
#define ELPA_2STAGE_REAL_BGP ELPA_C_2STAGE_REAL_BGP
#define ELPA_2STAGE_REAL_BGQ ELPA_C_2STAGE_REAL_BGQ
#define ELPA_2STAGE_REAL_SSE ELPA_C_2STAGE_REAL_SSE
#define ELPA_2STAGE_REAL_SSE_BLOCK2 ELPA_C_2STAGE_REAL_SSE_BLOCK2
#define ELPA_2STAGE_REAL_SSE_BLOCK4 ELPA_C_2STAGE_REAL_SSE_BLOCK4
#define ELPA_2STAGE_REAL_SSE_BLOCK6 ELPA_C_2STAGE_REAL_SSE_BLOCK6
#define ELPA_2STAGE_REAL_AVX_BLOCK2 ELPA_C_2STAGE_REAL_AVX_BLOCK2
#define ELPA_2STAGE_REAL_AVX_BLOCK4 ELPA_C_2STAGE_REAL_AVX_BLOCK4
#define ELPA_2STAGE_REAL_AVX_BLOCK6 ELPA_C_2STAGE_REAL_AVX_BLOCK6
#define ELPA_2STAGE_REAL_AVX2_BLOCK2 ELPA_C_2STAGE_REAL_AVX2_BLOCK2
#define ELPA_2STAGE_REAL_AVX2_BLOCK4 ELPA_C_2STAGE_REAL_AVX2_BLOCK4
#define ELPA_2STAGE_REAL_AVX2_BLOCK6 ELPA_C_2STAGE_REAL_AVX2_BLOCK6
#define ELPA_2STAGE_REAL_AVX512_BLOCK2 ELPA_C_2STAGE_REAL_AVX512_BLOCK2
#define ELPA_2STAGE_REAL_AVX512_BLOCK4 ELPA_C_2STAGE_REAL_AVX512_BLOCK4
#define ELPA_2STAGE_REAL_AVX512_BLOCK6 ELPA_C_2STAGE_REAL_AVX512_BLOCK6
#define ELPA_2STAGE_REAL_GPU ELPA_C_2STAGE_REAL_GPU
#define ELPA_2STAGE_REAL_DEFAULT ELPA_C_2STAGE_REAL_DEFAULT
#define ELPA_2STAGE_NUMBER_OF_REAL_KERNELS ELPA_C_2STAGE_NUMBER_OF_REAL_KERNELS
#define ELPA_2STAGE_COMPLEX_GENERIC ELPA_C_2STAGE_COMPLEX_GENERIC
#define ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE ELPA_C_2STAGE_COMPLEX_GENERIC_SIMPLE
#define ELPA_2STAGE_COMPLEX_BGP ELPA_C_2STAGE_COMPLEX_BGP
#define ELPA_2STAGE_COMPLEX_BGQ ELPA_C_2STAGE_COMPLEX_BGQ
#define ELPA_2STAGE_COMPLEX_SSE ELPA_C_2STAGE_COMPLEX_SSE
#define ELPA_2STAGE_COMPLEX_SSE_BLOCK1 ELPA_C_2STAGE_COMPLEX_SSE_BLOCK1
#define ELPA_2STAGE_COMPLEX_SSE_BLOCK2 ELPA_C_2STAGE_COMPLEX_SSE_BLOCK2
#define ELPA_2STAGE_COMPLEX_AVX_BLOCK1 ELPA_C_2STAGE_COMPLEX_AVX_BLOCK1
#define ELPA_2STAGE_COMPLEX_AVX_BLOCK2 ELPA_C_2STAGE_COMPLEX_AVX_BLOCK2
#define ELPA_2STAGE_COMPLEX_AVX2_BLOCK1 ELPA_C_2STAGE_COMPLEX_AVX2_BLOCK1
#define ELPA_2STAGE_COMPLEX_AVX2_BLOCK2 ELPA_C_2STAGE_COMPLEX_AVX2_BLOCK2
#define ELPA_2STAGE_COMPLEX_AVX512_BLOCK1 ELPA_C_2STAGE_COMPLEX_AVX512_BLOCK1
#define ELPA_2STAGE_COMPLEX_AVX512_BLOCK2 ELPA_C_2STAGE_COMPLEX_AVX512_BLOCK2
#define ELPA_2STAGE_COMPLEX_GPU ELPA_C_2STAGE_COMPLEX_GPU
#define ELPA_2STAGE_COMPLEX_DEFAULT ELPA_C_2STAGE_COMPLEX_DEFAULT
#define ELPA_2STAGE_NUMBER_OF_COMPLEX_KERNELS ELPA_C_2STAGE_NUMBER_OF_COMPLEX_KERNELS
#endif
#define ELPA_C_SOLVER_1STAGE 1
#define ELPA_C_SOLVER_2STAGE 2
#define ELPA_C_NUMBER_OF_SOLVERS 2
#ifdef ELPA_H
#define ELPA_SOLVER_1STAGE ELPA_C_SOLVER_1STAGE
#define ELPA_SOLVER_2STAGE ELPA_C_SOLVER_2STAGE
#define ELPA_NUMBER_OF_SOLVERS ELPA_C_NUMBER_OF_SOLVERS
#endif
......@@ -249,8 +249,8 @@
! Rearrange eigenvectors
call resort_ev_&
&PRECISION &
(idx, na)
&PRECISION &
(idx, na)
call timer%stop("merge_systems" // PRECISION_SUFFIX)
......@@ -343,8 +343,8 @@
qtrans(1,1) = C; qtrans(1,2) =-S
qtrans(2,1) = S; qtrans(2,2) = C
call transform_columns_&
&PRECISION &
(idx(i), idx1(na1))
&PRECISION &
(idx(i), idx1(na1))
if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
......@@ -385,22 +385,22 @@
if (na1==1) then
d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
else ! na1==2
call timer%start("blas")
call timer%start("blas")
call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
call timer%stop("blas")
call timer%stop("blas")
call transform_columns_&
&PRECISION&
&(idx1(1), idx1(2))
&PRECISION&
&(idx1(1), idx1(2))
endif
! Add the deflated eigenvalues
d(na1+1:na) = d2(1:na2)
! Calculate arrangement of all eigenvalues in output
call timer%start("blas")
call timer%start("blas")
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call timer%stop("blas")
call timer%stop("blas")
! Rearrange eigenvalues
tmp = d
......@@ -418,8 +418,8 @@
endif
enddo
call resort_ev_&
&PRECISION&
&(idxq1, na)
&PRECISION&
&(idxq1, na)
else if (na1>2) then
! Solve secular equation
......@@ -441,16 +441,16 @@
!$OMP DO
#endif
DO i = my_proc+1, na1, n_procs ! work distributed over all processors
call timer%start("blas")
call timer%start("blas")
call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
call timer%stop("blas")
call timer%stop("blas")
if (info/=0) then
! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
! use the more stable bisection algorithm in solve_secular_equation
! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
call solve_secular_equation_&
&PRECISION&
&(na1, i, d1, z1, delta, rho, s)