...
 
Commits (75)
This diff is collapsed.
...@@ -2,7 +2,8 @@ Changelog for upcoming release ...@@ -2,7 +2,8 @@ Changelog for upcoming release
- not yet decided - not yet decided
Changelog for ELPA 2019.05.001.rc1 Changelog for ELPA 2019.05.001
- elpa_print_kernels supports GPU usage - elpa_print_kernels supports GPU usage
- fix an error if PAPI measurements are activated - fix an error if PAPI measurements are activated
- new simple real kernels: block4 and block6 - new simple real kernels: block4 and block6
...@@ -21,6 +22,9 @@ been introduced a year ago. Removed routines: ...@@ -21,6 +22,9 @@ been introduced a year ago. Removed routines:
-> cholesky_real -> cholesky_real
-> cholesky_complex -> cholesky_complex
-> solve_tridi -> solve_tridi
- new kernels for ARM arch64 added
- fix an out-of-bound-error in elpa2
Changelog for ELPA 2018.11.001 Changelog for ELPA 2018.11.001
......
...@@ -893,62 +893,61 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \ ...@@ -893,62 +893,61 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \ @top_srcdir@/src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
@top_srcdir@/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \ @top_srcdir@/src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
@top_srcdir@/src/elpa2/kernels/simple_template.F90 \ @top_srcdir@/src/elpa2/kernels/simple_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_template.F90 \ @top_srcdir@/src/elpa2/kernels/real_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_template.c \ @top_srcdir@/src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/complex_template.F90 \ @top_srcdir@/src/elpa2/kernels/complex_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \ @top_srcdir@/src/elpa2/kernels/asm_x86_64_double_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_template.c \ @top_srcdir@/src/elpa2/kernels/real_sparc64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \ @top_srcdir@/src/elpa2/kernels/asm_x86_64_single_precision.s \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_simple.F90 \ @top_srcdir@/src/elpa2/kernels/complex_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real.F90 \ @top_srcdir@/src/elpa2/kernels/real.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx-avx2_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/mod_single_hh_trafo_real.F90 \ @top_srcdir@/src/elpa2/kernels/mod_single_hh_trafo_real.F90 \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_template.c \ @top_srcdir@/src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_bgq.f90 \ @top_srcdir@/src/elpa2/kernels/real_bgq.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_2hv_template.c \ @top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_simple.F90 \ @top_srcdir@/src/elpa2/kernels/real_simple.F90 \
@top_srcdir@/src/elpa2/kernels/complex.F90 \ @top_srcdir@/src/elpa2/kernels/complex.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_2hv_template.c \
@top_srcdir@/src/elpa2/kernels/complex_avx512_1hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_4hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_bgp.f90 \ @top_srcdir@/src/elpa2/kernels/real_bgp.f90 \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/complex_sse_2hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_avx512_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \ @top_srcdir@/src/elpa2/kernels/real_sse_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sparc64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c \
@top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \ @top_srcdir@/src/elpa2/elpa2_compute_complex_template.F90 \
@top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \ @top_srcdir@/src/elpa2/elpa2_bandred_template.F90 \
@top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \ @top_srcdir@/src/elpa2/pack_unpack_gpu.F90 \
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
## Preamble ## ## Preamble ##
This file provides documentation on how to build the *ELPA* library in **version ELPA-2019.05.001.rc1**. This file provides documentation on how to build the *ELPA* library in **version ELPA-2019.05.001**.
With release of **version ELPA-2017.05.001** the build process has been significantly simplified, With release of **version ELPA-2017.05.001** the build process has been significantly simplified,
which makes it easier to install the *ELPA* library. which makes it easier to install the *ELPA* library.
...@@ -16,7 +16,7 @@ With release ELPA 2019.05.001 the legacy API is disabled by default, however, ...@@ -16,7 +16,7 @@ With release ELPA 2019.05.001 the legacy API is disabled by default, however,
can be still switched on at build time. can be still switched on at build time.
Most likely with the release ELPA 2019.11.001 the legacy API will be deprecated and not supported anymore. Most likely with the release ELPA 2019.11.001 the legacy API will be deprecated and not supported anymore.
The release of ELPA 2019.05.001.rc1 changes the ABI and API, since it allows to also build the C-functions with optional error arguments The release of ELPA 2019.05.001 changes the ABI and API, since it allows to also build the C-functions with optional error arguments
## How to install *ELPA* ## ## How to install *ELPA* ##
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
For more details and recent updates please visit the online [issue system] (https://gitlab.mpcdf.mpg.de/elpa/elpa/issues) For more details and recent updates please visit the online [issue system] (https://gitlab.mpcdf.mpg.de/elpa/elpa/issues)
Issues which are not mentioned in a newer release are (considered as) solved. Issues which are not mentioned in a newer release are (considered as) solved.
### ELPA 2019.11.001.rc1 release ### ### ELPA 2019.11.001 release ###
- same issues as in ELPA 2017.11.001 - same issues as in ELPA 2017.11.001
### ELPA 2018.11.001 release ### ### ELPA 2018.11.001 release ###
......
...@@ -110,6 +110,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \ ...@@ -110,6 +110,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/kernels/complex_template.F90 \ src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/simple_template.F90 \ src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \ src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/kernels/simple_block6_template.F90 \
src/elpa2/kernels/blas_block4_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \ src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \ src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/compute_hh_trafo.F90 \ src/elpa2/compute_hh_trafo.F90 \
...@@ -194,9 +196,14 @@ if WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL ...@@ -194,9 +196,14 @@ if WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block4.F90 libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block4.F90
endif endif
#if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90 libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
#endif endif
if WITH_REAL_BLAS_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_blas_block4.F90
endif
if WITH_REAL_BGP_KERNEL if WITH_REAL_BGP_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90 libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
endif endif
...@@ -227,6 +234,13 @@ if WITH_REAL_SPARC64_BLOCK2_KERNEL ...@@ -227,6 +234,13 @@ if WITH_REAL_SPARC64_BLOCK2_KERNEL
#endif #endif
endif endif
if WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_2hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK2_KERNEL if WITH_REAL_VSX_BLOCK2_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_2hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
...@@ -269,6 +283,13 @@ if WITH_REAL_SPARC64_BLOCK4_KERNEL ...@@ -269,6 +283,13 @@ if WITH_REAL_SPARC64_BLOCK4_KERNEL
#endif #endif
endif endif
if WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_4hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK4_KERNEL if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
...@@ -311,6 +332,13 @@ if WITH_REAL_SPARC64_BLOCK6_KERNEL ...@@ -311,6 +332,13 @@ if WITH_REAL_SPARC64_BLOCK6_KERNEL
#endif #endif
endif endif
if WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_neon_arch64_6hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK6_KERNEL if WITH_REAL_VSX_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL if WANT_SINGLE_PRECISION_REAL
...@@ -768,28 +796,16 @@ EXTRA_DIST = \ ...@@ -768,28 +796,16 @@ EXTRA_DIST = \
src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \ src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \ src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
src/elpa2/elpa2_tridiag_band_template.F90 \ src/elpa2/elpa2_tridiag_band_template.F90 \
src/elpa2/kernels/complex_avx-avx2_1hv_template.c \ src/elpa2/kernels/complex_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/complex_avx-avx2_2hv_template.c \
src/elpa2/kernels/complex_avx512_1hv_template.c \
src/elpa2/kernels/complex_avx512_2hv_template.c \
src/elpa2/kernels/complex_sse_1hv_template.c \
src/elpa2/kernels/complex_sse_2hv_template.c \
src/elpa2/kernels/complex_template.F90 \ src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/real_avx-avx2_2hv_template.c \
src/elpa2/kernels/real_avx-avx2_4hv_template.c \
src/elpa2/kernels/real_avx-avx2_6hv_template.c \
src/elpa2/kernels/real_avx512_2hv_template.c \
src/elpa2/kernels/real_avx512_4hv_template.c \
src/elpa2/kernels/real_avx512_6hv_template.c \
src/elpa2/kernels/real_vsx_2hv_template.c \
src/elpa2/kernels/real_vsx_4hv_template.c \ src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \ src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_sse_2hv_template.c \ src/elpa2/kernels/real_128bit_256bit_512bit_BLOCK_template.c \
src/elpa2/kernels/real_sse_4hv_template.c \
src/elpa2/kernels/real_sse_6hv_template.c \
src/elpa2/kernels/real_template.F90 \ src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \ src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \ src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/kernels/simple_block6_template.F90 \
src/elpa2/kernels/blas_block4_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \ src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \ src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/qr/elpa_pdgeqrf_template.F90 \ src/elpa2/qr/elpa_pdgeqrf_template.F90 \
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
## Current Release ## ## Current Release ##
The current release is ELPA 2019.05.001.rc1 The current supported API version The current release is ELPA 2019.05.001 The current supported API version
is 20190501. This release supports the earliest API version 20170403. is 20190501. This release supports the earliest API version 20170403.
The old, obsolete legacy API will be deprecated in the future ! The old, obsolete legacy API will be deprecated in the future !
...@@ -76,6 +76,8 @@ No other conditions have to be met. ...@@ -76,6 +76,8 @@ No other conditions have to be met.
Nonetheless, we are grateful if you cite the following publications: Nonetheless, we are grateful if you cite the following publications:
If you use ELPA in general:
T. Auckenthaler, V. Blum, H.-J. Bungartz, T. Huckle, R. Johanni, T. Auckenthaler, V. Blum, H.-J. Bungartz, T. Huckle, R. Johanni,
L. Kr\"amer, B. Lang, H. Lederer, and P. R. Willems, L. Kr\"amer, B. Lang, H. Lederer, and P. R. Willems,
"Parallel solution of partial symmetric eigenvalue problems from "Parallel solution of partial symmetric eigenvalue problems from
...@@ -90,12 +92,20 @@ Nonetheless, we are grateful if you cite the following publications: ...@@ -90,12 +92,20 @@ Nonetheless, we are grateful if you cite the following publications:
Journal of Physics Condensed Matter, 26 (2014) Journal of Physics Condensed Matter, 26 (2014)
doi:10.1088/0953-8984/26/21/213201 doi:10.1088/0953-8984/26/21/213201
If you use the GPU version of ELPA:
Kus, P; Marek, A.; Lederer, H. Kus, P; Marek, A.; Lederer, H.
"GPU Optimization of Large-Scale Eigenvalue Solver", "GPU Optimization of Large-Scale Eigenvalue Solver",
In: Radu F., Kumar K., Berre I., Nordbotten J., Pop I. (eds) In: Radu F., Kumar K., Berre I., Nordbotten J., Pop I. (eds)
Numerical Mathematics and Advanced Applications ENUMATH 2017. ENUMATH 2017. Numerical Mathematics and Advanced Applications ENUMATH 2017. ENUMATH 2017.
Lecture Notes in Computational Science and Engineering, vol 126. Springer, Cham Lecture Notes in Computational Science and Engineering, vol 126. Springer, Cham
If you use the new API and/or autotuning:
Kus; P.; Marek, A.; Koecher, S. S.; Kowalski H.-H.; Carbogno, Ch.; Scheurer, Ch.; Reuter, K.; Scheffler, M.; Lederer, H.
"Optimizations of the Eigenvaluesolvers in the ELPA Library",
Parllel Computing 85, 167-177 (2019)
## Installation of the *ELPA* library ## Installation of the *ELPA* library
...@@ -115,7 +125,7 @@ the possible configure options. ...@@ -115,7 +125,7 @@ the possible configure options.
## Using *ELPA* ## Using *ELPA*
Please have a look at the "**USERS_GUIDE**" file, to get a documentation or at the [online] Please have a look at the "**USERS_GUIDE**" file, to get a documentation or at the [online]
(http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html) doxygen (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html) doxygen
documentation, where you find the definition of the interfaces. documentation, where you find the definition of the interfaces.
## Contributing to *ELPA* ## Contributing to *ELPA*
......
This file contains the release notes for the ELPA 2019.05.001.rc1 version This file contains the release notes for the ELPA 2019.05.001 version
What is new? What is new?
------------- -------------
...@@ -9,6 +9,7 @@ For detailed information about changes since release ELPA 2018.11 please have a ...@@ -9,6 +9,7 @@ For detailed information about changes since release ELPA 2018.11 please have a
- C functions can have an optional error argument, if compiler supports this - C functions can have an optional error argument, if compiler supports this
=> ABI and API change => ABI and API change
- as anounced, removal of deprecated routines - as anounced, removal of deprecated routines
- new kernels for Arm arch64
ABI change ABI change
--------------------- ---------------------
......
...@@ -146,7 +146,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst ...@@ -146,7 +146,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program which prints all For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program which prints all
the available kernels. the available kernels.
Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html) Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html)
for each *ELPA* release is available. for each *ELPA* release is available.
...@@ -13,7 +13,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst ...@@ -13,7 +13,7 @@ Local documentation (via man pages) should be available (if *ELPA* has been inst
For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program, which prints all For example "man elpa2_print_kernels" should provide the documentation for the *ELPA* program, which prints all
the available kernels. the available kernels.
Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html) Also a [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html)
for each *ELPA* release is available. for each *ELPA* release is available.
...@@ -200,7 +200,7 @@ The following table gives a list of all supported parameters which can be used t ...@@ -200,7 +200,7 @@ The following table gives a list of all supported parameters which can be used t
## III) List of computational routines ## ## III) List of computational routines ##
The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html) for details. The following compute routines are available in *ELPA*: Please have a look at the man pages or [online doxygen documentation] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html) for details.
| Name | Purpose | since API version | | Name | Purpose | since API version |
......
...@@ -22,7 +22,7 @@ The *ELPA* library consists of two main parts: ...@@ -22,7 +22,7 @@ The *ELPA* library consists of two main parts:
Both variants of the *ELPA* solvers are available for real or complex singe and double precision valued matrices. Both variants of the *ELPA* solvers are available for real or complex singe and double precision valued matrices.
Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001.rc1/html/index.html) for details): Thus *ELPA* provides the following user functions (see man pages or [online] (http://elpa.mpcdf.mpg.de/html/Documentation/ELPA-2019.05.001/html/index.html) for details):
- elpa_get_communicators : set the row / column communicators for *ELPA* - elpa_get_communicators : set the row / column communicators for *ELPA*
- elpa_solve_evp_complex_1stage_{single|double} : solve a {single|double} precision complex eigenvalue proplem with the *ELPA 1stage* solver - elpa_solve_evp_complex_1stage_{single|double} : solve a {single|double} precision complex eigenvalue proplem with the *ELPA 1stage* solver
......
This diff is collapsed.
...@@ -125,7 +125,7 @@ then ...@@ -125,7 +125,7 @@ then
CLUSTER="draco" CLUSTER="draco"
fi fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS" echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
# GPU runners # GPU runners
if [ "$CI_RUNNER_TAGS" == "gpu" ] if [ "$CI_RUNNER_TAGS" == "gpu" ]
...@@ -144,16 +144,17 @@ then ...@@ -144,16 +144,17 @@ then
cat ./run_${CLUSTER}_1node_2GPU.sh cat ./run_${CLUSTER}_1node_2GPU.sh
echo " " echo " "
echo "Submitting to SLURM" echo "Submitting to SLURM"
sbatch -W ./run_${CLUSTER}_1node_2GPU.sh if sbatch -W ./run_${CLUSTER}_1node_2GPU.sh; then
exitCode=$? exitCode=$?
else
exitCode=$?
echo "Submission exited with exitCode $exitCode"
fi
echo " " #if (( $exitCode > 0 ))
echo "Exit Code of sbatch: $exitCode" #then
echo " "
if (( $exitCode > 0 ))
then
cat ./ELPA_CI_2gpu.err.* cat ./ELPA_CI_2gpu.err.*
fi #fi
fi fi
...@@ -174,24 +175,31 @@ then ...@@ -174,24 +175,31 @@ then
cat ./run_${CLUSTER}_1node.sh cat ./run_${CLUSTER}_1node.sh
echo " " echo " "
echo "Submitting to SLURM" echo "Submitting to SLURM"
sbatch -W ./run_${CLUSTER}_1node.sh if sbatch -W ./run_${CLUSTER}_1node.sh; then
exitCode=$? exitCode=$?
else
exitCode=$?
echo "Submission excited with exitCode $exitCode"
fi
echo " " echo " "
echo "Exit Code of sbatch: $exitCode" echo "Exit Code of sbatch: $exitCode"
echo " " echo " "
cat ./ELPA_CI.out.* cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ] #if [ $exitCode -ne 0 ]
then #then
cat ./ELPA_CI.err.* cat ./ELPA_CI.err.*
fi #fi
fi fi
if [ $exitCode -ne 0 ] #if [ $exitCode -ne 0 ]
#then
if [ -f ./test-suite.log ]
then then
cat ./test-suite.log cat ./test-suite.log
fi fi
#fi
exit $exitCode exit $exitCode
......
...@@ -102,7 +102,7 @@ then ...@@ -102,7 +102,7 @@ then
CLUSTER="draco" CLUSTER="draco"
fi fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS" echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
#distcheck #distcheck
if [[ "$CI_RUNNER_TAGS" =~ "distcheck" ]] if [[ "$CI_RUNNER_TAGS" =~ "distcheck" ]]
...@@ -126,17 +126,21 @@ then ...@@ -126,17 +126,21 @@ then
echo "Exit Code of sbatch: $exitCode" echo "Exit Code of sbatch: $exitCode"
echo " " echo " "
cat ./ELPA_CI.out.* cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ] #if [ $exitCode -ne 0 ]
then #then
cat ./ELPA_CI.err.* cat ./ELPA_CI.err.*
#fi
if [ -f ./test-suite.log ]
then
cat ./test-suite.log
fi fi
fi fi
if [ $exitCode -ne 0 ] #if [ $exitCode -ne 0 ]
then #then
cat ./test-suite.log cat ./test-suite.log
fi #fi
exit $exitCode exit $exitCode
......
...@@ -120,7 +120,7 @@ then ...@@ -120,7 +120,7 @@ then
CLUSTER="draco" CLUSTER="draco"
fi fi
echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS" echo "Running on $CLUSTER with runner $CI_RUNNER_DESCRIPTION with tag $CI_RUNNER_TAGS on $mpiTasks tasks"
#project_test #project_test
if [[ "$CI_RUNNER_TAGS" =~ "project_test" ]] if [[ "$CI_RUNNER_TAGS" =~ "project_test" ]]
...@@ -129,18 +129,18 @@ then ...@@ -129,18 +129,18 @@ then
echo "mkdir -p build" >> ./run_${CLUSTER}_1node.sh echo "mkdir -p build" >> ./run_${CLUSTER}_1node.sh
echo "pushd build" >> ./run_${CLUSTER}_1node.sh echo "pushd build" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running autogen " >> ./run_${CLUSTER}_1node.sh echo "#Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running configure " >> ./run_${CLUSTER}_1node.sh echo "#Running configure " >> ./run_${CLUSTER}_1node.sh
echo "../configure " "$configureArgs" >> ./run_${CLUSTER}_1node.sh echo "../configure " "$configureArgs" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node.sh echo "export TASKS=$mpiTasks" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make " >> ./run_${CLUSTER}_1node.sh echo "#Running make " >> ./run_${CLUSTER}_1node.sh
echo "make -j 8" >> ./run_${CLUSTER}_1node.sh echo "make -j 8" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make install" >> ./run_${CLUSTER}_1node.sh echo "#Running make install" >> ./run_${CLUSTER}_1node.sh
echo "make install" >> ./run_${CLUSTER}_1node.sh echo "make install" >> ./run_${CLUSTER}_1node.sh
echo "popd" >> ./run_${CLUSTER}_1node.sh echo "popd" >> ./run_${CLUSTER}_1node.sh
echo "mkdir -p $projectName/build" >> ./run_${CLUSTER}_1node.sh echo "mkdir -p $projectName/build" >> ./run_${CLUSTER}_1node.sh
...@@ -149,19 +149,19 @@ then ...@@ -149,19 +149,19 @@ then
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo " Testting project " >> ./run_${CLUSTER}_1node.sh echo " #Testting project " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running autogen " >> ./run_${CLUSTER}_1node.sh echo "#Running autogen " >> ./run_${CLUSTER}_1node.sh
echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh echo "../autogen.sh" >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running configure " >> ./run_${CLUSTER}_1node.sh echo "#Running configure " >> ./run_${CLUSTER}_1node.sh
echo "../configure " "$projectConfigureArgs " >> ./run_${CLUSTER}_1node.sh echo "../configure " "$projectConfigureArgs " >> ./run_${CLUSTER}_1node.sh
echo " " >> ./run_${CLUSTER}_1node.sh echo " " >> ./run_${CLUSTER}_1node.sh
echo "Running make " >> ./run_${CLUSTER}_1node.sh echo "#Running make " >> ./run_${CLUSTER}_1node.sh
echo "make -j 8" >> ./run_${CLUSTER}_1node.sh echo "make -j 8" >> ./run_${CLUSTER}_1node.sh
echo "export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:\$LD_LIBRARY_PATH" >> ./run_${CLUSTER}_1node.sh echo "export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:\$LD_LIBRARY_PATH" >> ./run_${CLUSTER}_1node.sh
echo "./$projectExecutable" >> ./run_${CLUSTER}_1node.sh echo "./$projectExecutable" >> ./run_${CLUSTER}_1node.sh
...@@ -184,18 +184,17 @@ then ...@@ -184,18 +184,17 @@ then
echo "Exit Code of sbatch: $exitCode" echo "Exit Code of sbatch: $exitCode"
echo " " echo " "
cat ./ELPA_CI.out.* cat ./ELPA_CI.out.*
if [ $exitCode -ne 0 ] #if [ $exitCode -ne 0 ]
then #then
cat ./ELPA_CI.err.* cat ./ELPA_CI.err.*
#fi
if [ -f ./test-suite.log ]
then
cat ./test-suite.log
fi fi
fi fi
if [ $exitCode -ne 0 ]
then
cat ./test-suite.log
fi
exit $exitCode exit $exitCode
fi fi
...@@ -50,13 +50,15 @@ if test x$_cv_gnu_make_command = x ; then ...@@ -50,13 +50,15 @@ if test x$_cv_gnu_make_command = x ; then
AC_MSG_ERROR([Need GNU Make]) AC_MSG_ERROR([Need GNU Make])
fi fi
enable_legacy=no
AC_MSG_CHECKING(whether legacy interface should be provided) AC_MSG_CHECKING(whether legacy interface should be provided)
AC_ARG_ENABLE([legacy-interface], AC_ARG_ENABLE([legacy-interface],
AS_HELP_STRING([--enable-legacy-interface], AS_HELP_STRING([--enable-legacy-interface],
[build legacy API, default no]), [build legacy API, default no]),
[ [
if test x"$enableval" = x"yes"; then if test x"$enableval" = x"yes"; then
enable_legayc=yes enable_legacy=yes
else else
enable_legacy=no enable_legacy=no
fi fi
...@@ -227,9 +229,9 @@ fi ...@@ -227,9 +229,9 @@ fi
dnl check which MPI binray invokes a MPI job dnl check which MPI binray invokes a MPI job
if test x"$with_mpi" = x"yes"; then if test x"$with_mpi" = x"yes"; then
AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob srun], [no]) AC_CHECK_PROGS([MPI_BINARY], [mpiexec.hydra mpiexec mpirun poe runjob srun aprun], [no])
if test x"$MPI_BINARY" = x"no"; then if test x"$MPI_BINARY" = x"no"; then
AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun]) AC_MSG_ERROR([Could not find either of the MPI binaries: mpiexec.hydra, mpiexec, mpirun, poe, runjob, srun, aprun])
fi fi
fi fi
...@@ -613,8 +615,10 @@ m4_define(elpa_m4_generic_kernels, [ ...@@ -613,8 +615,10 @@ m4_define(elpa_m4_generic_kernels, [
real_generic real_generic
real_generic_simple real_generic_simple
real_generic_simple_block4 real_generic_simple_block4
real_generic_simple_block6
complex_generic complex_generic
complex_generic_simple complex_generic_simple
real_blas_block4
]) ])
m4_define(elpa_m4_sse_assembly_kernels, [ m4_define(elpa_m4_sse_assembly_kernels, [
...@@ -636,6 +640,12 @@ m4_define(elpa_m4_sparc64_kernels, [ ...@@ -636,6 +640,12 @@ m4_define(elpa_m4_sparc64_kernels, [
real_sparc64_block6 real_sparc64_block6
]) ])
m4_define(elpa_m4_neon_arch64_kernels, [
real_neon_arch64_block2
real_neon_arch64_block4
real_neon_arch64_block6
])
m4_define(elpa_m4_vsx_kernels, [ m4_define(elpa_m4_vsx_kernels, [
real_vsx_block2 real_vsx_block2
real_vsx_block4 real_vsx_block4
...@@ -681,7 +691,7 @@ m4_define(elpa_m4_gpu_kernels, [ ...@@ -681,7 +691,7 @@ m4_define(elpa_m4_gpu_kernels, [
complex_gpu complex_gpu
]) ])
m4_define(elpa_m4_kernel_types, [generic sparc64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu]) m4_define(elpa_m4_kernel_types, [generic sparc64 neon_arch64 vsx sse sse_assembly avx avx2 avx512 bgp bgq gpu])
m4_define(elpa_m4_all_kernels, m4_define(elpa_m4_all_kernels,
m4_foreach_w([elpa_m4_type], m4_foreach_w([elpa_m4_type],
...@@ -715,6 +725,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [ ...@@ -715,6 +725,7 @@ AC_DEFUN([ELPA_SELECT_KERNELS], [
dnl Modify list of kernels with configure arguments dnl Modify list of kernels with configure arguments
ELPA_SELECT_KERNELS([generic],[enable]) ELPA_SELECT_KERNELS([generic],[enable])
ELPA_SELECT_KERNELS([sparc64],[disable]) ELPA_SELECT_KERNELS([sparc64],[disable])
ELPA_SELECT_KERNELS([neon_arch64],[disable])
ELPA_SELECT_KERNELS([vsx],[disable]) ELPA_SELECT_KERNELS([vsx],[disable])
ELPA_SELECT_KERNELS([sse],[enable]) ELPA_SELECT_KERNELS([sse],[enable])
ELPA_SELECT_KERNELS([sse_assembly],[enable]) ELPA_SELECT_KERNELS([sse_assembly],[enable])
...@@ -730,7 +741,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -730,7 +741,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
]) ])
if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then if test x"${enable_bgp}" = x"yes" -o x"$enable_bgq" = x"yes"; then
m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [ m4_foreach_w([elpa_m4_kernel], elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_sse_kernels elpa_m4_avx_kernels elpa_m4_avx2_kernels elpa_m4_avx512_kernels, [
if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then if x"$use_[]elpa_m4_kernel[]" = x"yes" ; then
echo "Disabling elpa_m4_kernel due to BGP/BGQ option" echo "Disabling elpa_m4_kernel due to BGP/BGQ option"
fi fi
...@@ -790,7 +801,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[ ...@@ -790,7 +801,7 @@ AC_DEFUN([ELPA_KERNEL_DEPENDS],[
]) ])
fi fi
]) ])
m4_foreach_w([elpa_m4_arch],[sparc64 vsx sse avx avx2 avx512],[ m4_foreach_w([elpa_m4_arch],[sparc64 neon_arch64 vsx sse avx avx2 avx512],[
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block6], [real_]elpa_m4_arch[_block4 real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2]) ELPA_KERNEL_DEPENDS([real_]elpa_m4_arch[_block4], [real_]elpa_m4_arch[_block2])
ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1]) ELPA_KERNEL_DEPENDS([complex_]elpa_m4_arch[_block2], [complex_]elpa_m4_arch[_block1])
...@@ -848,7 +859,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -848,7 +859,7 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kind],[real complex],[ m4_foreach_w([elpa_m4_kind],[real complex],[
m4_foreach_w([elpa_m4_kernel], m4_foreach_w([elpa_m4_kernel],
m4_foreach_w([elpa_m4_cand_kernel], m4_foreach_w([elpa_m4_cand_kernel],
elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels, elpa_m4_avx512_kernels elpa_m4_avx2_kernels elpa_m4_avx_kernels elpa_m4_sse_kernels elpa_m4_sse_assembly_kernels elpa_m4_sparc64_kernels elpa_m4_neon_arch64_kernels elpa_m4_vsx_kernels elpa_m4_generic_kernels,
[m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ), [m4_bmatch(elpa_m4_cand_kernel,elpa_m4_kind,elpa_m4_cand_kernel)] ),
[ [
if test -z "$default_[]elpa_m4_kind[]_kernel"; then if test -z "$default_[]elpa_m4_kind[]_kernel"; then
...@@ -895,7 +906,6 @@ int main(int argc, char **argv) { ...@@ -895,7 +906,6 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU]) AC_DEFINE([HAVE_VSX_SSE],[1],[Altivec VSX intrinsics are supported on this CPU])
fi fi
if test x"${need_sparc64}" = x"yes"; then if test x"${need_sparc64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C) AC_MSG_CHECKING(whether we can compile SPARC64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...@@ -917,6 +927,27 @@ int main(int argc, char **argv) { ...@@ -917,6 +927,27 @@ int main(int argc, char **argv) {
AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU]) AC_DEFINE([HAVE_SPARC64_SSE],[1],[SPARC64 intrinsics are supported on this CPU])
fi fi
if test x"${need_neon_arch64}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile NEON ARCH64 with intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <arm_neon.h>
int main(int argc, char **argv) {
__Float64x2_t x1, x2, x3, x4;
x4 = vfmaq_f64(x1, x2, x3);
return 0;
}
])],
[can_compile_neon_arch64=yes],
[can_compile_neon_arch64=no]
)
AC_MSG_RESULT([${can_compile_neon_arch64}])
if test x"$can_compile_neon_arch64" != x"yes"; then
AC_MSG_ERROR([Could not compile test program, try with --disable-neon_arch64, or adjust the C compiler or CFLAGS])
fi
AC_DEFINE([HAVE_NEON_ARCH64_SSE],[1],[NEON_ARCH64 intrinsics are supported on this CPU])
fi
if test x"${need_sse}" = x"yes"; then if test x"${need_sse}" = x"yes"; then
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C) AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([ AC_COMPILE_IFELSE([AC_LANG_SOURCE([
...@@ -1502,12 +1533,12 @@ m4_foreach_w([elpa_m4_kind],[real complex],[ ...@@ -1502,12 +1533,12 @@ m4_foreach_w([elpa_m4_kind],[real complex],[
#echo "***********************************************************************" #echo "***********************************************************************"
#echo " " #echo " "
#echo " " #echo " "
echo "***********************************************************************" #echo "***********************************************************************"
echo "* This is a the first release candidate of ELPA 2019.05.001.rc1 *" #echo "* This is a the first release candidate of ELPA 2019.05.001.rc2 *"
echo "* There might be still some changes until the final release of *" #echo "* There might be still some changes until the final release of *"
echo "* ELPA 2019.05.001 *" #echo "* ELPA 2019.05.001 *"
echo "***********************************************************************" #echo "***********************************************************************"
echo " " #echo " "
if test x"$enable_kcomputer" = x"yes" ; then if test x"$enable_kcomputer" = x"yes" ; then
echo " " echo " "
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
%define with_openmp 0 %define with_openmp 0
Name: elpa Name: elpa
Version: 2019.05.001.rc1 Version: 2019.05.001
Release: 1 Release: 1
Summary: A massively parallel eigenvector solver Summary: A massively parallel eigenvector solver
License: LGPL-3.0 License: LGPL-3.0
......
...@@ -44,10 +44,15 @@ enum ELPA_SOLVERS { ...@@ -44,10 +44,15 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK2, 19, @ELPA_2STAGE_REAL_SPARC64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK4, 20, @ELPA_2STAGE_REAL_SPARC64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_SPARC64_BLOCK6, 21, @ELPA_2STAGE_REAL_SPARC64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2, 22, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4, 23, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \ X(ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6, 24, @ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) X(ELPA_2STAGE_REAL_VSX_BLOCK2, 25, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 29, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_BLAS_BLOCK4, 30, @ELPA_2STAGE_REAL_BLAS_BLOCK4_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \ #define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \ ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
This diff is collapsed.
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
#endif
#if REALCASE==1
subroutine quad_hh_trafo_&
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (useGPU, q, hh, nb, nq, ldq, ldh)
use precision
implicit none
#include "../../general/precision_kinds.F90"
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef USE_ASSUMED_SIZE
real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
#else
real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+3)
real(kind=C_DATATYPE_KIND), intent(in) :: hh(1:ldh,1:6)
#endif
real(kind=C_DATATYPE_KIND) :: w_comb(nq, 4)
real(kind=C_DATATYPE_KIND) :: h_mat(4, nb+3)
real(kind=C_DATATYPE_KIND) :: s_mat(4, 4)
integer(kind=ik) :: i, j, k
! Calculate dot product of the two Householder vectors
h_mat(:,:) = 0.0_rk
h_mat(1,4) = -1.0_rk
h_mat(2,3) = -1.0_rk
h_mat(3,2) = -1.0_rk
h_mat(4,1) = -1.0_rk
h_mat(1,5:nb+3) = -hh(2:nb, 1)
h_mat(2,4:nb+2) = -hh(2:nb, 2)
h_mat(3,3:nb+1) = -hh(2:nb, 3)
h_mat(4,2:nb) = -hh(2:nb, 4)
! TODO we do not need the diagonal, but how to do it with BLAS?
!s_mat = - matmul(h_mat, transpose(h_mat))
call PRECISION_SYRK('L', 'N', 4, nb+3, &
-ONE, h_mat, 4, &
ZERO, s_mat, 4)
! w_comb = - matmul(q(1:nq, 1:nb+3), transpose(h_mat))
call PRECISION_GEMM('N', 'T', nq, 4, nb+3, &
-ONE, q, ldq, &
h_mat, 4, &
ZERO, w_comb, nq)
! Rank-1 update
!w_comb(1:nq,1) = hh(1,1) * w_comb(1:nq, 1)
call PRECISION_SCAL(nq, hh(1,1), w_comb(1:nq, 1), 1)
do i = 2, 4
! w_comb(1:nq,i) = matmul(w_comb(1:nq,1:i-1), hh(1,i) * s_mat(i,1:i-1)) + hh(1,i) * w_comb(1:nq, i)
call PRECISION_GEMV('N', nq, i-1, &
hh(1,i), w_comb(1:nq, 1:i-1), nq, &
s_mat(i,1:i-1), 1, &
hh(1,i), w_comb(1:nq,i), 1)
enddo
!q(1:nq, 1:nb+3) = matmul(w_comb, h_mat) + q(1:nq, 1:nb+3)
call PRECISION_GEMM('N', 'N', nq, nb+3, 4, &
ONE, w_comb, nq, &
h_mat, 4, &
ONE, q, ldq)
end subroutine
#endif
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET AVX_256
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_avx-avx2_1hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET AVX_256
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_avx-avx2_1hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef SINGLE_PRECISION #undef SINGLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
This diff is collapsed.
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#define VEC_SET AVX_256
#define BLOCK2 1
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_avx-avx2_2hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#define VEC_SET AVX_256
#define BLOCK2 1
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_avx-avx2_2hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION #undef SINGLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
This diff is collapsed.
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET AVX_512
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_avx512_1hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK1
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET AVX_512
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_avx512_1hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK1
#undef SINGLE_PRECISION #undef SINGLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
This diff is collapsed.
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET AVX_512
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_avx512_2hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET AVX_512
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_avx512_2hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION #undef SINGLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
This diff is collapsed.
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET SSE_128
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_sse_1hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#define BLOCK1 1
#define VEC_SET SSE_128
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_sse_1hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK1
#undef VEC_SET
#undef COMPLEXCASE #undef COMPLEXCASE
#undef SINGLE_PRECISION #undef SINGLE_PRECISION
This diff is collapsed.
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#define VEC_SET SSE_128
#define BLOCK2 1
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_sse_2hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#define VEC_SET SSE_128
#define BLOCK2 1
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "complex_sse_2hv_template.c" #include "complex_128bit_256bit_512bit_BLOCK_template.c"
#undef VEC_SET
#undef BLOCK2
#undef SINGLE_PRECISION #undef SINGLE_PRECISION
#undef COMPLEXCASE #undef COMPLEXCASE
This diff is collapsed.
This diff is collapsed.
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define REALCASE 1 #define REALCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 256
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "real_avx-avx2_2hv_template.c" #include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
#undef VEC_SET
#undef REALCASE #undef REALCASE
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
...@@ -48,8 +48,12 @@ ...@@ -48,8 +48,12 @@
#define REALCASE 1 #define REALCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#define BLOCK2 1
#define VEC_SET 256
#include "../../general/precision_macros.h" #include "../../general/precision_macros.h"
#include "real_avx-avx2_2hv_template.c" #include "real_128bit_256bit_512bit_BLOCK_template.c"
#undef BLOCK2
#undef VEC_SET
#undef REALCASE #undef REALCASE
#undef SINGLE_PRECISION #undef SINGLE_PRECISION
...@@ -148,37 +148,6 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i ...@@ -148,37 +148,6 @@ void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, i
void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh); void double_hh_trafo_real_avx_avx2_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif #endif
#ifdef DOUBLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_real_avx_avx2_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_avx_avx2_2hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
!f> interface
!f> subroutine double_hh_trafo_real_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="double_hh_trafo_real_avx_avx2_2hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh) void double_hh_trafo_real_avx_avx2_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif #endif
......
This diff is collapsed.
This diff is collapsed.