Commit b6e1b918 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master_fix_up' into ELPA_KNL

parents aebf900d 303c02e6
...@@ -52,22 +52,19 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \ ...@@ -52,22 +52,19 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_template.X90 \ src/elpa1_compute_template.X90 \
src/elpa2_compute_real_template.X90 \ src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \ src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_real_template.X90 \ src/elpa2_template.X90 \
src/elpa2_bandred_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \ src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \ src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \ src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \ src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_bandred_complex_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \ src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \ src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \ src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \ src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \ src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \ src/redist_band.X90 \
src/precision_macros.h \ src/precision_macros.h
src/precision_macros_complex.h
lib_LTLIBRARIES = libelpa@SUFFIX@.la lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION)
...@@ -356,6 +353,7 @@ dist_files_DATA = \ ...@@ -356,6 +353,7 @@ dist_files_DATA = \
test/Fortran/test_real2_default.F90 \ test/Fortran/test_real2_default.F90 \
test/Fortran/test_real2_qr.F90 \ test/Fortran/test_real2_qr.F90 \
test/Fortran/test_real2_api.F90 \ test/Fortran/test_real2_api.F90 \
test/Fortran/test_real2_banded.F90 \
test/Fortran/test_real.F90 \ test/Fortran/test_real.F90 \
test/Fortran/test_real_with_c.F90 \ test/Fortran/test_real_with_c.F90 \
test/Fortran/test_toeplitz.F90 \ test/Fortran/test_toeplitz.F90 \
...@@ -386,6 +384,7 @@ noinst_PROGRAMS = \ ...@@ -386,6 +384,7 @@ noinst_PROGRAMS = \
elpa2_test_real_default@SUFFIX@ \ elpa2_test_real_default@SUFFIX@ \
elpa2_test_real_qr@SUFFIX@ \ elpa2_test_real_qr@SUFFIX@ \
elpa2_test_real_api@SUFFIX@ \ elpa2_test_real_api@SUFFIX@ \
elpa2_test_real_banded@SUFFIX@ \
elpa2_test_complex@SUFFIX@ \ elpa2_test_complex@SUFFIX@ \
elpa2_test_complex_default@SUFFIX@ \ elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \ elpa2_test_complex_api@SUFFIX@ \
...@@ -573,6 +572,11 @@ elpa2_test_real_api@SUFFIX@_LDADD = $(build_lib) ...@@ -573,6 +572,11 @@ elpa2_test_real_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules elpa2_test_real_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90 EXTRA_elpa2_test_real_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_real_banded@SUFFIX@_SOURCES = test/Fortran/test_real2_banded.F90
elpa2_test_real_banded@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_banded@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa1_test_complex@SUFFIX@_SOURCES = test/Fortran/test_complex.F90 elpa1_test_complex@SUFFIX@_SOURCES = test/Fortran/test_complex.F90
elpa1_test_complex@SUFFIX@_LDADD = $(build_lib) elpa1_test_complex@SUFFIX@_LDADD = $(build_lib)
elpa1_test_complex@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules elpa1_test_complex@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
...@@ -772,6 +776,7 @@ check_SCRIPTS = \ ...@@ -772,6 +776,7 @@ check_SCRIPTS = \
elpa2_test_real_qr@SUFFIX@.sh \ elpa2_test_real_qr@SUFFIX@.sh \
elpa2_test_complex_default@SUFFIX@.sh \ elpa2_test_complex_default@SUFFIX@.sh \
elpa2_test_real_api@SUFFIX@.sh \ elpa2_test_real_api@SUFFIX@.sh \
elpa2_test_real_banded@SUFFIX@.sh \
elpa2_test_complex_api@SUFFIX@.sh \ elpa2_test_complex_api@SUFFIX@.sh \
elpa_driver_real@SUFFIX@.sh \ elpa_driver_real@SUFFIX@.sh \
elpa_driver_complex@SUFFIX@.sh \ elpa_driver_complex@SUFFIX@.sh \
...@@ -942,18 +947,15 @@ EXTRA_DIST = \ ...@@ -942,18 +947,15 @@ EXTRA_DIST = \
src/elpa1_tridiag_template.X90 \ src/elpa1_tridiag_template.X90 \
src/elpa2_compute_real_template.X90 \ src/elpa2_compute_real_template.X90 \
src/elpa2_compute_complex_template.X90 \ src/elpa2_compute_complex_template.X90 \
src/elpa2_bandred_complex_template.X90 \ src/elpa2_bandred_template.X90 \
src/elpa2_bandred_real_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \ src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \ src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \ src/elpa2_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \ src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \ src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \ src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \
src/precision_macros.h \ src/precision_macros.h \
src/precision_macros_complex.h \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \ src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \ src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \ src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
import sys import sys
simple_tokens = [ simple_tokens = [
"PRECISION",
"elpa_transpose_vectors_NUMBER_PRECISION", "elpa_transpose_vectors_NUMBER_PRECISION",
"elpa_reduce_add_vectors_NUMBER_PRECISION", "elpa_reduce_add_vectors_NUMBER_PRECISION",
"bandred_NUMBER_PRECISION", "bandred_NUMBER_PRECISION",
"trans_ev_band_to_full_NUMBER_PRECISION", "trans_ev_band_to_full_NUMBER_PRECISION",
"tridiag_band_NUMBER_PRECISION", "tridiag_band_NUMBER_PRECISION",
...@@ -15,10 +15,11 @@ simple_tokens = [ ...@@ -15,10 +15,11 @@ simple_tokens = [
"solve_tridi_PRECISION", "solve_tridi_PRECISION",
"solve_tridi_col_PRECISION", "solve_tridi_col_PRECISION",
"solve_tridi_single_problem_PRECISION", "solve_tridi_single_problem_PRECISION",
"solve_evp_NUMBER_2stage_PRECISION",
"qr_pdgeqrf_2dcomm_PRECISION", "qr_pdgeqrf_2dcomm_PRECISION",
"hh_transform_NUMBER_PRECISION", "hh_transform_NUMBER_PRECISION",
"symm_matrix_allreduce_PRECISION", "symm_matrix_allreduce_PRECISION",
"herm_matrix_allreduce_PRECISION",
"redist_band_NUMBER_PRECISION", "redist_band_NUMBER_PRECISION",
"unpack_row_NUMBER_cpu_PRECISION", "unpack_row_NUMBER_cpu_PRECISION",
"unpack_row_NUMBER_cpu_openmp_PRECISION", "unpack_row_NUMBER_cpu_openmp_PRECISION",
...@@ -45,9 +46,19 @@ simple_tokens = [ ...@@ -45,9 +46,19 @@ simple_tokens = [
"global_product_PRECISION", "global_product_PRECISION",
"add_tmp_PRECISION", "add_tmp_PRECISION",
"v_add_s_PRECISION", "v_add_s_PRECISION",
"launch_compute_hh_trafo_c_kernel_NUMBER_PRECISION",
"compute_hh_trafo_NUMBER_gpu_PRECISION",
"launch_my_pack_c_kernel_NUMBER_PRECISION",
"launch_my_unpack_c_kernel_NUMBER_PRECISION",
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION",
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION",
"AVAILABLE_UPCASENUMBER_ELPA_KERNELS",
"UPCASENUMBER_ELPA_KERNEL_GENERIC",
"DEFAULT_UPCASENUMBER_ELPA_KERNEL",
"UPCASENUMBER_ELPA_KERNEL_NAMES",
"UPCASENUMBER_ELPA_KERNEL_GPU",
] ]
blas_tokens = [ blas_tokens = [
"PRECISION_GEMV", "PRECISION_GEMV",
"PRECISION_TRMV", "PRECISION_TRMV",
...@@ -57,6 +68,8 @@ blas_tokens = [ ...@@ -57,6 +68,8 @@ blas_tokens = [
"PRECISION_SYRK", "PRECISION_SYRK",
"PRECISION_SYMV", "PRECISION_SYMV",
"PRECISION_SYMM", "PRECISION_SYMM",
"PRECISION_HEMV",
"PRECISION_HER2",
"PRECISION_SYR2", "PRECISION_SYR2",
"PRECISION_SYR2K", "PRECISION_SYR2K",
"PRECISION_GEQRF", "PRECISION_GEQRF",
...@@ -75,6 +88,7 @@ blas_tokens = [ ...@@ -75,6 +88,7 @@ blas_tokens = [
explicit_tokens_complex = [ explicit_tokens_complex = [
("PRECISION_SUFFIX", "\"_double\"", "\"_single\""), ("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"), ("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"),
("MPI_COMPLEX_EXPLICIT_PRECISION", "MPI_COMPLEX16", "MPI_COMPLEX8"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"), ("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("KIND_PRECISION", "rk8", "rk4"), ("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"), ("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
...@@ -82,8 +96,15 @@ explicit_tokens_complex = [ ...@@ -82,8 +96,15 @@ explicit_tokens_complex = [
("PRECISION_REAL", "DREAL", "REAL"), ("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"), ("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"), ("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("CONST_REAL_0_5", "0.5_rk8", "0.5_rk4"),
("CONST_COMPLEX_PAIR_0_0", "(0.0_rk8,0.0_rk8)", "(0.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_1_0", "(1.0_rk8,0.0_rk8)", "(1.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_NEGATIVE_1_0", "(-1.0_rk8,0.0_rk8)", "(-1.0_rk4,0.0_rk4)"),
("CONST_COMPLEX_PAIR_NEGATIVE_0_5", "(-0.5_rk8,0.0_rk8)", "(-0.5_rk4,0.0_rk4)"),
("CONST_COMPLEX_0_0", "0.0_ck8", "0.0_ck4"), ("CONST_COMPLEX_0_0", "0.0_ck8", "0.0_ck4"),
("CONST_COMPLEX_1_0", "1.0_ck8", "1.0_ck4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"), ("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
("C_DATATYPE_KIND", "c_double", "c_float"),
] ]
explicit_tokens_real = [ explicit_tokens_real = [
...@@ -95,6 +116,7 @@ explicit_tokens_real = [ ...@@ -95,6 +116,7 @@ explicit_tokens_real = [
("CONST_8_0", "8.0_rk8", "8.0_rk4"), ("CONST_8_0", "8.0_rk8", "8.0_rk4"),
("size_of_PRECISION_real", "size_of_double_real_datatype", "size_of_single_real_datatype"), ("size_of_PRECISION_real", "size_of_double_real_datatype", "size_of_single_real_datatype"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"), ("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("C_DATATYPE_KIND", "c_double", "c_float"),
] ]
...@@ -103,7 +125,10 @@ blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","s ...@@ -103,7 +125,10 @@ blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","s
def print_variant(number, precision, explicit): def print_variant(number, precision, explicit):
for token in simple_tokens: for token in simple_tokens:
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number) print "#define ", token, token.replace("PRECISION", precision).replace("UPCASENUMBER", number.upper()).replace("NUMBER", number)
print "#define ", token + "_STR", "'" + token.replace("PRECISION", precision).replace("UPCASENUMBER", number.upper()).replace("NUMBER", number) + "'"
if("NUMBER" in token):
print "#define ", token.replace("NUMBER", number), token.replace("PRECISION", precision).replace("NUMBER", number)
for token in blas_tokens: for token in blas_tokens:
print "#define ", token, token.replace("PRECISION_", blas_prefixes[(number, precision)]) print "#define ", token, token.replace("PRECISION_", blas_prefixes[(number, precision)])
for token in explicit: for token in explicit:
...@@ -111,28 +136,51 @@ def print_variant(number, precision, explicit): ...@@ -111,28 +136,51 @@ def print_variant(number, precision, explicit):
def print_undefs(number, explicit): def print_undefs(number, explicit):
for token in simple_tokens: for token in simple_tokens:
print "#undef ", token.replace("NUMBER", number) print "#undef ", token
print "#undef ", token + "_STR"
if("NUMBER" in token):
print "#undef ", token.replace("NUMBER", number)
for token in blas_tokens: for token in blas_tokens:
print "#undef ", token print "#undef ", token
for token in explicit: for token in explicit:
print "#undef ", token[0] print "#undef ", token[0]
if(sys.argv[1] == "complex"): print "#ifdef REALCASE"
print "#ifdef DOUBLE_PRECISION_COMPLEX" print "#undef MATH_DATATYPE"
print_undefs("complex", explicit_tokens_complex) print "#define MATH_DATATYPE real"
print_variant("complex", "double", explicit_tokens_complex) print_undefs("real", explicit_tokens_real)
print "#else" #print_undefs("complex", explicit_tokens_complex)
print_undefs("complex", explicit_tokens_complex) print "#ifdef DOUBLE_PRECISION"
print_variant("complex", "single", explicit_tokens_complex) print_variant("real", "double", explicit_tokens_real)
print "#endif" print "#endif"
elif(sys.argv[1] == "real"): print "#ifdef SINGLE_PRECISION"
print "#ifdef DOUBLE_PRECISION_REAL" print_variant("real", "single", explicit_tokens_real)
print_undefs("real", explicit_tokens_real) print "#endif"
print_variant("real", "double", explicit_tokens_real) print "#endif"
print "#else"
print_undefs("real", explicit_tokens_real) print "#ifdef COMPLEXCASE"
print_variant("real", "single", explicit_tokens_real) print "#undef MATH_DATATYPE"
print "#endif" print "#define MATH_DATATYPE complex"
else: #print_undefs("real", explicit_tokens_real)
assert(False) print_undefs("complex", explicit_tokens_complex)
\ No newline at end of file print "#ifdef DOUBLE_PRECISION"
print_variant("complex", "double", explicit_tokens_complex)
print "#endif"
print "#ifdef SINGLE_PRECISION"
print_variant("complex", "single", explicit_tokens_complex)
print "#endif"
print "#endif"
#print "#elif MACROS_TYPE == COMPLEX_DOUBLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "double", explicit_tokens_complex)
#print "#elif MACROS_TYPE == COMPLEX_SINGLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "single", explicit_tokens_complex)
#print "#endif"
...@@ -159,12 +159,14 @@ module ELPA1_COMPUTE ...@@ -159,12 +159,14 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk8) #define DATATYPE REAL(kind=rk8)
#define BYTESIZE 8 #define BYTESIZE 8
#define REALCASE 1 #define REALCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#undef DATATYPE #undef DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef REALCASE #undef REALCASE
#undef DOUBLE_PRECISION
! single precision ! single precision
#ifdef WANT_SINGLE_PRECISION_REAL #ifdef WANT_SINGLE_PRECISION_REAL
...@@ -173,11 +175,13 @@ module ELPA1_COMPUTE ...@@ -173,11 +175,13 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk4) #define DATATYPE REAL(kind=rk4)
#define BYTESIZE 4 #define BYTESIZE 4
#define REALCASE 1 #define REALCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DATATYPE #undef DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef REALCASE #undef REALCASE
#undef SINGLE_PRECISION
#endif #endif
...@@ -187,11 +191,13 @@ module ELPA1_COMPUTE ...@@ -187,11 +191,13 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck8) #define DATATYPE COMPLEX(kind=ck8)
#define BYTESIZE 16 #define BYTESIZE 16
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DATATYPE #undef DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef COMPLEXCASE #undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX #undef DOUBLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX #ifdef WANT_SINGLE_PRECISION_COMPLEX
...@@ -200,11 +206,13 @@ module ELPA1_COMPUTE ...@@ -200,11 +206,13 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#define DATATYPE COMPLEX(kind=ck4) #define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DATATYPE #undef DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef COMPLEXCASE #undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */ #endif /* WANT_SINGLE_PRECISION_COMPLEX */
...@@ -213,6 +221,9 @@ module ELPA1_COMPUTE ...@@ -213,6 +221,9 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk8 #define REAL_DATATYPE rk8
#define REALCASE 1 #define REALCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef COMPLEXCASE #undef COMPLEXCASE
#include "elpa1_compute_template.X90" #include "elpa1_compute_template.X90"
...@@ -220,6 +231,7 @@ module ELPA1_COMPUTE ...@@ -220,6 +231,7 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE #undef REAL_DATATYPE
#undef REALCASE #undef REALCASE
#undef DOUBLE_PRECISION
! real single precision ! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL) #if defined(WANT_SINGLE_PRECISION_REAL)
...@@ -228,11 +240,15 @@ module ELPA1_COMPUTE ...@@ -228,11 +240,15 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk4 #define REAL_DATATYPE rk4
#define REALCASE 1 #define REALCASE 1
#define SINGLE_PRECISION 1
!remove? :
#undef COMPLEXCASE #undef COMPLEXCASE
#include "elpa1_compute_template.X90" #include "elpa1_compute_template.X90"
#undef REALCASE #undef REALCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE #undef REAL_DATATYPE
...@@ -245,10 +261,14 @@ module ELPA1_COMPUTE ...@@ -245,10 +261,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck8 #define COMPLEX_DATATYPE ck8
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef REALCASE #undef REALCASE
#include "elpa1_compute_template.X90" #include "elpa1_compute_template.X90"
#undef COMPLEXCASE #undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX #undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE #undef REAL_DATATYPE
...@@ -263,11 +283,14 @@ module ELPA1_COMPUTE ...@@ -263,11 +283,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck4 #define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1
!remove ? :
#undef REALCASE #undef REALCASE
#include "elpa1_compute_template.X90" #include "elpa1_compute_template.X90"
#undef COMPLEXCASE #undef COMPLEXCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX #undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE #undef COMPLEX_DATATYPE
......
...@@ -52,13 +52,7 @@ ...@@ -52,13 +52,7 @@
! distributed along with the original code in the file "COPYING". ! distributed along with the original code in the file "COPYING".
#endif #endif
#if REALCASE == 1
#include "precision_macros.h" #include "precision_macros.h"
#endif
#if COMPLEXCASE == 1
#include "precision_macros_complex.h"
#endif
#if REALCASE == 1 #if REALCASE == 1
......
...@@ -86,12 +86,11 @@ ...@@ -86,12 +86,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used !> \param useGPU If true, GPU version of the subroutine will be used
!> !>
#if REALCASE == 1 subroutine trans_ev_&
subroutine trans_ev_real_PRECISION (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU) &MATH_DATATYPE&
#endif &_&
#if COMPLEXCASE == 1 &PRECISION &
subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU) (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
use cuda_functions use cuda_functions
use iso_c_binding use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
...@@ -128,11 +127,18 @@ ...@@ -128,11 +127,18 @@
integer(kind=ik) :: max_stored_rows integer(kind=ik) :: max_stored_rows
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
real(kind=rk8), parameter :: ZERO = 0.0_rk8, ONE = 1.0_rk8
#else
real(kind=rk4), parameter :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=ck8), parameter :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8) complex(kind=ck8), parameter :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
#else #else
complex(kind=ck4), parameter :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4) complex(kind=ck4), parameter :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
#endif #endif
#endif #endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
...@@ -155,12 +161,11 @@ ...@@ -155,12 +161,11 @@
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA logical :: successCUDA
#if REALCASE == 1 call timer%start("trans_ev_&
call timer%start("trans_ev_real" // PRECISION_SUFFIX) &MATH_DATATYPE&
#endif &_" // &
#if COMPLEXCASE == 1 &PRECISION_SUFFIX &
call timer%start("trans_ev_complex" // PRECISION_SUFFIX) )
#endif
call timer%start("mpi_communication") call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...@@ -179,54 +184,39 @@ ...@@ -179,54 +184,39 @@
max_stored_rows = (63/nblk+1)*nblk max_stored_rows = (63/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "tmat", istat, errorMessage) &MATH_DATATYPE&
#endif &", "tmat", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmat", istat, errorMessage)
#endif
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "h1", istat, errorMessage) &MATH_DATATYPE&
#endif &", "h1", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h1", istat, errorMessage)
#endif
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "h2", istat, errorMessage) &MATH_DATATYPE&
#endif &", "h2", istat, errorMessage)
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h2", istat, errorMessage)
#endif
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1 call check_alloc("trans_ev_&
call check_alloc("trans_ev_real", "tmp1", istat, errorMessage)