Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
b6e1b918
Commit
b6e1b918
authored
Feb 18, 2017
by
Andreas Marek
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master_fix_up' into ELPA_KNL
parents
aebf900d
303c02e6
Changes
24
Pipelines
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
24 changed files
with
5199 additions
and
4906 deletions
+5199
-4906
Makefile.am
Makefile.am
+17
-15
generate/generate_precision.py
generate/generate_precision.py
+71
-23
src/elpa1_compute_private.F90
src/elpa1_compute_private.F90
+23
-0
src/elpa1_compute_template.X90
src/elpa1_compute_template.X90
+0
-6
src/elpa1_trans_ev_template.X90
src/elpa1_trans_ev_template.X90
+169
-250
src/elpa1_tridiag_template.X90
src/elpa1_tridiag_template.X90
+188
-144
src/elpa2.F90
src/elpa2.F90
+30
-1316
src/elpa2_bandred_complex_template.X90
src/elpa2_bandred_complex_template.X90
+0
-1186
src/elpa2_bandred_template.X90
src/elpa2_bandred_template.X90
+836
-75
src/elpa2_compute.F90
src/elpa2_compute.F90
+18
-2
src/elpa2_compute_complex_template.X90
src/elpa2_compute_complex_template.X90
+24
-75
src/elpa2_compute_real_template.X90
src/elpa2_compute_real_template.X90
+8
-4
src/elpa2_herm_matrix_allreduce_complex_template.X90
src/elpa2_herm_matrix_allreduce_complex_template.X90
+7
-34
src/elpa2_template.X90
src/elpa2_template.X90
+382
-0
src/elpa2_trans_ev_band_to_full_complex_template.X90
src/elpa2_trans_ev_band_to_full_complex_template.X90
+0
-380
src/elpa2_trans_ev_band_to_full_template.X90
src/elpa2_trans_ev_band_to_full_template.X90
+535
-72
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
+131
-686
src/elpa2_tridiag_band_complex_template.X90
src/elpa2_tridiag_band_complex_template.X90
+55
-270
src/elpa2_tridiag_band_template.X90
src/elpa2_tridiag_band_template.X90
+1455
-0
src/mod_precision.F90
src/mod_precision.F90
+1
-1
src/precision_macros.h
src/precision_macros.h
+885
-72
src/precision_macros_complex.h
src/precision_macros_complex.h
+0
-295
test/Fortran/test_real2_banded.F90
test/Fortran/test_real2_banded.F90
+359
-0
test/shared/blacs_infrastructure.F90
test/shared/blacs_infrastructure.F90
+5
-0
No files found.
Makefile.am
View file @
b6e1b918
...
...
@@ -52,22 +52,19 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_template.X90
\
src/elpa2_compute_real_template.X90
\
src/elpa2_compute_complex_template.X90
\
src/elpa2_bandred_real_template.X90
\
src/elpa2_template.X90
\
src/elpa2_bandred_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_trans_ev_band_to_full_
real_
template.X90
\
src/elpa2_tridiag_band_
real_
template.X90
\
src/elpa2_trans_ev_band_to_full_template.X90
\
src/elpa2_tridiag_band_template.X90
\
src/elpa2_trans_ev_tridi_to_band_real_template.X90
\
src/elpa2_bandred_complex_template.X90
\
src/elpa2_herm_matrix_allreduce_complex_template.X90
\
src/elpa2_trans_ev_band_to_full_complex_template.X90
\
src/elpa2_tridiag_band_complex_template.X90
\
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_real_template.X90
\
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/redist_band.X90
\
src/precision_macros.h
\
src/precision_macros_complex.h
src/precision_macros.h
lib_LTLIBRARIES
=
libelpa@SUFFIX@.la
libelpa@SUFFIX@
_la_LINK
=
$(FCLINK)
$(AM_LDFLAGS)
-version-info
$(ELPA_SO_VERSION)
...
...
@@ -356,6 +353,7 @@ dist_files_DATA = \
test
/Fortran/test_real2_default.F90
\
test
/Fortran/test_real2_qr.F90
\
test
/Fortran/test_real2_api.F90
\
test
/Fortran/test_real2_banded.F90
\
test
/Fortran/test_real.F90
\
test
/Fortran/test_real_with_c.F90
\
test
/Fortran/test_toeplitz.F90
\
...
...
@@ -386,6 +384,7 @@ noinst_PROGRAMS = \
elpa2_test_real_default@SUFFIX@
\
elpa2_test_real_qr@SUFFIX@
\
elpa2_test_real_api@SUFFIX@
\
elpa2_test_real_banded@SUFFIX@
\
elpa2_test_complex@SUFFIX@
\
elpa2_test_complex_default@SUFFIX@
\
elpa2_test_complex_api@SUFFIX@
\
...
...
@@ -573,6 +572,11 @@ elpa2_test_real_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_api@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_api@SUFFIX@
_DEPENDENCIES
=
test
/Fortran/elpa_print_headers.X90
elpa2_test_real_banded@SUFFIX@
_SOURCES
=
test
/Fortran/test_real2_banded.F90
elpa2_test_real_banded@SUFFIX@
_LDADD
=
$(build_lib)
elpa2_test_real_banded@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_banded@SUFFIX@
_DEPENDENCIES
=
test
/Fortran/elpa_print_headers.X90
elpa1_test_complex@SUFFIX@
_SOURCES
=
test
/Fortran/test_complex.F90
elpa1_test_complex@SUFFIX@
_LDADD
=
$(build_lib)
elpa1_test_complex@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
...
...
@@ -772,6 +776,7 @@ check_SCRIPTS = \
elpa2_test_real_qr@SUFFIX@.sh
\
elpa2_test_complex_default@SUFFIX@.sh
\
elpa2_test_real_api@SUFFIX@.sh
\
elpa2_test_real_banded@SUFFIX@.sh
\
elpa2_test_complex_api@SUFFIX@.sh
\
elpa_driver_real@SUFFIX@.sh
\
elpa_driver_complex@SUFFIX@.sh
\
...
...
@@ -942,18 +947,15 @@ EXTRA_DIST = \
src/elpa1_tridiag_template.X90
\
src/elpa2_compute_real_template.X90
\
src/elpa2_compute_complex_template.X90
\
src/elpa2_bandred_complex_template.X90
\
src/elpa2_bandred_real_template.X90
\
src/elpa2_bandred_template.X90
\
src/elpa2_herm_matrix_allreduce_complex_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_trans_ev_band_to_full_complex_template.X90
\
src/elpa2_trans_ev_band_to_full_real_template.X90
\
src/elpa2_template.X90
\
src/elpa2_tridiag_band_template.X90
\
src/elpa2_trans_ev_band_to_full_template.X90
\
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
\
src/elpa2_trans_ev_tridi_to_band_real_template.X90
\
src/elpa2_tridiag_band_complex_template.X90
\
src/elpa2_tridiag_band_real_template.X90
\
src/precision_macros.h
\
src/precision_macros_complex.h
\
src/elpa2_kernels/elpa2_kernels_real_template.X90
\
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
...
...
generate/generate_precision.py
View file @
b6e1b918
...
...
@@ -2,9 +2,9 @@
import
sys
simple_tokens
=
[
"PRECISION"
,
"elpa_transpose_vectors_NUMBER_PRECISION"
,
"elpa_reduce_add_vectors_NUMBER_PRECISION"
,
"bandred_NUMBER_PRECISION"
,
"trans_ev_band_to_full_NUMBER_PRECISION"
,
"tridiag_band_NUMBER_PRECISION"
,
...
...
@@ -15,10 +15,11 @@ simple_tokens = [
"solve_tridi_PRECISION"
,
"solve_tridi_col_PRECISION"
,
"solve_tridi_single_problem_PRECISION"
,
"solve_evp_NUMBER_2stage_PRECISION"
,
"qr_pdgeqrf_2dcomm_PRECISION"
,
"hh_transform_NUMBER_PRECISION"
,
"symm_matrix_allreduce_PRECISION"
,
"herm_matrix_allreduce_PRECISION"
,
"redist_band_NUMBER_PRECISION"
,
"unpack_row_NUMBER_cpu_PRECISION"
,
"unpack_row_NUMBER_cpu_openmp_PRECISION"
,
...
...
@@ -45,9 +46,19 @@ simple_tokens = [
"global_product_PRECISION"
,
"add_tmp_PRECISION"
,
"v_add_s_PRECISION"
,
"launch_compute_hh_trafo_c_kernel_NUMBER_PRECISION"
,
"compute_hh_trafo_NUMBER_gpu_PRECISION"
,
"launch_my_pack_c_kernel_NUMBER_PRECISION"
,
"launch_my_unpack_c_kernel_NUMBER_PRECISION"
,
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION"
,
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION"
,
"AVAILABLE_UPCASENUMBER_ELPA_KERNELS"
,
"UPCASENUMBER_ELPA_KERNEL_GENERIC"
,
"DEFAULT_UPCASENUMBER_ELPA_KERNEL"
,
"UPCASENUMBER_ELPA_KERNEL_NAMES"
,
"UPCASENUMBER_ELPA_KERNEL_GPU"
,
]
blas_tokens
=
[
"PRECISION_GEMV"
,
"PRECISION_TRMV"
,
...
...
@@ -57,6 +68,8 @@ blas_tokens = [
"PRECISION_SYRK"
,
"PRECISION_SYMV"
,
"PRECISION_SYMM"
,
"PRECISION_HEMV"
,
"PRECISION_HER2"
,
"PRECISION_SYR2"
,
"PRECISION_SYR2K"
,
"PRECISION_GEQRF"
,
...
...
@@ -75,6 +88,7 @@ blas_tokens = [
explicit_tokens_complex
=
[
(
"PRECISION_SUFFIX"
,
"
\"
_double
\"
"
,
"
\"
_single
\"
"
),
(
"MPI_COMPLEX_PRECISION"
,
"MPI_DOUBLE_COMPLEX"
,
"MPI_COMPLEX"
),
(
"MPI_COMPLEX_EXPLICIT_PRECISION"
,
"MPI_COMPLEX16"
,
"MPI_COMPLEX8"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
(
"KIND_PRECISION"
,
"rk8"
,
"rk4"
),
(
"PRECISION_CMPLX"
,
"DCMPLX"
,
"CMPLX"
),
...
...
@@ -82,8 +96,15 @@ explicit_tokens_complex = [
(
"PRECISION_REAL"
,
"DREAL"
,
"REAL"
),
(
"CONST_REAL_0_0"
,
"0.0_rk8"
,
"0.0_rk4"
),
(
"CONST_REAL_1_0"
,
"1.0_rk8"
,
"1.0_rk4"
),
(
"CONST_REAL_0_5"
,
"0.5_rk8"
,
"0.5_rk4"
),
(
"CONST_COMPLEX_PAIR_0_0"
,
"(0.0_rk8,0.0_rk8)"
,
"(0.0_rk4,0.0_rk4)"
),
(
"CONST_COMPLEX_PAIR_1_0"
,
"(1.0_rk8,0.0_rk8)"
,
"(1.0_rk4,0.0_rk4)"
),
(
"CONST_COMPLEX_PAIR_NEGATIVE_1_0"
,
"(-1.0_rk8,0.0_rk8)"
,
"(-1.0_rk4,0.0_rk4)"
),
(
"CONST_COMPLEX_PAIR_NEGATIVE_0_5"
,
"(-0.5_rk8,0.0_rk8)"
,
"(-0.5_rk4,0.0_rk4)"
),
(
"CONST_COMPLEX_0_0"
,
"0.0_ck8"
,
"0.0_ck4"
),
(
"CONST_COMPLEX_1_0"
,
"1.0_ck8"
,
"1.0_ck4"
),
(
"size_of_PRECISION_complex"
,
"size_of_double_complex_datatype"
,
"size_of_single_complex_datatype"
),
(
"C_DATATYPE_KIND"
,
"c_double"
,
"c_float"
),
]
explicit_tokens_real
=
[
...
...
@@ -95,6 +116,7 @@ explicit_tokens_real = [
(
"CONST_8_0"
,
"8.0_rk8"
,
"8.0_rk4"
),
(
"size_of_PRECISION_real"
,
"size_of_double_real_datatype"
,
"size_of_single_real_datatype"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
(
"C_DATATYPE_KIND"
,
"c_double"
,
"c_float"
),
]
...
...
@@ -103,7 +125,10 @@ blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","s
def
print_variant
(
number
,
precision
,
explicit
):
for
token
in
simple_tokens
:
print
"#define "
,
token
.
replace
(
"NUMBER"
,
number
),
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"NUMBER"
,
number
)
print
"#define "
,
token
,
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"UPCASENUMBER"
,
number
.
upper
()).
replace
(
"NUMBER"
,
number
)
print
"#define "
,
token
+
"_STR"
,
"'"
+
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"UPCASENUMBER"
,
number
.
upper
()).
replace
(
"NUMBER"
,
number
)
+
"'"
if
(
"NUMBER"
in
token
):
print
"#define "
,
token
.
replace
(
"NUMBER"
,
number
),
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"NUMBER"
,
number
)
for
token
in
blas_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION_"
,
blas_prefixes
[(
number
,
precision
)])
for
token
in
explicit
:
...
...
@@ -111,28 +136,51 @@ def print_variant(number, precision, explicit):
def
print_undefs
(
number
,
explicit
):
for
token
in
simple_tokens
:
print
"#undef "
,
token
.
replace
(
"NUMBER"
,
number
)
print
"#undef "
,
token
print
"#undef "
,
token
+
"_STR"
if
(
"NUMBER"
in
token
):
print
"#undef "
,
token
.
replace
(
"NUMBER"
,
number
)
for
token
in
blas_tokens
:
print
"#undef "
,
token
for
token
in
explicit
:
print
"#undef "
,
token
[
0
]
if
(
sys
.
argv
[
1
]
==
"complex"
):
print
"#ifdef DOUBLE_PRECISION_COMPLEX"
print_undefs
(
"complex"
,
explicit_tokens_complex
)
print_variant
(
"complex"
,
"double"
,
explicit_tokens_complex
)
print
"#else"
print_undefs
(
"complex"
,
explicit_tokens_complex
)
print_variant
(
"complex"
,
"single"
,
explicit_tokens_complex
)
print
"#endif"
elif
(
sys
.
argv
[
1
]
==
"real"
):
print
"#ifdef DOUBLE_PRECISION_REAL"
print_undefs
(
"real"
,
explicit_tokens_real
)
print_variant
(
"real"
,
"double"
,
explicit_tokens_real
)
print
"#else"
print_undefs
(
"real"
,
explicit_tokens_real
)
print_variant
(
"real"
,
"single"
,
explicit_tokens_real
)
print
"#endif"
else
:
assert
(
False
)
\ No newline at end of file
print
"#ifdef REALCASE"
print
"#undef MATH_DATATYPE"
print
"#define MATH_DATATYPE real"
print_undefs
(
"real"
,
explicit_tokens_real
)
#print_undefs("complex", explicit_tokens_complex)
print
"#ifdef DOUBLE_PRECISION"
print_variant
(
"real"
,
"double"
,
explicit_tokens_real
)
print
"#endif"
print
"#ifdef SINGLE_PRECISION"
print_variant
(
"real"
,
"single"
,
explicit_tokens_real
)
print
"#endif"
print
"#endif"
print
"#ifdef COMPLEXCASE"
print
"#undef MATH_DATATYPE"
print
"#define MATH_DATATYPE complex"
#print_undefs("real", explicit_tokens_real)
print_undefs
(
"complex"
,
explicit_tokens_complex
)
print
"#ifdef DOUBLE_PRECISION"
print_variant
(
"complex"
,
"double"
,
explicit_tokens_complex
)
print
"#endif"
print
"#ifdef SINGLE_PRECISION"
print_variant
(
"complex"
,
"single"
,
explicit_tokens_complex
)
print
"#endif"
print
"#endif"
#print "#elif MACROS_TYPE == COMPLEX_DOUBLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "double", explicit_tokens_complex)
#print "#elif MACROS_TYPE == COMPLEX_SINGLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "single", explicit_tokens_complex)
#print "#endif"
src/elpa1_compute_private.F90
View file @
b6e1b918
...
...
@@ -159,12 +159,14 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk8)
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
...
...
@@ -173,11 +175,13 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk4)
#define BYTESIZE 4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef SINGLE_PRECISION
#endif
...
...
@@ -187,11 +191,13 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck8)
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
...
...
@@ -200,11 +206,13 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
...
...
@@ -213,6 +221,9 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk8
#define REALCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
...
...
@@ -220,6 +231,7 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef REALCASE
#undef DOUBLE_PRECISION
! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL)
...
...
@@ -228,11 +240,15 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk4
#define REALCASE 1
#define SINGLE_PRECISION 1
!remove? :
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
...
...
@@ -245,10 +261,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck8
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
...
...
@@ -263,11 +283,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
!remove ? :
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
...
...
src/elpa1_compute_template.X90
View file @
b6e1b918
...
...
@@ -52,13 +52,7 @@
! distributed along with the original code in the file "COPYING".
#endif
#if REALCASE == 1
#include "precision_macros.h"
#endif
#if COMPLEXCASE == 1
#include "precision_macros_complex.h"
#endif
#if REALCASE == 1
...
...
src/elpa1_trans_ev_template.X90
View file @
b6e1b918
This diff is collapsed.
Click to expand it.
src/elpa1_tridiag_template.X90
View file @
b6e1b918
This diff is collapsed.
Click to expand it.
src/elpa2.F90
View file @
b6e1b918
This diff is collapsed.
Click to expand it.
src/elpa2_bandred_complex_template.X90
deleted
100644 → 0
View file @
aebf900d
This diff is collapsed.
Click to expand it.
src/elpa2_bandred_
real_
template.X90
→
src/elpa2_bandred_template.X90
View file @
b6e1b918
This diff is collapsed.
Click to expand it.
src/elpa2_compute.F90
View file @
b6e1b918
...
...
@@ -167,8 +167,12 @@ module ELPA2_compute
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa2_compute_real_template.X90"
#undef REALCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
...
...
@@ -179,8 +183,12 @@ module ELPA2_compute
#undef DOUBLE_PRECISION_REAL
#define REAL_DATATYPE rk4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "elpa2_compute_real_template.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
...
...
@@ -192,8 +200,12 @@ module ELPA2_compute
#define REAL_DATATYPE rk8
#define COMPLEX_DATATYPE ck8
#include "precision_macros_complex.h"
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa2_compute_complex_template.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
...
...
@@ -207,8 +219,12 @@ module ELPA2_compute
#define REAL_DATATYPE rk4
#define COMPLEX_DATATYPE ck4
#include "precision_macros_complex.h"
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "elpa2_compute_complex_template.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
...
...
src/elpa2_compute_complex_template.X90
View file @
b6e1b918
...
...
@@ -61,35 +61,28 @@
! distributed along with the original code in the file "COPYING".
#endif
#include "elpa2_bandred_complex_template.X90"
#define COMPLEXCASE 1
#undef REALCASE
#include "elpa2_bandred_template.X90"
#undef COMPLEXCASE
#include "elpa2_herm_matrix_allreduce_complex_template.X90"
#include "elpa2_trans_ev_band_to_full_complex_template.X90"
#include "elpa2_tridiag_band_complex_template.X90"
#define COMPLEXCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#include "elpa2_tridiag_band_template.X90"
#undef COMPLEXCASE
#include "elpa2_trans_ev_tridi_to_band_complex_template.X90"
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_dot_products_complex_gpu_double(nbw, n)
#else
subroutine compute_hh_dot_products_complex_gpu_single(nbw, n)
#endif
subroutine compute_hh_dot_products_complex_gpu_PRECISION(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_compute_hh_dotp_c_kernel_complex_double( bcast_buffer_dev, hh_dot_dev, nbw,n)
#else
call launch_compute_hh_dotp_c_kernel_complex_single( bcast_buffer_dev, hh_dot_dev, nbw,n)
#endif
call launch_compute_hh_dotp_c_kernel_complex_PRECISION( bcast_buffer_dev, hh_dot_dev, nbw,n)
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine pack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine pack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
subroutine pack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
...
...
@@ -99,28 +92,18 @@
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_pack_c_kernel_complex_double(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
call launch_my_pack_c_kernel_complex_PRECISION(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_
double_complex_datatype
, &
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_
PRECISION_complex
, &
cudaMemcpyDeviceToHost)
#else
call launch_my_pack_c_kernel_complex_single(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_single_complex_datatype, &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine unpack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
subroutine unpack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
...
...
@@ -131,31 +114,17 @@
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_double_complex_datatype , &
cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_single_complex_datatype , &
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_PRECISION_complex , &
cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_unpack_c_kernel_complex_double( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
#else
call launch_my_unpack_c_kernel_complex_single( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
call launch_my_unpack_c_kernel_complex_PRECISION( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
#endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_and_prepare_row_group_complex_gpu_double(next_unpack_idx, force)
#else
subroutine unpack_and_prepare_row_group_complex_gpu_single(next_unpack_idx, force)
#endif
subroutine unpack_and_prepare_row_group_complex_gpu_PRECISION(next_unpack_idx, force)
use precision
implicit none
...
...
@@ -168,11 +137,7 @@
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_group_complex_gpu_double(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#else
call unpack_row_group_complex_gpu_single(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#endif
call unpack_row_group_complex_gpu_PRECISION(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
...
...
@@ -184,11 +149,7 @@
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_trafo_complex_gpu_double(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#else
subroutine compute_hh_trafo_complex_gpu_single(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#endif
subroutine compute_hh_trafo_complex_gpu_PRECISION(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
use iso_c_binding
use cuda_c_kernel
...
...
@@ -205,27 +166,15 @@
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_double_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_double_complex_datatype
dev_offset_2 =( off-1 )*size_of_double_complex_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_double(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#else
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_single_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_
single_complex_datatype
dev_offset_2 =( off-1 )*size_of_
single_complex_datatype
size_of_PRECISION_complex
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_
PRECISION_complex
dev_offset_2 =( off-1 )*size_of_
PRECISION_complex
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_
single
(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
call launch_compute_hh_trafo_c_kernel_complex_
PRECISION
(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
...
...
src/elpa2_compute_real_template.X90
View file @
b6e1b918
...
...
@@ -60,11 +60,15 @@
! distributed along with the original code in the file "COPYING".
#endif
#include "elpa2_bandred_real_template.X90"
#define REALCASE 1
#undef COMPLEXCASE
#include "elpa2_bandred_template.X90"
#undef REALCASE
#include "elpa2_symm_matrix_allreduce_real_template.X90"
#include "elpa2_trans_ev_band_to_full_real_template.X90"
#include "elpa2_tridiag_band_real_template.X90"
#define REALCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#include "elpa2_tridiag_band_template.X90"
#undef REALCASE
#include "elpa2_trans_ev_tridi_to_band_real_template.X90"
...
...
src/elpa2_herm_matrix_allreduce_complex_template.X90
View file @
b6e1b918
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine herm_matrix_allreduce_double(n,a,lda,ldb,comm)
#else
subroutine herm_matrix_allreduce_single(n,a,lda,ldb,comm)
#endif
subroutine herm_matrix_allreduce_PRECISION(n,a,lda,ldb,comm)
!-------------------------------------------------------------------------------
! herm_matrix_allreduce: Does an mpi_allreduce for a hermitian matrix A.
! On entry, only the upper half of A needs to be set
! On exit, the complete matrix is set
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
...
...
@@ -20,13 +18,7 @@
integer(kind=ik) :: i, nc, mpierr
complex(kind=COMPLEX_DATATYPE) :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("herm_matrix_allreduce_double")
#else
call timer%start("herm_matrix_allreduce_single")
#endif