Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
b6e1b918
Commit
b6e1b918
authored
Feb 18, 2017
by
Andreas Marek
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'master_fix_up' into ELPA_KNL
parents
aebf900d
303c02e6
Changes
24
Pipelines
2
Show whitespace changes
Inline
Side-by-side
Showing
24 changed files
with
5199 additions
and
4906 deletions
+5199
-4906
Makefile.am
Makefile.am
+17
-15
generate/generate_precision.py
generate/generate_precision.py
+71
-23
src/elpa1_compute_private.F90
src/elpa1_compute_private.F90
+23
-0
src/elpa1_compute_template.X90
src/elpa1_compute_template.X90
+0
-6
src/elpa1_trans_ev_template.X90
src/elpa1_trans_ev_template.X90
+169
-250
src/elpa1_tridiag_template.X90
src/elpa1_tridiag_template.X90
+188
-144
src/elpa2.F90
src/elpa2.F90
+30
-1316
src/elpa2_bandred_complex_template.X90
src/elpa2_bandred_complex_template.X90
+0
-1186
src/elpa2_bandred_template.X90
src/elpa2_bandred_template.X90
+836
-75
src/elpa2_compute.F90
src/elpa2_compute.F90
+18
-2
src/elpa2_compute_complex_template.X90
src/elpa2_compute_complex_template.X90
+24
-75
src/elpa2_compute_real_template.X90
src/elpa2_compute_real_template.X90
+8
-4
src/elpa2_herm_matrix_allreduce_complex_template.X90
src/elpa2_herm_matrix_allreduce_complex_template.X90
+7
-34
src/elpa2_template.X90
src/elpa2_template.X90
+382
-0
src/elpa2_trans_ev_band_to_full_complex_template.X90
src/elpa2_trans_ev_band_to_full_complex_template.X90
+0
-380
src/elpa2_trans_ev_band_to_full_template.X90
src/elpa2_trans_ev_band_to_full_template.X90
+535
-72
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
+131
-686
src/elpa2_tridiag_band_complex_template.X90
src/elpa2_tridiag_band_complex_template.X90
+55
-270
src/elpa2_tridiag_band_template.X90
src/elpa2_tridiag_band_template.X90
+1455
-0
src/mod_precision.F90
src/mod_precision.F90
+1
-1
src/precision_macros.h
src/precision_macros.h
+885
-72
src/precision_macros_complex.h
src/precision_macros_complex.h
+0
-295
test/Fortran/test_real2_banded.F90
test/Fortran/test_real2_banded.F90
+359
-0
test/shared/blacs_infrastructure.F90
test/shared/blacs_infrastructure.F90
+5
-0
No files found.
Makefile.am
View file @
b6e1b918
...
...
@@ -52,22 +52,19 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_template.X90
\
src/elpa2_compute_real_template.X90
\
src/elpa2_compute_complex_template.X90
\
src/elpa2_bandred_real_template.X90
\
src/elpa2_template.X90
\
src/elpa2_bandred_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_trans_ev_band_to_full_
real_
template.X90
\
src/elpa2_tridiag_band_
real_
template.X90
\
src/elpa2_trans_ev_band_to_full_template.X90
\
src/elpa2_tridiag_band_template.X90
\
src/elpa2_trans_ev_tridi_to_band_real_template.X90
\
src/elpa2_bandred_complex_template.X90
\
src/elpa2_herm_matrix_allreduce_complex_template.X90
\
src/elpa2_trans_ev_band_to_full_complex_template.X90
\
src/elpa2_tridiag_band_complex_template.X90
\
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_real_template.X90
\
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/redist_band.X90
\
src/precision_macros.h
\
src/precision_macros_complex.h
src/precision_macros.h
lib_LTLIBRARIES
=
libelpa@SUFFIX@.la
libelpa@SUFFIX@
_la_LINK
=
$(FCLINK)
$(AM_LDFLAGS)
-version-info
$(ELPA_SO_VERSION)
...
...
@@ -356,6 +353,7 @@ dist_files_DATA = \
test
/Fortran/test_real2_default.F90
\
test
/Fortran/test_real2_qr.F90
\
test
/Fortran/test_real2_api.F90
\
test
/Fortran/test_real2_banded.F90
\
test
/Fortran/test_real.F90
\
test
/Fortran/test_real_with_c.F90
\
test
/Fortran/test_toeplitz.F90
\
...
...
@@ -386,6 +384,7 @@ noinst_PROGRAMS = \
elpa2_test_real_default@SUFFIX@
\
elpa2_test_real_qr@SUFFIX@
\
elpa2_test_real_api@SUFFIX@
\
elpa2_test_real_banded@SUFFIX@
\
elpa2_test_complex@SUFFIX@
\
elpa2_test_complex_default@SUFFIX@
\
elpa2_test_complex_api@SUFFIX@
\
...
...
@@ -573,6 +572,11 @@ elpa2_test_real_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_api@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_api@SUFFIX@
_DEPENDENCIES
=
test
/Fortran/elpa_print_headers.X90
elpa2_test_real_banded@SUFFIX@
_SOURCES
=
test
/Fortran/test_real2_banded.F90
elpa2_test_real_banded@SUFFIX@
_LDADD
=
$(build_lib)
elpa2_test_real_banded@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_banded@SUFFIX@
_DEPENDENCIES
=
test
/Fortran/elpa_print_headers.X90
elpa1_test_complex@SUFFIX@
_SOURCES
=
test
/Fortran/test_complex.F90
elpa1_test_complex@SUFFIX@
_LDADD
=
$(build_lib)
elpa1_test_complex@SUFFIX@
_FCFLAGS
=
$(AM_FCFLAGS)
@FC_MODOUT@private_modules @FC_MODINC@private_modules
...
...
@@ -772,6 +776,7 @@ check_SCRIPTS = \
elpa2_test_real_qr@SUFFIX@.sh
\
elpa2_test_complex_default@SUFFIX@.sh
\
elpa2_test_real_api@SUFFIX@.sh
\
elpa2_test_real_banded@SUFFIX@.sh
\
elpa2_test_complex_api@SUFFIX@.sh
\
elpa_driver_real@SUFFIX@.sh
\
elpa_driver_complex@SUFFIX@.sh
\
...
...
@@ -942,18 +947,15 @@ EXTRA_DIST = \
src/elpa1_tridiag_template.X90
\
src/elpa2_compute_real_template.X90
\
src/elpa2_compute_complex_template.X90
\
src/elpa2_bandred_complex_template.X90
\
src/elpa2_bandred_real_template.X90
\
src/elpa2_bandred_template.X90
\
src/elpa2_herm_matrix_allreduce_complex_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_trans_ev_band_to_full_complex_template.X90
\
src/elpa2_trans_ev_band_to_full_real_template.X90
\
src/elpa2_template.X90
\
src/elpa2_tridiag_band_template.X90
\
src/elpa2_trans_ev_band_to_full_template.X90
\
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
\
src/elpa2_trans_ev_tridi_to_band_real_template.X90
\
src/elpa2_tridiag_band_complex_template.X90
\
src/elpa2_tridiag_band_real_template.X90
\
src/precision_macros.h
\
src/precision_macros_complex.h
\
src/elpa2_kernels/elpa2_kernels_real_template.X90
\
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
...
...
generate/generate_precision.py
View file @
b6e1b918
...
...
@@ -2,9 +2,9 @@
import
sys
simple_tokens
=
[
"PRECISION"
,
"elpa_transpose_vectors_NUMBER_PRECISION"
,
"elpa_reduce_add_vectors_NUMBER_PRECISION"
,
"bandred_NUMBER_PRECISION"
,
"trans_ev_band_to_full_NUMBER_PRECISION"
,
"tridiag_band_NUMBER_PRECISION"
,
...
...
@@ -15,10 +15,11 @@ simple_tokens = [
"solve_tridi_PRECISION"
,
"solve_tridi_col_PRECISION"
,
"solve_tridi_single_problem_PRECISION"
,
"solve_evp_NUMBER_2stage_PRECISION"
,
"qr_pdgeqrf_2dcomm_PRECISION"
,
"hh_transform_NUMBER_PRECISION"
,
"symm_matrix_allreduce_PRECISION"
,
"herm_matrix_allreduce_PRECISION"
,
"redist_band_NUMBER_PRECISION"
,
"unpack_row_NUMBER_cpu_PRECISION"
,
"unpack_row_NUMBER_cpu_openmp_PRECISION"
,
...
...
@@ -45,9 +46,19 @@ simple_tokens = [
"global_product_PRECISION"
,
"add_tmp_PRECISION"
,
"v_add_s_PRECISION"
,
"launch_compute_hh_trafo_c_kernel_NUMBER_PRECISION"
,
"compute_hh_trafo_NUMBER_gpu_PRECISION"
,
"launch_my_pack_c_kernel_NUMBER_PRECISION"
,
"launch_my_unpack_c_kernel_NUMBER_PRECISION"
,
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION"
,
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION"
,
"AVAILABLE_UPCASENUMBER_ELPA_KERNELS"
,
"UPCASENUMBER_ELPA_KERNEL_GENERIC"
,
"DEFAULT_UPCASENUMBER_ELPA_KERNEL"
,
"UPCASENUMBER_ELPA_KERNEL_NAMES"
,
"UPCASENUMBER_ELPA_KERNEL_GPU"
,
]
blas_tokens
=
[
"PRECISION_GEMV"
,
"PRECISION_TRMV"
,
...
...
@@ -57,6 +68,8 @@ blas_tokens = [
"PRECISION_SYRK"
,
"PRECISION_SYMV"
,
"PRECISION_SYMM"
,
"PRECISION_HEMV"
,
"PRECISION_HER2"
,
"PRECISION_SYR2"
,
"PRECISION_SYR2K"
,
"PRECISION_GEQRF"
,
...
...
@@ -75,6 +88,7 @@ blas_tokens = [
explicit_tokens_complex
=
[
(
"PRECISION_SUFFIX"
,
"
\"
_double
\"
"
,
"
\"
_single
\"
"
),
(
"MPI_COMPLEX_PRECISION"
,
"MPI_DOUBLE_COMPLEX"
,
"MPI_COMPLEX"
),
(
"MPI_COMPLEX_EXPLICIT_PRECISION"
,
"MPI_COMPLEX16"
,
"MPI_COMPLEX8"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
(
"KIND_PRECISION"
,
"rk8"
,
"rk4"
),
(
"PRECISION_CMPLX"
,
"DCMPLX"
,
"CMPLX"
),
...
...
@@ -82,8 +96,15 @@ explicit_tokens_complex = [
(
"PRECISION_REAL"
,
"DREAL"
,
"REAL"
),
(
"CONST_REAL_0_0"
,
"0.0_rk8"
,
"0.0_rk4"
),
(
"CONST_REAL_1_0"
,
"1.0_rk8"
,
"1.0_rk4"
),
(
"CONST_REAL_0_5"
,
"0.5_rk8"
,
"0.5_rk4"
),
(
"CONST_COMPLEX_PAIR_0_0"
,
"(0.0_rk8,0.0_rk8)"
,
"(0.0_rk4,0.0_rk4)"
),
(
"CONST_COMPLEX_PAIR_1_0"
,
"(1.0_rk8,0.0_rk8)"
,
"(1.0_rk4,0.0_rk4)"
),
(
"CONST_COMPLEX_PAIR_NEGATIVE_1_0"
,
"(-1.0_rk8,0.0_rk8)"
,
"(-1.0_rk4,0.0_rk4)"
),
(
"CONST_COMPLEX_PAIR_NEGATIVE_0_5"
,
"(-0.5_rk8,0.0_rk8)"
,
"(-0.5_rk4,0.0_rk4)"
),
(
"CONST_COMPLEX_0_0"
,
"0.0_ck8"
,
"0.0_ck4"
),
(
"CONST_COMPLEX_1_0"
,
"1.0_ck8"
,
"1.0_ck4"
),
(
"size_of_PRECISION_complex"
,
"size_of_double_complex_datatype"
,
"size_of_single_complex_datatype"
),
(
"C_DATATYPE_KIND"
,
"c_double"
,
"c_float"
),
]
explicit_tokens_real
=
[
...
...
@@ -95,6 +116,7 @@ explicit_tokens_real = [
(
"CONST_8_0"
,
"8.0_rk8"
,
"8.0_rk4"
),
(
"size_of_PRECISION_real"
,
"size_of_double_real_datatype"
,
"size_of_single_real_datatype"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
(
"C_DATATYPE_KIND"
,
"c_double"
,
"c_float"
),
]
...
...
@@ -103,6 +125,9 @@ blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","s
def
print_variant
(
number
,
precision
,
explicit
):
for
token
in
simple_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"UPCASENUMBER"
,
number
.
upper
()).
replace
(
"NUMBER"
,
number
)
print
"#define "
,
token
+
"_STR"
,
"'"
+
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"UPCASENUMBER"
,
number
.
upper
()).
replace
(
"NUMBER"
,
number
)
+
"'"
if
(
"NUMBER"
in
token
):
print
"#define "
,
token
.
replace
(
"NUMBER"
,
number
),
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"NUMBER"
,
number
)
for
token
in
blas_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION_"
,
blas_prefixes
[(
number
,
precision
)])
...
...
@@ -111,6 +136,9 @@ def print_variant(number, precision, explicit):
def
print_undefs
(
number
,
explicit
):
for
token
in
simple_tokens
:
print
"#undef "
,
token
print
"#undef "
,
token
+
"_STR"
if
(
"NUMBER"
in
token
):
print
"#undef "
,
token
.
replace
(
"NUMBER"
,
number
)
for
token
in
blas_tokens
:
print
"#undef "
,
token
...
...
@@ -118,21 +146,41 @@ def print_undefs(number, explicit):
print
"#undef "
,
token
[
0
]
if
(
sys
.
argv
[
1
]
==
"complex"
):
print
"#ifdef DOUBLE_PRECISION_COMPLEX"
print_undefs
(
"complex"
,
explicit_tokens_complex
)
print_variant
(
"complex"
,
"double"
,
explicit_tokens_complex
)
print
"#else"
print_undefs
(
"complex"
,
explicit_tokens_complex
)
print_variant
(
"complex"
,
"single"
,
explicit_tokens_complex
)
print
"#endif"
elif
(
sys
.
argv
[
1
]
==
"real"
):
print
"#ifdef DOUBLE_PRECISION_REAL"
print_undefs
(
"real"
,
explicit_tokens_real
)
print_variant
(
"real"
,
"double"
,
explicit_tokens_real
)
print
"#else"
print_undefs
(
"real"
,
explicit_tokens_real
)
print_variant
(
"real"
,
"single"
,
explicit_tokens_real
)
print
"#endif"
else
:
assert
(
False
)
\ No newline at end of file
print
"#ifdef REALCASE"
print
"#undef MATH_DATATYPE"
print
"#define MATH_DATATYPE real"
print_undefs
(
"real"
,
explicit_tokens_real
)
#print_undefs("complex", explicit_tokens_complex)
print
"#ifdef DOUBLE_PRECISION"
print_variant
(
"real"
,
"double"
,
explicit_tokens_real
)
print
"#endif"
print
"#ifdef SINGLE_PRECISION"
print_variant
(
"real"
,
"single"
,
explicit_tokens_real
)
print
"#endif"
print
"#endif"
print
"#ifdef COMPLEXCASE"
print
"#undef MATH_DATATYPE"
print
"#define MATH_DATATYPE complex"
#print_undefs("real", explicit_tokens_real)
print_undefs
(
"complex"
,
explicit_tokens_complex
)
print
"#ifdef DOUBLE_PRECISION"
print_variant
(
"complex"
,
"double"
,
explicit_tokens_complex
)
print
"#endif"
print
"#ifdef SINGLE_PRECISION"
print_variant
(
"complex"
,
"single"
,
explicit_tokens_complex
)
print
"#endif"
print
"#endif"
#print "#elif MACROS_TYPE == COMPLEX_DOUBLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "double", explicit_tokens_complex)
#print "#elif MACROS_TYPE == COMPLEX_SINGLE"
#print "#undef NUMBER"
#print_undefs("complex", explicit_tokens_complex)
#print "#define NUMBER complex"
#print_variant("complex", "single", explicit_tokens_complex)
#print "#endif"
src/elpa1_compute_private.F90
View file @
b6e1b918
...
...
@@ -159,12 +159,14 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk8)
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
...
...
@@ -173,11 +175,13 @@ module ELPA1_COMPUTE
#define DATATYPE REAL(kind=rk4)
#define BYTESIZE 4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef SINGLE_PRECISION
#endif
...
...
@@ -187,11 +191,13 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck8)
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
...
...
@@ -200,11 +206,13 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
...
...
@@ -213,6 +221,9 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk8
#define REALCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
...
...
@@ -220,6 +231,7 @@ module ELPA1_COMPUTE
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef REALCASE
#undef DOUBLE_PRECISION
! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL)
...
...
@@ -228,11 +240,15 @@ module ELPA1_COMPUTE
#define REAL_DATATYPE rk4
#define REALCASE 1
#define SINGLE_PRECISION 1
!remove? :
#undef COMPLEXCASE
#include "elpa1_compute_template.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
...
...
@@ -245,10 +261,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck8
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef REAL_DATATYPE
...
...
@@ -263,11 +283,14 @@ module ELPA1_COMPUTE
#define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
!remove ? :
#undef REALCASE
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
...
...
src/elpa1_compute_template.X90
View file @
b6e1b918
...
...
@@ -52,13 +52,7 @@
! distributed along with the original code in the file "COPYING".
#endif
#if REALCASE == 1
#include "precision_macros.h"
#endif
#if COMPLEXCASE == 1
#include "precision_macros_complex.h"
#endif
#if REALCASE == 1
...
...
src/elpa1_trans_ev_template.X90
View file @
b6e1b918
...
...
@@ -86,12 +86,11 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!>
#if REALCASE == 1
subroutine trans_ev_real_PRECISION (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
#if COMPLEXCASE == 1
subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
subroutine trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
...
...
@@ -128,11 +127,18 @@
integer(kind=ik) :: max_stored_rows
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
real(kind=rk8), parameter :: ZERO = 0.0_rk8, ONE = 1.0_rk8
#else
real(kind=rk4), parameter :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=ck8), parameter ::
CZERO = (0.0_rk8,0.0_rk8), C
ONE = (1.0_rk8,0.0_rk8)
complex(kind=ck8), parameter ::
ZERO = (0.0_rk8,0.0_rk8),
ONE = (1.0_rk8,0.0_rk8)
#else
complex(kind=ck4), parameter ::
CZERO = (0.0_rk4,0.0_rk4), C
ONE = (1.0_rk4,0.0_rk4)
complex(kind=ck4), parameter ::
ZERO = (0.0_rk4,0.0_rk4),
ONE = (1.0_rk4,0.0_rk4)
#endif
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
...
...
@@ -155,12 +161,11 @@
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA
#if REALCASE == 1
call timer%start("trans_ev_real" // PRECISION_SUFFIX)
#endif
#if COMPLEXCASE == 1
call timer%start("trans_ev_complex" // PRECISION_SUFFIX)
#endif
call timer%start("trans_ev_&
&MATH_DATATYPE&
&_" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...
...
@@ -179,54 +184,39 @@
max_stored_rows = (63/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmat", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmat", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmat", istat, errorMessage)
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "h1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h1", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "h1", istat, errorMessage)
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "h2", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "h2", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "h2", istat, errorMessage)
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmp1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp1", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp1", istat, errorMessage)
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "tmp2", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "tmp2", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "tmp2", istat, errorMessage)
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvn", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvb", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvn", istat, errorMessage)
allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvm", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm", istat, errorMessage)
#endif
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm", istat, errorMessage)
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
...
...
@@ -241,66 +231,52 @@
#if COMPLEXCASE == 1
! In the complex case tau(2) /= 0
if (my_prow == prow(1, nblk, np_rows)) then
q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(
C
ONE-tau(2))
q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2))
endif
#endif
if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
call check_alloc("trans_ev_real", "hvm1", istat, errorMessage)
#endif
#if COMPLEXCASE == 1
call check_alloc("trans_ev_complex", "hvm1", istat, errorMessage)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_complex)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm1", istat, errorMessage)
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_complex)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_complex)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_real)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
#endif
! q_dev = q_mat
#if REALCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
endif ! useGPU
do istep = 1, na, nblk
...
...
@@ -330,12 +306,15 @@
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (nb>0) &
call MPI_Bcast(hvb, nb, &
#if REALCASE == 1
call MPI_Bcast(hvb, nb, MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
call MPI_Bcast(hvb, nb, MPI_COMPLEX_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
, cur_pcol, mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
...
...
@@ -360,13 +339,12 @@
call timer%start("blas")
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', nstor, l_rows, &
CONST_1_0, hvm, ubound(hvm,dim=1), &
CONST_0_0, tmat, max_stored_rows)
call PRECISION_SYRK('U', 'T', &
#endif
#if COMPLEXCASE == 1
call PRECISION_HERK('U', 'C',
nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows)
call PRECISION_HERK('U', 'C',
&
#endif
nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows)
call timer%stop("blas")
nc = 0
do n = 1, nstor-1
...
...
@@ -375,12 +353,14 @@
enddo
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (nc>0) call mpi_allreduce( h1, h2, nc, &
#if REALCASE == 1
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
&MPI_COMPLEX_PRECISION&
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
...
...
@@ -394,22 +374,23 @@
do n = 1, nstor-1
call timer%start("blas")
#if REALCASE == 1
call PRECISION_TRMV('L', 'T', 'N', n, &
tmat, max_stored_rows, &