Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
2ede0ba8
Commit
2ede0ba8
authored
Dec 01, 2016
by
Andreas Marek
Browse files
Merge branch 'master' of gitlab.mpcdf.mpg.de:elpa/elpa
parents
69695acb
50b3ab16
Changes
17
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
generate/generate_precision.py
0 → 100755
View file @
2ede0ba8
#!/usr/bin/python
import
sys
simple_tokens
=
[
"elpa_transpose_vectors_NUMBER_PRECISION"
,
"elpa_reduce_add_vectors_NUMBER_PRECISION"
,
"bandred_NUMBER_PRECISION"
,
"trans_ev_band_to_full_NUMBER_PRECISION"
,
"tridiag_band_NUMBER_PRECISION"
,
"trans_ev_tridi_to_band_NUMBER_PRECISION"
,
"band_band_NUMBER_PRECISION"
,
"tridiag_NUMBER_PRECISION"
,
"trans_ev_NUMBER_PRECISION"
,
"solve_tridi_PRECISION"
,
"solve_tridi_col_PRECISION"
,
"solve_tridi_single_problem_PRECISION"
,
"qr_pdgeqrf_2dcomm_PRECISION"
,
"hh_transform_NUMBER_PRECISION"
,
"symm_matrix_allreduce_PRECISION"
,
"redist_band_NUMBER_PRECISION"
,
"unpack_row_NUMBER_cpu_PRECISION"
,
"unpack_row_NUMBER_cpu_openmp_PRECISION"
,
"unpack_and_prepare_row_group_NUMBER_gpu_PRECISION"
,
"extract_hh_tau_NUMBER_gpu_PRECISION"
,
"compute_hh_dot_products_NUMBER_gpu_PRECISION"
,
"compute_hh_trafo_NUMBER_cpu_openmp_PRECISION"
,
"compute_hh_trafo_NUMBER_cpu_PRECISION"
,
"pack_row_group_NUMBER_gpu_PRECISION"
,
"pack_row_NUMBER_cpu_openmp_PRECISION"
,
"pack_row_NUMBER_cpu_PRECISION"
,
"wy_gen_PRECISION"
,
"wy_right_PRECISION"
,
"wy_left_PRECISION"
,
"wy_symm_PRECISION"
,
"merge_recursive_PRECISION"
,
"merge_systems_PRECISION"
,
"distribute_global_column_PRECISION"
,
"check_monotony_PRECISION"
,
"global_gather_PRECISION"
,
"resort_ev_PRECISION"
,
"transform_columns_PRECISION"
,
"solve_secular_equation_PRECISION"
,
"global_product_PRECISION"
,
"add_tmp_PRECISION"
,
"v_add_s_PRECISION"
,
]
blas_tokens
=
[
"PRECISION_GEMV"
,
"PRECISION_TRMV"
,
"PRECISION_GEMM"
,
"PRECISION_TRMM"
,
"PRECISION_HERK"
,
"PRECISION_SYRK"
,
"PRECISION_SYMV"
,
"PRECISION_SYMM"
,
"PRECISION_SYR2"
,
"PRECISION_SYR2K"
,
"PRECISION_GEQRF"
,
"PRECISION_STEDC"
,
"PRECISION_STEQR"
,
"PRECISION_LAMRG"
,
"PRECISION_LAMCH"
,
"PRECISION_LAPY2"
,
"PRECISION_LAED4"
,
"PRECISION_LAED5"
,
"cublas_PRECISION_GEMM"
,
"cublas_PRECISION_TRMM"
,
"cublas_PRECISION_GEMV"
,
]
explicit_tokens_complex
=
[
(
"PRECISION_SUFFIX"
,
"
\"
_double
\"
"
,
"
\"
_single
\"
"
),
(
"MPI_COMPLEX_PRECISION"
,
"MPI_DOUBLE_COMPLEX"
,
"MPI_COMPLEX"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
(
"KIND_PRECISION"
,
"rk8"
,
"rk4"
),
(
"PRECISION_CMPLX"
,
"DCMPLX"
,
"CMPLX"
),
(
"PRECISION_IMAG"
,
"DIMAG"
,
"AIMAG"
),
(
"PRECISION_REAL"
,
"DREAL"
,
"REAL"
),
(
"CONST_REAL_0_0"
,
"0.0_rk8"
,
"0.0_rk4"
),
(
"CONST_REAL_1_0"
,
"1.0_rk8"
,
"1.0_rk4"
),
(
"CONST_COMPLEX_0_0"
,
"0.0_ck8"
,
"0.0_ck4"
),
(
"size_of_PRECISION_complex"
,
"size_of_double_complex_datatype"
,
"size_of_single_complex_datatype"
),
]
explicit_tokens_real
=
[
(
"PRECISION_SUFFIX"
,
"
\"
_double
\"
"
,
"
\"
_single
\"
"
),
(
"CONST_0_0"
,
"0.0_rk8"
,
"0.0_rk4"
),
(
"CONST_0_5"
,
"0.5_rk8"
,
"0.5_rk4"
),
(
"CONST_1_0"
,
"1.0_rk8"
,
"1.0_rk4"
),
(
"CONST_2_0"
,
"2.0_rk8"
,
"2.0_rk4"
),
(
"CONST_8_0"
,
"8.0_rk8"
,
"8.0_rk4"
),
(
"size_of_PRECISION_real"
,
"size_of_double_real_datatype"
,
"size_of_single_real_datatype"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
]
explicit_order
=
{
"single"
:
2
,
"double"
:
1
}
blas_prefixes
=
{(
"real"
,
"single"
)
:
"S"
,
(
"real"
,
"double"
)
:
"D"
,
(
"complex"
,
"single"
)
:
"C"
,
(
"complex"
,
"double"
)
:
"Z"
}
def
print_variant
(
number
,
precision
,
explicit
):
for
token
in
simple_tokens
:
print
"#define "
,
token
.
replace
(
"NUMBER"
,
number
),
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"NUMBER"
,
number
)
for
token
in
blas_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION_"
,
blas_prefixes
[(
number
,
precision
)])
for
token
in
explicit
:
print
"#define "
,
token
[
0
],
token
[
explicit_order
[
precision
]]
def
print_undefs
(
number
,
explicit
):
for
token
in
simple_tokens
:
print
"#undef "
,
token
.
replace
(
"NUMBER"
,
number
)
for
token
in
blas_tokens
:
print
"#undef "
,
token
for
token
in
explicit
:
print
"#undef "
,
token
[
0
]
if
(
sys
.
argv
[
1
]
==
"complex"
):
print
"#ifdef DOUBLE_PRECISION_COMPLEX"
print_undefs
(
"complex"
,
explicit_tokens_complex
)
print_variant
(
"complex"
,
"double"
,
explicit_tokens_complex
)
print
"#else"
print_undefs
(
"complex"
,
explicit_tokens_complex
)
print_variant
(
"complex"
,
"single"
,
explicit_tokens_complex
)
print
"#endif"
elif
(
sys
.
argv
[
1
]
==
"real"
):
print
"#ifdef DOUBLE_PRECISION_REAL"
print_undefs
(
"real"
,
explicit_tokens_real
)
print_variant
(
"real"
,
"double"
,
explicit_tokens_real
)
print
"#else"
print_undefs
(
"real"
,
explicit_tokens_real
)
print_variant
(
"real"
,
"single"
,
explicit_tokens_real
)
print
"#endif"
else
:
assert
(
False
)
\ No newline at end of file
generate/generate_precission.py
deleted
100755 → 0
View file @
69695acb
#!/usr/bin/python
simple_tokens
=
[
"tridiag_complex_PRECISION"
,
"trans_ev_complex_PRECISION"
,
"solve_complex_PRECISION"
,
"hh_transform_complex_PRECISION"
,
"elpa_transpose_vectors_complex_PRECISION"
,
"elpa_reduce_add_vectors_complex_PRECISION"
,
]
blas_tokens
=
[
"PRECISION_GEMV"
,
"PRECISION_TRMV"
,
"PRECISION_GEMM"
,
"PRECISION_TRMM"
,
"PRECISION_HERK"
,
"cublas_PRECISION_gemm"
,
"cublas_PRECISION_trmm"
,
"cublas_PRECISION_gemv"
,
]
explicit_tokens
=
[(
"PRECISION_SUFFIX"
,
"
\"
_double
\"
"
,
"
\"
_single
\"
"
),
(
"MPI_COMPLEX_PRECISION"
,
"MPI_DOUBLE_COMPLEX"
,
"MPI_COMPLEX"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
(
"KIND_PRECISION"
,
"rk8"
,
"rk4"
),
(
"PRECISION_CMPLX"
,
"DCMPLX"
,
"CMPLX"
),
(
"PRECISION_IMAG"
,
"DIMAG"
,
"AIMAG"
),
(
"PRECISION_REAL"
,
"DREAL"
,
"REAL"
),
(
"CONST_REAL_0_0"
,
"0.0_rk8"
,
"0.0_rk4"
),
(
"CONST_REAL_1_0"
,
"1.0_rk8"
,
"1.0_rk4"
),
(
"size_of_PRECISION_complex"
,
"size_of_double_complex_datatype"
,
"size_of_single_complex_datatype"
),
]
print
"#ifdef DOUBLE_PRECISION_COMPLEX"
for
token
in
simple_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION"
,
"double"
)
for
token
in
blas_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION_"
,
"Z"
)
for
token
in
explicit_tokens
:
print
"#define "
,
token
[
0
],
token
[
1
]
print
"#else"
for
token
in
simple_tokens
:
print
"#undef "
,
token
for
token
in
blas_tokens
:
print
"#undef "
,
token
for
token
in
explicit_tokens
:
print
"#undef "
,
token
[
0
]
for
token
in
simple_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION"
,
"single"
)
for
token
in
blas_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION_"
,
"C"
)
for
token
in
explicit_tokens
:
print
"#define "
,
token
[
0
],
token
[
2
]
print
"#endif"
src/elpa1_merge_systems_real_template.X90
View file @
2ede0ba8
This diff is collapsed.
Click to expand it.
src/elpa1_solve_tridi_real_template.X90
View file @
2ede0ba8
...
...
@@ -52,7 +52,7 @@
! distributed along with the original code in the file "COPYING".
#endif
subroutine
M_
solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success )
#ifdef HAVE_DETAILED_TIMINGS
...
...
@@ -81,7 +81,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
integer(kind=ik) :: istat
character(200) :: errorMessage
call timer%start("solve_tridi" //
M_
PRECISION_SUFFIX)
call timer%start("solve_tridi" // PRECISION_SUFFIX)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...
...
@@ -96,7 +96,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
! Set Q to 0
q(1:l_rows, 1:l_cols) =
M_
CONST_0_0
q(1:l_rows, 1:l_cols) = CONST_0_0
! Get the limits of the subdivisons, each subdivison has as many cols
! as fit on the respective processor column
...
...
@@ -116,7 +116,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
! Scalapack supports it but delivers no results for these columns,
! which is rather annoying
if (nc==0) then
call timer%stop("solve_tridi" //
M_
PRECISION_SUFFIX)
call timer%stop("solve_tridi" // PRECISION_SUFFIX)
if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
success = .false.
return
...
...
@@ -141,10 +141,10 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
else
nev1 = MIN(nev,l_cols)
endif
call
M_
solve_tridi_col_PRECISION(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
call solve_tridi_col_PRECISION(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
matrixCols, mpi_comm_rows, wantDebug, success)
if (.not.(success)) then
call timer%stop("solve_tridi" //
M_
PRECISION_SUFFIX)
call timer%stop("solve_tridi" // PRECISION_SUFFIX)
return
endif
! If there is only 1 processor column, we are done
...
...
@@ -156,7 +156,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
stop
endif
call timer%stop("solve_tridi" //
M_
PRECISION_SUFFIX)
call timer%stop("solve_tridi" // PRECISION_SUFFIX)
return
endif
...
...
@@ -215,9 +215,9 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
enddo
! Recursively merge sub problems
call
M_
merge_recursive_PRECISION(0, np_cols, wantDebug, success)
call merge_recursive_PRECISION(0, np_cols, wantDebug, success)
if (.not.(success)) then
call timer%stop("solve_tridi" //
M_
PRECISION_SUFFIX)
call timer%stop("solve_tridi" // PRECISION_SUFFIX)
return
endif
...
...
@@ -227,11 +227,11 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
stop
endif
call timer%stop("solve_tridi" //
M_
PRECISION_SUFFIX)
call timer%stop("solve_tridi" // PRECISION_SUFFIX)
return
contains
recursive subroutine
M_
merge_recursive_PRECISION(np_off, nprocs, wantDebug, success)
recursive subroutine merge_recursive_PRECISION(np_off, nprocs, wantDebug, success)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
...
...
@@ -264,9 +264,9 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
np1 = nprocs/2
np2 = nprocs-np1
if (np1 > 1) call
M_
merge_recursive_PRECISION(np_off, np1, wantDebug, success)
if (np1 > 1) call merge_recursive_PRECISION(np_off, np1, wantDebug, success)
if (.not.(success)) return
if (np2 > 1) call
M_
merge_recursive_PRECISION(np_off+np1, np2, wantDebug, success)
if (np2 > 1) call merge_recursive_PRECISION(np_off+np1, np2, wantDebug, success)
if (.not.(success)) return
noff = limits(np_off)
...
...
@@ -277,7 +277,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
call timer%start("mpi_communication")
if (my_pcol==np_off) then
do n=np_off+np1,np_off+nprocs-1
call mpi_send(d(noff+1), nmid,
M_
MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
call mpi_send(d(noff+1), nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
enddo
endif
call timer%stop("mpi_communication")
...
...
@@ -286,7 +286,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
if (my_pcol>=np_off+np1 .and. my_pcol<np_off+nprocs) then
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_recv(d(noff+1), nmid,
M_
MPI_REAL_PRECISION, np_off, 1, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(d(noff+1), nmid, MPI_REAL_PRECISION, np_off, 1, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
! d(noff+1:noff+1+nmid-1) = d(noff+1:noff+1+nmid-1)
...
...
@@ -297,7 +297,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
do n=np_off,np_off+np1-1
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_send(d(noff+nmid+1), nlen-nmid,
M_
MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
call mpi_send(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
...
...
@@ -306,7 +306,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
if (my_pcol>=np_off .and. my_pcol<np_off+np1) then
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_recv(d(noff+nmid+1), nlen-nmid,
M_
MPI_REAL_PRECISION, np_off+np1, 1,mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(d(noff+nmid+1), nlen-nmid, MPI_REAL_PRECISION, np_off+np1, 1,mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
! d(noff+nmid+1:noff+nmid+1+nlen-nmid-1) = d(noff+nmid+1:noff+nmid+1+nlen-nmid-1)
...
...
@@ -316,22 +316,22 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
! Last merge, result distribution must be block cyclic, noff==0,
! p_col_bc is set so that only nev eigenvalues are calculated
call
M_
merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
call merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
l_col_bc, p_col_bc, np_off, nprocs, wantDebug, success )
if (.not.(success)) return
else
! Not last merge, leave dense column distribution
call
M_
merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
call merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
l_col(noff+1), p_col(noff+1), np_off, nprocs, wantDebug, success )
if (.not.(success)) return
endif
end subroutine
M_
merge_recursive_PRECISION
end subroutine merge_recursive_PRECISION
end subroutine
M_
solve_tridi_PRECISION
end subroutine solve_tridi_PRECISION
subroutine
M_
solve_tridi_col_PRECISION( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
subroutine solve_tridi_col_PRECISION( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
! with the divide and conquer method.
...
...
@@ -365,7 +365,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
integer(kind=ik) :: istat
character(200) :: errorMessage
call timer%start("solve_tridi_col" //
M_
PRECISION_SUFFIX)
call timer%start("solve_tridi_col" // PRECISION_SUFFIX)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
...
...
@@ -427,7 +427,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
noff = limits(n) ! Start of subproblem
nlen = limits(n+1)-noff ! Size of subproblem
call
M_
solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1), &
call solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1), &
q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
if (.not.(success)) return
...
...
@@ -456,7 +456,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
noff = limits(my_prow) ! Start of subproblem
nlen = limits(my_prow+1)-noff ! Size of subproblem
call
M_
solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1),qmat1, &
call solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1),qmat1, &
ubound(qmat1,dim=1), wantDebug, success)
if (.not.(success)) return
...
...
@@ -470,9 +470,9 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
nlen = limits(np+1)-noff
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Bcast(d(noff+1), nlen,
M_
MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
call MPI_Bcast(d(noff+1), nlen, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
qmat2 = qmat1
call MPI_Bcast(qmat2, max_size*max_size,
M_
MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
! qmat2 = qmat1 ! is this correct
...
...
@@ -480,9 +480,9 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
do i=1,nlen
#ifdef WITH_MPI
call
M_
distribute_global_column_PRECISION(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
call distribute_global_column_PRECISION(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else /* WITH_MPI */
call
M_
distribute_global_column_PRECISION(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
call distribute_global_column_PRECISION(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif /* WITH_MPI */
enddo
...
...
@@ -525,7 +525,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
p_col_o(nev+1:na) = -1
endif
call
M_
merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
call merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
l_col(noff+1), p_col_o(noff+1), 0, 1, wantDebug, success)
if (.not.(success)) return
...
...
@@ -542,11 +542,11 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
stop
endif
call timer%stop("solve_tridi_col" //
M_
PRECISION_SUFFIX)
call timer%stop("solve_tridi_col" // PRECISION_SUFFIX)
end subroutine
M_
solve_tridi_col_PRECISION
end subroutine solve_tridi_col_PRECISION
recursive subroutine
M_
solve_tridi_single_problem_PRECISION(nlen, d, e, q, ldq, wantDebug, success)
recursive subroutine solve_tridi_single_problem_PRECISION(nlen, d, e, q, ldq, wantDebug, success)
! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
...
...
@@ -572,7 +572,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
integer(kind=ik) :: istat
character(200) :: errorMessage
call timer%start("solve_tridi_single" //
M_
PRECISION_SUFFIX)
call timer%start("solve_tridi_single" // PRECISION_SUFFIX)
success = .true.
allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
...
...
@@ -596,7 +596,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
stop
endif
call
M_
PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
call PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
if (info /= 0) then
...
...
@@ -606,7 +606,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
d(:) = ds(:)
e(:) = es(:)
call
M_
PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info)
call PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info)
! If DSTEQR fails also, we don't know what to do further ...
if (info /= 0) then
...
...
@@ -666,7 +666,7 @@ subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi
endif
enddo
call timer%stop("solve_tridi_single" //
M_
PRECISION_SUFFIX)
call timer%stop("solve_tridi_single" // PRECISION_SUFFIX)
end subroutine
M_
solve_tridi_single_problem_PRECISION
end subroutine solve_tridi_single_problem_PRECISION
src/elpa1_tools_template.X90
View file @
2ede0ba8
...
...
@@ -54,16 +54,16 @@
#if REALCASE == 1
subroutine
M_
v_add_s_PRECISION(v,n,s)
subroutine v_add_s_PRECISION(v,n,s)
use precision
implicit none
integer(kind=ik) :: n
real(kind=REAL_DATATYPE) :: v(n),s
v(:) = v(:) + s
end subroutine
M_
v_add_s_PRECISION
end subroutine v_add_s_PRECISION
subroutine
M_
distribute_global_column_PRECISION(g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
subroutine distribute_global_column_PRECISION(g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
use precision
implicit none
...
...
@@ -88,9 +88,9 @@
l_col(l_off+js:l_off+je) = g_col(g_off+js-noff:g_off+je-noff)
enddo
end subroutine
M_
distribute_global_column_PRECISION
end subroutine distribute_global_column_PRECISION
subroutine
M_
solve_secular_equation_PRECISION(n, i, d, z, delta, rho, dlam)
subroutine solve_secular_equation_PRECISION(n, i, d, z, delta, rho, dlam)
!-------------------------------------------------------------------------------
! This routine solves the secular equation of a symmetric rank 1 modified
! diagonal matrix:
...
...
@@ -159,7 +159,7 @@
! Upper and lower bound of the shifted solution interval are a and b
call timer%start("solve_secular_equation" //
M_
PRECISION_SUFFIX)
call timer%start("solve_secular_equation" // PRECISION_SUFFIX)
if (i==n) then
! Special case: Last eigenvalue
...
...
@@ -169,15 +169,15 @@
dshift = d(n)
delta(:) = d(:) - dshift
a =
M_
CONST_0_0 ! delta(n)
b = rho*SUM(z(:)**2) +
M_
CONST_1_0 ! rho*SUM(z(:)**2) is the lower bound for the guess
a = CONST_0_0 ! delta(n)
b = rho*SUM(z(:)**2) + CONST_1_0 ! rho*SUM(z(:)**2) is the lower bound for the guess
else
! Other eigenvalues: lower bound is d(i), upper bound is d(i+1)
! We check the sign of the function in the midpoint of the interval
! in order to determine if eigenvalue is more close to d(i) or d(i+1)
x =
M_
CONST_0_5*(d(i)+d(i+1))
y =
M_
CONST_1_0 + rho*SUM(z(:)**2/(d(:)-x))
x = CONST_0_5*(d(i)+d(i+1))
y = CONST_1_0 + rho*SUM(z(:)**2/(d(:)-x))
if (y>0) then
! solution is next to d(i)
dshift = d(i)
...
...
@@ -197,7 +197,7 @@
do iter=1,200
! Interval subdivision
x =
M_
CONST_0_5*(a+b)
x = CONST_0_5*(a+b)
if (x==a .or. x==b) exit ! No further interval subdivisions possible
#ifdef DOUBLE_PRECISION_REAL
if (abs(x) < 1.e-200_rk8) exit ! x next to pole
...
...
@@ -223,14 +223,14 @@
dlam = x + dshift
delta(:) = delta(:) - x
call timer%stop("solve_secular_equation" //
M_
PRECISION_SUFFIX)
call timer%stop("solve_secular_equation" // PRECISION_SUFFIX)
end subroutine
M_
solve_secular_equation_PRECISION
end subroutine solve_secular_equation_PRECISION
!-------------------------------------------------------------------------------
#endif
#if REALCASE == 1
subroutine
M_
hh_transform_real_PRECISION(alpha, xnorm_sq, xf, tau)
subroutine hh_transform_real_PRECISION(alpha, xnorm_sq, xf, tau)
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
#endif
#if COMPLEXCASE == 1
...
...
@@ -266,7 +266,7 @@
real(kind=REAL_DATATYPE) :: BETA
#if REALCASE == 1
call timer%start("hh_transform_real" //
M_
PRECISION_SUFFIX )
call timer%start("hh_transform_real" // PRECISION_SUFFIX )
#endif
#if COMPLEXCASE == 1
call timer%start("hh_transform_complex" // PRECISION_SUFFIX )
...
...
@@ -332,14 +332,14 @@
endif
#if REALCASE == 1
call timer%stop("hh_transform_real" //
M_
PRECISION_SUFFIX )
call timer%stop("hh_transform_real" // PRECISION_SUFFIX )
#endif
#if COMPLEXCASE == 1
call timer%stop("hh_transform_complex" // PRECISION_SUFFIX )
#endif
#if REALCASE == 1
end subroutine
M_
hh_transform_real_PRECISION
end subroutine hh_transform_real_PRECISION
#endif
#if COMPLEXCASE == 1
end subroutine hh_transform_complex_PRECISION
...
...
src/elpa1_trans_ev_template.X90
View file @
2ede0ba8
...
...
@@ -87,7 +87,7 @@
!>
#if REALCASE == 1
subroutine
M_
trans_ev_real_PRECISION (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
subroutine trans_ev_real_PRECISION (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
#if COMPLEXCASE == 1
subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
...
...
@@ -156,7 +156,7 @@
logical :: successCUDA
#if REALCASE == 1
call timer%start("trans_ev_real" //
M_
PRECISION_SUFFIX)
call timer%start("trans_ev_real" // PRECISION_SUFFIX)
#endif
#if COMPLEXCASE == 1
call timer%start("trans_ev_complex" // PRECISION_SUFFIX)
...
...
@@ -256,7 +256,7 @@
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows *
M_
size_of_PRECISION_real)
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
...
...
@@ -265,7 +265,7 @@
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows *
M_
size_of_PRECISION_real)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
...
...
@@ -274,7 +274,7 @@
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows *
M_
size_of_PRECISION_real)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
...
...
@@ -283,7 +283,7 @@
#endif
#if REALCASE == 1
successCUDA = cuda_malloc(q_dev, ldq * matrixCols *
M_
size_of_PRECISION_real)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
...
...
@@ -293,7 +293,7 @@
! q_dev = q_mat
#if REALCASE == 1
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols *
M_
size_of_PRECISION_real, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
#endif
#if COMPLEXCASE == 1
...
...
@@ -331,7 +331,7 @@
call timer%start("mpi_communication")
if (nb>0) &
#if REALCASE == 1
call MPI_Bcast(hvb, nb,
M_
MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
call MPI_Bcast(hvb, nb, MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
#endif
#if COMPLEXCASE == 1
call MPI_Bcast(hvb, nb, MPI_COMPLEX_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
...
...
@@ -359,9 +359,9 @@
tmat = 0
if (l_rows>0) &