Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
20892fc3
Commit
20892fc3
authored
Jan 11, 2017
by
Pavel Kus
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
repated code from elpa2.F90 moved to elpa2_template.X90
parent
263b7c83
Changes
5
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
795 additions
and
1327 deletions
+795
-1327
Makefile.am
Makefile.am
+2
-0
generate/generate_precision.py
generate/generate_precision.py
+10
-2
src/elpa2.F90
src/elpa2.F90
+30
-1325
src/elpa2_template.X90
src/elpa2_template.X90
+381
-0
src/precision_macros.h
src/precision_macros.h
+372
-0
No files found.
Makefile.am
View file @
20892fc3
...
...
@@ -52,6 +52,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1_compute_template.X90
\
src/elpa2_compute_real_template.X90
\
src/elpa2_compute_complex_template.X90
\
src/elpa2_template.X90
\
src/elpa2_bandred_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_trans_ev_band_to_full_real_template.X90
\
...
...
@@ -903,6 +904,7 @@ EXTRA_DIST = \
src/elpa2_bandred_template.X90
\
src/elpa2_herm_matrix_allreduce_complex_template.X90
\
src/elpa2_symm_matrix_allreduce_real_template.X90
\
src/elpa2_template.X90
\
src/elpa2_trans_ev_band_to_full_complex_template.X90
\
src/elpa2_trans_ev_band_to_full_real_template.X90
\
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
\
...
...
generate/generate_precision.py
View file @
20892fc3
...
...
@@ -5,7 +5,6 @@ simple_tokens = [
"PRECISION"
,
"elpa_transpose_vectors_NUMBER_PRECISION"
,
"elpa_reduce_add_vectors_NUMBER_PRECISION"
,
"bandred_NUMBER_PRECISION"
,
"trans_ev_band_to_full_NUMBER_PRECISION"
,
"tridiag_band_NUMBER_PRECISION"
,
...
...
@@ -16,7 +15,7 @@ simple_tokens = [
"solve_tridi_PRECISION"
,
"solve_tridi_col_PRECISION"
,
"solve_tridi_single_problem_PRECISION"
,
"solve_evp_NUMBER_2stage_PRECISION"
,
"qr_pdgeqrf_2dcomm_PRECISION"
,
"hh_transform_NUMBER_PRECISION"
,
"symm_matrix_allreduce_PRECISION"
,
...
...
@@ -53,6 +52,11 @@ simple_tokens = [
"launch_my_unpack_c_kernel_NUMBER_PRECISION"
,
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION"
,
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION"
,
"AVAILABLE_UPCASENUMBER_ELPA_KERNELS"
,
"UPCASENUMBER_ELPA_KERNEL_GENERIC"
,
"DEFAULT_UPCASENUMBER_ELPA_KERNEL"
,
"UPCASENUMBER_ELPA_KERNEL_NAMES"
,
"UPCASENUMBER_ELPA_KERNEL_GPU"
,
]
blas_tokens
=
[
...
...
@@ -100,6 +104,7 @@ explicit_tokens_complex = [
(
"CONST_COMPLEX_0_0"
,
"0.0_ck8"
,
"0.0_ck4"
),
(
"CONST_COMPLEX_1_0"
,
"1.0_ck8"
,
"1.0_ck4"
),
(
"size_of_PRECISION_complex"
,
"size_of_double_complex_datatype"
,
"size_of_single_complex_datatype"
),
(
"C_DATATYPE_KIND"
,
"c_double"
,
"c_float"
),
]
explicit_tokens_real
=
[
...
...
@@ -111,6 +116,7 @@ explicit_tokens_real = [
(
"CONST_8_0"
,
"8.0_rk8"
,
"8.0_rk4"
),
(
"size_of_PRECISION_real"
,
"size_of_double_real_datatype"
,
"size_of_single_real_datatype"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
(
"C_DATATYPE_KIND"
,
"c_double"
,
"c_float"
),
]
...
...
@@ -119,6 +125,7 @@ blas_prefixes = {("real","single") : "S", ("real","double") : "D", ("complex","s
def
print_variant
(
number
,
precision
,
explicit
):
for
token
in
simple_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"UPCASENUMBER"
,
number
.
upper
()).
replace
(
"NUMBER"
,
number
)
print
"#define "
,
token
.
replace
(
"NUMBER"
,
number
),
token
.
replace
(
"PRECISION"
,
precision
).
replace
(
"NUMBER"
,
number
)
for
token
in
blas_tokens
:
print
"#define "
,
token
,
token
.
replace
(
"PRECISION_"
,
blas_prefixes
[(
number
,
precision
)])
...
...
@@ -127,6 +134,7 @@ def print_variant(number, precision, explicit):
def
print_undefs
(
number
,
explicit
):
for
token
in
simple_tokens
:
print
"#undef "
,
token
print
"#undef "
,
token
.
replace
(
"NUMBER"
,
number
)
for
token
in
blas_tokens
:
print
"#undef "
,
token
...
...
src/elpa2.F90
View file @
20892fc3
This diff is collapsed.
Click to expand it.
src/elpa2_template.X90
0 → 100644
View file @
20892fc3
#if REALCASE == 1
function solve_evp_real_2stage_PRECISION (na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_ELPA_KERNEL_API, useQR, useGPU) result(success)
#elif COMPLEXCASE == 1
function solve_evp_complex_2stage_PRECISION(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_ELPA_KERNEL_API, useGPU) result(success)
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use elpa1_utilities, only : gpu_usage_via_environment_variable
use elpa1_compute
use elpa2_compute
use elpa_mpi
use cuda_functions
use mod_check_for_gpu
use iso_c_binding
implicit none
logical, intent(in), optional :: useGPU
#if REALCASE == 1
logical, intent(in), optional :: useQR
#endif
logical :: useQRActual, useQREnvironment
integer(kind=c_int), intent(in), optional :: THIS_ELPA_KERNEL_API
integer(kind=c_int) :: THIS_ELPA_KERNEL
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
mpi_comm_cols, mpi_comm_all
integer(kind=c_int), intent(in) :: nblk
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,*), q(ldq,*)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
real(kind=C_DATATYPE_KIND), intent(inout) :: ev(na)
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: hh_trans(:,:)
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=c_int) :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
MATH_DATATYPE(kind=C_DATATYPE_KIND), allocatable :: tmat(:,:,:)
real(kind=C_DATATYPE_KIND), allocatable :: e(:)
#if COMPLEXCASE == 1
real(kind=C_DATATYPE_KIND), allocatable :: q_real(:,:)
#endif
integer(kind=c_intptr_t) :: tmat_dev, q_dev, a_dev
real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double
integer(kind=c_int) :: i
logical :: success
logical, save :: firstCall = .true.
logical :: wantDebug
integer(kind=c_int) :: istat
character(200) :: errorMessage
logical :: do_useGPU
integer(kind=c_int) :: numberOfGPUDevices
call timer%start("solve_evp_&
&MATH_DATATYPE&
&_2stage_" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call timer%stop("mpi_communication")
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
success = .true.
do_useGPU = .false.
#if REALCASE == 1
useQRActual = .false.
! set usage of qr decomposition via API call
if (present(useQR)) then
if (useQR) useQRActual = .true.
if (.not.(useQR)) useQRACtual = .false.
endif
! overwrite this with environment variable settings
if (qr_decomposition_via_environment_variable(useQREnvironment)) then
useQRActual = useQREnvironment
endif
if (useQRActual) then
if (mod(na,2) .ne. 0) then
if (wantDebug) then
write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
endif
print *, "Do not use QR-decomposition for this matrix and blocksize."
success = .false.
return
endif
endif
#endif
if (present(useGPU)) then
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
else
! check whether set by environment variable
do_useGPU = gpu_usage_via_environment_variable()
endif
if (present(THIS_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_ELPA_KERNEL = THIS_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_ELPA_KERNEL = get_actual_&
&MATH_DATATYPE&
&_kernel()
endif
! check whether choosen kernel is allowed: function returns true if NOT allowed! change this
if (check_allowed_&
&MATH_DATATYPE&
&_kernels(THIS_ELPA_KERNEL)) then
if (my_pe == 0) then
write(error_unit,*) " "
write(error_unit,*) "The choosen kernel ",UPCASENUMBER_ELPA_KERNEL_NAMES(THIS_ELPA_KERNEL)
write(error_unit,*) "is not in the list of the allowed kernels!"
write(error_unit,*) " "
write(error_unit,*) "Allowed kernels are:"
do i=1,size(UPCASENUMBER_ELPA_KERNEL_NAMES(:))
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(i) .ne. 0) then
write(error_unit,*) UPCASENUMBER_ELPA_KERNEL_NAMES(i)
endif
enddo
write(error_unit,*) " "
! check whether generic kernel is defined
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(UPCASENUMBER_ELPA_KERNEL_GENERIC) .eq. 1) then
write(error_unit,*) "The default kernel NUMBER_ELPA_KERNEL_GENERIC will be used !"
else
write(error_unit,*) "As default kernel ",UPCASENUMBER_ELPA_KERNEL_NAMES(DEFAULT_UPCASENUMBER_ELPA_KERNEL)," will be used"
endif
endif ! my_pe == 0
if (AVAILABLE_UPCASENUMBER_ELPA_KERNELS(UPCASENUMBER_ELPA_KERNEL_GENERIC) .eq. 1) then
THIS_ELPA_KERNEL = UPCASENUMBER_ELPA_KERNEL_GENERIC
else
THIS_ELPA_KERNEL = DEFAULT_UPCASENUMBER_ELPA_KERNEL
endif
endif
! check consistency between request for GPUs and defined kernel
if (do_useGPU) then
if (THIS_ELPA_KERNEL .ne. UPCASENUMBER_ELPA_KERNEL_GPU) then
write(error_unit,*) "GPU usage has been requested but compute kernel is defined as non-GPU! Aborting..."
success = .false.
return
endif
endif
if (do_useGPU) then
if (nblk .ne. 128) then
write(error_unit,*) "In case of GPU usage the blocksize for ELPA 2stage has to be 128"
success = .false.
return
endif
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
! on this and maybe allow a run-time optimization here
if (do_useGPU) then
nbw = nblk
else
#if REALCASE == 1
nbw = (63/nblk+1)*nblk
#elif COMPLEXCASE == 1
nbw = (31/nblk+1)*nblk
#endif
endif
num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when allocating tmat "//errorMessage
stop
endif
! Reduction full -> band
ttt0 = MPI_Wtime()
ttts = ttt0
#if REALCASE == 1
call bandred_real_PRECISION(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual)
#elif COMPLEXCASE == 1
call bandred_complex_PRECISION(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, do_useGPU, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time bandred_&
&MATH_DATATYPE&
& :',ttt1-ttt0
! Reduction band -> tridiagonal
allocate(e(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when allocating e "//errorMessage
stop
endif
ttt0 = MPI_Wtime()
call tridiag_band_NUMBER_PRECISION(na, nbw, nblk, a, lda, ev, e, matrixCols, &
hh_trans, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_&
&MATH_DATATYPE&
& :',ttt1-ttt0
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_bcast(ev, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
call mpi_bcast(e, na, MPI_REAL_PRECISION, 0, mpi_comm_all, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
#if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage
stop
endif
#endif
! Solve tridiagonal system
ttt0 = MPI_Wtime()
#if REALCASE == 1
call solve_tridi_PRECISION(na, nev, ev, e, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#elif COMPLEXCASE == 1
call solve_tridi_PRECISION(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, wantDebug, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttts = ttt1
#if REALCASE == 1
deallocate(e, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
stop
endif
#elif COMPLEXCASE == 1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
deallocate(e, q_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage
stop
endif
#endif
! Backtransform stage 1
ttt0 = MPI_Wtime()
#if REALCASE == 1
call trans_ev_tridi_to_band_real_PRECISION(na, nev, nblk, nbw, q, q_dev, ldq, &
matrixCols, hh_trans, &
mpi_comm_rows, mpi_comm_cols, &
wantDebug, do_useGPU, success, THIS_ELPA_KERNEL)
#elif COMPLEXCASE == 1
call trans_ev_tridi_to_band_complex_PRECISION(na, nev, nblk, nbw, q, ldq, &
matrixCols, hh_trans, &
mpi_comm_rows, mpi_comm_cols, &
wantDebug, do_useGPU, success,THIS_ELPA_KERNEL)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&:',ttt1-ttt0
! We can now deallocate the stored householder vectors
deallocate(hh_trans, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when deallocating hh_trans "//errorMessage
stop
endif
! Backtransform stage 2
ttt0 = MPI_Wtime()
#if REALCASE == 1
call trans_ev_band_to_full_real_PRECISION(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, &
q, q_dev, ldq, matrixCols, num_blocks, &
mpi_comm_rows, mpi_comm_cols, do_useGPU, useQRActual)
#elif COMPLEXCASE == 1
call trans_ev_band_to_full_complex_PRECISION(na, nev, nblk, nbw, a, lda, tmat, &
q, ldq, matrixCols, num_blocks, &
mpi_comm_rows, mpi_comm_cols, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_band_to_full_&
&MATH_DATATYPE&
& :',ttt1-ttt0
time_evp_back = ttt1-ttts
deallocate(tmat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_2stage: error when deallocating tmat"//errorMessage
stop
endif
call timer%stop("solve_evp_real_2stage_&
&MATH_DATATYPE&
&")
1 format(a,f10.3)
end function solve_evp_NUMBER_2stage_PRECISION
src/precision_macros.h
View file @
20892fc3
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment