Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
089ba660
Commit
089ba660
authored
Jan 28, 2017
by
Andreas Marek
Browse files
Cleanup
parent
58923f0f
Changes
12
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa1_compute_private.F90
View file @
089ba660
...
...
@@ -160,6 +160,11 @@ module ELPA1_COMPUTE
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION double
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
...
...
@@ -167,7 +172,8 @@ module ELPA1_COMPUTE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
#undef PRECISION
#undef PRECISION_SUFFIX
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
...
...
@@ -176,12 +182,19 @@ module ELPA1_COMPUTE
#define BYTESIZE 4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION single
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef SINGLE_PRECISION
#undef PRECISION
#undef PRECISION_SUFFIX
#endif
...
...
@@ -192,6 +205,11 @@ module ELPA1_COMPUTE
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION double
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
...
...
@@ -199,6 +217,8 @@ module ELPA1_COMPUTE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef PRECISION
#undef PRECISION_SUFFIX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
...
...
@@ -207,6 +227,11 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION single
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
...
...
src/elpa1_merge_systems_real_template.X90
View file @
089ba660
...
...
@@ -120,8 +120,7 @@
endif
#endif
call timer%start("merge_systems" // &
&PRECISION_SUFFIX)
call timer%start("merge_systems" // PRECISION_SUFFIX)
success = .true.
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...
...
@@ -829,7 +828,7 @@
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
use timings_dummy
#endif
use precision
implicit none
...
...
src/elpa1_tools_template.X90
View file @
089ba660
...
...
@@ -244,7 +244,7 @@
#if COMPLEXCASE == 1
subroutine hh_transform_complex_&
#endif
&PRECISION &
&PRECISION &
(alpha, xnorm_sq, xf, tau)
#if REALCASE == 1
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
...
...
@@ -282,8 +282,7 @@
call timer%start("hh_tranform_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX )
#if COMPLEXCASE == 1
ALPHR = real( ALPHA, kind=REAL_DATATYPE )
...
...
@@ -347,8 +346,7 @@
call timer%stop("hh_transform_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX )
#if REALCASE == 1
end subroutine hh_transform_real_&
...
...
@@ -356,4 +354,4 @@
#if COMPLEXCASE == 1
end subroutine hh_transform_complex_&
#endif
&PRECISION
&PRECISION
src/elpa2_bandred_template.X90
View file @
089ba660
...
...
@@ -258,7 +258,7 @@
na_rows = na
#endif
na_cols = na
#endif
#endif
/* WITH_MPI */
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols* &
...
...
@@ -1326,9 +1326,9 @@
if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
umcCUDA, cur_l_cols, mpi_comm_cols, &
istep*nbw, n_cols, nblk)
...
...
@@ -1401,7 +1401,7 @@
endif
call symm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw,nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
...
...
@@ -1427,9 +1427,9 @@
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
...
...
@@ -1466,9 +1466,9 @@
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
...
...
@@ -1512,7 +1512,7 @@
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call timer%stop("blas")
call symm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw, nbw ,mpi_comm_cols)
! U = U - 0.5 * V * VAV
...
...
@@ -1522,9 +1522,9 @@
call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
...
...
@@ -1588,9 +1588,9 @@
#if COMPLEXCASE == 1
if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
...
...
@@ -1683,7 +1683,7 @@
endif
call herm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw, nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
...
...
@@ -1699,7 +1699,7 @@
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call timer%stop("blas")
call herm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav,nbw,nbw,mpi_comm_cols)
endif
...
...
@@ -1723,9 +1723,9 @@
stop
endif
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
...
...
@@ -1755,9 +1755,9 @@
call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
...
...
@@ -1826,12 +1826,22 @@
! endif
!#endif
! this is not needed since a_dev is passed along from one subroutine to the other
successCUDA = cuda_memcpy ( &
#if REALCASE == 1
successCUDA = cuda_memcpy (
loc
(a),
a_dev, lda*na_cols*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
loc(a),
&
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy ( loc(a(1,1)), a_dev, lda*na_cols*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
loc(a(1,1)), &
#endif
a_dev, lda*na_cols* &
#if REALCASE == 1
size_of_PRECISION_real, &
#endif
#if COMPLEXCASE ==1
size_of_PRECISION_complex,&
#endif
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
...
...
src/elpa2_compute.F90
View file @
089ba660
...
...
@@ -116,25 +116,43 @@ module ELPA2_compute
#define REAL_DATATYPE rk8
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#define PRECISION double
#include "redist_band.X90"
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
#undef PRECISION_SUFFIX
#undef PRECISION
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION
#define REAL_DATATYPE rk4
#define BYTESIZE 4
#define REALCASE 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#undef PRECISION
#define PRECISION single
#include "redist_band.X90"
#undef REAL_DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef PRECISION_SUFFIX
#undef PRECISION
#endif
#endif
/* WANT_SINGLE_PRECISION_REAL */
! double precision
#define DOUBLE_PRECISION_COMPLEX 1
...
...
@@ -142,22 +160,40 @@ module ELPA2_compute
#define COMPLEX_DATATYPE ck8
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#undef PRECISION
#define PRECISION double
#include "redist_band.X90"
#undef COMPLEX_DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef PRECISION_SUFFIX
#undef PRECISION
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION
#define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#undef PRECISION
#define PRECISION single
#include "redist_band.X90"
#undef COMPLEX_DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef PRECISION_SUFFIX
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
...
...
src/elpa2_compute_real_template.X90
View file @
089ba660
...
...
@@ -141,22 +141,6 @@
character(200) :: errorMessage
call timer%start("band_band_real" // PRECISION_SUFFIX)
! if (na .lt. 2*nb) then
! print *,"na lt 2*nb ",na,2*nb
! stop
! endif
! if (na .lt. 2*nb2) then
! print *,"na lt 2*nb2 ",na,2*nb2
! stop
! endif
! if (na .lt. nbCol) then
! print *,"na lt nbCol ",na,nbCol
! stop
! endif
! if (na .lt. nb2Col) then
! print *,"na lt nb2Col ",na,nb2Col
! stop
! endif
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
...
...
@@ -560,12 +544,12 @@
real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
call timer%start("wy_right" // PRECISION_SUFFIX)
call timer%start("blas")
call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda)
call timer%stop("blas")
call timer%stop("wy_right" // PRECISION_SUFFIX)
call timer%start("wy_right" // PRECISION_SUFFIX)
call timer%start("blas")
call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda)
call timer%stop("blas")
call timer%stop("wy_right" // PRECISION_SUFFIX)
end subroutine
...
...
@@ -590,14 +574,14 @@
real(kind=REAL_DATATYPE) :: mem(n,nb) !memory for a temporary matrix of size n x nb
real(kind=REAL_DATATYPE) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb
call timer%start("wy_symm" // PRECISION_SUFFIX)
call timer%start("blas")
call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb)
call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n)
call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda)
call timer%stop("blas")
call timer%stop("wy_symm" // PRECISION_SUFFIX)
call timer%start("wy_symm" // PRECISION_SUFFIX)
call timer%start("blas")
call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb)
call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n)
call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda)
call timer%stop("blas")
call timer%stop("wy_symm" // PRECISION_SUFFIX)
end subroutine
src/elpa2_template.X90
View file @
089ba660
...
...
@@ -13,9 +13,9 @@
useGPU, bandwidth) result(success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
use timings
#else
use timings_dummy
use timings_dummy
#endif
use elpa1_utilities, only : gpu_usage_via_environment_variable
use elpa1_compute
...
...
@@ -69,7 +69,7 @@
call timer%start("solve_evp_&
&MATH_DATATYPE&
&_2stage
_
" // &
&_2stage" // &
&PRECISION_SUFFIX &
)
...
...
@@ -483,7 +483,7 @@
call timer%stop("solve_evp_&
&MATH_DATATYPE&
&_2stage
_
" // &
&_2stage" // &
&PRECISION_SUFFIX &
)
1 format(a,f10.3)
...
...
src/elpa2_trans_ev_tridi_to_band_template.X90
View file @
089ba660
...
...
@@ -238,7 +238,7 @@
call timer%start("trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&
_
" // &
&" // &
&PRECISION_SUFFIX &
)
...
...
@@ -275,7 +275,7 @@
na_cols = na
#endif
endif
#endif /* COMPLEXCASE
c
*/
#endif /* COMPLEXCASE
*/
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
...
...
@@ -370,7 +370,9 @@
#endif
endif ! useGPU
#else /* WITH_OPENMP */
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
...
...
@@ -553,7 +555,7 @@
#endif
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&
^
: error when allocating aIntern"//errorMessage
&: error when allocating aIntern"//errorMessage
stop
endif
...
...
@@ -721,9 +723,9 @@
#endif /* WITH_MPI */
call unpack_row_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif ! useGPU
#endif /* WITH_OPENMP */
...
...
@@ -776,9 +778,9 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION &
(aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
enddo
...
...
@@ -792,9 +794,9 @@
else
call unpack_row_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif
...
...
@@ -828,6 +830,7 @@
enddo
else if (my_prow < ip) then
! Send all rows going to PE ip
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
...
...
@@ -877,9 +880,9 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION &
(aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
...
...
@@ -944,9 +947,9 @@
#endif
call unpack_row_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
(aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
endif ! useGPU
...
...
@@ -1728,9 +1731,6 @@
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(0, current_local_n, i)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
...
...
@@ -1865,7 +1865,7 @@
!$omp end parallel do
call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
!send_b
2
!send_b
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
...
...
@@ -1928,7 +1928,7 @@
#endif
!send_b
3
!send_b
#ifdef WITH_MPI
call timer%start("mpi_communication")
...
...
@@ -1995,7 +1995,6 @@
endif
#endif
#endif
! until here unify
!compute
#ifdef WITH_OPENMP
...
...
@@ -2549,9 +2548,7 @@
stop
endif
call timer%start("OpenMP parallel_&
&PRECISION&
&")
call timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!$omp parallel do private(my_thread, i, j), schedule(static, 1)
do my_thread = 1, max_threads
...
...
@@ -2562,9 +2559,8 @@
enddo
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_&
&PRECISION&
&")
call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
#else /* WITH_OPENMP */
do i = 1, stripe_count
if (useGPU) then
...
...
src/elpa2_tridiag_band_template.X90
View file @
089ba660
...
...
@@ -174,7 +174,7 @@
call timer%start("tridiag_band_&
&MATH_DATATYPE&
&
_
" // &
&" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
...
...
@@ -505,7 +505,7 @@
#ifndef WITH_MPI
#if REALCASE == 1
startAddr
= ubound(hh_trans_real,dim=2)
startAddr = ubound(hh_trans_real,dim=2)
#endif
#if COMPLEXCASE == 1
startAddr = ubound(hh_trans_complex,dim=2)
...
...
@@ -779,7 +779,7 @@
#if COMPLEXCASE == 1
call hh_transform_complex_&
#endif
&PRECISION &
&PRECISION &
(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
#if REALCASE == 1
...
...
@@ -1143,7 +1143,7 @@
#if COMPLEXCASE == 1
call hh_transform_complex_&
#endif
&PRECISION &
&PRECISION &
(ab(nb+1,ns),vnorm2,hf,tau_new)