Commit 089ba660 authored by Andreas Marek's avatar Andreas Marek
Browse files

Cleanup

parent 58923f0f
......@@ -160,6 +160,11 @@ module ELPA1_COMPUTE
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION double
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
......@@ -167,7 +172,8 @@ module ELPA1_COMPUTE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
#undef PRECISION
#undef PRECISION_SUFFIX
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
......@@ -176,12 +182,19 @@ module ELPA1_COMPUTE
#define BYTESIZE 4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION single
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef SINGLE_PRECISION
#undef PRECISION
#undef PRECISION_SUFFIX
#endif
......@@ -192,6 +205,11 @@ module ELPA1_COMPUTE
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION double
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
......@@ -199,6 +217,8 @@ module ELPA1_COMPUTE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef PRECISION
#undef PRECISION_SUFFIX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
......@@ -207,6 +227,11 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION single
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
......
......@@ -120,8 +120,7 @@
endif
#endif
call timer%start("merge_systems" // &
&PRECISION_SUFFIX)
call timer%start("merge_systems" // PRECISION_SUFFIX)
success = .true.
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -829,7 +828,7 @@
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
use timings_dummy
#endif
use precision
implicit none
......
......@@ -244,7 +244,7 @@
#if COMPLEXCASE == 1
subroutine hh_transform_complex_&
#endif
&PRECISION &
&PRECISION &
(alpha, xnorm_sq, xf, tau)
#if REALCASE == 1
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
......@@ -282,8 +282,7 @@
call timer%start("hh_tranform_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX )
#if COMPLEXCASE == 1
ALPHR = real( ALPHA, kind=REAL_DATATYPE )
......@@ -347,8 +346,7 @@
call timer%stop("hh_transform_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX )
#if REALCASE == 1
end subroutine hh_transform_real_&
......@@ -356,4 +354,4 @@
#if COMPLEXCASE == 1
end subroutine hh_transform_complex_&
#endif
&PRECISION
&PRECISION
......@@ -258,7 +258,7 @@
na_rows = na
#endif
na_cols = na
#endif
#endif /* WITH_MPI */
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols* &
......@@ -1326,9 +1326,9 @@
if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
umcCUDA, cur_l_cols, mpi_comm_cols, &
istep*nbw, n_cols, nblk)
......@@ -1401,7 +1401,7 @@
endif
call symm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw,nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
......@@ -1427,9 +1427,9 @@
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
......@@ -1466,9 +1466,9 @@
! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
......@@ -1512,7 +1512,7 @@
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call timer%stop("blas")
call symm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw, nbw ,mpi_comm_cols)
! U = U - 0.5 * V * VAV
......@@ -1522,9 +1522,9 @@
call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
......@@ -1588,9 +1588,9 @@
#if COMPLEXCASE == 1
if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk)
......@@ -1683,7 +1683,7 @@
endif
call herm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav, nbw, nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
......@@ -1699,7 +1699,7 @@
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call timer%stop("blas")
call herm_matrix_allreduce_&
&PRECISION &
&PRECISION &
(n_cols,vav,nbw,nbw,mpi_comm_cols)
endif
......@@ -1723,9 +1723,9 @@
stop
endif
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
......@@ -1755,9 +1755,9 @@
call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
......@@ -1826,12 +1826,22 @@
! endif
!#endif
! this is not needed since a_dev is passed along from one subroutine to the other
successCUDA = cuda_memcpy ( &
#if REALCASE == 1
successCUDA = cuda_memcpy ( loc (a), a_dev, lda*na_cols*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
loc(a), &
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy ( loc(a(1,1)), a_dev, lda*na_cols*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
loc(a(1,1)), &
#endif
a_dev, lda*na_cols* &
#if REALCASE == 1
size_of_PRECISION_real, &
#endif
#if COMPLEXCASE ==1
size_of_PRECISION_complex,&
#endif
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......
......@@ -116,25 +116,43 @@ module ELPA2_compute
#define REAL_DATATYPE rk8
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#define PRECISION double
#include "redist_band.X90"
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
#undef PRECISION_SUFFIX
#undef PRECISION
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION
#define REAL_DATATYPE rk4
#define BYTESIZE 4
#define REALCASE 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#undef PRECISION
#define PRECISION single
#include "redist_band.X90"
#undef REAL_DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef PRECISION_SUFFIX
#undef PRECISION
#endif
#endif /* WANT_SINGLE_PRECISION_REAL */
! double precision
#define DOUBLE_PRECISION_COMPLEX 1
......@@ -142,22 +160,40 @@ module ELPA2_compute
#define COMPLEX_DATATYPE ck8
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#undef PRECISION
#define PRECISION double
#include "redist_band.X90"
#undef COMPLEX_DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#undef PRECISION_SUFFIX
#undef PRECISION
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION
#define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#undef PRECISION
#define PRECISION single
#include "redist_band.X90"
#undef COMPLEX_DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef PRECISION_SUFFIX
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
......
......@@ -141,22 +141,6 @@
character(200) :: errorMessage
call timer%start("band_band_real" // PRECISION_SUFFIX)
! if (na .lt. 2*nb) then
! print *,"na lt 2*nb ",na,2*nb
! stop
! endif
! if (na .lt. 2*nb2) then
! print *,"na lt 2*nb2 ",na,2*nb2
! stop
! endif
! if (na .lt. nbCol) then
! print *,"na lt nbCol ",na,nbCol
! stop
! endif
! if (na .lt. nb2Col) then
! print *,"na lt nb2Col ",na,nb2Col
! stop
! endif
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
......@@ -560,12 +544,12 @@
real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
call timer%start("wy_right" // PRECISION_SUFFIX)
call timer%start("blas")
call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda)
call timer%stop("blas")
call timer%stop("wy_right" // PRECISION_SUFFIX)
call timer%start("wy_right" // PRECISION_SUFFIX)
call timer%start("blas")
call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda)
call timer%stop("blas")
call timer%stop("wy_right" // PRECISION_SUFFIX)
end subroutine
......@@ -590,14 +574,14 @@
real(kind=REAL_DATATYPE) :: mem(n,nb) !memory for a temporary matrix of size n x nb
real(kind=REAL_DATATYPE) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb
call timer%start("wy_symm" // PRECISION_SUFFIX)
call timer%start("blas")
call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb)
call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n)
call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda)
call timer%stop("blas")
call timer%stop("wy_symm" // PRECISION_SUFFIX)
call timer%start("wy_symm" // PRECISION_SUFFIX)
call timer%start("blas")
call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb)
call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n)
call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda)
call timer%stop("blas")
call timer%stop("wy_symm" // PRECISION_SUFFIX)
end subroutine
......@@ -13,9 +13,9 @@
useGPU, bandwidth) result(success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
use timings
#else
use timings_dummy
use timings_dummy
#endif
use elpa1_utilities, only : gpu_usage_via_environment_variable
use elpa1_compute
......@@ -69,7 +69,7 @@
call timer%start("solve_evp_&
&MATH_DATATYPE&
&_2stage_" // &
&_2stage" // &
&PRECISION_SUFFIX &
)
......@@ -483,7 +483,7 @@
call timer%stop("solve_evp_&
&MATH_DATATYPE&
&_2stage_" // &
&_2stage" // &
&PRECISION_SUFFIX &
)
1 format(a,f10.3)
......
......@@ -238,7 +238,7 @@
call timer%start("trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&_" // &
&" // &
&PRECISION_SUFFIX &
)
......@@ -275,7 +275,7 @@
na_cols = na
#endif
endif
#endif /* COMPLEXCASEc*/
#endif /* COMPLEXCASE */
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
......@@ -370,7 +370,9 @@
#endif
endif ! useGPU
#else /* WITH_OPENMP */
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
......@@ -553,7 +555,7 @@
#endif
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&^: error when allocating aIntern"//errorMessage
&: error when allocating aIntern"//errorMessage
stop
endif
......@@ -721,9 +723,9 @@
#endif /* WITH_MPI */
call unpack_row_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif ! useGPU
#endif /* WITH_OPENMP */
......@@ -776,9 +778,9 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION &
(aIntern, row, i-limits(ip), my_thread, stripe_count, thread_width, stripe_width, l_nev)
enddo
......@@ -792,9 +794,9 @@
else
call unpack_row_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif
......@@ -828,6 +830,7 @@
enddo
else if (my_prow < ip) then
! Send all rows going to PE ip
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
......@@ -877,9 +880,9 @@
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION &
(aIntern, row, i-limits(my_prow), my_thread, stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
......@@ -944,9 +947,9 @@
#endif
call unpack_row_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
&MATH_DATATYPE&
&_cpu_&
&PRECISION &
(aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
endif ! useGPU
......@@ -1728,9 +1731,6 @@
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(0, current_local_n, i)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
......@@ -1865,7 +1865,7 @@
!$omp end parallel do
call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
!send_b 2
!send_b
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
......@@ -1928,7 +1928,7 @@
#endif
!send_b 3
!send_b
#ifdef WITH_MPI
call timer%start("mpi_communication")
......@@ -1995,7 +1995,6 @@
endif
#endif
#endif
! until here unify
!compute
#ifdef WITH_OPENMP
......@@ -2549,9 +2548,7 @@
stop
endif
call timer%start("OpenMP parallel_&
&PRECISION&
&")
call timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!$omp parallel do private(my_thread, i, j), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -2562,9 +2559,8 @@
enddo
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_&
&PRECISION&
&")
call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
#else /* WITH_OPENMP */
do i = 1, stripe_count
if (useGPU) then
......
......@@ -174,7 +174,7 @@
call timer%start("tridiag_band_&
&MATH_DATATYPE&
&_" // &
&" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
......@@ -505,7 +505,7 @@
#ifndef WITH_MPI
#if REALCASE == 1
startAddr = ubound(hh_trans_real,dim=2)
startAddr = ubound(hh_trans_real,dim=2)
#endif
#if COMPLEXCASE == 1
startAddr = ubound(hh_trans_complex,dim=2)
......@@ -779,7 +779,7 @@
#if COMPLEXCASE == 1
call hh_transform_complex_&
#endif
&PRECISION &
&PRECISION &
(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
#if REALCASE == 1
......@@ -1143,7 +1143,7 @@
#if COMPLEXCASE == 1
call hh_transform_complex_&
#endif
&PRECISION &