Commit 089ba660 authored by Andreas Marek's avatar Andreas Marek
Browse files

Cleanup

parent 58923f0f
...@@ -160,6 +160,11 @@ module ELPA1_COMPUTE ...@@ -160,6 +160,11 @@ module ELPA1_COMPUTE
#define BYTESIZE 8 #define BYTESIZE 8
#define REALCASE 1 #define REALCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION double
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
...@@ -167,7 +172,8 @@ module ELPA1_COMPUTE ...@@ -167,7 +172,8 @@ module ELPA1_COMPUTE
#undef BYTESIZE #undef BYTESIZE
#undef REALCASE #undef REALCASE
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef PRECISION
#undef PRECISION_SUFFIX
! single precision ! single precision
#ifdef WANT_SINGLE_PRECISION_REAL #ifdef WANT_SINGLE_PRECISION_REAL
...@@ -176,12 +182,19 @@ module ELPA1_COMPUTE ...@@ -176,12 +182,19 @@ module ELPA1_COMPUTE
#define BYTESIZE 4 #define BYTESIZE 4
#define REALCASE 1 #define REALCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION single
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DATATYPE #undef DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef REALCASE #undef REALCASE
#undef SINGLE_PRECISION #undef SINGLE_PRECISION
#undef PRECISION
#undef PRECISION_SUFFIX
#endif #endif
...@@ -192,6 +205,11 @@ module ELPA1_COMPUTE ...@@ -192,6 +205,11 @@ module ELPA1_COMPUTE
#define BYTESIZE 16 #define BYTESIZE 16
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION 1 #define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION double
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DATATYPE #undef DATATYPE
...@@ -199,6 +217,8 @@ module ELPA1_COMPUTE ...@@ -199,6 +217,8 @@ module ELPA1_COMPUTE
#undef COMPLEXCASE #undef COMPLEXCASE
#undef DOUBLE_PRECISION #undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX #undef DOUBLE_PRECISION_COMPLEX
#undef PRECISION
#undef PRECISION_SUFFIX
#ifdef WANT_SINGLE_PRECISION_COMPLEX #ifdef WANT_SINGLE_PRECISION_COMPLEX
...@@ -207,6 +227,11 @@ module ELPA1_COMPUTE ...@@ -207,6 +227,11 @@ module ELPA1_COMPUTE
#define DATATYPE COMPLEX(kind=ck4) #define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define SINGLE_PRECISION 1 #define SINGLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION
#define PRECISION single
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#include "elpa_transpose_vectors.X90" #include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90" #include "elpa_reduce_add_vectors.X90"
#undef DATATYPE #undef DATATYPE
......
...@@ -120,8 +120,7 @@ ...@@ -120,8 +120,7 @@
endif endif
#endif #endif
call timer%start("merge_systems" // & call timer%start("merge_systems" // PRECISION_SUFFIX)
&PRECISION_SUFFIX)
success = .true. success = .true.
call timer%start("mpi_communication") call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...@@ -829,7 +828,7 @@ ...@@ -829,7 +828,7 @@
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#else #else
use timings_dummy use timings_dummy
#endif #endif
use precision use precision
implicit none implicit none
......
...@@ -244,7 +244,7 @@ ...@@ -244,7 +244,7 @@
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
subroutine hh_transform_complex_& subroutine hh_transform_complex_&
#endif #endif
&PRECISION & &PRECISION &
(alpha, xnorm_sq, xf, tau) (alpha, xnorm_sq, xf, tau)
#if REALCASE == 1 #if REALCASE == 1
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:) ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
...@@ -282,8 +282,7 @@ ...@@ -282,8 +282,7 @@
call timer%start("hh_tranform_& call timer%start("hh_tranform_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX )
)
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
ALPHR = real( ALPHA, kind=REAL_DATATYPE ) ALPHR = real( ALPHA, kind=REAL_DATATYPE )
...@@ -347,8 +346,7 @@ ...@@ -347,8 +346,7 @@
call timer%stop("hh_transform_& call timer%stop("hh_transform_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX )
)
#if REALCASE == 1 #if REALCASE == 1
end subroutine hh_transform_real_& end subroutine hh_transform_real_&
...@@ -356,4 +354,4 @@ ...@@ -356,4 +354,4 @@
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
end subroutine hh_transform_complex_& end subroutine hh_transform_complex_&
#endif #endif
&PRECISION &PRECISION
...@@ -258,7 +258,7 @@ ...@@ -258,7 +258,7 @@
na_rows = na na_rows = na
#endif #endif
na_cols = na na_cols = na
#endif #endif /* WITH_MPI */
! Here we convert the regular host array into a pinned host array ! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols* & successCUDA = cuda_malloc(a_dev, lda*na_cols* &
...@@ -1326,9 +1326,9 @@ ...@@ -1326,9 +1326,9 @@
if (tile_size < istep*nbw) then if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_& call elpa_reduce_add_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
(vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, & (vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, &
umcCUDA, cur_l_cols, mpi_comm_cols, & umcCUDA, cur_l_cols, mpi_comm_cols, &
istep*nbw, n_cols, nblk) istep*nbw, n_cols, nblk)
...@@ -1401,7 +1401,7 @@ ...@@ -1401,7 +1401,7 @@
endif endif
call symm_matrix_allreduce_& call symm_matrix_allreduce_&
&PRECISION & &PRECISION &
(n_cols,vav, nbw,nbw,mpi_comm_cols) (n_cols,vav, nbw,nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
...@@ -1427,9 +1427,9 @@ ...@@ -1427,9 +1427,9 @@
! Transpose umc -> umr (stored in vmr, second half) ! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_& call elpa_transpose_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
(umcCUDA, cur_l_cols, mpi_comm_cols, & (umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, & vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk) 1, istep*nbw, n_cols, nblk)
...@@ -1466,9 +1466,9 @@ ...@@ -1466,9 +1466,9 @@
! Or if we used the Algorithm 4 ! Or if we used the Algorithm 4
if (tile_size < istep*nbw .or. n_way > 1) then if (tile_size < istep*nbw .or. n_way > 1) then
call elpa_reduce_add_vectors_& call elpa_reduce_add_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, & (vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk) istep*nbw, n_cols, nblk)
...@@ -1512,7 +1512,7 @@ ...@@ -1512,7 +1512,7 @@
ubound(tmat,dim=1), vav, ubound(vav,dim=1)) ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call timer%stop("blas") call timer%stop("blas")
call symm_matrix_allreduce_& call symm_matrix_allreduce_&
&PRECISION & &PRECISION &
(n_cols,vav, nbw, nbw ,mpi_comm_cols) (n_cols,vav, nbw, nbw ,mpi_comm_cols)
! U = U - 0.5 * V * VAV ! U = U - 0.5 * V * VAV
...@@ -1522,9 +1522,9 @@ ...@@ -1522,9 +1522,9 @@
call timer%stop("blas") call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half) ! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_& call elpa_transpose_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & (umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, & vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk) 1, istep*nbw, n_cols, nblk)
...@@ -1588,9 +1588,9 @@ ...@@ -1588,9 +1588,9 @@
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
if (tile_size < istep*nbw) then if (tile_size < istep*nbw) then
call elpa_reduce_add_vectors_& call elpa_reduce_add_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
(vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, & (vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, &
umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
istep*nbw, n_cols, nblk) istep*nbw, n_cols, nblk)
...@@ -1683,7 +1683,7 @@ ...@@ -1683,7 +1683,7 @@
endif endif
call herm_matrix_allreduce_& call herm_matrix_allreduce_&
&PRECISION & &PRECISION &
(n_cols,vav, nbw, nbw,mpi_comm_cols) (n_cols,vav, nbw, nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
...@@ -1699,7 +1699,7 @@ ...@@ -1699,7 +1699,7 @@
ubound(tmat,dim=1), vav, ubound(vav,dim=1)) ubound(tmat,dim=1), vav, ubound(vav,dim=1))
call timer%stop("blas") call timer%stop("blas")
call herm_matrix_allreduce_& call herm_matrix_allreduce_&
&PRECISION & &PRECISION &
(n_cols,vav,nbw,nbw,mpi_comm_cols) (n_cols,vav,nbw,nbw,mpi_comm_cols)
endif endif
...@@ -1723,9 +1723,9 @@ ...@@ -1723,9 +1723,9 @@
stop stop
endif endif
call elpa_transpose_vectors_& call elpa_transpose_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & (umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, & vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk) 1, istep*nbw, n_cols, nblk)
...@@ -1755,9 +1755,9 @@ ...@@ -1755,9 +1755,9 @@
call timer%stop("blas") call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half) ! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_& call elpa_transpose_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & (umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, & vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk) 1, istep*nbw, n_cols, nblk)
...@@ -1826,12 +1826,22 @@ ...@@ -1826,12 +1826,22 @@
! endif ! endif
!#endif !#endif
! this is not needed since a_dev is passed along from one subroutine to the other ! this is not needed since a_dev is passed along from one subroutine to the other
successCUDA = cuda_memcpy ( &
#if REALCASE == 1 #if REALCASE == 1
successCUDA = cuda_memcpy ( loc (a), a_dev, lda*na_cols*size_of_PRECISION_real,cudaMemcpyDeviceToHost) loc(a), &
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
successCUDA = cuda_memcpy ( loc(a(1,1)), a_dev, lda*na_cols*size_of_PRECISION_complex,cudaMemcpyDeviceToHost) loc(a(1,1)), &
#endif
a_dev, lda*na_cols* &
#if REALCASE == 1
size_of_PRECISION_real, &
#endif
#if COMPLEXCASE ==1
size_of_PRECISION_complex,&
#endif #endif
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
......
...@@ -116,25 +116,43 @@ module ELPA2_compute ...@@ -116,25 +116,43 @@ module ELPA2_compute
#define REAL_DATATYPE rk8 #define REAL_DATATYPE rk8
#define BYTESIZE 8 #define BYTESIZE 8
#define REALCASE 1 #define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#define PRECISION double
#include "redist_band.X90" #include "redist_band.X90"
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE #undef REAL_DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef REALCASE #undef REALCASE
#undef DOUBLE_PRECISION
#undef PRECISION_SUFFIX
#undef PRECISION
! single precision ! single precision
#ifdef WANT_SINGLE_PRECISION_REAL #ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION
#define REAL_DATATYPE rk4 #define REAL_DATATYPE rk4
#define BYTESIZE 4 #define BYTESIZE 4
#define REALCASE 1 #define REALCASE 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#undef PRECISION
#define PRECISION single
#include "redist_band.X90" #include "redist_band.X90"
#undef REAL_DATATYPE #undef REAL_DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef REALCASE #undef REALCASE
#undef PRECISION_SUFFIX
#undef PRECISION
#endif #endif /* WANT_SINGLE_PRECISION_REAL */
! double precision ! double precision
#define DOUBLE_PRECISION_COMPLEX 1 #define DOUBLE_PRECISION_COMPLEX 1
...@@ -142,22 +160,40 @@ module ELPA2_compute ...@@ -142,22 +160,40 @@ module ELPA2_compute
#define COMPLEX_DATATYPE ck8 #define COMPLEX_DATATYPE ck8
#define BYTESIZE 16 #define BYTESIZE 16
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_double"
#undef PRECISION
#define PRECISION double
#include "redist_band.X90" #include "redist_band.X90"
#undef COMPLEX_DATATYPE #undef COMPLEX_DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef COMPLEXCASE #undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX #undef DOUBLE_PRECISION_COMPLEX
#undef PRECISION_SUFFIX
#undef PRECISION
#ifdef WANT_SINGLE_PRECISION_COMPLEX #ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX #undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_REAL #undef DOUBLE_PRECISION_REAL
#undef DOUBLE_PRECISION
#define COMPLEX_DATATYPE ck4 #define COMPLEX_DATATYPE ck4
#define COMPLEXCASE 1 #define COMPLEXCASE 1
#include "precision_macros.h"
#undef PRECISION_SUFFIX
#define PRECISION_SUFFIX "_single"
#undef PRECISION
#define PRECISION single
#include "redist_band.X90" #include "redist_band.X90"
#undef COMPLEX_DATATYPE #undef COMPLEX_DATATYPE
#undef BYTESIZE #undef BYTESIZE
#undef COMPLEXCASE #undef COMPLEXCASE
#undef PRECISION_SUFFIX
#endif /* WANT_SINGLE_PRECISION_COMPLEX */ #endif /* WANT_SINGLE_PRECISION_COMPLEX */
......
...@@ -141,22 +141,6 @@ ...@@ -141,22 +141,6 @@
character(200) :: errorMessage character(200) :: errorMessage
call timer%start("band_band_real" // PRECISION_SUFFIX) call timer%start("band_band_real" // PRECISION_SUFFIX)
! if (na .lt. 2*nb) then
! print *,"na lt 2*nb ",na,2*nb
! stop
! endif
! if (na .lt. 2*nb2) then
! print *,"na lt 2*nb2 ",na,2*nb2
! stop
! endif
! if (na .lt. nbCol) then
! print *,"na lt nbCol ",na,nbCol
! stop
! endif
! if (na .lt. nb2Col) then
! print *,"na lt nb2Col ",na,nb2Col
! stop
! endif
call timer%start("mpi_communication") call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm,my_pe,mpierr) call mpi_comm_rank(mpi_comm,my_pe,mpierr)
...@@ -560,12 +544,12 @@ ...@@ -560,12 +544,12 @@
real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
call timer%start("wy_right" // PRECISION_SUFFIX) call timer%start("wy_right" // PRECISION_SUFFIX)
call timer%start("blas") call timer%start("blas")
call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n) call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda) call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda)
call timer%stop("blas") call timer%stop("blas")
call timer%stop("wy_right" // PRECISION_SUFFIX) call timer%stop("wy_right" // PRECISION_SUFFIX)
end subroutine end subroutine
...@@ -590,14 +574,14 @@ ...@@ -590,14 +574,14 @@
real(kind=REAL_DATATYPE) :: mem(n,nb) !memory for a temporary matrix of size n x nb real(kind=REAL_DATATYPE) :: mem(n,nb) !memory for a temporary matrix of size n x nb
real(kind=REAL_DATATYPE) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb real(kind=REAL_DATATYPE) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb
call timer%start("wy_symm" // PRECISION_SUFFIX) call timer%start("wy_symm" // PRECISION_SUFFIX)
call timer%start("blas") call timer%start("blas")
call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n) call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb) call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb)
call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n) call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n)
call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda) call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda)
call timer%stop("blas") call timer%stop("blas")
call timer%stop("wy_symm" // PRECISION_SUFFIX) call timer%stop("wy_symm" // PRECISION_SUFFIX)
end subroutine end subroutine
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
useGPU, bandwidth) result(success) useGPU, bandwidth) result(success)
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#else #else
use timings_dummy use timings_dummy
#endif #endif
use elpa1_utilities, only : gpu_usage_via_environment_variable use elpa1_utilities, only : gpu_usage_via_environment_variable
use elpa1_compute use elpa1_compute
...@@ -69,7 +69,7 @@ ...@@ -69,7 +69,7 @@
call timer%start("solve_evp_& call timer%start("solve_evp_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_2stage_" // & &_2stage" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
...@@ -483,7 +483,7 @@ ...@@ -483,7 +483,7 @@
call timer%stop("solve_evp_& call timer%stop("solve_evp_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_2stage_" // & &_2stage" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
1 format(a,f10.3) 1 format(a,f10.3)
......
...@@ -238,7 +238,7 @@ ...@@ -238,7 +238,7 @@
call timer%start("trans_ev_tridi_to_band_& call timer%start("trans_ev_tridi_to_band_&
&MATH_DATATYPE& &MATH_DATATYPE&