Commit 6aabcd54 authored by Andreas Marek's avatar Andreas Marek

Introduce timings of blas/lapack calls

parent 50967ab5
......@@ -101,6 +101,8 @@
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use elpa2_workload
use precision
......@@ -133,9 +135,7 @@
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("band_band_real" // PRECISION_SUFFIX)
#endif
! if (na .lt. 2*nb) then
! print *,"na lt 2*nb ",na,2*nb
! stop
......@@ -153,14 +153,10 @@
! stop
! endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_size(mpi_comm,n_pes,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
......@@ -193,9 +189,7 @@
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
ireq_ab2 = MPI_REQUEST_NULL
......@@ -205,9 +199,7 @@
call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, MPI_REAL_PRECISION, 0, 3, mpi_comm, ireq_ab2(i+1), mpierr)
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! carefull the "recieve" has to be done at the corresponding send or wait
......@@ -238,14 +230,10 @@
ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(ab_s, (nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
endif
......@@ -262,7 +250,9 @@
if (istep < na/nb2) then
! Transform first block column of remaining matrix
call timer%start("blas")
call PRECISION_GEQRF(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info)
call timer%stop("blas")
do i=1,nb2
hv(i,i) = CONST_1_0
......@@ -285,13 +275,9 @@
dest = dest+1
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_send(ab_s2, 2*nb2*nb2, MPI_REAL_PRECISION, dest, 3, mpi_comm, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! do irecv here
......@@ -308,13 +294,9 @@
if (na>na_s+nb2-1) then
! Receive Householder vectors from previous task, from PE owning subdiagonal
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_recv(hv, nb*nb2, MPI_REAL_PRECISION, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2)
......@@ -348,13 +330,9 @@
if (iblk==nblocks .and. nc==nb) then
!request last nb2 columns
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_recv(ab_r,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2)
......@@ -368,7 +346,9 @@
if (nr>0) then
call wy_right_PRECISION(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
call timer%start("blas")
call PRECISION_GEQRF(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info)
call timer%stop("blas")
do i=1,nb2
hv_new(i,i) = CONST_1_0
hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
......@@ -378,14 +358,10 @@
!send hh-vector
if (iblk==nblocks) then
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
hv_s = hv_new
......@@ -393,13 +369,9 @@
hv_s(i,i) = tau_new(i)
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(hv_s,nb*nb2, MPI_REAL_PRECISION, my_pe+1, 2, mpi_comm, ireq_hv, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
......@@ -412,27 +384,19 @@
if (my_pe>0 .and. iblk==1) then
!send first nb2 columns to previous PE
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif
do i=1,nb2
ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
enddo
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_isend(ab_s,(nb+1)*nb2, MPI_REAL_PRECISION, my_pe-1, 1, mpi_comm, ireq_ab, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
......@@ -452,9 +416,7 @@
! Finish the last outstanding requests
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
......@@ -472,9 +434,7 @@
! endif
call mpi_barrier(mpi_comm,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
......@@ -496,9 +456,7 @@
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("band_band_real" // PRECISION_SUFFIX)
#endif
end subroutine
......@@ -506,6 +464,8 @@
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -519,25 +479,25 @@
integer(kind=ik) :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_gen" // PRECISION_SUFFIX)
#endif
W(1:n,1) = tau(1)*Y(1:n,1)
do i=2,nb
W(1:n,i) = tau(i)*Y(1:n,i)
call timer%start("blas")
call PRECISION_GEMV('T', n, i-1, CONST_1_0, Y, lda, W(1,i), 1, CONST_0_0, mem,1)
call PRECISION_GEMV('N', n, i-1, -CONST_1_0, W, lda, mem, 1, CONST_1_0, W(1,i),1)
call timer%stop("blas")
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_gen" // PRECISION_SUFFIX)
#endif
end subroutine
subroutine wy_left_PRECISION(n, m, nb, A, lda, W, Y, mem, lda2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -551,22 +511,20 @@
real(kind=REAL_DATATYPE), intent(in) :: Y(m,nb) !blocked transformation matrix Y
real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_left" // PRECISION_SUFFIX)
#endif
call timer%start("blas")
call PRECISION_GEMM('T', 'N', nb, n, m, CONST_1_0, W, lda2, A, lda, CONST_0_0, mem, nb)
call PRECISION_GEMM('N', 'N', m, n, nb, -CONST_1_0, Y, lda2, mem, nb, CONST_1_0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("blas")
call timer%stop("wy_left" // PRECISION_SUFFIX)
#endif
end subroutine
subroutine wy_right_PRECISION(n, m, nb, A, lda, W, Y, mem, lda2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -581,16 +539,12 @@
real(kind=REAL_DATATYPE), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_right" // PRECISION_SUFFIX)
#endif
call timer%start("blas")
call PRECISION_GEMM('N', 'N', n, nb, m, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('N', 'T', n, m, nb, -CONST_1_0, mem, n, Y, lda2, CONST_1_0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("blas")
call timer%stop("wy_right" // PRECISION_SUFFIX)
#endif
end subroutine
......@@ -598,6 +552,8 @@
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -611,18 +567,14 @@
real(kind=REAL_DATATYPE) :: mem(n,nb) !memory for a temporary matrix of size n x nb
real(kind=REAL_DATATYPE) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_symm" // PRECISION_SUFFIX)
#endif
call timer%start("blas")
call PRECISION_SYMM('L', 'L', n, nb, CONST_1_0, A, lda, W, lda2, CONST_0_0, mem, n)
call PRECISION_GEMM('T', 'N', nb, nb, n, CONST_1_0, mem, n, W, lda2, CONST_0_0, mem2, nb)
call PRECISION_GEMM('N', 'N', n, nb, nb, -CONST_0_5, Y, lda2, mem2, nb, CONST_1_0, mem, n)
call PRECISION_SYR2K('L', 'N', n, nb, -CONST_1_0, Y, lda2, mem, n, CONST_1_0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("blas")
call timer%stop("wy_symm" // PRECISION_SUFFIX)
#endif
end subroutine
......@@ -230,8 +230,11 @@
if (l_rows > 0) then
if (useGPU) then
call timer%start("cublas")
call cublas_PRECISION_GEMM('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, &
q_dev, ldq, CZERO, tmp_dev, n_cols)
call timer%stop("cublas")
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_PRECISION_complex, &
cudaMemcpyDeviceToHost)
......@@ -240,8 +243,10 @@
stop
endif
else
call timer%start("blas")
call PRECISION_GEMM('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), &
q, ldq, CZERO, tmp1, n_cols)
call timer%stop("blas")
endif
else ! l_rows > 0
if (useGPU) then
......@@ -288,11 +293,15 @@
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
call timer%start("cublas")
call cublas_PRECISION_TRMM('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, &
tmp_dev, n_cols, CONE, q_dev, ldq)
else ! not useGPU
call timer%stop("cublas")
else ! not useGPU
call timer%start("blas")
#ifdef WITH_MPI
call PRECISION_TRMM('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
......@@ -301,6 +310,7 @@
call PRECISION_TRMM('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
call timer%stop("blas")
#endif /* WITH_MPI */
endif
endif
......
......@@ -165,7 +165,7 @@
endif
hvm = CONST_0_0 ! Must be set to 0 !!!
hvb = CONST_0_0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
do istep=1,(na-1)/nbw
......@@ -224,8 +224,10 @@
! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then
call timer%start("cublas")
call cublas_PRECISION_GEMM('T', 'N', n_cols, l_cols, l_rows, CONST_1_0, hvm_dev, max_local_rows, &
q_dev, ldq , CONST_0_0, tmp_dev, n_cols)
call timer%stop("cublas")
#ifdef WITH_MPI
! copy data from device to host for a later MPI_ALLREDUCE
......@@ -301,11 +303,11 @@
stop
endif
!#endif /* WITH_MPI */
call timer%start("cublas")
call cublas_PRECISION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, CONST_1_0, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONST_1_0, hvm_dev, max_local_rows, &
tmp_dev, n_cols, CONST_1_0, q_dev, ldq)
call timer%stop("cublas")
! copy to host maybe this can be avoided
! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_PRECISION_real),cudaMemcpyDeviceToHost)
......@@ -414,26 +416,32 @@
if (t_cols <= 0) exit
t_rows = (i - 1) * nbw
tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i)
call timer%start("blas")
if (i > 1) then
call PRECISION_GEMM('T', 'N', t_rows, t_cols, l_rows, CONST_1_0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
max_local_rows, CONST_0_0, t_tmp, cwy_blocking)
call timer%stop("blas")
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
call timer%start("blas")
call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, CONST_1_0, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
t_tmp2, cwy_blocking)
call timer%stop("blas")
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
#else
! t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
call timer%start("blas")
call PRECISION_TRMM('L', 'U', 'N', 'N', t_rows, t_cols, CONST_1_0, tmat_complete, cwy_blocking, t_tmp, cwy_blocking)
call PRECISION_TRMM('R', 'U', 'N', 'N', t_rows, t_cols, -CONST_1_0, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
t_tmp, cwy_blocking)
call timer%stop("blas")
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
......@@ -450,8 +458,11 @@
! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then
call timer%start("blas")
call PRECISION_GEMM('T', 'N', n_cols, l_cols, l_rows, CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, CONST_0_0, tmp1, n_cols)
call timer%stop("blas")
else ! l_rows>0
......@@ -463,20 +474,22 @@
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows ,mpierr)
call timer%stop("mpi_communication")
call timer%start("blas")
if (l_rows>0) then
call PRECISION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, CONST_1_0, tmat_complete, cwy_blocking, tmp2, n_cols)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONST_1_0, hvm, ubound(hvm,dim=1), tmp2, n_cols, CONST_1_0, q, ldq)
endif
call timer%stop("blas")
#else /* WITH_MPI */
! tmp2 = tmp1
call timer%start("blas")
if (l_rows>0) then
call PRECISION_TRMM('L', 'U', 'T', 'N', n_cols, l_cols, CONST_1_0, tmat_complete, cwy_blocking, tmp1, n_cols)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, n_cols, -CONST_1_0, hvm, ubound(hvm,dim=1), tmp1, n_cols, CONST_1_0, q, ldq)
endif
call timer%stop("blas")
#endif /* WITH_MPI */
! if (l_rows>0) then
......
......@@ -2529,6 +2529,7 @@
if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_ROWS, mpierr)
kernel_flops = kernel_flops_recv
call MPI_ALLREDUCE(kernel_flops, kernel_flops_recv, 1, MPI_INTEGER8, MPI_SUM, MPI_COMM_COLS, mpierr)
......@@ -2538,6 +2539,8 @@
kernel_time_recv = kernel_time
call MPI_ALLREDUCE(kernel_time, kernel_time_recv, 1, MPI_REAL8, MPI_MAX, MPI_COMM_COLS, mpierr)
kernel_time_recv = kernel_time
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
......
......@@ -457,18 +457,22 @@
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Transform diagonal block
call timer%start("blas")
call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
call timer%stop("blas")
x = dot_product(hv(1:nc),hd(1:nc))*tau
hd(1:nc) = hd(1:nc) - CONST_0_5*x*hv(1:nc)
call timer%start("blas")
call PRECISION_SYR2('L', nc, -CONST_1_0 ,hd, 1, hv, 1, ab(1,ns), 2*nb-1)
call timer%stop("blas")
hv_t(:,my_thread) = CONST_0_0
tau_t(my_thread) = CONST_0_0
if (nr<=0) cycle ! No subdiagonal block present any more
! Transform subdiagonal block
call timer%start("blas")
call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, CONST_0_0, hs, 1)
call timer%stop("blas")
if (nr>1) then
! complete (old) Householder transformation for first column
......@@ -487,7 +491,10 @@
! update subdiagonal block for old and new Householder transformation
! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
call timer%start("blas")
call PRECISION_GEMV('T',nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, CONST_0_0, h(2), 1)
call timer%stop("blas")
x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
......@@ -653,11 +660,13 @@
! Diagonal block, the contribution of the last element is added below!
ab(1,ne) = CONST_0_0
call timer%start("blas")
call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
! Subdiagonal block
if (nr>0) call PRECISION_GEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, CONST_0_0, hs, 1)
call timer%stop("blas")
! ... then request last column ...
#ifdef WITH_MPI
......@@ -681,8 +690,10 @@
else
! Normal matrix multiply
call timer%start("blas")
call PRECISION_SYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, CONST_0_0, hd, 1)
if (nr>0) call PRECISION_GEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, CONST_0_0, hs, 1)
call timer%stop("blas")
endif
! Calculate first column of subdiagonal block and calculate new
......@@ -763,17 +774,26 @@
#endif
! ... and calculate remaining columns with rank-2 update
call timer%start("blas")
if (nc>1) call PRECISION_SYR2('L', nc-1, -CONST_1_0, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1)
call timer%stop("blas")
else
! No need to send, just a rank-2 update
call timer%start("blas")
call PRECISION_SYR2('L', nc, -CONST_1_0, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
call timer%stop("blas")
endif
! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
if (nr>0) then
if (nr>1) then
call timer%start("blas")
call PRECISION_GEMV('T', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, CONST_0_0, h(2), 1)
call timer%stop("blas")
x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
......
......@@ -188,7 +188,7 @@ subroutine elpa_reduce_add_vectors_complex_single(vmat_s,ld_s,comm_s,vmat_t,ld_t
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
call timer%start("mpi_communication")
#endif
#if REALCASE==1
......@@ -212,7 +212,7 @@ subroutine elpa_reduce_add_vectors_complex_single(vmat_s,ld_s,comm_s,vmat_t,ld_t
#endif /* COMPLEXCASE == 1 */
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
......
......@@ -204,7 +204,7 @@ subroutine elpa_transpose_vectors_complex_single(vmat_s,ld_s,comm_s,vmat_t,ld_t,
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
call timer%start("mpi_communication")
#endif
#if COMPLEXCASE==1
......@@ -229,7 +229,7 @@ subroutine elpa_transpose_vectors_complex_single(vmat_s,ld_s,comm_s,vmat_t,ld_t,
#endif /* REALCASE == 1 */
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
......
......@@ -239,18 +239,16 @@ module ELPA_utilities
end function least_common_multiple
subroutine check_alloc(function_name, variable_name, istat, errorMessage)
use precision
implicit none
character(len=*), intent(in) :: function_name
character(len=*), intent(in) :: variable_name
integer(kind=ik), intent(in) :: istat
character(len=*), intent(in) :: errorMessage
if (istat .ne. 0) then
print *, function_name, ": error when allocating ", variable_name, " ", errorMessage
stop
......@@ -259,13 +257,13 @@ module ELPA_utilities
subroutine check_alloc_CUDA_f(file_name, line, successCUDA)
use precision
implicit none
character(len=*), intent(in) :: file_name
integer(kind=ik), intent(in) :: line
logical :: successCUDA
if (.not.(successCUDA)) then
print *, file_name, ":", line, " error in cuda_malloc when allocating "
stop
......@@ -274,13 +272,13 @@ module ELPA_utilities
subroutine check_dealloc_CUDA_f(file_name, line, successCUDA)
use precision