Commit 939020cc authored by Andreas Marek's avatar Andreas Marek
Browse files

Reduced timings if wantDebug is not set

parent 9c352986
......@@ -256,7 +256,7 @@ function elpa_solve_evp_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU, wantDebug)
call obj%timer%stop("forward")
endif !do_bandred
......
......@@ -248,7 +248,7 @@
subroutine hh_transform_complex_&
#endif
&PRECISION &
(obj, alpha, xnorm_sq, xf, tau)
(obj, alpha, xnorm_sq, xf, tau, wantDebug)
#if REALCASE == 1
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
#endif
......@@ -261,7 +261,8 @@
use precision
use elpa_abstract_impl
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: wantDebug
#if REALCASE == 1
real(kind=REAL_DATATYPE), intent(inout) :: alpha
#endif
......@@ -279,7 +280,7 @@
real(kind=REAL_DATATYPE) :: BETA
call obj%timer%start("hh_transform_&
if (wantDebug) call obj%timer%start("hh_transform_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX )
......@@ -343,7 +344,7 @@
ALPHA = BETA
endif
call obj%timer%stop("hh_transform_&
if (wantDebug) call obj%timer%stop("hh_transform_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX )
......
......@@ -52,11 +52,21 @@
! distributed along with the original code in the file "COPYING".
#endif
#include "../general/sanity.X90"
#undef SAVE_MATR
#ifdef DOUBLE_PRECISION_REAL
#define SAVE_MATR(name, iteration) \
call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
#else
#define SAVE_MATR(name, iteration)
#endif
!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
! Parameters
!
!> \param obj object of elpa_type
!> \param na Order of matrix
!>
!> \param a_mat(lda,matrixCols) Distributed matrix which should be reduced.
......@@ -80,23 +90,13 @@
!> \param tau(na) Factors for the Householder vectors (returned), needed for back transformation
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!> \param wantDebug if true more debug information
!>
#include "../general/sanity.X90"
#undef SAVE_MATR
#ifdef DOUBLE_PRECISION_REAL
#define SAVE_MATR(name, iteration) \
call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_cols,name,iteration)
#else
#define SAVE_MATR(name, iteration)
#endif
subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
(obj, na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU, wantDebug)
use cuda_functions
use iso_c_binding
use precision
......@@ -105,7 +105,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU
logical, intent(in) :: useGPU, wantDebug
#if REALCASE == 1
real(kind=REAL_DATATYPE), intent(out) :: tau(na)
......@@ -227,12 +227,12 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
)
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
! seems that tile is a square submatrix, consisting by several blocks
......@@ -393,7 +393,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
endif
if (n_stored_vecs > 0 .and. l_rows > 0) then
call obj%timer%start("blas")
if (wantDebug) call obj%timer%start("blas")
#if COMPLEXCASE == 1
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
......@@ -408,7 +408,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#endif
ONE, v_row, 1)
call obj%timer%stop("blas")
if (wantDebug) call obj%timer%stop("blas")
endif
......@@ -421,10 +421,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
endif
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, mpi_comm_rows, mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
......@@ -439,7 +439,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call hh_transform_complex_&
#endif
&PRECISION &
(obj, vrl, vnorm2, xf, tau(istep))
(obj, vrl, vnorm2, xf, tau(istep), wantDebug)
! Scale v_row and store Householder Vector for back transformation
v_row(1:l_rows) = v_row(1:l_rows) * xf
......@@ -456,13 +456,15 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! add tau after the end of actuall v_row, to be broadcasted with it
v_row(l_rows+1) = tau(istep)
endif !(my_pcol == pcol(istep, nblk, np_cols))
! SAVE_MATR("HH vec stored", na - istep + 1)
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
! Broadcast the Householder Vector (and tau) along columns
call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION, &
pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
!recover tau, which has been broadcasted together with v_row
......@@ -508,7 +510,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
n_iter = 0
! first calculate A*v part of (A + VU**T + UV**T)*v
! first calculate A*v part of (A + VU**T + UV**T)*v
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif /* WITH_OPENMP */
......@@ -522,7 +524,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call obj%timer%start("blas")
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, &
......@@ -534,7 +536,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
ONE, ur_p(l_row_beg,my_thread), 1)
endif
call obj%timer%stop("blas")
if (wantDebug) call obj%timer%stop("blas")
endif
n_iter = n_iter+1
#else /* WITH_OPENMP */
......@@ -543,7 +545,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
! CPU implementation) or by one large matrix Vector multiply
if (.not. useGPU) then
call obj%timer%start("blas")
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg, l_col_beg), lda, &
......@@ -557,7 +559,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
v_col(l_col_beg), 1, &
ONE, u_row(l_row_beg), 1)
endif
call obj%timer%stop("blas")
if (wantDebug) call obj%timer%stop("blas")
endif ! not useGPU
#endif /* WITH_OPENMP */
......@@ -569,7 +571,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! Unlike for CPU, we (for each MPI thread) do just one large mat-vec multiplication
! this requires altering of the algorithm when later explicitly updating the matrix
! after max_stored_uv is reached : we need to update all tiles, not only those above diagonal
call obj%timer%start("cublas")
if (wantDebug) call obj%timer%start("cublas")
call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, l_rows,l_cols, &
ONE, a_dev, lda, &
v_row_dev , 1, &
......@@ -584,7 +586,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! ONE, u_row_dev + (l_row_beg - 1) * &
! size_of_datatype, 1)
! endif
call obj%timer%stop("cublas")
if (wantDebug) call obj%timer%stop("cublas")
else
!perform multiplication by stripes - it is faster than by blocks, since we call cublas with
......@@ -651,7 +653,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
if (n_stored_vecs > 0) then
call obj%timer%start("blas")
if (wantDebug) call obj%timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMV('T', &
#endif
......@@ -665,7 +667,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs, &
ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1), &
aux, 1, ONE, u_col, 1)
call obj%timer%stop("blas")
if (wantDebug) call obj%timer%stop("blas")
endif
endif ! (l_rows>0 .and. l_cols>0)
......@@ -691,10 +693,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (l_cols>0) then
tmp(1:l_cols) = u_col(1:l_cols)
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, mpi_comm_rows, mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
u_col = tmp
#endif /* WITH_MPI */
......@@ -720,14 +722,14 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#endif
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
#if REALCASE == 1
call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#if COMPLEXCASE == 1
call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
#if REALCASE == 1
......@@ -795,7 +797,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if(.not. mat_vec_as_one_block) then
! if using mat-vec multiply by stripes, it is enough to update tiles above (or on) the diagonal only
! we than use the same calls as for CPU version
call obj%timer%start("cublas")
if (wantDebug) call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev + (l_row_beg - 1) * &
......@@ -804,29 +806,29 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
size_of_datatype, &
max_local_cols, ONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * &
size_of_datatype , lda)
call obj%timer%stop("cublas")
if (wantDebug) call obj%timer%stop("cublas")
endif
else !useGPU
call obj%timer%start("blas")
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
ONE, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1), &
uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1), &
ONE, a_mat(l_row_beg,l_col_beg), lda)
call obj%timer%stop("blas")
if (wantDebug) call obj%timer%stop("blas")
endif !useGPU
enddo
if (useGPU) then
if(mat_vec_as_one_block) then
!update whole (remaining) part of matrix, including tiles below diagonal
!update whole (remaining) part of matrix, including tiles below diagonal
!we can do that in one large cublas call
call obj%timer%start("cublas")
if (wantDebug) call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, l_rows, l_cols, 2*n_stored_vecs, &
ONE, vu_stored_rows_dev, max_local_rows, &
uv_stored_cols_dev, max_local_cols, &
ONE, a_dev, lda)
call obj%timer%stop("cublas")
if (wantDebug) call obj%timer%stop("cublas")
endif
endif
......@@ -878,23 +880,23 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
endif !useGPU
call hh_transform_complex_&
&PRECISION &
(obj, vrl, CONST_REAL_0_0, xf, tau(2))
(obj, vrl, CONST_REAL_0_0, xf, tau(2), wantDebug)
e_vec(1) = vrl
a_mat(1,l_cols) = 1. ! for consistency only
endif
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
......@@ -989,7 +991,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
#if REALCASE == 1
tmp = d_vec
call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
......@@ -1010,7 +1012,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
#if REALCASE == 1
......@@ -1037,5 +1039,3 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&MATH_DATATYPE&
&_&
&PRECISION
\ No newline at end of file
......@@ -201,7 +201,7 @@
endif
endif
call obj%timer%start("compute_hh_trafo_&
if (wantDebug) call obj%timer%start("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_openmp" // &
......@@ -247,7 +247,7 @@
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
call obj%timer%stop("compute_hh_trafo_&
if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_openmp" // &
......@@ -1576,7 +1576,7 @@
endif
#endif
call obj%timer%stop("compute_hh_trafo_&
if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_openmp" // &
......
......@@ -226,14 +226,14 @@
&" // &
&PRECISION_SUFFIX &
)
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
success = .true.
......@@ -676,7 +676,7 @@
endif
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(aux1, aux2, 2, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
......@@ -685,7 +685,7 @@
MPI_COMPLEX_PRECISION, &
#endif
MPI_SUM, mpi_comm_rows, mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
aux2 = aux1 ! this should be optimized
......@@ -702,7 +702,7 @@
call hh_transform_complex_&
#endif
&PRECISION &
(obj, vrl, vnorm2, xf, tau)
(obj, vrl, vnorm2, xf, tau, wantDebug)
! Scale vr and store Householder Vector for back transformation
vr(1:lr) = vr(1:lr) * xf
......@@ -725,7 +725,7 @@
vr(lr+1) = tau
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Bcast(vr, lr+1, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
......@@ -734,7 +734,7 @@
MPI_COMPLEX_PRECISION, &
#endif
cur_pcol, mpi_comm_cols, mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -780,7 +780,7 @@
! Get global dot products
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
! Transform
......@@ -796,7 +796,7 @@
enddo
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! if (nlc>0) aux2=aux1
......@@ -854,7 +854,7 @@
!$omp barrier
!$omp single
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
......@@ -863,7 +863,7 @@
MPI_COMPLEX_PRECISION, &
#endif
MPI_SUM, mpi_comm_rows, mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
if (mynlc>0) aux2 = aux1
#endif /* WITH_MPI */
......@@ -907,7 +907,7 @@
! Get global dot products
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
......@@ -916,7 +916,7 @@
MPI_COMPLEX_PRECISION,&
#endif
MPI_SUM, mpi_comm_rows, mpierr)
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
......@@ -983,7 +983,7 @@
! This can be done in different ways, we use dsyrk
vav = 0
call obj%timer%start("blas")
call obj%timer%start("blas")
if (useGPU) then
if (l_rows>0) &
#if REALCASE == 1
......@@ -1010,7 +1010,7 @@
#endif
n_cols, l_rows, ONE, vmrCPU, ubound(vmrCPU,dim=1), ZERO, vav, ubound(vav,dim=1))
endif
call obj%timer%stop("blas")
call obj%timer%stop("blas")
#if REALCASE == 1
call symm_matrix_allreduce_&
#endif
......@@ -1020,7 +1020,7 @@
&PRECISION &
(obj, n_cols,vav, nbw, nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
call obj%timer%start("blas")
call obj%timer%start("blas")
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc<n_cols) then
......@@ -1040,7 +1040,7 @@
#endif
endif
enddo
call obj%timer%stop("blas")
call obj%timer%stop("blas")
#if REALCASE == 1
endif !useQR
#endif
......@@ -1297,17 +1297,17 @@
!C1 += [A11 A12] [B1
! B2]
if ( lre > lrs .and. l_cols > lcs ) then
call obj%timer%start("blas")
call obj%timer%start("blas")
call PRECISION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, &
ONE, a(lrs,lcs), ubound(a,dim=1), &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
ZERO, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
call obj%timer%stop("blas")
call obj%timer%stop("blas")
endif
! C1 += A10' B0
if ( lce > lcs .and. i > 0 ) then
call obj%timer%start("blas")
call obj%timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMM('T', 'N', &
#endif
......@@ -1338,7 +1338,7 @@
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
call obj%timer%start("blas")
call obj%timer%start("blas")
lre = min(l_rows,(i+1)*l_rows_tile)
#if REALCASE == 1
call PRECISION_GEMM('T', 'N', &
......@@ -1348,13 +1348,13 @@
#endif
lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call obj%timer%stop("blas")
call obj%timer%stop("blas")
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call obj%timer%start("blas")
call obj%timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call obj%timer%stop("blas")
enddo
endif
......@@ -1411,7 +1411,7 @@
stop 1
endif
call obj%timer%start("mpi_communication")
if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, &
#if REALCASE == 1
......@@ -1428,7 +1428,7 @@
#if COMPLEXCASE == 1
umcCUDA(1:l_cols,1:n_cols) = tmpCUDA(1:l_cols,1:n_cols)
#endif
call obj%timer%stop("mpi_communication")
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
......@@ -1469,7 +1469,7 @@
stop 1
endif
call obj%timer%start("cublas")
call obj%timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', &
#endif
......@@ -1477,7 +1477,7 @@
call cublas_PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', &
#endif
l_cols, n_cols, ONE, tmat_dev, nbw, umc_dev, cur_l_cols)
call obj%timer%stop("cublas")
call obj%timer%stop("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_datatype,cudaMemcpyHostToDevice)
......@@ -1487,7 +1487,7 @@
&: error in cudaMemcpy vav_dev 2"
stop 1
endif
call obj%timer%start("cublas")
call obj%timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_GEMM('T', 'N', &
......@@ -1506,7 +1506,7 @@
call cublas_PRECISION_TRMM('Right', 'Upper', 'C', 'Nonunit', &
#endif
n_cols, n_cols, ONE, tmat_dev, nbw, vav_dev, nbw)
call obj%timer%stop("cublas")
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -1534,7 +1534,7 @@
endif
! U = U - 0.5 * V * VAV
call obj%timer%start("cublas")
call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols,&
#if REALCASE == 1
......@@ -1547,7 +1547,7 @@
size_of_datatype), &
cur_l_cols, vav_dev,nbw, &
ONE, umc_dev, cur_l_cols)
call obj%timer%stop("cublas")
call obj%timer%stop("cublas")
successCUDA = cuda_memcpy( &
#if REALCASE == 1
......@@ -1617,7 +1617,7 @@
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<1) cycle