Commit 50967ab5 authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to introduce timings of blas/lapack calls

parent 2ede0ba8
This diff is collapsed.
......@@ -214,7 +214,9 @@
z = z/sqrt(CONST_2_0)
rho = CONST_2_0*beta
! Calculate index for merging both systems by ascending eigenvalues
call timer%start("blas")
call PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx )
call timer%stop("blas")
! Calculate the allowable deflation tolerance
......@@ -365,9 +367,10 @@
if (na1==1) then
d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
else ! na1==2
call timer%start("blas")
call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
call timer%stop("blas")
call transform_columns_PRECISION(idx1(1), idx1(2))
endif
......@@ -375,7 +378,9 @@
d(na1+1:na) = d2(1:na2)
! Calculate arrangement of all eigenvalues in output
call timer%start("blas")
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call timer%stop("blas")
! Rearrange eigenvalues
tmp = d
......@@ -414,7 +419,9 @@
!$OMP DO
#endif
DO i = my_proc+1, na1, n_procs ! work distributed over all processors
call timer%start("blas")
call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
call timer%stop("blas")
if (info/=0) then
! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
! use the more stable bisection algorithm in solve_secular_equation
......@@ -501,9 +508,10 @@
! Add the deflated eigenvalues
d(na1+1:na) = d2(1:na2)
call timer%start("blas")
! Calculate arrangement of all eigenvalues in output
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call timer%stop("blas")
! Rearrange eigenvalues
tmp = d
do i=1,na
......@@ -688,9 +696,11 @@
! call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1_dev,ubound(qtmp1_dev,1),ev,ubound(ev,1), &
! 1.d0,qtmp2(1,1),ubound(qtmp2,1))
! else
call timer%start("blas")
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, CONST_1_0, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
CONST_1_0, qtmp2(1,1), ubound(qtmp2,dim=1))
call timer%stop("blas")
! endif ! useGPU
! Compute eigenvectors of the rank-1 modified matrix.
! Parts for multiplying with lower half of Q:
......@@ -711,9 +721,11 @@
! call dgemm('N','N',l_rows-l_rnm,ncnt,nnzl,1.d0,qtmp1_dev(l_rnm+1,1),ubound(qtmp1_dev,1),ev,ubound(ev,1), &
! 1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,1))
! else
call timer%start("blas")
if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
call PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, CONST_1_0, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, &
ubound(ev,dim=1), CONST_1_0, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
call timer%stop("blas")
! endif ! useGPU
! Put partial result into (output) Q
......
......@@ -236,7 +236,7 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
use timings_dummy
#endif
implicit none
......@@ -429,7 +429,7 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
call solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1), &
q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
if (.not.(success)) return
enddo
......@@ -458,7 +458,7 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
nlen = limits(my_prow+1)-noff ! Size of subproblem
call solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1),qmat1, &
ubound(qmat1,dim=1), wantDebug, success)
if (.not.(success)) return
endif
......@@ -595,8 +595,9 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
print *,"solve_tridi_single: error when allocating work "//errorMessage
stop
endif
call timer%start("blas")
call PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
call timer%stop("blas")
if (info /= 0) then
......@@ -606,7 +607,10 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c
d(:) = ds(:)
e(:) = es(:)
call timer%start("blas")
call PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info)
call timer%stop("blas")
! If DSTEQR fails also, we don't know what to do further ...
if (info /= 0) then
......
......@@ -357,16 +357,17 @@
! This can be done in different ways, we use dsyrk or zherk
tmat = 0
call timer%start("blas")
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', nstor, l_rows, &
call PRECISION_SYRK('U', 'T', nstor, l_rows, &
CONST_1_0, hvm, ubound(hvm,dim=1), &
CONST_0_0, tmat, max_stored_rows)
#endif
#if COMPLEXCASE == 1
call PRECISION_HERK('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows)
call PRECISION_HERK('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows)
#endif
call timer%stop("blas")
nc = 0
do n = 1, nstor-1
h1(nc+1:nc+n) = tmat(1:n,n+1)
......@@ -391,6 +392,7 @@
nc = 0
tmat(1,1) = tau(ice-nstor+1)
do n = 1, nstor-1
call timer%start("blas")
#if REALCASE == 1
call PRECISION_TRMV('L', 'T', 'N', n, &
tmat, max_stored_rows, &
......@@ -401,6 +403,7 @@
tmat, max_stored_rows, &
h2(nc+1),1)
#endif
call timer%stop("blas")
#if REALCASE == 1
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
#endif
......@@ -441,7 +444,8 @@
! Q = Q - V * T * V**T * Q
if (l_rows>0) then
if(useGPU) then
if (useGPU) then
call timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_GEMM('T', 'N', nstor, l_cols, l_rows, &
CONST_1_0, hvm_dev, hvm_ubnd, &
......@@ -454,8 +458,9 @@
q_dev, ldq, &
CZERO, tmp_dev, nstor)
#endif
call timer%stop("cublas")
else ! useGPU
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMM('T', 'N', nstor, l_cols, l_rows, &
CONST_1_0, hvm, ubound(hvm,dim=1), &
......@@ -468,6 +473,7 @@
q_mat, ldq, &
CZERO, tmp1 ,nstor)
#endif
call timer%stop("blas")
endif ! useGPU
else !l_rows>0
......@@ -527,6 +533,7 @@
if (l_rows>0) then
if (useGPU) then
call timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONST_1_0, tmat_dev, max_stored_rows, &
......@@ -545,9 +552,11 @@
tmp_dev, nstor, &
CONE, q_dev, ldq)
#endif
call timer%stop("cublas")
else !useGPU
#ifdef WITH_MPI
! tmp2 = tmat * tmp2
call timer%start("blas")
#if REALCASE ==1
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONST_1_0, tmat, max_stored_rows, &
......@@ -568,9 +577,9 @@
tmp2, nstor, &
CONE, q_mat, ldq)
#endif
call timer%stop("blas")
#else /* WITH_MPI */
call timer%start("blas")
#if REALCASE == 1
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONST_1_0, tmat, max_stored_rows, &
......@@ -589,6 +598,7 @@
tmp1, nstor, &
CONE, q_mat, ldq)
#endif
call timer%stop("blas")
#endif /* WITH_MPI */
endif ! useGPU
endif ! l_rows>0
......
......@@ -450,14 +450,15 @@
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)*size_of_PRECISION_complex, &
cudaMemcpyDeviceToHost)
cudaMemcpyDeviceToHost)
#endif
check_memcpy_cuda("tridiag", successCUDA)
else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
endif
if(n_stored_vecs > 0 .and. l_rows > 0) then
if (n_stored_vecs > 0 .and. l_rows > 0) then
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMV('N', l_rows, 2*n_stored_vecs, &
CONST_1_0, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
......@@ -465,12 +466,14 @@
CONST_1_0, v_row, 1)
#endif
#if COMPLEXCASE == 1
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
call PRECISION_GEMV('N', l_rows, 2*n_stored_vecs, &
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
call PRECISION_GEMV('N', l_rows, 2*n_stored_vecs, &
CONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
aux, 1, &
CONE, v_row, 1)
#endif
call timer%stop("blas")
endif
if(my_prow == prow(istep-1, nblk, np_rows)) then
......@@ -643,6 +646,7 @@
if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMV('T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
CONST_1_0, a_mat(l_row_beg,l_col_beg), lda, &
......@@ -667,6 +671,7 @@
CONE, ur_p(l_row_beg,my_thread), 1)
endif
#endif
call timer%stop("blas")
endif
n_iter = n_iter+1
#else /* WITH_OPENMP */
......@@ -678,7 +683,7 @@
#if COMPLEXCASE == 1
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * size_of_PRECISION_complex
#endif
call timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_GEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
CONST_1_0,a_dev + a_offset, lda, &
......@@ -706,9 +711,9 @@
CONE, u_row_dev + (l_row_beg - 1) * size_of_PRECISION_complex, 1)
#endif
endif
call timer%stop("cublas")
else ! useGPU
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMV('T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
CONST_1_0, a_mat(l_row_beg, l_col_beg), lda, &
......@@ -736,6 +741,7 @@
CONE, u_row(l_row_beg), 1)
#endif
endif
call timer%stop("blas")
endif ! useGPU
#endif /* WITH_OPENMP */
......@@ -777,6 +783,7 @@
#endif /* WITH_OPENMP */
if (n_stored_vecs > 0) then
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMV('T', l_rows, 2*n_stored_vecs, &
CONST_1_0, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
......@@ -797,6 +804,7 @@
aux, 1, &
CONE, u_col, 1)
#endif
call timer%stop("blas")
endif
endif ! (l_rows>0 .and. l_cols>0)
......@@ -937,6 +945,7 @@
cycle
if (useGPU) then
call timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_GEMM('N', 'T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
CONST_1_0, vu_stored_rows_dev + (l_row_beg - 1) * size_of_PRECISION_real, max_local_rows, &
......@@ -949,8 +958,9 @@
uv_stored_cols_dev + (l_col_beg - 1) * size_of_PRECISION_complex, max_local_cols, &
CONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * size_of_PRECISION_complex, lda)
#endif
call timer%stop("cublas")
else !useGPU
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMM('N', 'T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
CONST_1_0, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1), &
......@@ -963,6 +973,7 @@
uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1), &
CONE, a_mat(l_row_beg,l_col_beg), lda)
#endif
call timer%stop("blas")
endif !useGPU
enddo
......
......@@ -591,13 +591,19 @@
endif
vav = 0
if (l_rows>0) &
if (l_rows>0) then
#ifdef DOUBLE_PRECISION_COMPLEX
call zherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1))
call timer%start("blas")
call zherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1))
call timer%stop("blas")
endif
call herm_matrix_allreduce_double(n_cols,vav, nbw,nbw,mpi_comm_rows)
#else
call timer%start("blas")
call cherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1))
call timer%stop("blas")
endif
call herm_matrix_allreduce_single(n_cols,vav, nbw,nbw,mpi_comm_rows)
#endif
......@@ -606,11 +612,13 @@
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc<n_cols) then
call timer%start("blas")
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmv('U', 'C', 'N', n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
#else
call ctrmv('U', 'C', 'N', n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
#endif
call timer%stop("blas")
tmat(lc,lc+1:n_cols,istep) = -tau * conjg(vav(lc+1:n_cols,lc))
endif
enddo
......@@ -676,6 +684,7 @@
lre = min(l_rows,(i+1)*l_rows_tile)
if (useGPU) then
call timer%start("cublas")
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_ZGEMM('C', 'N', lce-lcs+1, n_cols, lre, CONE, (a_dev + ((lcs-1)*lda* &
size_of_double_complex_datatype)), lda, &
......@@ -685,7 +694,9 @@
size_of_single_complex_datatype)), lda, &
vmr_dev, cur_l_rows, CONE, (umc_dev +(lcs-1)*size_of_single_complex_datatype), cur_l_cols)
#endif
call timer%stop("cublas")
else
call timer%start("blas")
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMM('C', 'N', lce-lcs+1, n_cols, lre, CONE, a(1,lcs), ubound(a,dim=1), &
vmr, ubound(vmr,dim=1), CONE, umc(lcs,1), ubound(umc,dim=1))
......@@ -693,29 +704,38 @@
call CGEMM('C', 'N', lce-lcs+1, n_cols, lre, CONE, a(1,lcs), ubound(a,dim=1), &
vmr, ubound(vmr,dim=1), CONE, umc(lcs,1), ubound(umc,dim=1))
#endif
call timer%stop("blas")
endif
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
#ifdef DOUBLE_PRECISION_COMPLEX
if (useGPU) then
call timer%start("cublas")
call cublas_ZGEMM('N', 'N', lre, n_cols, lce-lcs+1, CONE, (a_dev+((lcs-1)*lda* &
size_of_double_complex_datatype)),lda, &
(umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_double_complex_datatype), cur_l_cols,CONE, &
(vmr_dev+(cur_l_rows * n_cols)*size_of_double_complex_datatype), cur_l_rows)
call timer%stop("cublas")
else
call timer%start("blas")
call ZGEMM('N', 'N', lre, n_cols, lce-lcs+1, CONE, a(1,lcs), lda, &
umc(lcs,n_cols+1), ubound(umc,dim=1), CONE, vmr(1,n_cols+1), ubound(vmr,dim=1))
call timer%stop("blas")
endif
#else
if (useGPU) then
call timer%start("cublas")
call cublas_CGEMM('N', 'N', lre, n_cols, lce-lcs+1, CONE, (a_dev+((lcs-1)*lda* &
size_of_single_complex_datatype)),lda, &
(umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_single_complex_datatype), cur_l_cols,CONE, &
(vmr_dev+(cur_l_rows * n_cols)*size_of_single_complex_datatype), cur_l_rows)
call timer%stop("cublas")
else
call timer%start("blas")
call CGEMM('N', 'N', lre, n_cols, lce-lcs+1, CONE, a(1,lcs), lda, &
umc(lcs,n_cols+1), ubound(umc,dim=1), CONE, vmr(1,n_cols+1), ubound(vmr,dim=1))
call timer%stop("blas")
endif
#endif
enddo
......@@ -835,12 +855,15 @@
print *, "bandred_complex: cuad memcpy failed tmat_dev ", istat
stop
endif
call timer%start("cublas")
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_ztrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols)
#else
call cublas_ctrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols)
#endif
call timer%stop("cublas")
else ! not useGPU
call timer%start("blas")
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), &
umc, ubound(umc,dim=1))
......@@ -848,6 +871,7 @@
call ctrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), &
umc, ubound(umc,dim=1))
#endif
call timer%stop("blas")
endif
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
......@@ -861,6 +885,7 @@
print *, "bandred_complex: cuad memcpy failed vav_dev ", istat
stop
endif
call timer%start("cublas")
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_zgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) &
*size_of_double_complex_datatype ), cur_l_cols, CZERO, vav_dev, nbw)
......@@ -872,7 +897,7 @@
call cublas_ctrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat_dev, nbw, vav_dev, nbw)
#endif
call timer%stop("cublas")
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_double_complex_datatype,cudaMemcpyDeviceToHost)
#else
......@@ -898,7 +923,8 @@
print *, "bandred_complex: cuad memcpy failed vav_dev ", istat
stop
endif
else
else ! useGPU
call timer%start("blas")
#ifdef DOUBLE_PRECISION_COMPLEX
call zgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc, ubound(umc,dim=1), umc(1,n_cols+1), &
ubound(umc,dim=1), CZERO, vav, ubound(vav,dim=1))
......@@ -910,7 +936,7 @@
call ctrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat(1,1,istep), &
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
#endif
call timer%stop("blas")
#ifdef DOUBLE_PRECISION_COMPLEX
call herm_matrix_allreduce_double(n_cols,vav,nbw,nbw,mpi_comm_cols)
#else
......@@ -921,6 +947,7 @@
! U = U - 0.5 * V * VAV
if (useGPU) then
call timer%start("cublas")
#ifdef DOUBLE_PRECISION_COMPLEX
call cublas_zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk8, 0.0_rk8), (umc_dev + &
(cur_l_cols * n_cols )*size_of_double_complex_datatype), &
......@@ -930,6 +957,7 @@
(cur_l_cols * n_cols )*size_of_single_complex_datatype), &
cur_l_cols, vav_dev, nbw, CONE, umc_dev, cur_l_cols)
#endif
call timer%stop("cublas")
! Transpose umc -> umr (stored in vmr, second half)
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
......@@ -982,6 +1010,7 @@
stop
endif
else ! not useGPU
call timer%start("blas")
#ifdef DOUBLE_PRECISION_COMPLEX
call zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk8, 0.0_rk8), umc(1,n_cols+1), ubound(umc,dim=1), &
vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1))
......@@ -989,6 +1018,7 @@
call cgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk4, 0.0_rk4), umc(1,n_cols+1), ubound(umc,dim=1), &
vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1))
#endif
call timer%stop("blas")
! Transpose umc -> umr (stored in vmr, second half)
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -1010,23 +1040,32 @@
if (lce<lcs .or. lre<1) cycle
#ifdef DOUBLE_PRECISION_COMPLEX
if (useGPU) then
call timer%start("cublas")
call cublas_zgemm('N', 'C', lre, lce-lcs+1, 2*n_cols, -CONE, &
vmr_dev ,cur_l_rows, (umc_dev +(lcs-1)*size_of_double_complex_datatype),cur_l_cols, &
CONE, (a_dev + (lcs-1)*lda*size_of_double_complex_datatype),lda)
call timer%stop("cublas")
else
call timer%start("blas")
call zgemm('N', 'C', lre,lce-lcs+1, 2*n_cols, -CONE, &
vmr, ubound(vmr,dim=1), umc(lcs,1), ubound(umc,dim=1), &
CONE, a(1,lcs), lda)
call timer%stop("blas")
endif
#else
if (useGPU) then
call timer%start("cublas")
call cublas_cgemm('N', 'C', lre, lce-lcs+1, 2*n_cols, -CONE, &
vmr_dev ,cur_l_rows, (umc_dev +(lcs-1)*size_of_single_complex_datatype),cur_l_cols, &
CONE, (a_dev + (lcs-1)*lda*size_of_single_complex_datatype),lda)
call timer%stop("cublas")
else
call timer%start("blas")
call cgemm('N', 'C', lre,lce-lcs+1, 2*n_cols, -CONE, &
vmr, ubound(vmr,dim=1), umc(lcs,1), ubound(umc,dim=1), &
CONE, a(1,lcs), lda)
call timer%stop("blas")
endif
#endif
enddo
......
......@@ -644,7 +644,7 @@
! This can be done in different ways, we use dsyrk
vav = 0
call timer%start("blas")
if (useGPU) then
if (l_rows>0) &
call PRECISION_SYRK('U', 'T', n_cols, l_rows, CONST_1_0, vmrCUDA, cur_l_rows, CONST_0_0, vav, ubound(vav,dim=1))
......@@ -652,10 +652,11 @@
if (l_rows>0) &
call PRECISION_SYRK('U', 'T', n_cols, l_rows, CONST_1_0, vmrCPU, ubound(vmrCPU,dim=1), CONST_0_0, vav, ubound(vav,dim=1))
endif
call timer%stop("blas")
call symm_matrix_allreduce_PRECISION(n_cols,vav, nbw, nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
call timer%start("blas")
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc<n_cols) then
......@@ -663,6 +664,7 @@
tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
endif
enddo
call timer%stop("blas")
endif
! Transpose vmr -> vmc (stored in umc, second half)
......@@ -705,7 +707,7 @@
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
call timer%start("cublas")
lre = min(l_rows,(i+1)*l_rows_tile)
call cublas_PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lre, &
CONST_1_0, (a_dev + ((lcs-1)*lda*size_of_PRECISION_real)), lda, vmr_dev,cur_l_rows, &
......@@ -716,6 +718,7 @@
CONST_1_0, (a_dev+ ((lcs-1)*lda*size_of_PRECISION_real)), lda, &
(umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_PRECISION_real), cur_l_cols, &
CONST_1_0, (vmr_dev+(cur_l_rows * n_cols)*size_of_PRECISION_real), cur_l_rows)
call timer%stop("cublas")
enddo
successCUDA = cuda_memcpy(loc(vmrCUDA(1)), vmr_dev,vmr_size*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
......@@ -781,18 +784,22 @@
!C1 += [A11 A12] [B1
! B2]
if ( lre > lrs .and. l_cols > lcs ) then
call timer%start("blas")
call PRECISION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, &
CONST_1_0, a(lrs,lcs), ubound(a,dim=1), &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
CONST_0_0, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
call timer%stop("blas")
endif
! C1 += A10' B0
if ( lce > lcs .and. i > 0 ) then
call timer%start("blas")
call PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, &
CONST_1_0, a(1,lcs), ubound(a,dim=1), &
vmrCPU(1,1), ubound(vmrCPU,dim=1), &
CONST_0_0, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call timer%stop("blas")
endif
enddo
endif ! l_cols>0 .and. l_rows>0
......@@ -805,15 +812,17 @@
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
call timer%start("blas")
lre = min(l_rows,(i+1)*l_rows_tile)
call PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lre, CONST_1_0, a(1,lcs), ubound(a,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), CONST_1_0, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call timer%stop("blas")
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, CONST_1_0, a(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), CONST_1_0, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call timer%stop("blas")
enddo
endif
endif ! n_way > 1
......@@ -875,20 +884,26 @@
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call timer%start("cublas")
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
CONST_1_0, tmat_dev, nbw, umc_dev, cur_l_cols)
call timer%start("cublas")
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call timer%start("cublas")
call cublas_PRECISION_GEMM('T', 'N', n_cols, n_cols, l_cols, &
CONST_1_0, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, &
CONST_0_0, vav_dev, nbw)
call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
CONST_1_0, tmat_dev, nbw, vav_dev, nbw)