Commit 50967ab5 by Andreas Marek

### Start to introduce timings of blas/lapack calls

parent 2ede0ba8
This diff is collapsed.
 ... ... @@ -214,7 +214,9 @@ z = z/sqrt(CONST_2_0) rho = CONST_2_0*beta ! Calculate index for merging both systems by ascending eigenvalues call timer%start("blas") call PRECISION_LAMRG( nm, na-nm, d, 1, 1, idx ) call timer%stop("blas") ! Calculate the allowable deflation tolerance ... ... @@ -365,9 +367,10 @@ if (na1==1) then d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation else ! na1==2 call timer%start("blas") call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1)) call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2)) call timer%stop("blas") call transform_columns_PRECISION(idx1(1), idx1(2)) endif ... ... @@ -375,7 +378,9 @@ d(na1+1:na) = d2(1:na2) ! Calculate arrangement of all eigenvalues in output call timer%start("blas") call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx ) call timer%stop("blas") ! Rearrange eigenvalues tmp = d ... ... @@ -414,7 +419,9 @@ !\$OMP DO #endif DO i = my_proc+1, na1, n_procs ! work distributed over all processors call timer%start("blas") call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! call timer%stop("blas") if (info/=0) then ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) ! use the more stable bisection algorithm in solve_secular_equation ... ... @@ -501,9 +508,10 @@ ! Add the deflated eigenvalues d(na1+1:na) = d2(1:na2) call timer%start("blas") ! Calculate arrangement of all eigenvalues in output call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx ) call timer%stop("blas") ! Rearrange eigenvalues tmp = d do i=1,na ... ... @@ -688,9 +696,11 @@ ! call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1_dev,ubound(qtmp1_dev,1),ev,ubound(ev,1), & ! 1.d0,qtmp2(1,1),ubound(qtmp2,1)) ! else call timer%start("blas") if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) & call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, CONST_1_0, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & CONST_1_0, qtmp2(1,1), ubound(qtmp2,dim=1)) call timer%stop("blas") ! endif ! useGPU ! Compute eigenvectors of the rank-1 modified matrix. ! Parts for multiplying with lower half of Q: ... ... @@ -711,9 +721,11 @@ ! call dgemm('N','N',l_rows-l_rnm,ncnt,nnzl,1.d0,qtmp1_dev(l_rnm+1,1),ubound(qtmp1_dev,1),ev,ubound(ev,1), & ! 1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,1)) ! else call timer%start("blas") if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) & call PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, CONST_1_0, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & ubound(ev,dim=1), CONST_1_0, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) call timer%stop("blas") ! endif ! useGPU ! Put partial result into (output) Q ... ...
 ... ... @@ -236,7 +236,7 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c #ifdef HAVE_DETAILED_TIMINGS use timings #else use timings_dummy use timings_dummy #endif implicit none ... ... @@ -429,7 +429,7 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c call solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1), & q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success) if (.not.(success)) return enddo ... ... @@ -458,7 +458,7 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c nlen = limits(my_prow+1)-noff ! Size of subproblem call solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1),qmat1, & ubound(qmat1,dim=1), wantDebug, success) if (.not.(success)) return endif ... ... @@ -595,8 +595,9 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c print *,"solve_tridi_single: error when allocating work "//errorMessage stop endif call timer%start("blas") call PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info) call timer%stop("blas") if (info /= 0) then ... ... @@ -606,7 +607,10 @@ subroutine solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_c d(:) = ds(:) e(:) = es(:) call timer%start("blas") call PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info) call timer%stop("blas") ! If DSTEQR fails also, we don't know what to do further ... if (info /= 0) then ... ...
 ... ... @@ -357,16 +357,17 @@ ! This can be done in different ways, we use dsyrk or zherk tmat = 0 call timer%start("blas") if (l_rows>0) & #if REALCASE == 1 call PRECISION_SYRK('U', 'T', nstor, l_rows, & call PRECISION_SYRK('U', 'T', nstor, l_rows, & CONST_1_0, hvm, ubound(hvm,dim=1), & CONST_0_0, tmat, max_stored_rows) #endif #if COMPLEXCASE == 1 call PRECISION_HERK('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows) call PRECISION_HERK('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows) #endif call timer%stop("blas") nc = 0 do n = 1, nstor-1 h1(nc+1:nc+n) = tmat(1:n,n+1) ... ... @@ -391,6 +392,7 @@ nc = 0 tmat(1,1) = tau(ice-nstor+1) do n = 1, nstor-1 call timer%start("blas") #if REALCASE == 1 call PRECISION_TRMV('L', 'T', 'N', n, & tmat, max_stored_rows, & ... ... @@ -401,6 +403,7 @@ tmat, max_stored_rows, & h2(nc+1),1) #endif call timer%stop("blas") #if REALCASE == 1 tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1) #endif ... ... @@ -441,7 +444,8 @@ ! Q = Q - V * T * V**T * Q if (l_rows>0) then if(useGPU) then if (useGPU) then call timer%start("cublas") #if REALCASE == 1 call cublas_PRECISION_GEMM('T', 'N', nstor, l_cols, l_rows, & CONST_1_0, hvm_dev, hvm_ubnd, & ... ... @@ -454,8 +458,9 @@ q_dev, ldq, & CZERO, tmp_dev, nstor) #endif call timer%stop("cublas") else ! useGPU call timer%start("blas") #if REALCASE == 1 call PRECISION_GEMM('T', 'N', nstor, l_cols, l_rows, & CONST_1_0, hvm, ubound(hvm,dim=1), & ... ... @@ -468,6 +473,7 @@ q_mat, ldq, & CZERO, tmp1 ,nstor) #endif call timer%stop("blas") endif ! useGPU else !l_rows>0 ... ... @@ -527,6 +533,7 @@ if (l_rows>0) then if (useGPU) then call timer%start("cublas") #if REALCASE == 1 call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, & CONST_1_0, tmat_dev, max_stored_rows, & ... ... @@ -545,9 +552,11 @@ tmp_dev, nstor, & CONE, q_dev, ldq) #endif call timer%stop("cublas") else !useGPU #ifdef WITH_MPI ! tmp2 = tmat * tmp2 call timer%start("blas") #if REALCASE ==1 call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, & CONST_1_0, tmat, max_stored_rows, & ... ... @@ -568,9 +577,9 @@ tmp2, nstor, & CONE, q_mat, ldq) #endif call timer%stop("blas") #else /* WITH_MPI */ call timer%start("blas") #if REALCASE == 1 call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, & CONST_1_0, tmat, max_stored_rows, & ... ... @@ -589,6 +598,7 @@ tmp1, nstor, & CONE, q_mat, ldq) #endif call timer%stop("blas") #endif /* WITH_MPI */ endif ! useGPU endif ! l_rows>0 ... ...
 ... ... @@ -450,14 +450,15 @@ #endif #if COMPLEXCASE == 1 successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)*size_of_PRECISION_complex, & cudaMemcpyDeviceToHost) cudaMemcpyDeviceToHost) #endif check_memcpy_cuda("tridiag", successCUDA) else v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1) endif if(n_stored_vecs > 0 .and. l_rows > 0) then if (n_stored_vecs > 0 .and. l_rows > 0) then call timer%start("blas") #if REALCASE == 1 call PRECISION_GEMV('N', l_rows, 2*n_stored_vecs, & CONST_1_0, vu_stored_rows, ubound(vu_stored_rows,dim=1), & ... ... @@ -465,12 +466,14 @@ CONST_1_0, v_row, 1) #endif #if COMPLEXCASE == 1 aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs)) call PRECISION_GEMV('N', l_rows, 2*n_stored_vecs, & aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs)) call PRECISION_GEMV('N', l_rows, 2*n_stored_vecs, & CONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), & aux, 1, & CONE, v_row, 1) #endif call timer%stop("blas") endif if(my_prow == prow(istep-1, nblk, np_rows)) then ... ... @@ -643,6 +646,7 @@ if (l_row_end < l_row_beg) cycle #ifdef WITH_OPENMP if (mod(n_iter,n_threads) == my_thread) then call timer%start("blas") #if REALCASE == 1 call PRECISION_GEMV('T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, & CONST_1_0, a_mat(l_row_beg,l_col_beg), lda, & ... ... @@ -667,6 +671,7 @@ CONE, ur_p(l_row_beg,my_thread), 1) endif #endif call timer%stop("blas") endif n_iter = n_iter+1 #else /* WITH_OPENMP */ ... ... @@ -678,7 +683,7 @@ #if COMPLEXCASE == 1 a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * size_of_PRECISION_complex #endif call timer%start("cublas") #if REALCASE == 1 call cublas_PRECISION_GEMV('T',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, & CONST_1_0,a_dev + a_offset, lda, & ... ... @@ -706,9 +711,9 @@ CONE, u_row_dev + (l_row_beg - 1) * size_of_PRECISION_complex, 1) #endif endif call timer%stop("cublas") else ! useGPU call timer%start("blas") #if REALCASE == 1 call PRECISION_GEMV('T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, & CONST_1_0, a_mat(l_row_beg, l_col_beg), lda, & ... ... @@ -736,6 +741,7 @@ CONE, u_row(l_row_beg), 1) #endif endif call timer%stop("blas") endif ! useGPU #endif /* WITH_OPENMP */ ... ... @@ -777,6 +783,7 @@ #endif /* WITH_OPENMP */ if (n_stored_vecs > 0) then call timer%start("blas") #if REALCASE == 1 call PRECISION_GEMV('T', l_rows, 2*n_stored_vecs, & CONST_1_0, vu_stored_rows, ubound(vu_stored_rows,dim=1), & ... ... @@ -797,6 +804,7 @@ aux, 1, & CONE, u_col, 1) #endif call timer%stop("blas") endif endif ! (l_rows>0 .and. l_cols>0) ... ... @@ -937,6 +945,7 @@ cycle if (useGPU) then call timer%start("cublas") #if REALCASE == 1 call cublas_PRECISION_GEMM('N', 'T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, & CONST_1_0, vu_stored_rows_dev + (l_row_beg - 1) * size_of_PRECISION_real, max_local_rows, & ... ... @@ -949,8 +958,9 @@ uv_stored_cols_dev + (l_col_beg - 1) * size_of_PRECISION_complex, max_local_cols, & CONE, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * size_of_PRECISION_complex, lda) #endif call timer%stop("cublas") else !useGPU call timer%start("blas") #if REALCASE == 1 call PRECISION_GEMM('N', 'T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, & CONST_1_0, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1), & ... ... @@ -963,6 +973,7 @@ uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1), & CONE, a_mat(l_row_beg,l_col_beg), lda) #endif call timer%stop("blas") endif !useGPU enddo ... ...
 ... ... @@ -591,13 +591,19 @@ endif vav = 0 if (l_rows>0) & if (l_rows>0) then #ifdef DOUBLE_PRECISION_COMPLEX call zherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) call timer%start("blas") call zherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) call timer%stop("blas") endif call herm_matrix_allreduce_double(n_cols,vav, nbw,nbw,mpi_comm_rows) #else call timer%start("blas") call cherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) call timer%stop("blas") endif call herm_matrix_allreduce_single(n_cols,vav, nbw,nbw,mpi_comm_rows) #endif ... ... @@ -606,11 +612,13 @@ do lc=n_cols,1,-1 tau = tmat(lc,lc,istep) if (lc umr (stored in vmr, second half) if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then ... ... @@ -982,6 +1010,7 @@ stop endif else ! not useGPU call timer%start("blas") #ifdef DOUBLE_PRECISION_COMPLEX call zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk8, 0.0_rk8), umc(1,n_cols+1), ubound(umc,dim=1), & vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1)) ... ... @@ -989,6 +1018,7 @@ call cgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk4, 0.0_rk4), umc(1,n_cols+1), ubound(umc,dim=1), & vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1)) #endif call timer%stop("blas") ! Transpose umc -> umr (stored in vmr, second half) #ifdef DOUBLE_PRECISION_COMPLEX ... ... @@ -1010,23 +1040,32 @@ if (lce
 ... ... @@ -644,7 +644,7 @@ ! This can be done in different ways, we use dsyrk vav = 0 call timer%start("blas") if (useGPU) then if (l_rows>0) & call PRECISION_SYRK('U', 'T', n_cols, l_rows, CONST_1_0, vmrCUDA, cur_l_rows, CONST_0_0, vav, ubound(vav,dim=1)) ... ... @@ -652,10 +652,11 @@ if (l_rows>0) & call PRECISION_SYRK('U', 'T', n_cols, l_rows, CONST_1_0, vmrCPU, ubound(vmrCPU,dim=1), CONST_0_0, vav, ubound(vav,dim=1)) endif call timer%stop("blas") call symm_matrix_allreduce_PRECISION(n_cols,vav, nbw, nbw,mpi_comm_rows) ! Calculate triangular matrix T for block Householder Transformation call timer%start("blas") do lc=n_cols,1,-1 tau = tmat(lc,lc,istep) if (lc vmc (stored in umc, second half) ... ... @@ -705,7 +707,7 @@ lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce lrs .and. l_cols > lcs ) then call timer%start("blas") call PRECISION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, & CONST_1_0, a(lrs,lcs), ubound(a,dim=1), & umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), & CONST_0_0, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1)) call timer%stop("blas") endif ! C1 += A10' B0 if ( lce > lcs .and. i > 0 ) then call timer%start("blas") call PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, & CONST_1_0, a(1,lcs), ubound(a,dim=1), & vmrCPU(1,1), ubound(vmrCPU,dim=1), & CONST_0_0, umcCPU(lcs,1), ubound(umcCPU,dim=1)) call timer%stop("blas") endif enddo endif ! l_cols>0 .and. l_rows>0 ... ... @@ -805,15 +812,17 @@ lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) if (lce 1 ... ... @@ -875,20 +884,26 @@ print *,"bandred_real: error in cudaMemcpy" stop endif call timer%start("cublas") call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, & CONST_1_0, tmat_dev, nbw, umc_dev, cur_l_cols) call timer%start("cublas") ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_PRECISION_real,cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"bandred_real: error in cudaMemcpy" stop endif call timer%start("cublas") call cublas_PRECISION_GEMM('T', 'N', n_cols, n_cols, l_cols, & CONST_1_0, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_PRECISION_real),cur_l_cols, & CONST_0_0, vav_dev, nbw) call cublas_PRECISION_TRMM('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &