Commit b75e563b authored by Andreas Marek's avatar Andreas Marek
Browse files

Refactor bandred

parent 9009c2b1
...@@ -1247,31 +1247,79 @@ ...@@ -1247,31 +1247,79 @@
endif ! l_cols>0 .and. l_rows>0 endif ! l_cols>0 .and. l_rows>0
#if REALCASE == 1 else ! do not useGPU version
#if 0
! original complex implemetation check for performance
umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
else ! do not useGPU version lre = min(l_rows,(i+1)*l_rows_tile)
call timer%start("blas")
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call timer%stop("blas")
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call timer%stop("blas")
enddo
endif ! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */
!Code for Algorithm 4 !Code for Algorithm 4
n_way = 1 n_way = 1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
#if REALCASE == 1
! check whether this can also be done in complex case
n_way = omp_get_max_threads() n_way = omp_get_max_threads()
#endif
#endif #endif
!umcCPU(1:l_cols,1:n_cols) = 0.d0 !umcCPU(1:l_cols,1:n_cols) = 0.d0
!vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0 !vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
#if REALCASE == 1
!$omp parallel private( i,lcs,lce,lrs,lre) !$omp parallel private( i,lcs,lce,lrs,lre)
#endif
#endif #endif
if (n_way > 1) then if (n_way > 1) then
#ifdef WITH_OPENMP
#if REALCASE == 1
!$omp do !$omp do
#endif
#endif
do i=1,min(l_cols_tile, l_cols) do i=1,min(l_cols_tile, l_cols)
#if REALCASE == 1
umcCPU(i,1:n_cols) = CONST_0_0 umcCPU(i,1:n_cols) = CONST_0_0
#endif
#if COMPLEXCASE == 1
umcCPU(i,1:n_cols) = CONST_COMPLEX_0_0
#endif
enddo enddo
#ifdef WITH_OPENMP
#if REALCASE == 1
!$omp do !$omp do
#endif
#endif
do i=1,l_rows do i=1,l_rows
#if REALCASE == 1
vmrCPU(i,n_cols+1:2*n_cols) = CONST_0_0 vmrCPU(i,n_cols+1:2*n_cols) = CONST_0_0
#endif
#if COMPLEXCASE == 1
vmrCPU(i,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
#endif
enddo enddo
if (l_cols>0 .and. l_rows>0) then if (l_cols>0 .and. l_rows>0) then
...@@ -1289,8 +1337,11 @@ ...@@ -1289,8 +1337,11 @@
!This algorithm chosen because in this algoirhtm, the loop around the dgemm calls !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls
!is easily parallelized, and regardless of choise of algorithm, !is easily parallelized, and regardless of choise of algorithm,
!the startup cost for parallelizing the dgemms inside the loop is too great !the startup cost for parallelizing the dgemms inside the loop is too great
#ifdef WITH_OPENMP
#if REALCASE == 1
!$omp do schedule(static,1) !$omp do schedule(static,1)
#endif
#endif
do i=0,(istep*nbw-1)/tile_size do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1 ! local column start lcs = i*l_cols_tile+1 ! local column start
lce = min(l_cols, (i+1)*l_cols_tile) ! local column end lce = min(l_cols, (i+1)*l_cols_tile) ! local column end
...@@ -1303,26 +1354,39 @@ ...@@ -1303,26 +1354,39 @@
if ( lre > lrs .and. l_cols > lcs ) then if ( lre > lrs .and. l_cols > lcs ) then
call timer%start("blas") call timer%start("blas")
call PRECISION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, & call PRECISION_GEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, &
CONST_1_0, a(lrs,lcs), ubound(a,dim=1), & ONE, a(lrs,lcs), ubound(a,dim=1), &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), & umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
CONST_0_0, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1)) ZERO, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
call timer%stop("blas") call timer%stop("blas")
endif endif
! C1 += A10' B0 ! C1 += A10' B0
if ( lce > lcs .and. i > 0 ) then if ( lce > lcs .and. i > 0 ) then
call timer%start("blas") call timer%start("blas")
call PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, & #if REALCASE == 1
CONST_1_0, a(1,lcs), ubound(a,dim=1), & call PRECISION_GEMM('T', 'N', &
vmrCPU(1,1), ubound(vmrCPU,dim=1), & #endif
CONST_0_0, umcCPU(lcs,1), ubound(umcCPU,dim=1)) #if COMPLEXCASE == 1
call timer%stop("blas") call PRECISION_GEMM('T', 'N', &
#endif
lce-lcs+1, n_cols, lrs-1, &
ONE, a(1,lcs), ubound(a,dim=1), &
vmrCPU(1,1), ubound(vmrCPU,dim=1), &
ZERO, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call timer%stop("blas")
endif endif
enddo enddo
endif ! l_cols>0 .and. l_rows>0 endif ! l_cols>0 .and. l_rows>0
else ! n_way > 1 else ! n_way > 1
#if REALCASE == 1
umcCPU(1:l_cols,1:n_cols) = CONST_0_0 umcCPU(1:l_cols,1:n_cols) = CONST_0_0
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_0_0 vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_0_0
#endif
#if COMPLEXCASE == 1
umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
#endif
if (l_cols>0 .and. l_rows>0) then if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size do i=0,(istep*nbw-1)/tile_size
...@@ -1331,117 +1395,32 @@ ...@@ -1331,117 +1395,32 @@
if (lce<lcs) cycle if (lce<lcs) cycle
call timer%start("blas") call timer%start("blas")
lre = min(l_rows,(i+1)*l_rows_tile) lre = min(l_rows,(i+1)*l_rows_tile)
call PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lre, CONST_1_0, a(1,lcs), ubound(a,dim=1), & #if REALCASE == 1
vmrCPU, ubound(vmrCPU,dim=1), CONST_1_0, umcCPU(lcs,1), ubound(umcCPU,dim=1)) call PRECISION_GEMM('T', 'N', &
#endif
#if COMPLEXCASE == 1
call PRECISION_GEMM('C', 'N', &
#endif
lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call timer%stop("blas") call timer%stop("blas")
if (i==0) cycle if (i==0) cycle
lre = min(l_rows,i*l_rows_tile) lre = min(l_rows,i*l_rows_tile)
call timer%start("blas") call timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, CONST_1_0, a(1,lcs), lda, & call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), CONST_1_0, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1)) umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, &
call timer%stop("blas") vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call timer%stop("blas")
enddo enddo
endif endif
endif ! n_way > 1 endif ! n_way > 1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
#if REALCASE == 1
!$omp end parallel !$omp end parallel
#endif #endif
#endif /* REALCASE == 1 */ #endif
endif ! do not useGPU version endif ! do not useGPU version
#if COMPLEXCASE == 1
if (useGPU) then
!umcCUDA(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
!vmrCUDA(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
else
umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
endif
if (l_cols>0 .and. l_rows>0) then
if (useGPU) then
!! if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
!! print *,"bandred_complex: vmr size 2 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
!! stop
!! endif
! successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1,1)),vmr_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *, "bandred_complex: cuda memcpy vmr_dev failed ", istat
! stop
! endif
! !if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
! ! print *,"bandred_complex: umc size 2 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
! ! stop
! !endif
! successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *, "bandred_complex: cuda memcpy umc_dev failed ", istat
! stop
! endif
endif
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
if (useGPU) then
!call timer%start("cublas")
!call cublas_PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, (a_dev + ((lcs-1)*lda* &
! size_of_PRECISION_complex)), lda, &
! vmr_dev, cur_l_rows, ONE, (umc_dev +(lcs-1)*size_of_PRECISION_complex), cur_l_cols)
!call timer%stop("cublas")
else
call timer%start("blas")
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call timer%stop("blas")
endif
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
if (useGPU) then
!call timer%start("cublas")
!call cublas_PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, (a_dev+((lcs-1)*lda* &
! size_of_PRECISION_complex)),lda, &
! (umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_PRECISION_complex), cur_l_cols,ONE, &
! (vmr_dev+(cur_l_rows * n_cols)*size_of_PRECISION_complex), cur_l_rows)
!call timer%stop("cublas")
else
call timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call timer%stop("blas")
endif
enddo
if (useGPU) then
!! if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
!! print *,"bandred_complex: vmr size 3 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
!! stop
!! endif
! successCUDA = cuda_memcpy(loc(vmrCUDA(1,1)),vmr_dev,vmr_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
! if (.not.(successCUDA)) then
! print *, "bandred_complex: cuad memcpy failed vmrCUDA ", istat
! stop
! endif
! ! if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
! ! print *,"bandred_complex: umc size 3 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
! ! stop
! ! endif
! successCUDA = cuda_memcpy(loc(umcCUDA(1,1)), umc_dev,umc_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
! if (.not.(successCUDA)) then
! print *, "bandred_complex: cuad memcpy failed umcCUDA ", istat
! stop
! endif
endif ! useGPU
endif ! (l_cols>0 .and. l_rows>0)
#endif /* COMPLEXCASE == 1 */
! Sum up all ur(:) parts along rows and add them to the uc(:) parts ! Sum up all ur(:) parts along rows and add them to the uc(:) parts
! on the processors containing the diagonal ! on the processors containing the diagonal
! This is only necessary if ur has been calculated, i.e. if the ! This is only necessary if ur has been calculated, i.e. if the
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment