Commit 3ec9e836 authored by Andreas Marek's avatar Andreas Marek

Try to run QR and GPU code

parent 8596c33e
...@@ -159,9 +159,9 @@ ...@@ -159,9 +159,9 @@
#endif #endif
integer(kind=ik) :: ierr integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=c_intptr_t) :: lc_start, lc_end integer(kind=c_intptr_t) :: lc_start, lc_end
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1 integer(kind=c_intptr_t) :: lce_1, lcs_1, lre_1
#endif #endif
integer(kind=ik) :: lr_end integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols integer(kind=ik) :: na_cols
...@@ -179,17 +179,32 @@ ...@@ -179,17 +179,32 @@
logical, intent(in) :: useQR logical, intent(in) :: useQR
#endif #endif
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, & integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp ii, pp, transformChunkSize
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION& &PRECISION&
&_& &_&
&MATH_DATATYPE &MATH_DATATYPE
logical :: useGPU_reduction_lower_block_to_tridiagonal
call obj%timer%start("bandred_& call obj%timer%start("bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
&" // & &" // &
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
if (useGPU) then
useGPU_reduction_lower_block_to_tridiagonal = .true.
#if REALCASE == 1
if (useQR) then
!in this case switch off GPU usage for step "reduce current block to lower triangular form"
! since this is done by QR decomposition
useGPU_reduction_lower_block_to_tridiagonal = .false.
endif
#endif
endif
if (wantDebug) call obj%timer%start("mpi_communication") if (wantDebug) call obj%timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
...@@ -206,18 +221,18 @@ ...@@ -206,18 +221,18 @@
if (my_prow==0 .and. my_pcol==0) then if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_& write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk &: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_& write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk' &: ELPA2 works only for nbw==n*nblk'
endif endif
success = .false. success = .false.
return return
endif endif
endif endif
! na_rows in used nowhere; only na_cols ! na_rows in used nowhere; only na_cols
if (useGPU) then if (useGPU) then
#ifdef WITH_MPI #ifdef WITH_MPI
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
...@@ -268,11 +283,6 @@ ...@@ -268,11 +283,6 @@
#if REALCASE == 1 #if REALCASE == 1
if (useQR) then if (useQR) then
if (useGPU) then
print *,"qr decomposition at the moment not supported with GPU"
stop 1
endif
if (which_qr_decomposition == 1) then if (which_qr_decomposition == 1) then
call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s') call qr_pqrparam_init(obj,pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage) allocate(tauvector(na), stat=istat, errmsg=errorMessage)
...@@ -533,6 +543,10 @@ ...@@ -533,6 +543,10 @@
! Reduce current block to lower triangular form ! Reduce current block to lower triangular form
#if REALCASE == 1 #if REALCASE == 1
if (useQR) then if (useQR) then
if (useGPU) then
! vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
endif
if (which_qr_decomposition == 1) then if (which_qr_decomposition == 1) then
vmrCols = 2*n_cols vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR #ifdef USE_ASSUMED_SIZE_QR
...@@ -637,7 +651,7 @@ ...@@ -637,7 +651,7 @@
#endif /* WITH_MPI */ #endif /* WITH_MPI */
if (useGPU) then if (useGPU_reduction_lower_block_to_tridiagonal) then
vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr) vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
else else
vmrCPU(1:lr,lc) = vr(1:lr) vmrCPU(1:lr,lc) = vr(1:lr)
...@@ -815,7 +829,7 @@ ...@@ -815,7 +829,7 @@
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
enddo ! lc enddo ! lc
if (useGPU) then if (useGPU_reduction_lower_block_to_tridiagonal) then
! store column tiles back to GPU ! store column tiles back to GPU
cur_pcol = pcol(istep*nbw+1, nblk, np_cols) cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then if (my_pcol == cur_pcol) then
...@@ -841,7 +855,7 @@ ...@@ -841,7 +855,7 @@
vav = 0 vav = 0
call obj%timer%start("blas") call obj%timer%start("blas")
if (useGPU) then if (useGPU_reduction_lower_block_to_tridiagonal) then
if (l_rows>0) & if (l_rows>0) &
#if REALCASE == 1 #if REALCASE == 1
call PRECISION_SYRK('U', 'T', & call PRECISION_SYRK('U', 'T', &
...@@ -853,7 +867,7 @@ ...@@ -853,7 +867,7 @@
vmrCUDA, cur_l_rows, & vmrCUDA, cur_l_rows, &
ZERO, vav, ubound(vav,dim=1)) ZERO, vav, ubound(vav,dim=1))
else ! useGPU else ! useGPU_reduction_to_tridiagonal
if (l_rows>0) & if (l_rows>0) &
#if REALCASE == 1 #if REALCASE == 1
call PRECISION_SYRK('U', 'T', & call PRECISION_SYRK('U', 'T', &
...@@ -892,6 +906,33 @@ ...@@ -892,6 +906,33 @@
#if REALCASE == 1 #if REALCASE == 1
endif !useQR endif !useQR
#endif #endif
#if REALCASE == 1
if (useGPU .and. useQR) then
! copy the data for furhter usage
! qr worked on *CPU arrarys
!vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
int(((lc_start-1)*lda*size_of_datatype),kind=c_intptr_t)), &
int(lda*size_of_datatype,kind=c_intptr_t), loc(a(1,lc_start)), &
int(lda*size_of_datatype,kind=c_intptr_t), &
int(lr_end*size_of_datatype,kind=c_intptr_t), &
int((lc_end - lc_start+1),kind=c_intptr_t), &
int(cudaMemcpyHostToDevice,kind=c_int))
if (.not.(successCUDA)) then
print *, "bandred_&
&MATH_DATATYPE&
&: cuda memcpy a_dev failed ", istat
stop 1
endif
endif
endif
#endif
! Transpose vmr -> vmc (stored in umc, second half) ! Transpose vmr -> vmc (stored in umc, second half)
if (useGPU) then if (useGPU) then
call elpa_transpose_vectors_& call elpa_transpose_vectors_&
...@@ -1517,7 +1558,7 @@ ...@@ -1517,7 +1558,7 @@
endif endif
endif !useGPU endif !useGPU
enddo ! istep enddo ! istep - loop
if (useGPU) then if (useGPU) then
successCUDA = cuda_free(vav_dev) successCUDA = cuda_free(vav_dev)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment