Start to work on issue #9

The ELPA1 gpu version changes from branch "ELPA_development_version_GPU"
have been integrated in the branch "ELPA_GPU".

However, at the moment these changes exist only as a comment (and are
deactivated), due to the following missing points:

- the flag useGPU must be passed along the ELPA1 function calls
- the GPU device arrays must be allocated as pointers (and not use
  the deprecated Cuda Fortran style)
- the calls to cublas have to be changed from the cublasFortran API
  to the cublas C API
parent a24a8d01
......@@ -216,6 +216,15 @@
stop
endif
! if (useGPU) then
! allocate(vr_dev(max_local_rows))
! allocate(ur_dev(max_local_rows))
! allocate(vc_dev(max_local_cols))
! allocate(uc_dev(max_local_cols))
! allocate(vur_dev(max_local_rows,2*max_stored_rows))
! allocate(uvc_dev(max_local_cols,2*max_stored_rows))
! endif
d(:) = 0
e(:) = 0
tau(:) = 0
......@@ -226,6 +235,11 @@
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols)
! if (useGPU) then
! allocate(a_dev(lda,na))
! a_dev = a
! endif
do istep=na,3,-1
! Calculate number of local rows and columns of the still remaining matrix
......@@ -242,7 +256,11 @@
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:l_rows) = a(1:l_rows,l_cols+1)
! if (useGPU) then
! vr(1:l_rows) = a_dev(1:l_rows,l_cols+1)
! else
vr(1:l_rows) = a(1:l_rows,l_cols+1)
! endif
if(nstor>0 .and. l_rows>0) then
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('N', l_rows, 2*nstor, 1.0_rk8, vur, ubound(vur,dim=1), &
......@@ -372,17 +390,47 @@
n_iter = n_iter+1
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk8, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk8, uc(lcs), 1)
if (i/=j) then
call DGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk8, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk8, ur(lrs), 1)
endif
! if (useGPU) then
! uc_dev(1:l_cols) = 0.
! ur_dev(1:l_rows) = 0.
! vc_dev(1:l_cols) = vc(1:l_cols)
! vr_dev(1:l_rows) = vr(1:l_rows)
!! do i=0,(istep-2)/tile_size
!! lcs = i*l_cols_tile+1
!! lce = min(l_cols,(i+1)*l_cols_tile)
!! if(lce<lcs) cycle
!! do j=0,i
!! lrs = j*l_rows_tile+1
!! lre = min(l_rows,(j+1)*l_rows_tile)
!! if(lre<lrs) cycle
!! if(mod(n_iter,n_threads) == my_thread) then
!! call cublasDGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a_dev(lrs,lcs),lda,vr_dev(lrs),1,1.d0,uc_dev(lcs),1)
!! if(i/=j) call cublasDGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a_dev(lrs,lcs),lda,vc_dev(lcs),1,1.d0,ur_dev(lrs),1)
!! endif
!! n_iter = n_iter+1
!! enddo
!! enddo
!!
! !--- for now, just use DSYMV!!!
! call DSYMV('U',l_cols,1.d0,a_dev,ubound(a_dev,1),vr_dev,1,0.d0,uc_dev,1)
! uc(1:l_cols) = uc_dev(1:l_cols)
! ur(1:l_rows) = ur_dev(1:l_rows)
! else
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk8, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk8, uc(lcs), 1)
if (i/=j) then
call DGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk8, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk8, ur(lrs), 1)
endif
#else
call SGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk4, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk4, uc(lcs), 1)
if (i/=j) then
call SGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk4, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk4, ur(lrs), 1)
endif
call SGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk4, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk4, uc(lcs), 1)
if (i/=j) then
call SGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk4, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk4, ur(lrs), 1)
endif
#endif
! endif ! useGPU
#endif /* WITH_OPENMP */
enddo
enddo
......@@ -395,12 +443,13 @@
call timer%stop("OpenMP parallel_single")
#endif
#endif
do i=0,max_threads-1
uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
enddo
#endif
#endif /* WITH_OPENMP */
if (nstor>0) then
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', l_rows, 2*nstor, 1.0_rk8, vur, ubound(vur,dim=1), vr, 1, 0.0_rk8, aux, 1)
......@@ -492,22 +541,33 @@
if (nstor==max_stored_rows .or. istep==3) then
! if (useGPU) then
! vur_dev(:,:) = vur(:,:)
! uvc_dev(:,:) = uvc(:,:)
! endif
do i=0,(istep-2)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lrs = 1
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<lrs) cycle
! if (useGPU) then
! call cublasdgemm('N','T',lre-lrs+1,lce-lcs+1,2*nstor,1.d0, &
! vur_dev(lrs,1),ubound(vur_dev,1),uvc_dev(lcs,1),ubound(uvc_dev,1), &
! 1.d0,a_dev(lrs,lcs),lda)
! else
#ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk8, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk8, a(lrs,lcs), lda)
call dgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk8, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk8, a(lrs,lcs), lda)
#else
call sgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk4, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk4, a(lrs,lcs), lda)
call sgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk4, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk4, a(lrs,lcs), lda)
#endif
! endif ! useGPU
enddo
nstor = 0
......@@ -515,17 +575,24 @@
endif
if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then
! if (useGPU) a(l_rows,l_cols) = a_dev(l_rows,l_cols)
if (nstor>0) a(l_rows,l_cols) = a(l_rows,l_cols) &
+ dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor))
d(istep-1) = a(l_rows,l_cols)
! if (useGPU) a_dev(l_rows,l_cols) = a(l_rows,l_cols)
endif
enddo
! Store e(1) and d(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
! if (useGPU) then
! if(my_prow==prow(1) .and. my_pcol==pcol(2)) e(1) = a_dev(1,l_cols) ! use last l_cols value of loop above
! if(my_prow==prow(1) .and. my_pcol==pcol(1)) d(1) = a_dev(1,1)
! else
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
! endif
deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -710,12 +777,26 @@
stop
endif
! if (useGPU) then
! allocate(hvm1(max_local_rows*max_stored_rows))
!
! allocate(tmat_dev(max_stored_rows,max_stored_rows))
! allocate(hvm_dev(max_local_rows*max_stored_rows))
! allocate(tmp_dev(max_local_cols*max_stored_rows))
! allocate(q_dev(ldq,nqc))
! q_dev = q
! endif
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
nstor = 0
! if (useGPU) then
! hvn_ubnd = 0
! endif
do istep=1,na,nblk
......@@ -755,6 +836,9 @@
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
! if (useGPU) then
! hvm_ubnd = l_rows
! endif
nstor = nstor+1
nb = nb+l_rows
enddo
......@@ -805,6 +889,13 @@
nc = nc+n
enddo
! if (useGPU) then
! hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!
! hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
! tmat_dev = tmat
! endif
! Q = Q - V * T * V**T * Q
if (l_rows>0) then
......@@ -817,16 +908,23 @@
#endif
else
tmp1(1:l_cols*nstor) = 0
! if (useGPU) then
! tmp_dev(1:l_cols*nstor) = 0
! else
tmp1(1:l_cols*nstor) = 0
! endif
endif
#ifdef DOUBLE_PRECISION_REAL
! if (useGPU) then
! else
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
tmp2 = tmp1
#endif
! endif ! useGPU
if (l_rows>0) then
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
......@@ -834,11 +932,14 @@
endif
#else /* DOUBLE_PRECISION_REAL */
! if (useGPU) then
! else
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
tmp2 = tmp1
#endif
! endif ! useGPU
if (l_rows>0) then
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor)
......@@ -857,6 +958,11 @@
stop
endif
! if (useGPU) then
! q = q_dev
! deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
! endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_REAL
call timer%stop("trans_ev_real_double")
......@@ -1814,9 +1920,9 @@
#endif
#ifdef DOUBLE_PRECISION_REAL
subroutine solve_tridi_single_problem_double(nlen, d, e, q, ldq, wantDebug, success)
recursive subroutine solve_tridi_single_problem_double(nlen, d, e, q, ldq, wantDebug, success)
#else
subroutine solve_tridi_single_problem_single(nlen, d, e, q, ldq, wantDebug, success)
recursive subroutine solve_tridi_single_problem_single(nlen, d, e, q, ldq, wantDebug, success)
#endif
! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
......@@ -1963,6 +2069,33 @@
end subroutine solve_tridi_single_problem_single
#endif
!! RJ: Doesn't work !!!!
!
!subroutine merge_systems_single( na, nm, d, e, q, ldq )
!
! implicit none
! integer na, nm, ldq
! real*8 d(na), e, q(ldq,*)
!
! integer, allocatable :: l_col(:), p_col(:), l_col_out(:), p_col_out(:)
! integer :: i
!
! allocate(l_col(na), p_col(na), l_col_out(na), p_col_out(na))
! do i = 1,na
! l_col(i) = i
! p_col(i) = 0
! l_col_out(i) = i
! p_col_out(i) = 0
! enddo
!
! call merge_systems( na, nm, d, e, q, ldq, 0, 16, mpi_comm_self, mpi_comm_self, &
! l_col, p_col, l_col_out, p_col_out, 0, 1)
!
! deallocate(l_col, p_col, l_col_out, p_col_out)
!
!end subroutine merge_systems_single
#ifdef DOUBLE_PRECISION_REAL
subroutine merge_systems_double( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, wantDebug, success)
......@@ -2645,6 +2778,10 @@
stop
endif
! if (useGPU) then
! allocate(qtmp1_dev(MAX(1,l_rows),max_local_cols))
! endif
! Gather nonzero upper/lower components of old matrix Q
! which are needed for multiplication with new eigenvectors
......@@ -2714,6 +2851,10 @@
#endif /* WITH_MPI */
endif
! if (useGPU) then
! qtmp1_dev(:,:) = qtmp1(:,:)
! endif
! Gather the parts in d1 and z which are fitting to qtmp1.
! This also delivers nnzu/nnzl for proc np_rem
......@@ -2776,14 +2917,20 @@
! Multiply old Q with eigenvectors (upper half)
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
! if (useGPU) then
! if(l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
! call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1_dev,ubound(qtmp1_dev,1),ev,ubound(ev,1), &
! 1.d0,qtmp2(1,1),ubound(qtmp2,1))
! else
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
#ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk8, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
1.0_rk8, qtmp2(1,1), ubound(qtmp2,dim=1))
call dgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk8, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
1.0_rk8, qtmp2(1,1), ubound(qtmp2,dim=1))
#else
call sgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk4, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
1.0_rk4, qtmp2(1,1), ubound(qtmp2,dim=1))
call sgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk4, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
1.0_rk4, qtmp2(1,1), ubound(qtmp2,dim=1))
#endif
! endif ! useGPU
! Compute eigenvectors of the rank-1 modified matrix.
! Parts for multiplying with lower half of Q:
......@@ -2802,15 +2949,20 @@
! Multiply old Q with eigenvectors (lower half)
if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
! if (useGPU) then
! if(l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
! call dgemm('N','N',l_rows-l_rnm,ncnt,nnzl,1.d0,qtmp1_dev(l_rnm+1,1),ubound(qtmp1_dev,1),ev,ubound(ev,1), &
! 1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,1))
! else
if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
#ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk8, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, &
ubound(ev,dim=1), 1.0_rk8, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
call dgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk8, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, &
ubound(ev,dim=1), 1.0_rk8, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
#else
call sgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk4, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, &
ubound(ev,dim=1), 1.0_rk4, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
call sgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk4, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, &
ubound(ev,dim=1), 1.0_rk4, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
#endif
! endif ! useGPU
! Put partial result into (output) Q
do i = 1, ncnt
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment