Unverified Commit ed5d94d0 authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to work on issue #9

The ELPA1 gpu version changes from branch "ELPA_development_version_GPU"
have been integrated in the branch "ELPA_GPU".

However, at the moment these changes exist only as a comment (and are
deactivated), due to the following missing points:

- the flag useGPU must be passed along the ELPA1 function calls
- the GPU device arrays must be allocated as pointers (and not use
  the deprecated Cuda Fortran style)
- the calls to cublas have to be changed from the cublasFortran API
  to the cublas C API
parent a24a8d01
...@@ -216,6 +216,15 @@ ...@@ -216,6 +216,15 @@
stop stop
endif endif
! if (useGPU) then
! allocate(vr_dev(max_local_rows))
! allocate(ur_dev(max_local_rows))
! allocate(vc_dev(max_local_cols))
! allocate(uc_dev(max_local_cols))
! allocate(vur_dev(max_local_rows,2*max_stored_rows))
! allocate(uvc_dev(max_local_cols,2*max_stored_rows))
! endif
d(:) = 0 d(:) = 0
e(:) = 0 e(:) = 0
tau(:) = 0 tau(:) = 0
...@@ -226,6 +235,11 @@ ...@@ -226,6 +235,11 @@
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols) if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols)
! if (useGPU) then
! allocate(a_dev(lda,na))
! a_dev = a
! endif
do istep=na,3,-1 do istep=na,3,-1
! Calculate number of local rows and columns of the still remaining matrix ! Calculate number of local rows and columns of the still remaining matrix
...@@ -242,7 +256,11 @@ ...@@ -242,7 +256,11 @@
! Get vector to be transformed; distribute last element and norm of ! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column ! remaining elements to all procs in current column
vr(1:l_rows) = a(1:l_rows,l_cols+1) ! if (useGPU) then
! vr(1:l_rows) = a_dev(1:l_rows,l_cols+1)
! else
vr(1:l_rows) = a(1:l_rows,l_cols+1)
! endif
if(nstor>0 .and. l_rows>0) then if(nstor>0 .and. l_rows>0) then
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
call DGEMV('N', l_rows, 2*nstor, 1.0_rk8, vur, ubound(vur,dim=1), & call DGEMV('N', l_rows, 2*nstor, 1.0_rk8, vur, ubound(vur,dim=1), &
...@@ -372,17 +390,47 @@ ...@@ -372,17 +390,47 @@
n_iter = n_iter+1 n_iter = n_iter+1
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL ! if (useGPU) then
call DGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk8, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk8, uc(lcs), 1) ! uc_dev(1:l_cols) = 0.
if (i/=j) then ! ur_dev(1:l_rows) = 0.
call DGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk8, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk8, ur(lrs), 1) ! vc_dev(1:l_cols) = vc(1:l_cols)
endif ! vr_dev(1:l_rows) = vr(1:l_rows)
!! do i=0,(istep-2)/tile_size
!! lcs = i*l_cols_tile+1
!! lce = min(l_cols,(i+1)*l_cols_tile)
!! if(lce<lcs) cycle
!! do j=0,i
!! lrs = j*l_rows_tile+1
!! lre = min(l_rows,(j+1)*l_rows_tile)
!! if(lre<lrs) cycle
!! if(mod(n_iter,n_threads) == my_thread) then
!! call cublasDGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a_dev(lrs,lcs),lda,vr_dev(lrs),1,1.d0,uc_dev(lcs),1)
!! if(i/=j) call cublasDGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a_dev(lrs,lcs),lda,vc_dev(lcs),1,1.d0,ur_dev(lrs),1)
!! endif
!! n_iter = n_iter+1
!! enddo
!! enddo
!!
! !--- for now, just use DSYMV!!!
! call DSYMV('U',l_cols,1.d0,a_dev,ubound(a_dev,1),vr_dev,1,0.d0,uc_dev,1)
! uc(1:l_cols) = uc_dev(1:l_cols)
! ur(1:l_rows) = ur_dev(1:l_rows)
! else
#ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk8, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk8, uc(lcs), 1)
if (i/=j) then
call DGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk8, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk8, ur(lrs), 1)
endif
#else #else
call SGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk4, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk4, uc(lcs), 1) call SGEMV('T', lre-lrs+1, lce-lcs+1, 1.0_rk4, a(lrs,lcs), lda, vr(lrs), 1, 1.0_rk4, uc(lcs), 1)
if (i/=j) then if (i/=j) then
call SGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk4, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk4, ur(lrs), 1) call SGEMV('N', lre-lrs+1, lce-lcs+1, 1.0_rk4, a(lrs,lcs), lda, vc(lcs), 1, 1.0_rk4, ur(lrs), 1)
endif endif
#endif #endif
! endif ! useGPU
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
enddo enddo
enddo enddo
...@@ -395,12 +443,13 @@ ...@@ -395,12 +443,13 @@
call timer%stop("OpenMP parallel_single") call timer%stop("OpenMP parallel_single")
#endif #endif
#endif #endif
do i=0,max_threads-1 do i=0,max_threads-1
uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i) uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i) ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
enddo enddo
#endif #endif /* WITH_OPENMP */
if (nstor>0) then if (nstor>0) then
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
call DGEMV('T', l_rows, 2*nstor, 1.0_rk8, vur, ubound(vur,dim=1), vr, 1, 0.0_rk8, aux, 1) call DGEMV('T', l_rows, 2*nstor, 1.0_rk8, vur, ubound(vur,dim=1), vr, 1, 0.0_rk8, aux, 1)
...@@ -492,22 +541,33 @@ ...@@ -492,22 +541,33 @@
if (nstor==max_stored_rows .or. istep==3) then if (nstor==max_stored_rows .or. istep==3) then
! if (useGPU) then
! vur_dev(:,:) = vur(:,:)
! uvc_dev(:,:) = uvc(:,:)
! endif
do i=0,(istep-2)/tile_size do i=0,(istep-2)/tile_size
lcs = i*l_cols_tile+1 lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile) lce = min(l_cols,(i+1)*l_cols_tile)
lrs = 1 lrs = 1
lre = min(l_rows,(i+1)*l_rows_tile) lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<lrs) cycle if (lce<lcs .or. lre<lrs) cycle
! if (useGPU) then
! call cublasdgemm('N','T',lre-lrs+1,lce-lcs+1,2*nstor,1.d0, &
! vur_dev(lrs,1),ubound(vur_dev,1),uvc_dev(lcs,1),ubound(uvc_dev,1), &
! 1.d0,a_dev(lrs,lcs),lda)
! else
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk8, & call dgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk8, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), & vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk8, a(lrs,lcs), lda) 1.0_rk8, a(lrs,lcs), lda)
#else #else
call sgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk4, & call sgemm('N', 'T', lre-lrs+1, lce-lcs+1, 2*nstor, 1.0_rk4, &
vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), & vur(lrs,1), ubound(vur,dim=1), uvc(lcs,1), ubound(uvc,dim=1), &
1.0_rk4, a(lrs,lcs), lda) 1.0_rk4, a(lrs,lcs), lda)
#endif #endif
! endif ! useGPU
enddo enddo
nstor = 0 nstor = 0
...@@ -515,17 +575,24 @@ ...@@ -515,17 +575,24 @@
endif endif
if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then
! if (useGPU) a(l_rows,l_cols) = a_dev(l_rows,l_cols)
if (nstor>0) a(l_rows,l_cols) = a(l_rows,l_cols) & if (nstor>0) a(l_rows,l_cols) = a(l_rows,l_cols) &
+ dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor)) + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor))
d(istep-1) = a(l_rows,l_cols) d(istep-1) = a(l_rows,l_cols)
! if (useGPU) a_dev(l_rows,l_cols) = a(l_rows,l_cols)
endif endif
enddo enddo
! Store e(1) and d(1) ! Store e(1) and d(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above ! if (useGPU) then
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1) ! if(my_prow==prow(1) .and. my_pcol==pcol(2)) e(1) = a_dev(1,l_cols) ! use last l_cols value of loop above
! if(my_prow==prow(1) .and. my_pcol==pcol(1)) d(1) = a_dev(1,1)
! else
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1)
! endif
deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage) deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
...@@ -710,12 +777,26 @@ ...@@ -710,12 +777,26 @@
stop stop
endif endif
! if (useGPU) then
! allocate(hvm1(max_local_rows*max_stored_rows))
!
! allocate(tmat_dev(max_stored_rows,max_stored_rows))
! allocate(hvm_dev(max_local_rows*max_stored_rows))
! allocate(tmp_dev(max_local_cols*max_stored_rows))
! allocate(q_dev(ldq,nqc))
! q_dev = q
! endif
hvm = 0 ! Must be set to 0 !!! hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only hvb = 0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
nstor = 0 nstor = 0
! if (useGPU) then
! hvn_ubnd = 0
! endif
do istep=1,na,nblk do istep=1,na,nblk
...@@ -755,6 +836,9 @@ ...@@ -755,6 +836,9 @@
do ic=ics,ice do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
! if (useGPU) then
! hvm_ubnd = l_rows
! endif
nstor = nstor+1 nstor = nstor+1
nb = nb+l_rows nb = nb+l_rows
enddo enddo
...@@ -805,6 +889,13 @@ ...@@ -805,6 +889,13 @@
nc = nc+n nc = nc+n
enddo enddo
! if (useGPU) then
! hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!
! hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
! tmat_dev = tmat
! endif
! Q = Q - V * T * V**T * Q ! Q = Q - V * T * V**T * Q
if (l_rows>0) then if (l_rows>0) then
...@@ -817,16 +908,23 @@ ...@@ -817,16 +908,23 @@
#endif #endif
else else
tmp1(1:l_cols*nstor) = 0 ! if (useGPU) then
! tmp_dev(1:l_cols*nstor) = 0
! else
tmp1(1:l_cols*nstor) = 0
! endif
endif endif
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
! if (useGPU) then
! else
#ifdef WITH_MPI #ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else #else
tmp2 = tmp1 tmp2 = tmp1
#endif #endif
! endif ! useGPU
if (l_rows>0) then if (l_rows>0) then
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor) call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), & call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
...@@ -834,11 +932,14 @@ ...@@ -834,11 +932,14 @@
endif endif
#else /* DOUBLE_PRECISION_REAL */ #else /* DOUBLE_PRECISION_REAL */
! if (useGPU) then
! else
#ifdef WITH_MPI #ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else #else
tmp2 = tmp1 tmp2 = tmp1
#endif #endif
! endif ! useGPU
if (l_rows>0) then if (l_rows>0) then
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor) call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor)
...@@ -857,6 +958,11 @@ ...@@ -857,6 +958,11 @@
stop stop
endif endif
! if (useGPU) then
! q = q_dev
! deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
! endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
call timer%stop("trans_ev_real_double") call timer%stop("trans_ev_real_double")
...@@ -1814,9 +1920,9 @@ ...@@ -1814,9 +1920,9 @@
#endif #endif
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
subroutine solve_tridi_single_problem_double(nlen, d, e, q, ldq, wantDebug, success) recursive subroutine solve_tridi_single_problem_double(nlen, d, e, q, ldq, wantDebug, success)
#else #else
subroutine solve_tridi_single_problem_single(nlen, d, e, q, ldq, wantDebug, success) recursive subroutine solve_tridi_single_problem_single(nlen, d, e, q, ldq, wantDebug, success)
#endif #endif
! Solves the symmetric, tridiagonal eigenvalue problem on a single processor. ! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
...@@ -1963,6 +2069,33 @@ ...@@ -1963,6 +2069,33 @@
end subroutine solve_tridi_single_problem_single end subroutine solve_tridi_single_problem_single
#endif #endif
!! RJ: Doesn't work !!!!
!
!subroutine merge_systems_single( na, nm, d, e, q, ldq )
!
! implicit none
! integer na, nm, ldq
! real*8 d(na), e, q(ldq,*)
!
! integer, allocatable :: l_col(:), p_col(:), l_col_out(:), p_col_out(:)
! integer :: i
!
! allocate(l_col(na), p_col(na), l_col_out(na), p_col_out(na))
! do i = 1,na
! l_col(i) = i
! p_col(i) = 0
! l_col_out(i) = i
! p_col_out(i) = 0
! enddo
!
! call merge_systems( na, nm, d, e, q, ldq, 0, 16, mpi_comm_self, mpi_comm_self, &
! l_col, p_col, l_col_out, p_col_out, 0, 1)
!
! deallocate(l_col, p_col, l_col_out, p_col_out)
!
!end subroutine merge_systems_single
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
subroutine merge_systems_double( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, & subroutine merge_systems_double( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, wantDebug, success) l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, wantDebug, success)
...@@ -2645,6 +2778,10 @@ ...@@ -2645,6 +2778,10 @@
stop stop
endif endif
! if (useGPU) then
! allocate(qtmp1_dev(MAX(1,l_rows),max_local_cols))
! endif
! Gather nonzero upper/lower components of old matrix Q ! Gather nonzero upper/lower components of old matrix Q
! which are needed for multiplication with new eigenvectors ! which are needed for multiplication with new eigenvectors
...@@ -2714,6 +2851,10 @@ ...@@ -2714,6 +2851,10 @@
#endif /* WITH_MPI */ #endif /* WITH_MPI */
endif endif
! if (useGPU) then
! qtmp1_dev(:,:) = qtmp1(:,:)
! endif
! Gather the parts in d1 and z which are fitting to qtmp1. ! Gather the parts in d1 and z which are fitting to qtmp1.
! This also delivers nnzu/nnzl for proc np_rem ! This also delivers nnzu/nnzl for proc np_rem
...@@ -2776,14 +2917,20 @@ ...@@ -2776,14 +2917,20 @@
! Multiply old Q with eigenvectors (upper half) ! Multiply old Q with eigenvectors (upper half)
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) & ! if (useGPU) then
! if(l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
! call dgemm('N','N',l_rnm,ncnt,nnzu,1.d0,qtmp1_dev,ubound(qtmp1_dev,1),ev,ubound(ev,1), &
! 1.d0,qtmp2(1,1),ubound(qtmp2,1))
! else
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk8, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & call dgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk8, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
1.0_rk8, qtmp2(1,1), ubound(qtmp2,dim=1)) 1.0_rk8, qtmp2(1,1), ubound(qtmp2,dim=1))
#else #else
call sgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk4, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & call sgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk4, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
1.0_rk4, qtmp2(1,1), ubound(qtmp2,dim=1)) 1.0_rk4, qtmp2(1,1), ubound(qtmp2,dim=1))
#endif #endif
! endif ! useGPU
! Compute eigenvectors of the rank-1 modified matrix. ! Compute eigenvectors of the rank-1 modified matrix.
! Parts for multiplying with lower half of Q: ! Parts for multiplying with lower half of Q:
...@@ -2802,15 +2949,20 @@ ...@@ -2802,15 +2949,20 @@
! Multiply old Q with eigenvectors (lower half) ! Multiply old Q with eigenvectors (lower half)
if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) & ! if (useGPU) then
! if(l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
! call dgemm('N','N',l_rows-l_rnm,ncnt,nnzl,1.d0,qtmp1_dev(l_rnm+1,1),ubound(qtmp1_dev,1),ev,ubound(ev,1), &
! 1.d0,qtmp2(l_rnm+1,1),ubound(qtmp2,1))
! else
if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk8, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & call dgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk8, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, &
ubound(ev,dim=1), 1.0_rk8, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) ubound(ev,dim=1), 1.0_rk8, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
#else #else
call sgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk4, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & call sgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk4, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, &
ubound(ev,dim=1), 1.0_rk4, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) ubound(ev,dim=1), 1.0_rk4, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
#endif #endif
! endif ! useGPU
! Put partial result into (output) Q ! Put partial result into (output) Q
do i = 1, ncnt do i = 1, ncnt
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment