Commit 56043bdc authored by Andreas Marek's avatar Andreas Marek
Browse files

Single precision support for ELPA2

ELPA2 can now be build (as ELPA1) for single precision calculations.
The ELPA2 kernles which are implemented in assembler, C, or C++ have NOT
yet been ported.

Thus at the moment only the GENERIC and GENERIC_SIMPLE kernels support
single precision calculations
parent de6a4fde
......@@ -146,6 +146,7 @@ contains
use precision
use cuda_functions
use mod_check_for_gpu
use iso_c_binding
implicit none
logical, intent(in), optional :: useQR
logical :: useQRActual, useQREnvironment
......@@ -163,7 +164,7 @@ contains
integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: nbw, num_blocks
real(kind=rk), allocatable :: tmat(:,:,:), e(:)
real(kind=rk) :: ttt0, ttt1, ttts
real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double
integer(kind=ik) :: i
logical :: success
logical, save :: firstCall = .true.
......@@ -294,6 +295,7 @@ contains
ttts = ttt0
call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQRActual)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
......@@ -308,14 +310,19 @@ contains
endif
ttt0 = MPI_Wtime()
call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_real :',ttt1-ttt0
#ifdef DOUBLE_PRECISION_REAL
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
#else
call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr)
#endif
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
......@@ -426,6 +433,7 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
use precision
use cuda_functions
use mod_check_for_gpu
use iso_c_binding
implicit none
integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
integer(kind=ik) :: THIS_COMPLEX_ELPA_KERNEL
......@@ -440,7 +448,7 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
integer(kind=ik) :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
complex(kind=ck), allocatable :: tmat(:,:,:)
real(kind=rk), allocatable :: q_real(:,:), e(:)
real(kind=rk) :: ttt0, ttt1, ttts
real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double
integer(kind=ik) :: i
logical :: success, wantDebug
......@@ -562,9 +570,13 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_bcast(ev, na, mpi_real8, 0, mpi_comm_all, mpierr)
call mpi_bcast(e, na, mpi_real8, 0, mpi_comm_all, mpierr)
#else
call mpi_bcast(ev, na, mpi_real4, 0, mpi_comm_all, mpierr)
call mpi_bcast(e, na, mpi_real4, 0, mpi_comm_all, mpierr)
#endif
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
......
......@@ -266,7 +266,7 @@ module ELPA2_compute
stop
endif
work_blocked = 0.0d0
work_blocked = 0.0_rk
deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
......@@ -300,7 +300,7 @@ module ELPA2_compute
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*size_of_real_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*size_of_real_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
......@@ -422,16 +422,16 @@ module ELPA2_compute
endif ! use GPU
if (useGPU) then
vmrCUDA(1 : cur_l_rows * n_cols) = 0.
vmrCUDA(1 : cur_l_rows * n_cols) = 0._rk
else
vmrCPU(1:l_rows,1:n_cols) = 0.
vmrCPU(1:l_rows,1:n_cols) = 0._rk
endif
vr(:) = 0
tmat(:,:,istep) = 0
if (useGPU) then
umcCUDA(1 : umc_size) = 0.
umcCUDA(1 : umc_size) = 0._rk
lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
......@@ -507,10 +507,15 @@ module ELPA2_compute
aux1(2) = vr(lr)
else
aux1(1) = dot_product(vr(1:lr),vr(1:lr))
aux1(2) = 0.
aux1(2) = 0._rk
endif
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
vnorm2 = aux2(1)
vrl = aux2(2)
......@@ -525,7 +530,7 @@ module ELPA2_compute
if (my_prow==prow(nrow, nblk, np_rows)) then
a(1:lr-1,lch) = vr(1:lr-1)
a(lr,lch) = vrl
vr(lr) = 1.
vr(lr) = 1._rk
else
a(1:lr,lch) = vr(1:lr)
endif
......@@ -535,7 +540,11 @@ module ELPA2_compute
! Broadcast Householder vector and tau along columns
vr(lr+1) = tau
call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call MPI_Bcast(vr, lr+1, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr)
#else
call MPI_Bcast(vr, lr+1, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr)
#endif
if (useGPU) then
vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
......@@ -577,7 +586,11 @@ module ELPA2_compute
! Get global dot products
!$omp barrier
!$omp single
if (mynlc>0) call mpi_allreduce(aux1,aux2,mynlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_REAL
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#endif
!$omp end single
!$omp barrier
......@@ -610,8 +623,11 @@ module ELPA2_compute
enddo
! Get global dot products
if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_REAL
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
! Transform
nlc = 0
......@@ -646,13 +662,24 @@ module ELPA2_compute
vav = 0
#ifdef DOUBLE_PRECISION_REAL
if (useGPU) then
if (l_rows>0) &
call dsyrk('U','T',n_cols,l_rows,1.d0,vmrCUDA,cur_l_rows,0.d0,vav,ubound(vav,dim=1))
call dsyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCUDA, cur_l_rows, 0.0_rk, vav, ubound(vav,dim=1))
else
if (l_rows>0) &
call dsyrk('U','T',n_cols,l_rows,1.d0,vmrCPU,ubound(vmrCPU,dim=1),0.d0,vav,ubound(vav,dim=1))
call dsyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCPU, ubound(vmrCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1))
endif
#else
if (useGPU) then
if (l_rows>0) &
call ssyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCUDA, cur_l_rows, 0.0_rk, vav, ubound(vav,dim=1))
else
if (l_rows>0) &
call ssyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCPU, ubound(vmrCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1))
endif
#endif
call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
......@@ -660,7 +687,11 @@ module ELPA2_compute
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc<n_cols) then
call dtrmv('U','T','N',n_cols-lc,tmat(lc+1,lc+1,istep),ubound(tmat,dim=1),vav(lc+1,lc),1)
#ifdef DOUBLE_PRECISION_REAL
call dtrmv('U', 'T', 'N', n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
#else
call strmv('U', 'T', 'N', n_cols-lc, tmat(lc+1,lc+1,istep), ubound(tmat,dim=1), vav(lc+1,lc), 1)
#endif
tmat(lc,lc+1:n_cols,istep) = -tau * vav(lc+1:n_cols,lc)
endif
enddo
......@@ -686,7 +717,7 @@ module ELPA2_compute
! here the GPU version and CPU version diverged substantially, due to the newest
! optimizations due to Intel. The GPU version has to be re-written
if (useGPU) then
umcCUDA(1 : l_cols * n_cols) = 0.d0
umcCUDA(1 : l_cols * n_cols) = 0.0_rk
vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = 0
if (l_cols>0 .and. l_rows>0) then
......@@ -709,18 +740,28 @@ module ELPA2_compute
if (lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
call cublas_dgemm('T','N',lce-lcs+1,n_cols,lre, &
1.d0, (a_dev + ((lcs-1)*lda*size_of_real_datatype)), lda, vmr_dev,cur_l_rows, &
1.d0, (umc_dev+ (lcs-1)*size_of_real_datatype), cur_l_cols)
#ifdef DOUBLE_PRECISION_REAL
call cublas_dgemm('T', 'N', lce-lcs+1, n_cols, lre, &
1.0_rk, (a_dev + ((lcs-1)*lda*size_of_real_datatype)), lda, vmr_dev,cur_l_rows, &
1.0_rk, (umc_dev+ (lcs-1)*size_of_real_datatype), cur_l_cols)
#else
call cublas_sgemm('T', 'N', lce-lcs+1, n_cols, lre, &
1.0_rk, (a_dev + ((lcs-1)*lda*size_of_real_datatype)), lda, vmr_dev,cur_l_rows, &
1.0_rk, (umc_dev+ (lcs-1)*size_of_real_datatype), cur_l_cols)
#endif
if(i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call cublas_dgemm('N','N',lre,n_cols,lce-lcs+1,&
1.d0, (a_dev+ ((lcs-1)*lda*size_of_real_datatype)),lda, &
#ifdef DOUBLE_PRECISION_REAL
call cublas_dgemm('N', 'N', lre,n_cols, lce-lcs+1,&
1.0_rk, (a_dev+ ((lcs-1)*lda*size_of_real_datatype)), lda, &
(umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_real_datatype), cur_l_cols, &
1.0_rk, (vmr_dev+(cur_l_rows * n_cols)*size_of_real_datatype), cur_l_rows)
#else
call cublas_sgemm('N', 'N', lre,n_cols, lce-lcs+1,&
1.0_rk, (a_dev+ ((lcs-1)*lda*size_of_real_datatype)), lda, &
(umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_real_datatype), cur_l_cols, &
1.d0, (vmr_dev+(cur_l_rows * n_cols)*size_of_real_datatype), cur_l_rows)
1.0_rk, (vmr_dev+(cur_l_rows * n_cols)*size_of_real_datatype), cur_l_rows)
#endif
enddo
successCUDA = cuda_memcpy(loc(vmrCUDA(1)), vmr_dev,vmr_size*size_of_real_datatype,cudaMemcpyDeviceToHost)
......@@ -752,11 +793,11 @@ module ELPA2_compute
if (n_way > 1) then
!$omp do
do i=1,min(l_cols_tile, l_cols)
umcCPU(i,1:n_cols) = 0.d0
umcCPU(i,1:n_cols) = 0.0_rk
enddo
!$omp do
do i=1,l_rows
vmrCPU(i,n_cols+1:2*n_cols) = 0.d0
vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rk
enddo
if (l_cols>0 .and. l_rows>0) then
......@@ -786,23 +827,37 @@ module ELPA2_compute
!C1 += [A11 A12] [B1
! B2]
if ( lre > lrs .and. l_cols > lcs ) then
call DGEMM('N','N', lre-lrs+1, n_cols, l_cols-lcs+1, &
1.d0, a(lrs,lcs), ubound(a,dim=1), &
#ifdef DOUBLE_PRECISION_REAL
call DGEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, &
1.0_rk, a(lrs,lcs), ubound(a,dim=1), &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
0.0_rk, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
#else
call SGEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, &
1.0_rk, a(lrs,lcs), ubound(a,dim=1), &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
0.d0, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
0.0_rk, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1))
#endif
endif
! C1 += A10' B0
if ( lce > lcs .and. i > 0 ) then
call DGEMM('T','N', lce-lcs+1, n_cols, lrs-1, &
1.d0, a(1,lcs), ubound(a,dim=1), &
#ifdef DOUBLE_PRECISION_REAL
call DGEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, &
1.0_rk, a(1,lcs), ubound(a,dim=1), &
vmrCPU(1,1), ubound(vmrCPU,dim=1), &
0.d0, umcCPU(lcs,1), ubound(umcCPU,dim=1))
0.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1))
#else
call SGEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, &
1.0_rk, a(1,lcs), ubound(a,dim=1), &
vmrCPU(1,1), ubound(vmrCPU,dim=1), &
0.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1))
#endif
endif
enddo
endif ! l_cols>0 .and. l_rows>0
else ! n_way > 1
umcCPU(1:l_cols,1:n_cols) = 0.d0
umcCPU(1:l_cols,1:n_cols) = 0.0_rk
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
......@@ -812,13 +867,25 @@ module ELPA2_compute
if (lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
call DGEMM('T','N',lce-lcs+1,n_cols,lre,1.d0,a(1,lcs),ubound(a,dim=1), &
vmrCPU,ubound(vmrCPU,dim=1),1.d0,umcCPU(lcs,1),ubound(umcCPU,dim=1))
#ifdef DOUBLE_PRECISION_REAL
call DGEMM('T', 'N', lce-lcs+1, n_cols, lre, 1.0_rk, a(1,lcs), ubound(a,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), 1.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1))
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call DGEMM('N','N',lre,n_cols,lce-lcs+1,1.d0,a(1,lcs),lda, &
umcCPU(lcs,n_cols+1),ubound(umcCPU,dim=1),1.d0,vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1))
call DGEMM('N', 'N', lre, n_cols, lce-lcs+1, 1.0_rk, a(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), 1.0_rk, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
#else
call SGEMM('T', 'N', lce-lcs+1, n_cols, lre, 1.0_rk, a(1,lcs), ubound(a,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), 1.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1))
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call SGEMM('N', 'N', lre, n_cols, lce-lcs+1, 1.0_rk, a(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), 1.0_rk, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
#endif
enddo
endif
endif ! n_way > 1
......@@ -848,7 +915,11 @@ module ELPA2_compute
stop
endif
call mpi_allreduce(umcCUDA,tmpCUDA,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,ierr)
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, ierr)
#else
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, ierr)
#endif
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
if (allocated(tmpCUDA)) then
......@@ -872,10 +943,13 @@ module ELPA2_compute
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call cublas_dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols, &
1.d0, tmat_dev,nbw,umc_dev,cur_l_cols)
#ifdef DOUBLE_PRECISION_REAL
call cublas_dtrmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
1.0_rk, tmat_dev, nbw, umc_dev, cur_l_cols)
#else
call cublas_strmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, &
1.0_rk, tmat_dev, nbw, umc_dev, cur_l_cols)
#endif
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
......@@ -883,13 +957,22 @@ module ELPA2_compute
print *,"bandred_real: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_REAL
call cublas_dgemm('T', 'N', n_cols, n_cols, l_cols, &
1.0_rk, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, &
0.0_rk, vav_dev, nbw)
call cublas_dgemm('T','N',n_cols,n_cols,l_cols, &
1.d0, umc_dev,cur_l_cols,(umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, &
0.d0, vav_dev,nbw)
call cublas_dtrmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
1.0_rk, tmat_dev, nbw, vav_dev, nbw)
#else
call cublas_sgemm('T', 'N', n_cols, n_cols, l_cols, &
1.0_rk, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, &
0.0_rk, vav_dev, nbw)
call cublas_strmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, &
1.0_rk, tmat_dev, nbw, vav_dev, nbw)
#endif
call cublas_dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols, &
1.d0, tmat_dev,nbw, vav_dev, nbw)
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_real_datatype, cudaMemcpyDeviceToHost)
......@@ -907,10 +990,15 @@ module ELPA2_compute
endif
! U = U - 0.5 * V * VAV
call cublas_dgemm('N','N',l_cols,n_cols,n_cols,&
-0.5d0, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, vav_dev,nbw,&
1.0d0, umc_dev,cur_l_cols)
#ifdef DOUBLE_PRECISION_REAL
call cublas_dgemm('N', 'N', l_cols, n_cols, n_cols,&
-0.5_rk, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, vav_dev,nbw,&
1.0_rk, umc_dev, cur_l_cols)
#else
call cublas_sgemm('N', 'N', l_cols, n_cols, n_cols,&
-0.5_rk, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, vav_dev,nbw,&
1.0_rk, umc_dev, cur_l_cols)
#endif
successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_real_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
......@@ -940,10 +1028,15 @@ module ELPA2_compute
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<1) cycle
call cublas_dgemm('N', 'T', lre, lce-lcs+1, 2*n_cols, -1.d0, &
vmr_dev,cur_l_rows,(umc_dev +(lcs-1)*size_of_real_datatype),cur_l_cols, &
1.d0,(a_dev+(lcs-1)*lda*size_of_real_datatype),lda)
#ifdef DOUBLE_PRECISION_REAL
call cublas_dgemm('N', 'T', lre, lce-lcs+1, 2*n_cols, -1.0_rk, &
vmr_dev, cur_l_rows, (umc_dev +(lcs-1)*size_of_real_datatype), cur_l_cols, &
1.0_rk, (a_dev+(lcs-1)*lda*size_of_real_datatype), lda)
#else
call cublas_sgemm('N', 'T', lre, lce-lcs+1, 2*n_cols, -1.0_rk, &
vmr_dev, cur_l_rows, (umc_dev +(lcs-1)*size_of_real_datatype), cur_l_cols, &
1.0_rk, (a_dev+(lcs-1)*lda*size_of_real_datatype), lda)
#endif
enddo
else ! do not useGPU
! Or if we used the Algorithm 4
......@@ -959,8 +1052,11 @@ module ELPA2_compute
print *,"bandred_real: error when allocating tmpCPU "//errorMessage
stop
endif
call mpi_allreduce(umcCPU,tmpCPU,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
......@@ -971,23 +1067,40 @@ module ELPA2_compute
endif
! U = U * Tmat**T
#ifdef DOUBLE_PRECISION_REAL
call dtrmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols,n_cols, 1.0_rk, tmat(1,1,istep), ubound(tmat,dim=1), &
umcCPU, ubound(umcCPU,dim=1))
call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep),ubound(tmat,dim=1), &
umcCPU,ubound(umcCPU,dim=1))
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call dgemm('T', 'N', n_cols, n_cols, l_cols, 1.0_rk, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), &
ubound(umcCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1))
call dtrmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, 1.0_rk, tmat(1,1,istep), &
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
#else
call strmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols,n_cols, 1.0_rk, tmat(1,1,istep), ubound(tmat,dim=1), &
umcCPU, ubound(umcCPU,dim=1))
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umcCPU,ubound(umcCPU,dim=1),umcCPU(1,n_cols+1), &
ubound(umcCPU,dim=1),0.d0,vav,ubound(vav,dim=1))
call sgemm('T', 'N', n_cols, n_cols, l_cols, 1.0_rk, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), &
ubound(umcCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1))
call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep), &
ubound(tmat,dim=1),vav,ubound(vav,dim=1))
call strmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, 1.0_rk, tmat(1,1,istep), &
ubound(tmat,dim=1), vav, ubound(vav,dim=1))
#endif
call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols)
! U = U - 0.5 * V * VAV
call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umcCPU(1,n_cols+1),ubound(umcCPU,dim=1),vav, &
ubound(vav,dim=1),1.d0,umcCPU,ubound(umcCPU,dim=1))
#ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'N', l_cols, n_cols, n_cols, -0.5_rk, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, &
ubound(vav,dim=1), 1.0_rk, umcCPU, ubound(umcCPU,dim=1))
#else
call sgemm('N', 'N', l_cols, n_cols, n_cols, -0.5_rk, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, &
ubound(vav,dim=1), 1.0_rk, umcCPU, ubound(umcCPU,dim=1))
#endif
! Transpose umc -> umr (stored in vmr, second half)
......@@ -1024,10 +1137,15 @@ module ELPA2_compute
myend = mystart + work_per_thread - 1
if ( myend > lre ) myend = lre
if ( myend-mystart+1 < 1) cycle
call dgemm('N','T',myend-mystart+1, lce-lcs+1, 2*n_cols, -1.d0, &
#ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'T', myend-mystart+1, lce-lcs+1, 2*n_cols, -1.0_rk, &
vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), &
1.d0,a(mystart,lcs),ubound(a,1))
1.0_rk, a(mystart,lcs), ubound(a,1))
#else
call sgemm('N', 'T', myend-mystart+1, lce-lcs+1, 2*n_cols, -1.0_rk, &
vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), &
1.0_rk, a(mystart,lcs), ubound(a,1))
#endif
enddo
!$omp end parallel
......@@ -1037,9 +1155,16 @@ module ELPA2_compute
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce<lcs .or. lre<1) cycle
call dgemm('N','T',lre,lce-lcs+1,2*n_cols,-1.d0, &
vmrCPU,ubound(vmrCPU,dim=1),umcCPU(lcs,1),ubound(umcCPU,dim=1), &
1.d0,a(1,lcs),lda)
#ifdef DOUBLE_PRECISION_REAL
call dgemm('N', 'T', lre,lce-lcs+1, 2*n_cols, -1.0_rk, &
vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
1.0_rk, a(1,lcs), lda)
#else
call sgemm('N', 'T', lre,lce-lcs+1, 2*n_cols, -1.0_rk, &
vmrCPU, ubound(vmrCPU,dim=1), umcCPU(lcs,1), ubound(umcCPU,dim=1), &
1.0_rk, a(1,lcs), lda)
#endif
enddo
#endif /* WITH_OPENMP */
......@@ -1204,9 +1329,11 @@ module ELPA2_compute
h1(nc+1:nc+i) = a(1:i,i)
nc = nc+i
enddo
call mpi_allreduce(h1,h2,nc,MPI_REAL8,MPI_SUM,comm,mpierr)
#ifdef DOUBLE_PRECISION_REAL
call mpi_allreduce(h1, h2, nc, MPI_REAL8, MPI_SUM, comm, mpierr)
#else
call mpi_allreduce(h1, h2, nc, MPI_REAL4, MPI_SUM, comm, mpierr)