Commit c2037d4e authored by Pavel Kus's avatar Pavel Kus
Browse files

elpa1 trans_ev_real ported to GPU

Conflicts:
	src/elpa1_trans_ev_real_template.X90
	src/elpa1_tridiag_real_template.X90
parent f7c5807e
......@@ -637,9 +637,9 @@ function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_real_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
call trans_ev_real_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#else
call trans_ev_real_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
call trans_ev_real_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
......@@ -816,9 +816,9 @@ function solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixC
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_real_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
call trans_ev_real_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#else
call trans_ev_real_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
call trans_ev_real_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
......
......@@ -54,6 +54,11 @@
#include "precision_macros.h"
!cannot use __FILE__ because filename with path can be too long for gfortran (max line length)
#define check_memcpy_cuda(file, success) call check_memcpy_CUDA_f(file, __LINE__, success)
#define check_alloc_cuda(file, success) call check_alloc_CUDA_f(file, __LINE__, success)
#define check_dealloc_cuda(file, success) call check_dealloc_CUDA_f(file, __LINE__, success)
#include "elpa1_tridiag_real_template.X90"
#include "elpa1_trans_ev_real_template.X90"
#include "elpa1_solve_tridi_real_template.X90"
......
......@@ -52,41 +52,46 @@
! distributed along with the original code in the file "COPYING".
#endif
subroutine M_trans_ev_real_PRECISSION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)
!-------------------------------------------------------------------------------
! trans_ev_real: Transforms the eigenvectors of a tridiagonal matrix back
! to the eigenvectors of the original matrix
! (like Scalapack Routine PDORMTR)
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nqc Number of columns of matrix q
!
! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a and q
!
! tau(na) Factors of the Householder vectors
!
! q On input: Eigenvectors of tridiagonal matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
!> \brief Transforms the eigenvectors of a tridiagonal matrix back
!> to the eigenvectors of the original matrix
!> (like Scalapack Routine PDORMTR)
!>
! Parameters
!
!> \param na Order of matrix a, number of rows of matrix q
!>
!> \param nqc Number of columns of matrix q
!>
!> \param a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
!> Distribution is like in Scalapack.
!>
!> \param lda Leading dimension of a
!>
!> \param tau(na) Factors of the Householder vectors
!>
!> \param q On input: Eigenvectors of tridiagonal matrix
!> On output: Transformed eigenvectors
!> Distribution is like in Scalapack.
!>
!> \param ldq Leading dimension of q
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix a and q
!>
!> \param mpi_comm_rows MPI-Communicator for rows
!>
!> \param mpi_comm_cols MPI-Communicator for columns
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine M_trans_ev_real_PRECISSION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
......@@ -98,21 +103,29 @@
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, nstor
integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
real(kind=REAL_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
real(kind=REAL_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:)
real(kind=REAL_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
integer(kind=C_intptr_T) :: q_dev, tmp_dev, hvm_dev, tmat_dev
logical :: successCUDA
!write(*,*) "na", na, "nqc", nqc, "lda", lda, "ldq", ldq, "matrixCols", matrixCols
call timer%start("trans_ev_real" // M_PRECISSION_SUFFIX)
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
......@@ -133,57 +146,52 @@
max_stored_rows = (63/nblk+1)*nblk
allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_real: error when allocating tmat "//errorMessage
stop
endif
call check_alloc("trans_ev_real", "tmat", istat, errorMessage)
allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_real: error when allocating h1 "//errorMessage
stop
endif
call check_alloc("trans_ev_real", "h1", istat, errorMessage)
allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_real: error when allocating h2 "//errorMessage
stop
endif
call check_alloc("trans_ev_real", "h2", istat, errorMessage)
allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_real: error when allocating tmp1 "//errorMessage
stop
endif
call check_alloc("trans_ev_real", "tmp1", istat, errorMessage)
allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_real: error when allocating tmp2 "//errorMessage
stop
endif
call check_alloc("trans_ev_real", "tmp2", istat, errorMessage)
allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_real: error when allocating hvn "//errorMessage
stop
endif
call check_alloc("trans_ev_real", "hvn", istat, errorMessage)
allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_real: error when allocating hvm "//errorMessage
stop
endif
call check_alloc("trans_ev_real", "hvm", istat, errorMessage)
! if (useGPU) then
! allocate(hvm1(max_local_rows*max_stored_rows))
!
! allocate(tmat_dev(max_stored_rows,max_stored_rows))
! allocate(hvm_dev(max_local_rows*max_stored_rows))
! allocate(tmp_dev(max_local_cols*max_stored_rows))
! allocate(q_dev(ldq,nqc))
! q_dev = q
! endif
if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_real", "hvm1", istat, errorMessage)
! allocate(tmat_dev(max_stored_rows,max_stored_rows))
! allocate(hvm_dev(max_local_rows*max_stored_rows))
! allocate(tmp_dev(max_local_cols*max_stored_rows))
! allocate(q_dev(ldq,nqc))
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * M_size_of_PRECISSION_real)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * M_size_of_PRECISSION_real)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * M_size_of_PRECISSION_real)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * M_size_of_PRECISSION_real)
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q
successCUDA = cuda_memcpy(q_dev, loc(q(1,1)), ldq * matrixCols * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
......@@ -191,12 +199,11 @@
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
nstor = 0
! if (useGPU) then
! hvn_ubnd = 0
! endif
if (useGPU) then
hvn_ubnd = 0
endif
do istep=1,na,nblk
ics = MAX(istep,3)
ice = MIN(istep+nblk-1,na)
if (ice<ics) cycle
......@@ -234,9 +241,9 @@
do ic=ics,ice
l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
! if (useGPU) then
! hvm_ubnd = l_rows
! endif
if (useGPU) then
hvm_ubnd = l_rows
endif
nstor = nstor+1
nb = nb+l_rows
enddo
......@@ -282,65 +289,109 @@
nc = nc+n
enddo
! if (useGPU) then
! hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!
! hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
! tmat_dev = tmat
! endif
if (useGPU) then
! todo: is this reshape really neccessary?
hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
hvm_ubnd * nstor * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
! Q = Q - V * T * V**T * Q
if (l_rows>0) then
call M_PRECISSION_GEMM('T', 'N', nstor, l_cols, l_rows, &
M_CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, &
M_CONST_0_0, tmp1, nstor)
else
! if (useGPU) then
! tmp_dev(1:l_cols*nstor) = 0
! else
if(useGPU) then
call M_cublas_PRECISSION_gemm('T', 'N', nstor, l_cols, l_rows, &
M_CONST_1_0, hvm_dev, hvm_ubnd, &
q_dev, ldq, &
M_CONST_0_0, tmp_dev, nstor)
else
call M_PRECISSION_GEMM('T', 'N', nstor, l_cols, l_rows, &
M_CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, &
M_CONST_0_0, tmp1, nstor)
endif
else !l_rows>0
if (useGPU) then
!tmp_dev(1:l_cols*nstor) = 0
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * M_size_of_PRECISSION_real)
check_memcpy_cuda("trans_ev", successCUDA)
else
tmp1(1:l_cols*nstor) = 0
! endif
endif
endif
! if (useGPU) then
! else
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else
! tmp2 = tmp1
! tmp2 = tmp1
#endif
! endif ! useGPU
if (l_rows>0) then
if(useGPU) then
call M_cublas_PRECISSION_trmm('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat_dev, max_stored_rows, &
tmp_dev, nstor)
call M_cublas_PRECISSION_gemm('N', 'N' ,l_rows ,l_cols ,nstor, &
-M_CONST_1_0, hvm_dev, hvm_ubnd, &
tmp_dev, nstor, &
M_CONST_1_0, q_dev, ldq)
else
#ifdef WITH_MPI
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp2, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp2, nstor, &
M_CONST_1_0, q, ldq)
! tmp2 = tmat * tmp2
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp2, nstor)
!q = q - hvm*tmp2
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp2, nstor, &
M_CONST_1_0, q, ldq)
#else
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp1, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp1, nstor, &
M_CONST_1_0, q, ldq)
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp1, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp1, nstor, &
M_CONST_1_0, q, ldq)
#endif
endif
endif ! useGPU
endif ! l_rows>0
nstor = 0
endif
endif ! (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32))
enddo
......@@ -350,13 +401,32 @@
stop
endif
! if (useGPU) then
! q = q_dev
! deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
! endif
if (useGPU) then
!q = q_dev
successCUDA = cuda_memcpy(loc(q(1,1)), q_dev, ldq * matrixCols * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_real: error when deallocating hvm1 "//errorMessage
stop
endif
!deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
successCUDA = cuda_free(q_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(tmp_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(hvm_dev)
check_dealloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_free(tmat_dev)
check_dealloc_cuda("trans_ev", successCUDA)
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_real" // M_PRECISSION_SUFFIX)
#endif
end subroutine M_trans_ev_real_PRECISSION
......@@ -52,7 +52,6 @@
! distributed along with the original code in the file "COPYING".
#endif
#define check_cuda(success) call check_memcpy_CUDA(__FILE__, __LINE__, success)
!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
......@@ -79,6 +78,8 @@
!> \param e(na) Off-Diagonal elements (returned), identical on all processors
!>
!> \param tau(na) Factors for the Householder vectors (returned), needed for back transformation
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine M_tridiag_real_PRECISSION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau, useGPU)
......@@ -260,22 +261,22 @@
if (useGPU) then
successCUDA = cuda_malloc(v_row_dev, max_local_rows * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "v_row_dev", successCUDA)
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(u_row_dev, max_local_rows * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "u_row_dev", successCUDA)
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(v_col_dev, max_local_cols * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "v_col_dev", successCUDA)
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(u_col_dev, max_local_cols * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "u_col_dev", successCUDA)
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "vu_stored_rows_dev", successCUDA)
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "uv_stored_cols_dev", successCUDA)
check_alloc_cuda("tridiag", successCUDA)
endif
d(:) = 0
......@@ -292,9 +293,9 @@
if (useGPU) then
! allocate memmory for matrix A on the device and than copy the matrix
successCUDA = cuda_malloc(a_dev, lda * na_cols * M_size_of_PRECISSION_real)
call check_alloc_CUDA("tridiag_real", "a_dev", successCUDA)
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), lda * na_cols * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
endif
! main cycle of tridiagonalization
......@@ -324,7 +325,7 @@
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)*M_size_of_PRECISSION_real, &
cudaMemcpyDeviceToHost)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
else
v_row(1:l_rows) = a(1:l_rows,l_cols+1)
endif
......@@ -404,14 +405,14 @@
if (l_rows>0 .and. l_cols>0) then
if(useGPU) then
successCUDA = cuda_memset(u_col_dev, 0, l_cols * M_size_of_PRECISSION_real)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
successCUDA = cuda_memset(u_row_dev, 0, l_rows * M_size_of_PRECISSION_real)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
endif
! if (useGPU) then
!u_col_dev(1:l_cols) = 0.
......@@ -515,9 +516,9 @@
if(useGPU) then
successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(loc(u_row(1)), u_row_dev, l_rows * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
endif
! call M_PRECISSION_SYMV('U', l_cols, &
......@@ -618,11 +619,11 @@
if (useGPU) then
successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
max_local_rows * 2 * max_stored_uv * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
max_local_cols * 2 * max_stored_uv * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
! vu_stored_rows_dev(:,:) = vu_stored_rows(:,:)
! uv_stored_cols_dev(:,:) = uv_stored_cols(:,:)
......@@ -660,7 +661,7 @@
a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * M_size_of_PRECISSION_real
successCUDA = cuda_memcpy(loc(a(l_rows, l_cols)), a_dev + a_offset, &
1 * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost);
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
endif
if (n_stored_vecs>0) then
......@@ -673,7 +674,7 @@
!a_dev(l_rows,l_cols) = a(l_rows,l_cols)
successCUDA = cuda_memcpy(a_dev + a_offset, loc(a(l_rows, l_cols)), &
1 * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
endif
endif
......@@ -685,12 +686,12 @@
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
successCUDA = cuda_memcpy(loc(e(1)), a_dev + (lda * (l_cols - 1)) * M_size_of_PRECISSION_real, &
1 * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
endif
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
successCUDA = cuda_memcpy(loc(d(1)), a_dev, &
1 * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_cuda(successCUDA)
check_memcpy_cuda("tridiag", successCUDA)
endif
else
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) &
......@@ -708,25 +709,25 @@
if (useGPU) then
! todo: should we leave a on the device for further use?
successCUDA = cuda_free(a_dev)
call check_dealloc_CUDA("tridiag_real", "a_dev", successCUDA)
check_dealloc_cuda("tridiag", successCUDA)
successCUDA = cuda_free(v_row_dev)
call check_dealloc_CUDA("tridiag_real", "v_row_dev", successCUDA)
check_dealloc_cuda("tridiag", successCUDA)
successCUDA = cuda_free(u_row_dev)
call check_dealloc_CUDA("tridiag_real", "u_row_dev", successCUDA)
check_dealloc_cuda("tridiag", successCUDA)
successCUDA = cuda_free(v_col_dev)
call check_dealloc_CUDA("tridiag_real", "v_col_dev", successCUDA)
check_dealloc_cuda("tridiag", successCUDA)
successCUDA = cuda_free(u_col_dev)
call check_dealloc_CUDA("tridiag_real", "u_col_dev", successCUDA)
check_dealloc_cuda("tridiag", successCUDA)
successCUDA = cuda_free(vu_stored_rows_dev)
call check_dealloc_CUDA("tridiag_real", "vu_stored_rows_dev", successCUDA)
check_dealloc_cuda("tridiag", successCUDA)
successCUDA = cuda_free(uv_stored_cols_dev)
call check_dealloc_CUDA("tridiag_real", "uv_stored_cols_dev", successCUDA)
check_dealloc_cuda("tridiag", successCUDA)
endif
......@@ -792,7 +793,7 @@
!
! if((my_prow == prow) .and. (my_pcol == pcol)) then
! successCUDA = cuda_memcpy(loc(tmp(1,1)), a_dev, lda * na_cols * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
! check_cuda(successCUDA)
! check_memcpy_cuda("tridiag", successCUDA)
!
! write(*, '(A,2I4.2)') "MATRIX A ON DEVICE:", prow, pcol
! do i=1,size(tmp,1)
......
......@@ -59,7 +59,7 @@ module ELPA_utilities
private ! By default, all routines contained are private
public :: debug_messages_via_environment_variable, error_unit
public :: check_alloc, check_alloc_CUDA, check_memcpy_CUDA, check_dealloc_CUDA