Commit ba80938c authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'cusolver_passing_ci' into master_pre_stage

parents 1bbf7864 6b96dfde
......@@ -62,7 +62,7 @@
__global__ void copy_double_a_tmatc_kernel(double *a_dev, double *tmatc_dev, const int l_cols, const int matrixRows, const int l_colx, const int l_row1, const int nblk){
int ii_index = threadIdx.x +1; // range 1..nblk
int jj_index = blockIdx.x + 1; // range 1..l_colx-l_cols-1
int jj_index = blockIdx.x + 1; // range 1..l_cols-l_colx+1
tmatc_dev[l_colx-1+jj_index-1+(ii_index-1)*l_cols] = a_dev[l_row1-1+ii_index-1 + (l_colx-1+jj_index-1)*matrixRows];
}
......@@ -86,7 +86,7 @@ extern "C" void copy_double_a_tmatc_FromC(double *a_dev, double *tmatc_dev, int
__global__ void copy_float_a_tmatc_kernel(float *a_dev, float *tmatc_dev, const int l_cols, const int matrixRows, const int l_colx, const int l_row1, const int nblk){
int ii_index = threadIdx.x +1; // range 1..nblk
int jj_index = blockIdx.x + 1; // range 1..l_colx-l_cols-1
int jj_index = blockIdx.x + 1; // range 1..l_cols-l_colx+1
tmatc_dev[l_colx-1+jj_index-1+(ii_index-1)*l_cols] = a_dev[l_row1-1+ii_index-1 + (l_colx-1+jj_index-1)*matrixRows];
}
......@@ -110,7 +110,7 @@ extern "C" void copy_float_a_tmatc_FromC(float *a_dev, float *tmatc_dev, int *nb
__global__ void copy_double_complex_a_tmatc_kernel(cuDoubleComplex *a_dev, cuDoubleComplex *tmatc_dev, const int l_cols, const int matrixRows, const int l_colx, const int l_row1){
int ii_index = threadIdx.x +1; // range 1..nblk
int jj_index = blockIdx.x + 1; // range 1..l_colx-l_cols-1
int jj_index = blockIdx.x + 1; // range 1..l_cols-l_colx+1
tmatc_dev[l_colx-1+jj_index-1+(ii_index-1)*l_cols] = cuConj(a_dev[l_row1-1+ii_index-1 + (l_colx-1+jj_index-1)*matrixRows]);
}
......@@ -137,7 +137,7 @@ extern "C" void copy_double_complex_a_tmatc_FromC(double _Complex *a_dev, double
__global__ void copy_float_complex_a_tmatc_kernel(cuFloatComplex *a_dev, cuFloatComplex *tmatc_dev, const int l_cols, const int matrixRows, const int l_colx, const int l_row1){
int ii_index = threadIdx.x +1; // range 1..nblk
int jj_index = blockIdx.x + 1; // range 1..l_colx-l_cols-1
int jj_index = blockIdx.x + 1; // range 1..l_cols-l_colx+1
tmatc_dev[l_colx-1+jj_index-1+(ii_index-1)*l_cols] = cuConjf(a_dev[l_row1-1+ii_index-1 + (l_colx-1+jj_index-1)*matrixRows]);
}
......
......@@ -55,7 +55,6 @@
use mod_check_for_gpu
use invert_trm_cuda, only : copy_PRECISION_tmp1_tmp2, &
copy_PRECISION_a_tmp1
use cholesky_cuda
implicit none
#include "../general/precision_kinds.F90"
......@@ -498,8 +497,11 @@
#else
tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk*nblk])
call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk,nblk])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("cholesky: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
call obj%timer%start("mpi_cuda_communication")
call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nblk*(nblk+1)/2,kind=MPI_KIND), &
......@@ -559,7 +561,9 @@
if (useGPU) then
if (my_prow==prow(n, nblk, np_rows)) then
call copy_PRECISION_a_tmatc(a_dev, tmatc_dev, nblk, matrixRows, l_cols, l_colx, l_row1)
! if l_cols-l_colx+1 == 0 kernel launch with 0 blocks => raises error
if (l_cols-l_colx+1>0) &
call copy_PRECISION_a_tmatc(a_dev, tmatc_dev, nblk, matrixRows, l_cols, l_colx, l_row1)
endif
else ! useGPU
do i=1,nblk
......@@ -600,6 +604,11 @@
tmatc_mpi_dev = transfer(tmatc_dev, tmatc_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmatc_mpi_dev, tmatc_mpi_fortran_ptr, [l_cols,nblk])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("cholesky: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
do i=1,nblk
call obj%timer%start("mpi_cuda_communication")
......
......@@ -614,3 +614,4 @@ extern "C" void copy_float_complex_a_tmp1_FromC(float _Complex *a_dev, float _Co
printf("Error in executing copy_float_complex_a_tmp1_kernel: %s\n",cudaGetErrorString(cuerr));
}
}
......@@ -94,13 +94,13 @@
integer(kind=c_int) :: gpu, numGPU
integer(kind=c_intptr_t) :: tmat1_dev, tmat2_dev, a_dev, tmp1_dev, tmp2_dev, zero_dev
type(c_ptr) :: tmp1_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: tmp1_mpi_fortran_ptr(:,:)
type(c_ptr) :: tmat1_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: tmat1_mpi_fortran_ptr(:,:)
MATH_DATATYPE(kind=rck), pointer :: tmp1_mpi_fortran_ptr(:)
type(c_ptr) :: tmat1_mpi_dev, tmat2_mpi_dev
MATH_DATATYPE(kind=rck), pointer :: tmat1_mpi_fortran_ptr(:,:), tmat2_mpi_fortran_ptr(:,:)
type(c_ptr) :: tmp2_mpi_dev, a_mpi_dev
integer(kind=c_intptr_t) :: a_off, tmat2_off, tmp1_off, tmp2_off
MATH_DATATYPE(kind=rck), pointer :: a_mpi_deviceptr(:,:)
MATH_DATATYPE(kind=rck), pointer :: a_mpi_deviceptr(:,:), initializer_ptr(:) !DEB
integer(kind=c_intptr_t) :: num
integer(kind=c_int) :: gpu_invert_trm
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
......@@ -399,9 +399,16 @@
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else
tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk*nblk])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
if (wantDebug) call obj%timer%start("cuda_mpi_communication")
call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nb*(nb+1)/2,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
......@@ -437,7 +444,6 @@
call gpublas_PRECISION_TRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, ONE, tmp2_dev, &
nblk, a_dev+a_off, matrixRows)
!successGPU = gpu_devicesynchronize()
endif
call obj%timer%stop("gpublas")
......@@ -497,7 +503,10 @@
tmat1_mpi_dev = transfer(tmat1_dev, tmat1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmat1_mpi_dev, tmat1_mpi_fortran_ptr, [l_rows,nblk])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
call obj%timer%start("mpi_cuda_communication")
do i=1,nb
call MPI_Bcast(tmat1_mpi_fortran_ptr(1,i), int(l_row1-1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
......@@ -534,9 +543,6 @@
check_memcpy_gpu("elpa_invert_trm: tmat2_dev to tmat2", successGPU)
endif
endif
#else
#error "not yet implemented"
#endif
call obj%timer%start("mpi_communication")
if (l_cols-l_col1+1 > 0) &
......@@ -545,7 +551,6 @@
call obj%timer%stop("mpi_communication")
#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
if (l_cols-l_col1+1 > 0) then
num = nblk*l_cols*size_of_datatype
......@@ -553,11 +558,23 @@
gpuMemcpyHostToDevice)
check_memcpy_gpu("elpa_invert_trm: tmat2 to tmat2_dev", successGPU)
endif
endif
#else
#error "not yet implemented"
#endif
tmat2_mpi_dev = transfer(tmat2_dev, tmat2_mpi_dev)
call c_f_pointer(tmat2_mpi_dev, tmat2_mpi_fortran_ptr, [nblk,l_cols])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
call obj%timer%start("mpi_cuda_communication")
if (l_cols-l_col1+1 > 0) &
call MPI_Bcast(tmat2_mpi_fortran_ptr(1,l_col1), int((l_cols-l_col1+1)*nblk,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION, int(prow(n, nblk, np_rows),kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_cuda_communication")
endif
#endif
#endif /* WITH_MPI */
if (useGPU) then
......
......@@ -492,6 +492,5 @@ module invert_trm_cuda
#endif
end subroutine
end module
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment