Commit c7746cbe authored by Andreas Marek's avatar Andreas Marek
Browse files

Disable cuda aware MPI in invert_trm and cholesky

parent ba80938c
......@@ -471,7 +471,7 @@
endif ! (my_pcol==pcol(n, nblk, np_cols))
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
num = nblk*nblk*size_of_datatype
successGPU = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), tmp1_dev, num, &
......@@ -479,9 +479,9 @@
check_memcpy_gpu("elpa_cholesky: tmp1_dev to tmp1", successGPU)
endif
#endif
!#endif
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
call obj%timer%start("mpi_communication")
call MPI_Bcast(tmp1, int(nblk*(nblk+1)/2,kind=MPI_KIND), &
......@@ -494,29 +494,29 @@
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else
tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk,nblk])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("cholesky: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
call obj%timer%start("mpi_cuda_communication")
call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nblk*(nblk+1)/2,kind=MPI_KIND), &
#if REALCASE == 1
MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION, &
#endif
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_cuda_communication")
#endif
!#else
! tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! ! and associate a fortran pointer
! call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk,nblk])
! if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
! successGPU = gpu_devicesynchronize()
! check_memcpy_gpu("cholesky: device_synchronize", successGPU)
! if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
! call obj%timer%start("mpi_cuda_communication")
!
! call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nblk*(nblk+1)/2,kind=MPI_KIND), &
!#if REALCASE == 1
! MPI_REAL_PRECISION, &
!#endif
!#if COMPLEXCASE == 1
! MPI_COMPLEX_PRECISION, &
!#endif
! int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
!
! call obj%timer%stop("mpi_cuda_communication")
!#endif
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
num = nblk*nblk*size_of_datatype
successGPU = gpu_memcpy(tmp1_dev, int(loc(tmp1),kind=c_intptr_t), num, &
......@@ -524,7 +524,7 @@
check_memcpy_gpu("elpa_cholesky: tmp1 to tmp1_dev", successGPU)
endif
#endif
!#endif
#endif /* WITH_MPI */
......@@ -577,7 +577,7 @@
endif ! useGPU
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
if (l_cols-l_colx+1 > 0) then
num = l_cols*nblk*size_of_datatype
......@@ -586,12 +586,12 @@
check_memcpy_gpu("elpa_cholesky: tmatc_dev to tmatc", successGPU)
endif
endif
#endif
!#endif
#endif /* WITH_MPI */
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
do i=1,nblk
call obj%timer%start("mpi_communication")
if (l_cols-l_colx+1>0) &
......@@ -600,30 +600,30 @@
call obj%timer%stop("mpi_communication")
enddo
#else
tmatc_mpi_dev = transfer(tmatc_dev, tmatc_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmatc_mpi_dev, tmatc_mpi_fortran_ptr, [l_cols,nblk])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("cholesky: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
do i=1,nblk
call obj%timer%start("mpi_cuda_communication")
if (l_cols-l_colx+1>0) &
call MPI_Bcast(tmatc_mpi_fortran_ptr(l_colx,i), int(l_cols-l_colx+1,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION, &
int(prow(n, nblk, np_rows),kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_cuda_communication")
enddo
#endif
!#else
! tmatc_mpi_dev = transfer(tmatc_dev, tmatc_mpi_dev)
! ! and associate a fortran pointer
! call c_f_pointer(tmatc_mpi_dev, tmatc_mpi_fortran_ptr, [l_cols,nblk])
!
! if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
! successGPU = gpu_devicesynchronize()
! check_memcpy_gpu("cholesky: device_synchronize", successGPU)
! if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
!
! do i=1,nblk
! call obj%timer%start("mpi_cuda_communication")
! if (l_cols-l_colx+1>0) &
! call MPI_Bcast(tmatc_mpi_fortran_ptr(l_colx,i), int(l_cols-l_colx+1,kind=MPI_KIND), &
! MPI_MATH_DATATYPE_PRECISION, &
! int(prow(n, nblk, np_rows),kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), mpierr)
!
! call obj%timer%stop("mpi_cuda_communication")
! enddo
!#endif
#endif /* WITH_MPI */
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
!if (l_cols-l_colx+1 > 0) then
num = l_cols*nblk*size_of_datatype
......@@ -632,7 +632,7 @@
check_memcpy_gpu("elpa_cholesky: tmatc to tmatc_dev", successGPU)
!endif
endif
#endif
!#endif
#endif /* WITH_MPI */
if (useGPU) then
......@@ -640,7 +640,8 @@
! a gpu version of elpa_transpose_vectors is needed
#if !defined(WITH_MPI) || (defined(WITH_MPI) && defined(WITH_CUDA_AWARE_MPI))
!#if !defined(WITH_MPI) || (defined(WITH_MPI) && defined(WITH_CUDA_AWARE_MPI))
#if !defined(WITH_MPI)
! this memcopy is only needed if
! - not mpi case
! - or mpi and cuda_aware_mpi
......
......@@ -383,7 +383,7 @@
endif ! my_pcol==pcol(n, nblk, np_cols)
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
num = nblk*nblk*size_of_datatype
successGPU = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), tmp1_dev, num, &
......@@ -391,31 +391,31 @@
check_memcpy_gpu("elpa_invert_trm: tmp1_dev to tmp1", successGPU)
endif
#endif
!#endif
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
call obj%timer%start("mpi_communication")
call MPI_Bcast(tmp1, int(nb*(nb+1)/2,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_communication")
#else
tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk*nblk])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
if (wantDebug) call obj%timer%start("cuda_mpi_communication")
call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nb*(nb+1)/2,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
if (wantDebug) call obj%timer%stop("cuda_mpi_communication")
#endif
!#else
! tmp1_mpi_dev = transfer(tmp1_dev, tmp1_mpi_dev)
! ! and associate a fortran pointer
! call c_f_pointer(tmp1_mpi_dev, tmp1_mpi_fortran_ptr, [nblk*nblk])
! if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
! successGPU = gpu_devicesynchronize()
! check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
! if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
!
!
!
! if (wantDebug) call obj%timer%start("cuda_mpi_communication")
! call MPI_Bcast(tmp1_mpi_fortran_ptr, int(nb*(nb+1)/2,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
! int(pcol(n, nblk, np_cols),kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
! if (wantDebug) call obj%timer%stop("cuda_mpi_communication")
!#endif
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if ((useGPU)) then
num = nblk*nblk*size_of_datatype
successGPU = gpu_memcpy(tmp1_dev, int(loc(tmp1),kind=c_intptr_t), num, &
......@@ -423,7 +423,7 @@
check_memcpy_gpu("elpa_invert_trm: tmp1 to tmp1_dev", successGPU)
endif
#endif
!#endif
#endif /* WITH_MPI */
if (useGPU) then
......@@ -479,18 +479,18 @@
endif
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
num = l_rows*nblk*size_of_datatype
successGPU = gpu_memcpy(int(loc(tmat1),kind=c_intptr_t), tmat1_dev, num, &
gpuMemcpyDeviceToHost)
check_memcpy_gpu("elpa_invert_trm: tmat1_dev to tmat1", successGPU)
endif
#endif
!#endif
#endif /* WITH_MPI */
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
do i=1,nb
call obj%timer%start("mpi_communication")
call MPI_Bcast(tmat1(1,i), int(l_row1-1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
......@@ -499,27 +499,27 @@
call obj%timer%stop("mpi_communication")
enddo
#else
tmat1_mpi_dev = transfer(tmat1_dev, tmat1_mpi_dev)
! and associate a fortran pointer
call c_f_pointer(tmat1_mpi_dev, tmat1_mpi_fortran_ptr, [l_rows,nblk])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
call obj%timer%start("mpi_cuda_communication")
do i=1,nb
call MPI_Bcast(tmat1_mpi_fortran_ptr(1,i), int(l_row1-1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
int(pcol(n, nblk, np_cols),kind=MPI_KIND), &
int(mpi_comm_cols,kind=MPI_KIND), mpierr)
enddo
call obj%timer%stop("mpi_cuda_communication")
#endif
!#else
! tmat1_mpi_dev = transfer(tmat1_dev, tmat1_mpi_dev)
! ! and associate a fortran pointer
! call c_f_pointer(tmat1_mpi_dev, tmat1_mpi_fortran_ptr, [l_rows,nblk])
! if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
! successGPU = gpu_devicesynchronize()
! check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
! if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
! call obj%timer%start("mpi_cuda_communication")
! do i=1,nb
! call MPI_Bcast(tmat1_mpi_fortran_ptr(1,i), int(l_row1-1,kind=MPI_KIND), MPI_MATH_DATATYPE_PRECISION, &
! int(pcol(n, nblk, np_cols),kind=MPI_KIND), &
! int(mpi_comm_cols,kind=MPI_KIND), mpierr)
!
! enddo
! call obj%timer%stop("mpi_cuda_communication")
!#endif
#endif /* WITH_MPI */
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
! cuda aware MPI here
num = l_rows*nblk*size_of_datatype
......@@ -528,12 +528,12 @@
check_memcpy_gpu("elpa_invert_trm: tmat1 to tmat1_dev", successGPU)
endif
#endif
!#endif
#endif /* WITH_MPI */
endif ! (l_row1>1)
#ifdef WITH_MPI
#ifndef WITH_CUDA_AWARE_MPI
!#ifndef WITH_CUDA_AWARE_MPI
if (useGPU) then
if (l_cols-l_col1+1 > 0) then
......@@ -559,22 +559,22 @@
check_memcpy_gpu("elpa_invert_trm: tmat2 to tmat2_dev", successGPU)
endif
endif
#else
tmat2_mpi_dev = transfer(tmat2_dev, tmat2_mpi_dev)
call c_f_pointer(tmat2_mpi_dev, tmat2_mpi_fortran_ptr, [nblk,l_cols])
if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
successGPU = gpu_devicesynchronize()
check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
call obj%timer%start("mpi_cuda_communication")
if (l_cols-l_col1+1 > 0) &
call MPI_Bcast(tmat2_mpi_fortran_ptr(1,l_col1), int((l_cols-l_col1+1)*nblk,kind=MPI_KIND), &
MPI_MATH_DATATYPE_PRECISION, int(prow(n, nblk, np_rows),kind=MPI_KIND), &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
call obj%timer%stop("mpi_cuda_communication")
#endif
!#else
! tmat2_mpi_dev = transfer(tmat2_dev, tmat2_mpi_dev)
! call c_f_pointer(tmat2_mpi_dev, tmat2_mpi_fortran_ptr, [nblk,l_cols])
!
! if (wantDebug) call obj%timer%start("cuda_aware_device_synchronize")
! successGPU = gpu_devicesynchronize()
! check_memcpy_gpu("invert_trm: device_synchronize", successGPU)
! if (wantDebug) call obj%timer%stop("cuda_aware_device_synchronize")
! call obj%timer%start("mpi_cuda_communication")
! if (l_cols-l_col1+1 > 0) &
! call MPI_Bcast(tmat2_mpi_fortran_ptr(1,l_col1), int((l_cols-l_col1+1)*nblk,kind=MPI_KIND), &
! MPI_MATH_DATATYPE_PRECISION, int(prow(n, nblk, np_rows),kind=MPI_KIND), &
! int(mpi_comm_rows,kind=MPI_KIND), mpierr)
! call obj%timer%stop("mpi_cuda_communication")
!
!#endif
#endif /* WITH_MPI */
if (useGPU) then
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment