...
 
Commits (6)
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -387,6 +387,30 @@ extern "C" {
m, n, k, &alpha_casted, A_casted, lda, B_casted, ldb, &beta_casted, C_casted, ldc);
}
// TODO so far only real
void cublasSsyrk_elpa_wrapper (intptr_t handle, char uplo, char trans, int n, int k,
float alpha, const float *A, int lda,
float beta, float *C, int ldc) {
cublasSsyrk(*((cublasHandle_t*)handle), fill_mode_new_api(uplo), operation_new_api(trans),
n, k, &alpha, A, lda, &beta, C, ldc);
}
void cublasDsyrk_elpa_wrapper (intptr_t handle, char uplo, char trans, int n, int k,
double alpha, const double *A, int lda,
double beta, double *C, int ldc) {
cublasDsyrk(*((cublasHandle_t*)handle), fill_mode_new_api(uplo), operation_new_api(trans),
n, k, &alpha, A, lda, &beta, C, ldc);
}
// TODO so far only real
void cublasSscal_elpa_wrapper (intptr_t handle, int n, float alpha, float *x, int incx) {
cublasSscal(*((cublasHandle_t*)handle), n, &alpha, x, incx);
}
void cublasDscal_elpa_wrapper (intptr_t handle, int n, double alpha, double *x, int incx) {
cublasDscal(*((cublasHandle_t*)handle), n, &alpha, x, incx);
}
// todo: new CUBLAS API diverged from standard BLAS api for these functions
// todo: it provides out-of-place (and apparently more efficient) implementation
......
......@@ -328,6 +328,71 @@ module cuda_functions
end subroutine cublas_strmm_c
end interface
!TODO so far only real
interface
subroutine cublas_dsyrk_c(handle, uplo, trans, n, k, alpha, a, lda, beta, c, ldc) &
bind(C,name='cublasDsyrk_elpa_wrapper')
use iso_c_binding
implicit none
character(1, C_CHAR), value :: uplo, trans
integer(kind=C_INT), value :: n, k
integer(kind=C_INT), intent(in), value :: lda, ldc
real(kind=C_DOUBLE), value :: alpha, beta
integer(kind=C_intptr_T), value :: a, c
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_dsyrk_c
end interface
interface
subroutine cublas_ssyrk_c(handle, uplo, trans, n, k, alpha, a, lda, beta, c, ldc) &
bind(C,name='cublasSsyrk_elpa_wrapper')
use iso_c_binding
implicit none
character(1, C_CHAR), value :: uplo, trans
integer(kind=C_INT), value :: n, k
integer(kind=C_INT), intent(in), value :: lda, ldc
real(kind=C_FLOAT), value :: alpha, beta
integer(kind=C_intptr_T), value :: a, c
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_ssyrk_c
end interface
!TODO so far only real
interface
subroutine cublas_sscal_c(handle, n, alpha, x, incx) &
bind(C,name='cublasSscal_elpa_wrapper')
use iso_c_binding
implicit none
integer(kind=C_INT), value :: n
integer(kind=C_INT), intent(in), value :: incx
real(kind=C_FLOAT), value :: alpha
integer(kind=C_intptr_T), value :: x
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_sscal_c
end interface
interface
subroutine cublas_dscal_c(handle, n, alpha, x, incx) &
bind(C,name='cublasDscal_elpa_wrapper')
use iso_c_binding
implicit none
integer(kind=C_INT), value :: n
integer(kind=C_INT), intent(in), value :: incx
real(kind=C_DOUBLE), value :: alpha
integer(kind=C_intptr_T), value :: x
integer(kind=C_intptr_T), value :: handle
end subroutine cublas_dscal_c
end interface
interface
subroutine cublas_zgemm_c(handle, cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc) &
bind(C,name='cublasZgemm_elpa_wrapper')
......@@ -759,6 +824,67 @@ module cuda_functions
#endif
end subroutine cublas_strmm
!TODO so far only real
subroutine cublas_ssyrk(uplo, trans, n, k, alpha, a, lda, beta, c, ldc)
use iso_c_binding
implicit none
character(1, C_CHAR), value :: uplo, trans
integer(kind=C_INT) :: n, k
integer(kind=C_INT), intent(in) :: lda, ldc
real(kind=C_FLOAT) :: alpha, beta
integer(kind=C_intptr_T) :: a, c
#ifdef WITH_GPU_VERSION
call cublas_ssyrk_c(cublasHandle, uplo, trans, n, k, alpha, a, lda, beta, c, ldc)
#endif
end subroutine cublas_ssyrk
subroutine cublas_dsyrk(uplo, trans, n, k, alpha, a, lda, beta, c, ldc)
use iso_c_binding
implicit none
character(1, C_CHAR), value :: uplo, trans
integer(kind=C_INT) :: n, k
integer(kind=C_INT), intent(in) :: lda, ldc
real(kind=C_DOUBLE) :: alpha, beta
integer(kind=C_intptr_T) :: a, c
#ifdef WITH_GPU_VERSION
call cublas_dsyrk_c(cublasHandle, uplo, trans, n, k, alpha, a, lda, beta, c, ldc)
#endif
end subroutine cublas_dsyrk
!TODO so far only real
subroutine cublas_dscal(n, alpha, x, incx)
use iso_c_binding
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx
real(kind=C_DOUBLE) :: alpha
integer(kind=C_intptr_T) :: x
#ifdef WITH_GPU_VERSION
call cublas_dscal_c(cublasHandle, n, alpha, x, incx)
#endif
end subroutine cublas_dscal
subroutine cublas_sscal(n, alpha, x, incx)
use iso_c_binding
implicit none
integer(kind=C_INT) :: n
integer(kind=C_INT), intent(in) :: incx
real(kind=C_FLOAT) :: alpha
integer(kind=C_intptr_T) :: x
#ifdef WITH_GPU_VERSION
call cublas_sscal_c(cublasHandle, n, alpha, x, incx)
#endif
end subroutine cublas_sscal
subroutine cublas_zgemm(cta, ctb, m, n, k, alpha, a, lda, b, ldb, beta, c,ldc)
use iso_c_binding
......
......@@ -65,7 +65,7 @@
#else
last_stripe_width, &
#endif
kernel)
kernel, h_dev, s_dev, q_dev, w_dev)
use precision
use elpa_abstract_impl
......@@ -164,7 +164,11 @@
integer(kind=ik), intent(in) :: kernel
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
! for the blas kernel
integer(kind=c_intptr_t) :: h_dev, s_dev, q_dev, w_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
#if REALCASE == 1
integer(kind=c_intptr_t) :: hh_dot_dev ! why not needed in complex case
#endif
......@@ -1490,6 +1494,7 @@
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
......@@ -1497,14 +1502,14 @@
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
& (useGPU, a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw, h_dev, s_dev, q_dev, w_dev)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe,my_thread), w(1:nbw,1:6), nbw, nl, &
stripe_width, nbw)
& (useGPU, a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe,my_thread), w(1:nbw,1:6), nbw, nl, &
stripe_width, nbw, h_dev, s_dev, q_dev, w_dev)
#endif
#else
......@@ -1514,14 +1519,14 @@
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
& (useGPU, a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw, h_dev, s_dev, q_dev, w_dev)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), nbw, nl, &
stripe_width, nbw)
& (useGPU, a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), nbw, nl, &
stripe_width, nbw, h_dev, s_dev, q_dev, w_dev)
#endif
#endif
......
......@@ -128,16 +128,21 @@
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#undef BLAS_KERNEL
#define GPU_KERNEL ELPA_2STAGE_REAL_GPU
#define GENERIC_KERNEL ELPA_2STAGE_REAL_GENERIC
#define BLAS_KERNEL ELPA_2STAGE_REAL_BLAS_BLOCK4
#define KERNEL_STRING "real_kernel"
#endif
#if COMPLEXCASE == 1
#undef GPU_KERNEL
#undef GENERIC_KERNEL
#undef KERNEL_STRING
#undef BLAS_KERNEL
#define GPU_KERNEL ELPA_2STAGE_COMPLEX_GPU
#define GENERIC_KERNEL ELPA_2STAGE_COMPLEX_GENERIC
! TODO blas complex kernel
#define BLAS_KERNEL ELPA_2STAGE_REAL_BLAS_BLOCK4
#define KERNEL_STRING "complex_kernel"
#endif
......@@ -319,30 +324,34 @@
! check consistency between request for GPUs and defined kernel
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
else if (nblk .ne. 128) then
write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
kernel = GENERIC_KERNEL
endif
if (kernel .ne. BLAS_KERNEL) then
if (kernel .ne. GPU_KERNEL) then
write(error_unit,*) "ELPA: Warning, GPU usage has been requested but compute kernel is defined as non-GPU!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
else if (nblk .ne. 128) then
write(error_unit,*) "ELPA: Warning, GPU kernel can run only with scalapack block size 128!"
write(error_unit,*) "The compute kernel will be executed on CPUs!"
do_useGPU_trans_ev_tridi_to_band = .false.
kernel = GENERIC_KERNEL
endif
endif ! blas kernel TODO improve this logic
endif
! check again, now kernel and do_useGPU_trans_ev_tridi_to_band sould be
! finally consistent
if (do_useGPU_trans_ev_tridi_to_band) then
if (kernel .ne. GPU_KERNEL) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel! Aborting..."
stop
endif
if (nblk .ne. 128) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize! Aborting..."
stop
if (kernel .ne. BLAS_KERNEL) then ! for BLAS kernel both GPU and CPU possible TODO maybe should have BLAS_GPU_KERNEL?
if (kernel .ne. GPU_KERNEL) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel! Aborting..."
stop
endif
if (nblk .ne. 128) then
! this should never happen, checking as an assert
write(error_unit,*) "ELPA: INTERNAL ERROR setting GPU kernel and blocksize! Aborting..."
stop
endif
endif
else
if (kernel .eq. GPU_KERNEL) then
......
......@@ -104,6 +104,17 @@
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU
! at the moment, thte re are two completely different implementations for
! GPU usage
! * the LEGACY one, which uses the original NVidia kernels and does not
! fully work
! * the experimental BLAS kernel approach, which is not using all the
! legacy machinery, but, rather thant that, only offloads data inside the
! kernel
! TODO remove the LEGACY if the BLAS turns out to be better
logical :: useGPU_LEGACY, useGPU_BLAS
integer(kind=ik), intent(in) :: kernel
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
......@@ -116,6 +127,9 @@
MATH_DATATYPE(kind=rck), intent(in) :: hh_trans(:,:)
integer(kind=c_intptr_t) :: q_dev
! for the BLAS kernel
integer(kind=c_intptr_t) :: h_dev, s_dev, q2_dev, w_dev
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
......@@ -178,6 +192,10 @@
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
! at the moment, maximal 4 eigenvector at a time
! TODO increase it
integer(kind=ik), parameter :: max_block_blas = 4
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
......@@ -206,9 +224,19 @@
&MATH_DATATYPE
if(useGPU) then
gpuString = "_gpu"
if (kernel .eq. ELPA_2STAGE_REAL_BLAS_BLOCK4) then ! .or. &
gpuString = "_gpu_blas"
useGPU_BLAS = .true.
useGPU_LEGACY = .false.
else
gpuString = "_gpu_legacy"
useGPU_BLAS = .false.
useGPU_LEGACY = .true.
endif
else
gpuString = ""
useGPU_BLAS = .false.
useGPU_LEGACY = .false.
endif
call obj%timer%start("trans_ev_tridi_to_band_&
......@@ -218,7 +246,7 @@
gpuString)
n_times = 0
if (useGPU) then
if (useGPU_LEGACY) then
unpack_idx = 0
row_group_size = 0
endif
......@@ -270,10 +298,10 @@
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
if (useGPU) then
if (useGPU_LEGACY) then
stripe_width = 256 ! Must be a multiple of 4
stripe_count = (l_nev - 1) / stripe_width + 1
else ! useGPU
else ! useGPU_LEGACY
! openmp only in non-GPU case
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
......@@ -370,7 +398,7 @@
! last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif
endif ! useGPU
endif ! useGPU_LEGACY
#else /* WITH_OPENMP */
......@@ -378,11 +406,11 @@
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
if (useGPU) then
if (useGPU_LEGACY) then
stripe_width = 256 ! Must be a multiple of 4
stripe_count = (l_nev - 1) / stripe_width + 1
else ! useGPU
else ! useGPU_LEGACY
#if REALCASE == 1
call obj%get("stripewidth_real",stripe_width, error)
......@@ -467,7 +495,7 @@
endif
#endif
#endif /* COMPLEXCASE */
endif ! useGPU
endif ! useGPU_LEGACY
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
......@@ -489,7 +517,7 @@
a_dim2 = max_blk_size + nbw
if (useGPU) then
if (useGPU_LEGACY) then
num = (stripe_width*a_dim2*stripe_count)* size_of_datatype
successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* size_of_datatype)
if (.not.(successCUDA)) then
......@@ -585,7 +613,50 @@
aIntern(:,:,:) = 0.0_rck
#endif /* WITH_OPENMP */
endif !useGPU
endif !useGPU_LEGACY
if(useGPU_BLAS) then
! prepare device memory for the BLAS GPU kennel
! dimmensions correspondence:
! nbw = ldh = nb
! stripe_width = ldq = nq = nl
! max_block_blas = 4 (currently)
successCUDA = cuda_malloc( h_dev, max_block_blas * (nbw+3) * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop 1
endif
successCUDA = cuda_malloc( s_dev, max_block_blas * max_block_blas * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop 1
endif
successCUDA = cuda_malloc( q2_dev, stripe_width * (nbw+3) * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop 1
endif
successCUDA = cuda_malloc( w_dev, stripe_width * max_block_blas * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop 1
endif
endif !useGPU_BLAS
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -657,7 +728,7 @@
call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
#else /* WITH_OPENMP */
if (useGPU) then
if (useGPU_LEGACY) then
! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_&
&MATH_DATATYPE&
......@@ -678,7 +749,7 @@
row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif /* WITH_MPI */
else ! useGPU
else ! useGPU_LEGACY
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
......@@ -696,14 +767,14 @@
&_cpu_&
&PRECISION &
(obj,aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif ! useGPU
endif ! useGPU_LEGACY
#endif /* WITH_OPENMP */
elseif (src == my_prow) then
src_offset = src_offset+1
if (useGPU) then
if (useGPU_LEGACY) then
#ifndef WITH_OPENMP
! An unpacking of the current row group may occur before queuing the next row
......@@ -753,7 +824,7 @@
#else /* WITH_OPENMP */
if (useGPU) then
if (useGPU_LEGACY) then
else
call unpack_row_&
......@@ -834,7 +905,7 @@
call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
#else /* WITH_OPENMP */
if (useGPU) then
if (useGPU_LEGACY) then
! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_&
&MATH_DATATYPE&
......@@ -856,7 +927,7 @@
row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#endif /* WITH_MPI */
else ! useGPU
else ! useGPU_LEGACY
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_MATH_DATATYPE_PRECISION_EXPL, &
......@@ -872,7 +943,7 @@
&_cpu_&
&PRECISION &
(obj,aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
endif ! useGPU
endif ! useGPU_LEGACY
#endif /* WITH_OPENMP */
......@@ -881,7 +952,7 @@
endif
enddo
if (useGPU) then
if (useGPU_LEGACY) then
! Force an unpacking of all remaining rows that haven't been unpacked yet
call unpack_and_prepare_row_group_&
&MATH_DATATYPE&
......@@ -1088,7 +1159,7 @@
endif
bcast_buffer = 0.0_rck
if (useGPU) then
if (useGPU_LEGACY) then
num = ( nbw * max_blk_size) * size_of_datatype
successCUDA = cuda_malloc(bcast_buffer_dev, num)
if (.not.(successCUDA)) then
......@@ -1139,7 +1210,7 @@
&: error in cudaMemset"
stop 1
endif
endif ! useGPU
endif ! useGPU_LEGACY
current_tv_off = 0 ! Offset of next row to be broadcast
......@@ -1184,7 +1255,7 @@
#ifdef WITH_OPENMP
if (useGPU) then
if (useGPU_LEGACY) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop 1
endif
......@@ -1233,7 +1304,7 @@
#endif /* WITH_MPI */
if (useGPU) then
if (useGPU_LEGACY) then
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), &
nbw * current_local_n * &
size_of_datatype, &
......@@ -1262,13 +1333,13 @@
&PRECISION &
(bcast_buffer_dev, hh_dot_dev, nbw, &
current_local_n)
endif ! useGPU
endif ! useGPU_LEGACY
else ! (current_local_n > 1) then
! for current_local_n == 1 the one and only HH Vector is 0 and not stored in hh_trans_real/complex
bcast_buffer(:,1) = 0.0_rck
if (useGPU) then
if (useGPU_LEGACY) then
successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
......@@ -1284,7 +1355,7 @@
&( &
bcast_buffer_dev, hh_tau_dev, &
nbw, 1, .true.)
endif ! useGPU
endif ! useGPU_LEGACY
endif ! (current_local_n > 1) then
if (l_nev == 0) cycle
......@@ -1293,7 +1364,7 @@
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
if (useGPU_LEGACY) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop 1
endif
......@@ -1311,7 +1382,7 @@
#ifdef WITH_OPENMP
if (useGPU) then
if (useGPU_LEGACY) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop 1
endif
......@@ -1343,7 +1414,7 @@
#endif
n_off = current_local_n+a_off
if (useGPU) then
if (useGPU_LEGACY) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), &
stripe_width*nbw* size_of_datatype, &
......@@ -1365,7 +1436,7 @@
#ifdef WITH_OPENMP
if (useGPU) then
if (useGPU_LEGACY) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop 1
endif
......@@ -1406,7 +1477,7 @@
if (top_msg_length>0) then
#ifdef WITH_OPENMP
if (useGPU) then
if (useGPU_LEGACY) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: not yet implemented"
......@@ -1427,7 +1498,7 @@
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif
if (useGPU) then
if (useGPU_LEGACY) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA = cuda_memcpy( aIntern_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), &
......@@ -1439,9 +1510,9 @@
&: error in cudaMemcpy"
stop 1
endif
else ! useGPU
else ! useGPU_LEGACY
aIntern(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif ! useGPU
endif ! useGPU_LEGACY
#endif /* WITH_OPENMP */
endif ! top_msg_length
......@@ -1469,7 +1540,7 @@
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, &
i, my_thread, thread_width, kernel)
i, my_thread, thread_width, kernel, h_dev, s_dev, q2_dev, w_dev)
enddo
!$omp end parallel do
call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
......@@ -1486,7 +1557,7 @@
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i, &
last_stripe_width, kernel)
last_stripe_width, kernel, h_dev, s_dev, q2_dev, w_dev)
#endif /* WITH_OPENMP */
!send_b 1
......@@ -1521,7 +1592,7 @@
#else /* WITH_OPENMP */
if (useGPU) then
if (useGPU_LEGACY) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_datatype, &
......@@ -1555,7 +1626,7 @@
!compute
#ifdef WITH_OPENMP
if (useGPU) then
if (useGPU_LEGACY) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop 1
endif
......@@ -1574,7 +1645,7 @@
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, current_local_n - bottom_msg_length, &
bottom_msg_length, i, my_thread, thread_width, kernel)
bottom_msg_length, i, my_thread, thread_width, kernel, h_dev, s_dev, q2_dev, w_dev)
enddo
!$omp end parallel do
call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
......@@ -1618,7 +1689,7 @@
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, kernel)
last_stripe_width, kernel, h_dev, s_dev, q2_dev, w_dev)
......@@ -1632,7 +1703,7 @@
if (bottom_msg_length > 0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
if (useGPU) then
if (useGPU_LEGACY) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), aIntern_dev + dev_offset, &
stripe_width*bottom_msg_length* size_of_datatype, &
......@@ -1689,7 +1760,7 @@
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,&
current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
kernel)
kernel, h_dev, s_dev, q2_dev, w_dev)
enddo
!$omp end parallel do
call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
......@@ -1707,7 +1778,7 @@
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, kernel)
last_stripe_width, kernel, h_dev, s_dev, q2_dev, w_dev)
#endif /* WITH_OPENMP */
......@@ -1728,7 +1799,7 @@
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif
if (useGPU) then
if (useGPU_LEGACY) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( aIntern_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), &
stripe_width * top_msg_length * size_of_datatype, &
......@@ -1768,7 +1839,7 @@
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, my_thread, &
thread_width, kernel)
thread_width, kernel, h_dev, s_dev, q2_dev, w_dev)
enddo
!$omp end parallel do
call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
......@@ -1785,7 +1856,7 @@
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, &
last_stripe_width, kernel)
last_stripe_width, kernel, h_dev, s_dev, q2_dev, w_dev)
#endif /* WITH_OPENMP */
endif
......@@ -1856,7 +1927,7 @@
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif
if (useGPU) then
if (useGPU_LEGACY) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), aIntern_dev + dev_offset, &
stripe_width*nbw * size_of_datatype, &
......@@ -1945,7 +2016,7 @@
dst = mod(num_blk, np_rows)
if (dst == 0) then
if (useGPU) then
if (useGPU_LEGACY) then
row_group_size = min(na - num_blk*nblk, nblk)
call pack_row_group_&
&MATH_DATATYPE&
......@@ -1957,7 +2028,7 @@
do i = 1, row_group_size
q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
enddo
else ! useGPU
else ! useGPU_LEGACY
do i = 1, min(na - num_blk*nblk, nblk)
#ifdef WITH_OPENMP
......@@ -1976,11 +2047,11 @@
#endif /* WITH_OPENMP */
q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
enddo
endif ! useGPU
endif ! useGPU_LEGACY
else ! (dst == 0)
if (useGPU) then
if (useGPU_LEGACY) then
call pack_row_group_&
&MATH_DATATYPE&
&_gpu_&
......@@ -1989,7 +2060,7 @@
last_stripe_width, a_dim2, l_nev, &
result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else ! useGPU
else ! useGPU_LEGACY
do i = 1, nblk
#if WITH_OPENMP
call pack_row_&
......@@ -2006,7 +2077,7 @@
&(obj, aIntern, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
#endif /* WITH_OPENMP */
enddo
endif ! useGPU
endif ! useGPU_LEGACY
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_MATH_DATATYPE_PRECISION_EXPL, &
......@@ -2101,7 +2172,7 @@
a_off = a_off + offset
if (a_off + next_local_n + nbw >= a_dim2) then
#ifdef WITH_OPENMP
if (useGPU) then
if (useGPU_LEGACY) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop 1
endif
......@@ -2121,7 +2192,7 @@
#else /* WITH_OPENMP */
do i = 1, stripe_count
if (useGPU) then
if (useGPU_LEGACY) then
chunk = min(next_local_n - 1, a_off)
do j = top_msg_length + 1, top_msg_length + next_local_n, chunk
top = min(j + chunk, top_msg_length + next_local_n)
......@@ -2140,7 +2211,7 @@
stop 1
endif
enddo
else ! not useGPU
else ! not useGPU_LEGACY
do j = top_msg_length+1, top_msg_length+next_local_n
aIntern(:,j,i) = aIntern(:,j+a_off,i)
enddo
......@@ -2194,6 +2265,10 @@
if (my_prow==0 .and. my_pcol==0 .and.print_flops == 1) &
write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6
!if (useGPU_LEGACY) then
! TODO at the moment, create q on device for both GPU implementations
! it is assumed to be there by the following routines
! TODO rewise this
if (useGPU) then
! copy q to q_dev needed in trans_ev_band_to_full
successCUDA = cuda_malloc(q_dev, ldq*matrixCols* size_of_datatype)
......@@ -2218,7 +2293,7 @@
! deallocate all working space
if (.not.(useGPU)) then
if (.not.(useGPU_LEGACY)) then
nullify(aIntern)
call free(aIntern_ptr)
endif
......@@ -2335,7 +2410,7 @@
stop 1
endif
if (useGPU) then
if (useGPU_LEGACY) then
#if COMPLEXCASE == 1
! should this not hbe done always?
successCUDA = cuda_free(aIntern_dev)
......@@ -2391,8 +2466,43 @@
&: error in cudaFree "//errorMessage
stop 1
endif
endif ! useGPU
endif ! useGPU_LEGACY
if(useGPU_BLAS) then
successCUDA = cuda_free( h_dev )
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
stop 1
endif
successCUDA = cuda_free( s_dev )
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
stop 1
endif
successCUDA = cuda_free( q2_dev )
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
stop 1
endif
successCUDA = cuda_free( w_dev )
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaFree "//errorMessage
stop 1
endif
endif ! useGPU_BLAS
call obj%timer%stop("trans_ev_tridi_to_band_&
&MATH_DATATYPE&
......
......@@ -64,14 +64,15 @@
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (q, hh, nb, nq, ldq, ldh)
& (useGPU, q, hh, nb, nq, ldq, ldh, h_dev, s_dev, q_dev, w_dev)
use precision
use elpa_abstract_impl
use iso_c_binding
use cuda_functions
implicit none
#include "../../general/precision_kinds.F90"
!class(elpa_abstract_impl_t), intent(inout) :: obj
logical :: useGPU
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef USE_ASSUMED_SIZE
......@@ -82,11 +83,25 @@
real(kind=C_DATATYPE_KIND), intent(in) :: hh(1:ldh,1:6)
#endif
real(kind=C_DATATYPE_KIND) :: w_comb(nq, 4)
real(kind=C_DATATYPE_KIND) :: w_comb(ldq, 4)
real(kind=C_DATATYPE_KIND) :: h_mat(4, nb+3)
real(kind=C_DATATYPE_KIND) :: s_mat(4, 4)
integer(kind=ik) :: i, j, k
!TODO remove
!real(kind=C_DATATYPE_KIND) :: q_extra(1:ldq,1:nb+3)
integer(kind=c_intptr_t) :: h_dev, s_dev, q_dev, w_dev
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
integer(kind=ik) :: i, j, k
integer(kind=ik), parameter :: max_block_blas = 4
! Calculate dot product of the two Householder vectors
......@@ -103,34 +118,138 @@
h_mat(3,3:nb+1) = -hh(2:nb, 3)
h_mat(4,2:nb) = -hh(2:nb, 4)
if(useGPU) then
! nb == nbw
successCUDA = cuda_memcpy(h_dev, loc(h_mat(1,1)), &
max_block_blas * (nb+3) * size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"blas_block4_kernel: error in cudaMemcpy, h_dev host to device"
stop 1
endif
! nq == stripe_width
successCUDA = cuda_memcpy(q_dev, loc(q(1,1)), &
ldq * (nb+3) * size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"blas_block4_kernel: error in cudaMemcpy, q_dev host to device"
stop 1
endif
endif
! TODO we do not need the diagonal, but how to do it with BLAS?
!s_mat = - matmul(h_mat, transpose(h_mat))
call PRECISION_SYRK('L', 'N', 4, nb+3, &
-ONE, h_mat, 4, &
ZERO, s_mat, 4)
if(useGPU) then
call cublas_PRECISION_SYRK('L', 'N', 4, nb+3, &
-ONE, h_dev, 4, &
ZERO, s_dev, 4)
else
call PRECISION_SYRK('L', 'N', 4, nb+3, &
-ONE, h_mat, 4, &
ZERO, s_mat, 4)
endif
! w_comb = - matmul(q(1:nq, 1:nb+3), transpose(h_mat))
call PRECISION_GEMM('N', 'T', nq, 4, nb+3, &
-ONE, q, ldq, &
h_mat, 4, &
ZERO, w_comb, nq)
!w_comb = - matmul(q(1:nq, 1:nb+3), transpose(h_mat))
if(useGPU) then
call cublas_PRECISION_GEMM('N', 'T', nq, 4, nb+3, &
-ONE, q_dev, ldq, &
h_dev, 4, &
ZERO, w_dev, ldq)
else
call PRECISION_GEMM('N', 'T', nq, 4, nb+3, &
-ONE, q, ldq, &
h_mat, 4, &
ZERO, w_comb, ldq)
endif
! Rank-1 update
!w_comb(1:nq,1) = hh(1,1) * w_comb(1:nq, 1)
call PRECISION_SCAL(nq, hh(1,1), w_comb(1:nq, 1), 1)
if(useGPU) then
call cublas_PRECISION_SCAL(nq, hh(1,1), w_dev, 1)
else
call PRECISION_SCAL(nq, hh(1,1), w_comb(1, 1), 1)
endif
do i = 2, 4
! w_comb(1:nq,i) = matmul(w_comb(1:nq,1:i-1), hh(1,i) * s_mat(i,1:i-1)) + hh(1,i) * w_comb(1:nq, i)
call PRECISION_GEMV('N', nq, i-1, &
hh(1,i), w_comb(1:nq, 1:i-1), nq, &
s_mat(i,1:i-1), 1, &
hh(1,i), w_comb(1:nq,i), 1)
if(useGPU) then
call cublas_PRECISION_GEMV('N', nq, i-1, &
hh(1,i), w_dev, ldq, &
s_dev + (i - 1) * size_of_datatype, 4, &
hh(1,i), w_dev + (i-1) * ldq * size_of_datatype, 1)
else
call PRECISION_GEMV('N', nq, i-1, &
hh(1,i), w_comb(1, 1), ldq, &
s_mat(i,1), 4, &
hh(1,i), w_comb(1,i), 1)
endif
enddo
! ---------------------
if(useGPU) then
! successCUDA = cuda_memcpy(loc(s_mat(1,1)), s_dev, &
! 4 * 4 * size_of_datatype, &
! cudaMemcpyDeviceToHost)
! if (.not.(successCUDA)) then
! print *,"blas_block4_kernel: error in cudaMemcpy, q_dev device to host"
! stop 1
! endif
successCUDA = cuda_memcpy(loc(w_comb(1,1)), w_dev, &
nq * 4 * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"blas_block4_kernel: error in cudaMemcpy, w_dev device to host"
stop 1
endif
successCUDA = cuda_memcpy(loc(h_mat(1,1)), h_dev, &
max_block_blas * (nb+3) * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"blas_block4_kernel: error in cudaMemcpy, w_dev device to host"
stop 1
endif
endif
useGPU = .false.
! ---------------------
!q(1:nq, 1:nb+3) = matmul(w_comb, h_mat) + q(1:nq, 1:nb+3)
call PRECISION_GEMM('N', 'N', nq, nb+3, 4, &
ONE, w_comb, nq, &
h_mat, 4, &
ONE, q, ldq)
if(useGPU) then
call cublas_PRECISION_GEMM('N', 'N', nq, nb+3, 4, &
ONE, w_dev, ldq, &
h_dev, 4, &
ONE, q_dev, ldq)
else
call PRECISION_GEMM('N', 'N', nq, nb+3, 4, &
ONE, w_comb, ldq, &
h_mat, 4, &
ONE, q, ldq)
endif
if(useGPU) then
!successCUDA = cuda_memcpy(loc(q_extra(1,1)), q_dev, &
successCUDA = cuda_memcpy(loc(q(1,1)), q_dev, &
ldq * (nb+3) * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"blas_block4_kernel: error in cudaMemcpy, q_dev device to host"
stop 1
endif
endif
! print *, "difference ", norm2(q(1:ldq,1:nb+3)-q_extra(1:ldq,1:nb+3)), ", ldq ", ldq, ", nq ", nq, ", nb ", nb
! print *, q(1:ldq,1:nb+3)
! stop 1
end subroutine
......
......@@ -46,6 +46,8 @@
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef cublas_PRECISION_SYMV
#undef cublas_PRECISION_SYRK
#undef cublas_PRECISION_SCAL
#undef scal_PRECISION_GEMM
#undef scal_PRECISION_NRM2
#undef scal_PRECISION_LASET
......@@ -106,6 +108,8 @@
#define cublas_PRECISION_TRMM cublas_DTRMM
#define cublas_PRECISION_GEMV cublas_DGEMV
#define cublas_PRECISION_SYMV cublas_DSYMV
#define cublas_PRECISION_SYRK cublas_DSYRK
#define cublas_PRECISION_SCAL cublas_DSCAL
#define scal_PRECISION_GEMM PDGEMM
#define scal_PRECISION_NRM2 PDNRM2
#define scal_PRECISION_LASET PDLASET
......@@ -167,6 +171,8 @@
#define cublas_PRECISION_TRMM cublas_STRMM
#define cublas_PRECISION_GEMV cublas_SGEMV
#define cublas_PRECISION_SYMV cublas_SSYMV
#define cublas_PRECISION_SYRK cublas_SSYRK
#define cublas_PRECISION_SCAL cublas_SSCAL
#define scal_PRECISION_GEMM PSGEMM
#define scal_PRECISION_NRM2 PSNRM2
#define scal_PRECISION_LASET PSLASET
......@@ -236,6 +242,8 @@
#undef cublas_PRECISION_TRMM
#undef cublas_PRECISION_GEMV
#undef cublas_PRECISION_SYMV
#undef cublas_PRECISION_SYRK
#undef cublas_PRECISION_SCAL
#undef scal_PRECISION_GEMM
#undef scal_PRECISION_DOTC
#undef scal_PRECISION_LASET
......@@ -307,6 +315,8 @@
#define cublas_PRECISION_TRMM cublas_ZTRMM
#define cublas_PRECISION_GEMV cublas_ZGEMV
#define cublas_PRECISION_SYMV cublas_ZSYMV
#define cublas_PRECISION_SYRK cublas_ZSYRK
#define cublas_PRECISION_SCAL cublas_ZSCAL
#define scal_PRECISION_GEMM PZGEMM
#define scal_PRECISION_DOTC PZDOTC
#define scal_PRECISION_LASET PZLASET
......@@ -372,6 +382,8 @@
#define cublas_PRECISION_TRMM cublas_CTRMM
#define cublas_PRECISION_GEMV cublas_CGEMV
#define cublas_PRECISION_SYMV cublas_CSYMV
#define cublas_PRECISION_SYRK cublas_CSYRK
#define cublas_PRECISION_SCAL cublas_CSCAL
#define scal_PRECISION_GEMM PCGEMM
#define scal_PRECISION_DOTC PCDOTC
#define scal_PRECISION_LASET PCLASET
......