Commit 14fed421 authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify pack_unpack GPU version

parent ca9dc94b
...@@ -40,7 +40,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \ ...@@ -40,7 +40,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/check_for_gpu.F90 \ src/check_for_gpu.F90 \
src/mod_cuda.F90 \ src/mod_cuda.F90 \
src/interface_c_kernel.F90 \ src/interface_c_kernel.F90 \
src/mod_pack_unpack_real_gpu.F90 \ src/mod_pack_unpack_gpu.F90 \
src/elpa_qr/qr_utils.F90 \ src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.F90 \ src/elpa_qr/elpa_qrkernels.F90 \
src/elpa_qr/elpa_pdlarfb.F90 \ src/elpa_qr/elpa_pdlarfb.F90 \
...@@ -63,6 +63,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \ ...@@ -63,6 +63,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \ src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \ src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/pack_unpack_cpu.X90 \ src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/redist_band.X90 \ src/redist_band.X90 \
src/sanity.X90 \ src/sanity.X90 \
src/elpa_cholesky_template.X90 \ src/elpa_cholesky_template.X90 \
...@@ -982,6 +984,8 @@ EXTRA_DIST = \ ...@@ -982,6 +984,8 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \ src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \ src/redist_band.X90 \
src/pack_unpack_cpu.X90 \ src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/sanity.X90 \ src/sanity.X90 \
src/elpa_cholesky_template.X90 \ src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \ src/elpa_invert_trm.X90 \
......
subroutine compute_hh_trafo_complex_gpu_&
&PRECISION&
&(a_dev, bcast_buffer_dev, hh_tau_dev, off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
use iso_c_binding
use cuda_c_kernel
use cuda_functions
use precision
implicit none
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=c_size_t) :: a_dev, bcast_buffer_dev, hh_tau_dev
integer(kind=ik), intent(in) :: last_stripe_width
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl, a_dim2, n_times, nbw, stripe_count, stripe_width
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_datatype
dev_offset_2 =( off-1 )*size_of_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
...@@ -71,150 +71,3 @@ ...@@ -71,150 +71,3 @@
#include "elpa2_tridiag_band_template.X90" #include "elpa2_tridiag_band_template.X90"
#include "elpa2_trans_ev_tridi_to_band_template.X90" #include "elpa2_trans_ev_tridi_to_band_template.X90"
subroutine compute_hh_dot_products_complex_gpu_&
&PRECISION&
&(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
call launch_compute_hh_dotp_c_kernel_complex_&
&PRECISION&
&( bcast_buffer_dev, hh_dot_dev, nbw,n)
end subroutine
subroutine pack_row_group_complex_gpu_&
&PRECISION&
&(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE) :: rows(:,:)
integer(kind=ik) :: max_idx
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
call launch_my_pack_c_kernel_complex_&
&PRECISION&
&(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop 1
endif
end subroutine
subroutine unpack_row_group_complex_gpu_&
&PRECISION&
&(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE), intent(in) :: rows(:, :)
integer(kind=ik) :: max_idx
integer(kind=ik) :: i
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_datatype , &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop 1
endif
call launch_my_unpack_c_kernel_complex_&
&PRECISION&
&( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
end subroutine
subroutine unpack_and_prepare_row_group_complex_gpu_&
&PRECISION&
&(next_unpack_idx, force)
use precision
implicit none
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
call unpack_row_group_complex_gpu_&
&PRECISION&
&(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
end subroutine
subroutine compute_hh_trafo_complex_gpu_&
&PRECISION&
&(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
use iso_c_binding
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_datatype
dev_offset_2 =( off-1 )*size_of_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_&
&PRECISION&
&(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
end subroutine
...@@ -46,8 +46,8 @@ ...@@ -46,8 +46,8 @@
#endif #endif
use elpa2_workload use elpa2_workload
use pack_unpack_cpu use pack_unpack_cpu
use pack_unpack_gpu
#if REALCASE == 1 #if REALCASE == 1
use pack_unpack_real_gpu
use compute_hh_trafo_real use compute_hh_trafo_real
#endif #endif
...@@ -730,11 +730,9 @@ ...@@ -730,11 +730,9 @@
&_gpu_& &_gpu_&
&PRECISION & &PRECISION &
( & ( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, & row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,& stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, & row_group_size, nblk, unpack_idx, &
#endif
i - limits(ip), .false.) i - limits(ip), .false.)
#ifdef WITH_MPI #ifdef WITH_MPI
call timer%start("mpi_communication") call timer%start("mpi_communication")
...@@ -792,11 +790,9 @@ ...@@ -792,11 +790,9 @@
&_gpu_& &_gpu_&
&PRECISION & &PRECISION &
( & ( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, & row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,& stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, & row_group_size, nblk, unpack_idx, &
#endif
i - limits(ip), .false.) i - limits(ip), .false.)
#if REALCASE == 1 #if REALCASE == 1
...@@ -810,9 +806,12 @@ ...@@ -810,9 +806,12 @@
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
! why is an cuda call in the openmp region?
call unpack_and_prepare_row_group_complex_gpu_& call unpack_and_prepare_row_group_complex_gpu_&
&PRECISION& &PRECISION&
&(i - limits(ip),.false.) &(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(ip),.false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev) row_group(:, row_group_size) = q(src_offset, 1:l_nev)
#endif #endif
...@@ -945,11 +944,9 @@ ...@@ -945,11 +944,9 @@
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
&( & &( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, & row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev, & stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, & row_group_size, nblk, unpack_idx, &
#endif
i - limits(my_prow), .false.) i - limits(my_prow), .false.)
#ifdef WITH_MPI #ifdef WITH_MPI
...@@ -1016,11 +1013,9 @@ ...@@ -1016,11 +1013,9 @@
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
&( & &( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, & row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, & stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group_size, nblk, unpack_idx, & a_dim2, l_nev, row_group_size, nblk, unpack_idx, &
#endif
-1, .true.) -1, .true.)
successCUDA = cuda_devicesynchronize() successCUDA = cuda_devicesynchronize()
...@@ -1427,23 +1422,18 @@ ...@@ -1427,23 +1422,18 @@
&MATH_DATATYPE& &MATH_DATATYPE&
&_gpu_& &_gpu_&
&PRECISION & &PRECISION &
#if REALCASE == 1 !#if REALCASE == 1
(bcast_buffer_dev, hh_tau_dev, nbw, & (bcast_buffer_dev, hh_tau_dev, nbw, &
#endif !#endif
#if COMPLEXCASE == 1 !#if COMPLEXCASE == 1
( nbw, & ! ( nbw, &
#endif !#endif
current_local_n, .false.) current_local_n, .false.)
call compute_hh_dot_products_& call compute_hh_dot_products_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_gpu_& &_gpu_&
&PRECISION & &PRECISION &
#if REALCASE == 1
(bcast_buffer_dev, hh_dot_dev, nbw, & (bcast_buffer_dev, hh_dot_dev, nbw, &
#endif
#if COMPLEXCASE == 1
( nbw, &
#endif
current_local_n) current_local_n)
endif ! useGPU endif ! useGPU
...@@ -1470,9 +1460,7 @@ ...@@ -1470,9 +1460,7 @@
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
&( & &( &
#if REALCASE == 1
bcast_buffer_dev, hh_tau_dev, & bcast_buffer_dev, hh_tau_dev, &
#endif
nbw, 1, .true.) nbw, 1, .true.)
endif ! useGPU endif ! useGPU
endif ! (current_local_n > 1) then endif ! (current_local_n > 1) then
...@@ -1720,7 +1708,9 @@ ...@@ -1720,7 +1708,9 @@
if (useGPU) then if (useGPU) then
call compute_hh_trafo_complex_gpu_& call compute_hh_trafo_complex_gpu_&
&PRECISION& &PRECISION&
&(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2) &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else else
call compute_hh_trafo_complex_cpu_& call compute_hh_trafo_complex_cpu_&
&PRECISION& &PRECISION&
...@@ -1892,8 +1882,10 @@ ...@@ -1892,8 +1882,10 @@
if (useGPU) then if (useGPU) then
call compute_hh_trafo_complex_gpu_& call compute_hh_trafo_complex_gpu_&
&PRECISION& &PRECISION&
&(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, & &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2) dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else else
call compute_hh_trafo_complex_cpu_& call compute_hh_trafo_complex_cpu_&
&PRECISION& &PRECISION&
...@@ -2008,8 +2000,10 @@ ...@@ -2008,8 +2000,10 @@
if (useGPU) then if (useGPU) then
call compute_hh_trafo_complex_gpu_& call compute_hh_trafo_complex_gpu_&
&PRECISION& &PRECISION&
&(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, & &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, dev_offset, dev_offset_1, dev_offset_2) i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else else
call compute_hh_trafo_complex_cpu_& call compute_hh_trafo_complex_cpu_&
&PRECISION& &PRECISION&
...@@ -2106,7 +2100,9 @@ ...@@ -2106,7 +2100,9 @@
if (useGPU) then if (useGPU) then
call compute_hh_trafo_complex_gpu_& call compute_hh_trafo_complex_gpu_&
&PRECISION& &PRECISION&
&(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2) &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else else
call compute_hh_trafo_complex_cpu_& call compute_hh_trafo_complex_cpu_&
&PRECISION& &PRECISION&
...@@ -2308,9 +2304,7 @@ ...@@ -2308,9 +2304,7 @@
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
&( & &( &
#if REALCASE == 1
row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, & row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
#endif
row_group(:, :), j * nblk + a_off, row_group_size) row_group(:, :), j * nblk + a_off, row_group_size)
do i = 1, row_group_size do i = 1, row_group_size
...@@ -2351,10 +2345,8 @@ ...@@ -2351,10 +2345,8 @@
&_gpu_& &_gpu_&
&PRECISION& &PRECISION&
&( & &( &
#if REALCASE == 1
row_group_dev, aIntern_dev, stripe_count, stripe_width, & row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, & last_stripe_width, a_dim2, l_nev, &
#endif
result_buffer(:, :, nbuf), j * nblk + a_off, nblk) result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else ! useGPU else ! useGPU
...@@ -2605,6 +2597,7 @@ ...@@ -2605,6 +2597,7 @@
&: error in cudaMalloc" &: error in cudaMalloc"
stop 1 stop 1
endif endif
! endif
!#endif !#endif
endif !use GPU endif !use GPU
...@@ -2797,33 +2790,30 @@ ...@@ -2797,33 +2790,30 @@
) )
return return
#if COMPLEXCASE == 1 !#if COMPLEXCASE == 1
contains ! contains
! The host wrapper for extracting "tau" from the HH reflectors (see the ! ! The host wrapper for extracting "tau" from the HH reflectors (see the
! kernel below) ! ! kernel below)
subroutine extract_hh_tau_complex_gpu_& ! subroutine extract_hh_tau_complex_gpu_&
&PRECISION& ! &PRECISION&
&(nbw, n, is_zero) ! &(nbw, n, is_zero)
use cuda_c_kernel ! use cuda_c_kernel
use precision ! use pack_unpack_gpu
implicit none ! use precision
integer(kind=ik), value :: nbw, n ! implicit none
logical, value :: is_zero ! integer(kind=ik), value :: nbw, n
integer(kind=ik) :: val_is_zero ! logical, value :: is_zero
! integer(kind=ik) :: val_is_zero
!
! if (is_zero) then
! val_is_zero = 1
! else
! val_is_zero = 0
! endif
! call launch_extract_hh_tau_c_kernel_complex_&
! &PRECISION&
! &(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
! end subroutine
!#endif /* COMPLEXCASE */
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
call launch_extract_hh_tau_c_kernel_complex_&
&PRECISION&
&(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
end subroutine end subroutine
#endif /* COMPLEXCASE */
#if REALCASE == 1
end subroutine trans_ev_tridi_to_band_real_&
&PRECISION
#endif
...@@ -46,12 +46,13 @@ module compute_hh_trafo_complex ...@@ -46,12 +46,13 @@ module compute_hh_trafo_complex
use elpa_mpi use elpa_mpi
implicit none implicit none
private
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
public compute_hh_trafo_complex_cpu_openmp_double public compute_hh_trafo_complex_cpu_openmp_double
#else #else
public compute_hh_trafo_complex_cpu_double public compute_hh_trafo_complex_cpu_double
#endif #endif
public compute_hh_trafo_complex_gpu_double
#ifdef WANT_SINGLE_PRECISION_COMPLEX #ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
...@@ -59,7 +60,7 @@ module compute_hh_trafo_complex ...@@ -59,7 +60,7 @@ module compute_hh_trafo_complex
#else #else
public compute_hh_trafo_complex_cpu_single public compute_hh_trafo_complex_cpu_single
#endif #endif
public compute_hh_trafo_complex_gpu_single
#endif #endif
contains contains
...@@ -787,4 +788,25 @@ module compute_hh_trafo_complex ...@@ -787,4 +788,25 @@ module compute_hh_trafo_complex
#endif /* WANT_SINGLE_PRECISION_COMPLEX */ #endif /* WANT_SINGLE_PRECISION_COMPLEX */
!complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
! complex single precision
#if defined(WANT_SINGLE_PRECISION_COMPLEX)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif
end module end module
! This file is part of ELPA.
!