Commit 14fed421 authored by Andreas Marek's avatar Andreas Marek

Unify pack_unpack GPU version

parent ca9dc94b
......@@ -40,7 +40,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/check_for_gpu.F90 \
src/mod_cuda.F90 \
src/interface_c_kernel.F90 \
src/mod_pack_unpack_real_gpu.F90 \
src/mod_pack_unpack_gpu.F90 \
src/elpa_qr/qr_utils.F90 \
src/elpa_qr/elpa_qrkernels.F90 \
src/elpa_qr/elpa_pdlarfb.F90 \
......@@ -63,6 +63,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
......@@ -982,6 +984,8 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
......
subroutine compute_hh_trafo_complex_gpu_&
&PRECISION&
&(a_dev, bcast_buffer_dev, hh_tau_dev, off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
use iso_c_binding
use cuda_c_kernel
use cuda_functions
use precision
implicit none
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=c_size_t) :: a_dev, bcast_buffer_dev, hh_tau_dev
integer(kind=ik), intent(in) :: last_stripe_width
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl, a_dim2, n_times, nbw, stripe_count, stripe_width
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_datatype
dev_offset_2 =( off-1 )*size_of_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
......@@ -71,150 +71,3 @@
#include "elpa2_tridiag_band_template.X90"
#include "elpa2_trans_ev_tridi_to_band_template.X90"
subroutine compute_hh_dot_products_complex_gpu_&
&PRECISION&
&(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
call launch_compute_hh_dotp_c_kernel_complex_&
&PRECISION&
&( bcast_buffer_dev, hh_dot_dev, nbw,n)
end subroutine
subroutine pack_row_group_complex_gpu_&
&PRECISION&
&(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE) :: rows(:,:)
integer(kind=ik) :: max_idx
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
call launch_my_pack_c_kernel_complex_&
&PRECISION&
&(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop 1
endif
end subroutine
subroutine unpack_row_group_complex_gpu_&
&PRECISION&
&(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE), intent(in) :: rows(:, :)
integer(kind=ik) :: max_idx
integer(kind=ik) :: i
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_datatype , &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop 1
endif
call launch_my_unpack_c_kernel_complex_&
&PRECISION&
&( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
end subroutine
subroutine unpack_and_prepare_row_group_complex_gpu_&
&PRECISION&
&(next_unpack_idx, force)
use precision
implicit none
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
call unpack_row_group_complex_gpu_&
&PRECISION&
&(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
end subroutine
subroutine compute_hh_trafo_complex_gpu_&
&PRECISION&
&(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
use iso_c_binding
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_datatype
dev_offset_2 =( off-1 )*size_of_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_&
&PRECISION&
&(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
end subroutine
......@@ -46,8 +46,8 @@
#endif
use elpa2_workload
use pack_unpack_cpu
use pack_unpack_gpu
#if REALCASE == 1
use pack_unpack_real_gpu
use compute_hh_trafo_real
#endif
......@@ -730,11 +730,9 @@
&_gpu_&
&PRECISION &
( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
#endif
i - limits(ip), .false.)
#ifdef WITH_MPI
call timer%start("mpi_communication")
......@@ -792,11 +790,9 @@
&_gpu_&
&PRECISION &
( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
#endif
i - limits(ip), .false.)
#if REALCASE == 1
......@@ -810,9 +806,12 @@
#else /* WITH_OPENMP */
#if COMPLEXCASE == 1
! why is an cuda call in the openmp region?
call unpack_and_prepare_row_group_complex_gpu_&
&PRECISION&
&(i - limits(ip),.false.)
&(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(ip),.false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
#endif
......@@ -945,11 +944,9 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, &
#endif
i - limits(my_prow), .false.)
#ifdef WITH_MPI
......@@ -1016,11 +1013,9 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group_size, nblk, unpack_idx, &
#endif
-1, .true.)
successCUDA = cuda_devicesynchronize()
......@@ -1427,23 +1422,18 @@
&MATH_DATATYPE&
&_gpu_&
&PRECISION &
#if REALCASE == 1
!#if REALCASE == 1
(bcast_buffer_dev, hh_tau_dev, nbw, &
#endif
#if COMPLEXCASE == 1
( nbw, &
#endif
!#endif
!#if COMPLEXCASE == 1
! ( nbw, &
!#endif
current_local_n, .false.)
call compute_hh_dot_products_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION &
#if REALCASE == 1
(bcast_buffer_dev, hh_dot_dev, nbw, &
#endif
#if COMPLEXCASE == 1
( nbw, &
#endif
current_local_n)
endif ! useGPU
......@@ -1470,9 +1460,7 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
bcast_buffer_dev, hh_tau_dev, &
#endif
nbw, 1, .true.)
endif ! useGPU
endif ! (current_local_n > 1) then
......@@ -1720,7 +1708,9 @@
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
......@@ -1892,8 +1882,10 @@
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
......@@ -2008,8 +2000,10 @@
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
......@@ -2106,7 +2100,9 @@
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
......@@ -2308,9 +2304,7 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
#endif
row_group(:, :), j * nblk + a_off, row_group_size)
do i = 1, row_group_size
......@@ -2351,10 +2345,8 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, &
#endif
result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else ! useGPU
......@@ -2596,15 +2588,16 @@
stop 1
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop 1
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop 1
endif
! endif
!#endif
endif !use GPU
......@@ -2797,33 +2790,30 @@
)
return
#if COMPLEXCASE == 1
contains
! The host wrapper for extracting "tau" from the HH reflectors (see the
! kernel below)
subroutine extract_hh_tau_complex_gpu_&
&PRECISION&
&(nbw, n, is_zero)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
logical, value :: is_zero
integer(kind=ik) :: val_is_zero
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
call launch_extract_hh_tau_c_kernel_complex_&
&PRECISION&
&(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
end subroutine
#endif /* COMPLEXCASE */
#if REALCASE == 1
end subroutine trans_ev_tridi_to_band_real_&
&PRECISION
#endif
!#if COMPLEXCASE == 1
! contains
! ! The host wrapper for extracting "tau" from the HH reflectors (see the
! ! kernel below)
! subroutine extract_hh_tau_complex_gpu_&
! &PRECISION&
! &(nbw, n, is_zero)
! use cuda_c_kernel
! use pack_unpack_gpu
! use precision
! implicit none
! integer(kind=ik), value :: nbw, n
! logical, value :: is_zero
! integer(kind=ik) :: val_is_zero
!
! if (is_zero) then
! val_is_zero = 1
! else
! val_is_zero = 0
! endif
! call launch_extract_hh_tau_c_kernel_complex_&
! &PRECISION&
! &(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
! end subroutine
!#endif /* COMPLEXCASE */
end subroutine
......@@ -46,12 +46,13 @@ module compute_hh_trafo_complex
use elpa_mpi
implicit none
private
#ifdef WITH_OPENMP
public compute_hh_trafo_complex_cpu_openmp_double
#else
public compute_hh_trafo_complex_cpu_double
#endif
public compute_hh_trafo_complex_gpu_double
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP
......@@ -59,7 +60,7 @@ module compute_hh_trafo_complex
#else
public compute_hh_trafo_complex_cpu_single
#endif
public compute_hh_trafo_complex_gpu_single
#endif
contains
......@@ -787,4 +788,25 @@ module compute_hh_trafo_complex
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
!complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
! complex single precision
#if defined(WANT_SINGLE_PRECISION_COMPLEX)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif
end module
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
module pack_unpack_gpu
#include "config-f90.h"
implicit none
private
public pack_row_group_real_gpu_double, unpack_row_group_real_gpu_double, &
unpack_and_prepare_row_group_real_gpu_double, compute_hh_dot_products_real_gpu_double, &
extract_hh_tau_real_gpu_double
public pack_row_group_complex_gpu_double, unpack_row_group_complex_gpu_double, &
unpack_and_prepare_row_group_complex_gpu_double, compute_hh_dot_products_complex_gpu_double, &
extract_hh_tau_complex_gpu_double
#ifdef WANT_SINGLE_PRECISION_REAL
public pack_row_group_real_gpu_single, unpack_row_group_real_gpu_single, &
unpack_and_prepare_row_group_real_gpu_single, compute_hh_dot_products_real_gpu_single, &
extract_hh_tau_real_gpu_single
#endif
#ifdef WANT_SINGLE_PRECISION_COMPLEX
public pack_row_group_complex_gpu_single, unpack_row_group_complex_gpu_single, &
unpack_and_prepare_row_group_complex_gpu_single, compute_hh_dot_products_complex_gpu_single, &
extract_hh_tau_complex_gpu_single
#endif
contains
!real double precision
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_gpu.X90"
#undef REALCASE
#undef DOUBLE_PRECISION
! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL)
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_gpu.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif
!complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_gpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
! complex single precision
#if defined(WANT_SINGLE_PRECISION_COMPLEX)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_gpu.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif
end module
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment