Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
14fed421
Commit
14fed421
authored
Mar 21, 2017
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Unify pack_unpack GPU version
parent
ca9dc94b
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
552 additions
and
221 deletions
+552
-221
Makefile.am
Makefile.am
+5
-1
src/compute_hh_trafo_complex_gpu.X90
src/compute_hh_trafo_complex_gpu.X90
+50
-0
src/elpa2_compute_complex_template.X90
src/elpa2_compute_complex_template.X90
+0
-147
src/elpa2_trans_ev_tridi_to_band_template.X90
src/elpa2_trans_ev_tridi_to_band_template.X90
+61
-71
src/mod_compute_hh_trafo_complex.F90
src/mod_compute_hh_trafo_complex.F90
+24
-2
src/mod_pack_unpack_gpu.F90
src/mod_pack_unpack_gpu.F90
+110
-0
src/pack_unpack_gpu.X90
src/pack_unpack_gpu.X90
+302
-0
No files found.
Makefile.am
View file @
14fed421
...
...
@@ -40,7 +40,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/check_for_gpu.F90
\
src/mod_cuda.F90
\
src/interface_c_kernel.F90
\
src/mod_pack_unpack_
real_
gpu.F90
\
src/mod_pack_unpack_gpu.F90
\
src/elpa_qr/qr_utils.F90
\
src/elpa_qr/elpa_qrkernels.F90
\
src/elpa_qr/elpa_pdlarfb.F90
\
...
...
@@ -63,6 +63,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/pack_unpack_cpu.X90
\
src/pack_unpack_gpu.X90
\
src/compute_hh_trafo_complex_gpu.X90
\
src/redist_band.X90
\
src/sanity.X90
\
src/elpa_cholesky_template.X90
\
...
...
@@ -982,6 +984,8 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/redist_band.X90
\
src/pack_unpack_cpu.X90
\
src/pack_unpack_gpu.X90
\
src/compute_hh_trafo_complex_gpu.X90
\
src/sanity.X90
\
src/elpa_cholesky_template.X90
\
src/elpa_invert_trm.X90
\
...
...
src/compute_hh_trafo_complex_gpu.X90
0 → 100644
View file @
14fed421
subroutine compute_hh_trafo_complex_gpu_&
&PRECISION&
&(a_dev, bcast_buffer_dev, hh_tau_dev, off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
use iso_c_binding
use cuda_c_kernel
use cuda_functions
use precision
implicit none
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=c_size_t) :: a_dev, bcast_buffer_dev, hh_tau_dev
integer(kind=ik), intent(in) :: last_stripe_width
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl, a_dim2, n_times, nbw, stripe_count, stripe_width
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_datatype
dev_offset_2 =( off-1 )*size_of_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
src/elpa2_compute_complex_template.X90
View file @
14fed421
...
...
@@ -71,150 +71,3 @@
#include "elpa2_tridiag_band_template.X90"
#include "elpa2_trans_ev_tridi_to_band_template.X90"
subroutine compute_hh_dot_products_complex_gpu_&
&PRECISION&
&(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
call launch_compute_hh_dotp_c_kernel_complex_&
&PRECISION&
&( bcast_buffer_dev, hh_dot_dev, nbw,n)
end subroutine
subroutine pack_row_group_complex_gpu_&
&PRECISION&
&(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE) :: rows(:,:)
integer(kind=ik) :: max_idx
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
call launch_my_pack_c_kernel_complex_&
&PRECISION&
&(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop 1
endif
end subroutine
subroutine unpack_row_group_complex_gpu_&
&PRECISION&
&(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE), intent(in) :: rows(:, :)
integer(kind=ik) :: max_idx
integer(kind=ik) :: i
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_datatype , &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop 1
endif
call launch_my_unpack_c_kernel_complex_&
&PRECISION&
&( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
end subroutine
subroutine unpack_and_prepare_row_group_complex_gpu_&
&PRECISION&
&(next_unpack_idx, force)
use precision
implicit none
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
call unpack_row_group_complex_gpu_&
&PRECISION&
&(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
end subroutine
subroutine compute_hh_trafo_complex_gpu_&
&PRECISION&
&(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
use iso_c_binding
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_datatype
dev_offset_2 =( off-1 )*size_of_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_&
&PRECISION&
&(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
end subroutine
src/elpa2_trans_ev_tridi_to_band_template.X90
View file @
14fed421
...
...
@@ -46,8 +46,8 @@
#endif
use elpa2_workload
use pack_unpack_cpu
use pack_unpack_gpu
#if REALCASE == 1
use pack_unpack_real_gpu
use compute_hh_trafo_real
#endif
...
...
@@ -730,11 +730,9 @@
&_gpu_&
&PRECISION &
( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
#endif
i - limits(ip), .false.)
#ifdef WITH_MPI
call timer%start("mpi_communication")
...
...
@@ -792,11 +790,9 @@
&_gpu_&
&PRECISION &
( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
#endif
i - limits(ip), .false.)
#if REALCASE == 1
...
...
@@ -810,9 +806,12 @@
#else /* WITH_OPENMP */
#if COMPLEXCASE == 1
! why is an cuda call in the openmp region?
call unpack_and_prepare_row_group_complex_gpu_&
&PRECISION&
&(i - limits(ip),.false.)
&(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(ip),.false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
#endif
...
...
@@ -945,11 +944,9 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, &
#endif
i - limits(my_prow), .false.)
#ifdef WITH_MPI
...
...
@@ -1016,11 +1013,9 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group_size, nblk, unpack_idx, &
#endif
-1, .true.)
successCUDA = cuda_devicesynchronize()
...
...
@@ -1427,23 +1422,18 @@
&MATH_DATATYPE&
&_gpu_&
&PRECISION &
#if REALCASE == 1
!
#if REALCASE == 1
(bcast_buffer_dev, hh_tau_dev, nbw, &
#endif
#if COMPLEXCASE == 1
( nbw, &
#endif
!
#endif
!
#if COMPLEXCASE == 1
!
( nbw, &
!
#endif
current_local_n, .false.)
call compute_hh_dot_products_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION &
#if REALCASE == 1
(bcast_buffer_dev, hh_dot_dev, nbw, &
#endif
#if COMPLEXCASE == 1
( nbw, &
#endif
current_local_n)
endif ! useGPU
...
...
@@ -1470,9 +1460,7 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
bcast_buffer_dev, hh_tau_dev, &
#endif
nbw, 1, .true.)
endif ! useGPU
endif ! (current_local_n > 1) then
...
...
@@ -1720,7 +1708,9 @@
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
...
...
@@ -1892,8 +1882,10 @@
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
...
...
@@ -2008,8 +2000,10 @@
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
...
...
@@ -2106,7 +2100,9 @@
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION&
...
...
@@ -2308,9 +2304,7 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
#endif
row_group(:, :), j * nblk + a_off, row_group_size)
do i = 1, row_group_size
...
...
@@ -2351,10 +2345,8 @@
&_gpu_&
&PRECISION&
&( &
#if REALCASE == 1
row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, &
#endif
result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else ! useGPU
...
...
@@ -2596,15 +2588,16 @@
stop 1
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop 1
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop 1
endif
! endif
!#endif
endif !use GPU
...
...
@@ -2797,33 +2790,30 @@
)
return
#if COMPLEXCASE == 1
contains
! The host wrapper for extracting "tau" from the HH reflectors (see the
! kernel below)
subroutine extract_hh_tau_complex_gpu_&
&PRECISION&
&(nbw, n, is_zero)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
logical, value :: is_zero
integer(kind=ik) :: val_is_zero
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
call launch_extract_hh_tau_c_kernel_complex_&
&PRECISION&
&(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
end subroutine
#endif /* COMPLEXCASE */
#if REALCASE == 1
end subroutine trans_ev_tridi_to_band_real_&
&PRECISION
#endif
!#if COMPLEXCASE == 1
! contains
! ! The host wrapper for extracting "tau" from the HH reflectors (see the
! ! kernel below)
! subroutine extract_hh_tau_complex_gpu_&
! &PRECISION&
! &(nbw, n, is_zero)
! use cuda_c_kernel
! use pack_unpack_gpu
! use precision
! implicit none
! integer(kind=ik), value :: nbw, n
! logical, value :: is_zero
! integer(kind=ik) :: val_is_zero
!
! if (is_zero) then
! val_is_zero = 1
! else
! val_is_zero = 0
! endif
! call launch_extract_hh_tau_c_kernel_complex_&
! &PRECISION&
! &(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
! end subroutine
!#endif /* COMPLEXCASE */
end subroutine
src/mod_compute_hh_trafo_complex.F90
View file @
14fed421
...
...
@@ -46,12 +46,13 @@ module compute_hh_trafo_complex
use
elpa_mpi
implicit
none
private
#ifdef WITH_OPENMP
public
compute_hh_trafo_complex_cpu_openmp_double
#else
public
compute_hh_trafo_complex_cpu_double
#endif
public
compute_hh_trafo_complex_gpu_double
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP
...
...
@@ -59,7 +60,7 @@ module compute_hh_trafo_complex
#else
public
compute_hh_trafo_complex_cpu_single
#endif
public
compute_hh_trafo_complex_gpu_single
#endif
contains
...
...
@@ -787,4 +788,25 @@ module compute_hh_trafo_complex
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
!complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
! complex single precision
#if defined(WANT_SINGLE_PRECISION_COMPLEX)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif
end
module
src/mod_pack_unpack_gpu.F90
0 → 100644
View file @
14fed421
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
module
pack_unpack_gpu
#include "config-f90.h"
implicit
none
private
public
pack_row_group_real_gpu_double
,
unpack_row_group_real_gpu_double
,
&
unpack_and_prepare_row_group_real_gpu_double
,
compute_hh_dot_products_real_gpu_double
,
&
extract_hh_tau_real_gpu_double
public
pack_row_group_complex_gpu_double
,
unpack_row_group_complex_gpu_double
,
&
unpack_and_prepare_row_group_complex_gpu_double
,
compute_hh_dot_products_complex_gpu_double
,
&
extract_hh_tau_complex_gpu_double
#ifdef WANT_SINGLE_PRECISION_REAL
public
pack_row_group_real_gpu_single
,
unpack_row_group_real_gpu_single
,
&
unpack_and_prepare_row_group_real_gpu_single
,
compute_hh_dot_products_real_gpu_single
,
&
extract_hh_tau_real_gpu_single
#endif
#ifdef WANT_SINGLE_PRECISION_COMPLEX
public
pack_row_group_complex_gpu_single
,
unpack_row_group_complex_gpu_single
,
&
unpack_and_prepare_row_group_complex_gpu_single
,
compute_hh_dot_products_complex_gpu_single
,
&
extract_hh_tau_complex_gpu_single
#endif
contains
!real double precision
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_gpu.X90"
#undef REALCASE
#undef DOUBLE_PRECISION
! real single precision
#if defined(WANT_SINGLE_PRECISION_REAL)
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_gpu.X90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif
!complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_gpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
! complex single precision
#if defined(WANT_SINGLE_PRECISION_COMPLEX)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "pack_unpack_gpu.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif
end
module
src/
mod_pack_unpack_real_gpu.F
90
→
src/
pack_unpack_gpu.X
90
View file @
14fed421
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
...
...
@@ -39,26 +40,15 @@
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
module
pack_unpack_real_gpu
#include "config-f90.h"
implicit
none
public
pack_row_group_real_gpu_double
,
unpack_row_group_real_gpu_double
,
&
unpack_and_prepare_row_group_real_gpu_double
,
compute_hh_dot_products_real_gpu_double
,
&
extract_hh_tau_real_gpu_double
#ifdef WANT_SINGLE_PRECISION_REAL