Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
14fed421
Commit
14fed421
authored
Mar 21, 2017
by
Andreas Marek
Browse files
Unify pack_unpack GPU version
parent
ca9dc94b
Changes
7
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
14fed421
...
@@ -40,7 +40,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
...
@@ -40,7 +40,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/check_for_gpu.F90
\
src/check_for_gpu.F90
\
src/mod_cuda.F90
\
src/mod_cuda.F90
\
src/interface_c_kernel.F90
\
src/interface_c_kernel.F90
\
src/mod_pack_unpack_
real_
gpu.F90
\
src/mod_pack_unpack_gpu.F90
\
src/elpa_qr/qr_utils.F90
\
src/elpa_qr/qr_utils.F90
\
src/elpa_qr/elpa_qrkernels.F90
\
src/elpa_qr/elpa_qrkernels.F90
\
src/elpa_qr/elpa_pdlarfb.F90
\
src/elpa_qr/elpa_pdlarfb.F90
\
...
@@ -63,6 +63,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
...
@@ -63,6 +63,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_complex_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/pack_unpack_cpu.X90
\
src/pack_unpack_cpu.X90
\
src/pack_unpack_gpu.X90
\
src/compute_hh_trafo_complex_gpu.X90
\
src/redist_band.X90
\
src/redist_band.X90
\
src/sanity.X90
\
src/sanity.X90
\
src/elpa_cholesky_template.X90
\
src/elpa_cholesky_template.X90
\
...
@@ -982,6 +984,8 @@ EXTRA_DIST = \
...
@@ -982,6 +984,8 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/elpa2_kernels/elpa2_kernels_simple_template.X90
\
src/redist_band.X90
\
src/redist_band.X90
\
src/pack_unpack_cpu.X90
\
src/pack_unpack_cpu.X90
\
src/pack_unpack_gpu.X90
\
src/compute_hh_trafo_complex_gpu.X90
\
src/sanity.X90
\
src/sanity.X90
\
src/elpa_cholesky_template.X90
\
src/elpa_cholesky_template.X90
\
src/elpa_invert_trm.X90
\
src/elpa_invert_trm.X90
\
...
...
src/compute_hh_trafo_complex_gpu.X90
0 → 100644
View file @
14fed421
subroutine compute_hh_trafo_complex_gpu_&
&PRECISION&
&(a_dev, bcast_buffer_dev, hh_tau_dev, off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
use iso_c_binding
use cuda_c_kernel
use cuda_functions
use precision
implicit none
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=c_size_t) :: a_dev, bcast_buffer_dev, hh_tau_dev
integer(kind=ik), intent(in) :: last_stripe_width
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl, a_dim2, n_times, nbw, stripe_count, stripe_width
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_datatype
dev_offset_2 =( off-1 )*size_of_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
src/elpa2_compute_complex_template.X90
View file @
14fed421
...
@@ -71,150 +71,3 @@
...
@@ -71,150 +71,3 @@
#include "elpa2_tridiag_band_template.X90"
#include "elpa2_tridiag_band_template.X90"
#include "elpa2_trans_ev_tridi_to_band_template.X90"
#include "elpa2_trans_ev_tridi_to_band_template.X90"
subroutine compute_hh_dot_products_complex_gpu_&
&PRECISION&
&(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
call launch_compute_hh_dotp_c_kernel_complex_&
&PRECISION&
&( bcast_buffer_dev, hh_dot_dev, nbw,n)
end subroutine
subroutine pack_row_group_complex_gpu_&
&PRECISION&
&(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE) :: rows(:,:)
integer(kind=ik) :: max_idx
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
call launch_my_pack_c_kernel_complex_&
&PRECISION&
&(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop 1
endif
end subroutine
subroutine unpack_row_group_complex_gpu_&
&PRECISION&
&(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: n_offset, row_count
complex(kind=COMPLEX_DATATYPE), intent(in) :: rows(:, :)
integer(kind=ik) :: max_idx
integer(kind=ik) :: i
logical :: successCUDA
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_datatype , &
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop 1
endif
call launch_my_unpack_c_kernel_complex_&
&PRECISION&
&( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
end subroutine
subroutine unpack_and_prepare_row_group_complex_gpu_&
&PRECISION&
&(next_unpack_idx, force)
use precision
implicit none
integer(kind=ik), intent(in) :: next_unpack_idx
logical, intent(in) :: force
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
call unpack_row_group_complex_gpu_&
&PRECISION&
&(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
end subroutine
subroutine compute_hh_trafo_complex_gpu_&
&PRECISION&
&(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
use iso_c_binding
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), intent(in) :: off, ncols, istripe
integer(kind=ik) :: nl
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer(kind=ik) :: a_off
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
if (ncols < 1) return
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_datatype
dev_offset_2 =( off-1 )*size_of_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_&
&PRECISION&
&(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8)
kernel_time = kernel_time + mpi_wtime() - ttt
n_times =n_times +1
end subroutine
end subroutine
src/elpa2_trans_ev_tridi_to_band_template.X90
View file @
14fed421
...
@@ -46,8 +46,8 @@
...
@@ -46,8 +46,8 @@
#endif
#endif
use elpa2_workload
use elpa2_workload
use pack_unpack_cpu
use pack_unpack_cpu
use pack_unpack_gpu
#if REALCASE == 1
#if REALCASE == 1
use pack_unpack_real_gpu
use compute_hh_trafo_real
use compute_hh_trafo_real
#endif
#endif
...
@@ -730,11 +730,9 @@
...
@@ -730,11 +730,9 @@
&_gpu_&
&_gpu_&
&PRECISION &
&PRECISION &
( &
( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
row_group_size, nblk, unpack_idx, &
#endif
i - limits(ip), .false.)
i - limits(ip), .false.)
#ifdef WITH_MPI
#ifdef WITH_MPI
call timer%start("mpi_communication")
call timer%start("mpi_communication")
...
@@ -792,11 +790,9 @@
...
@@ -792,11 +790,9 @@
&_gpu_&
&_gpu_&
&PRECISION &
&PRECISION &
( &
( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev,&
stripe_width, last_stripe_width, a_dim2, l_nev,&
row_group_size, nblk, unpack_idx, &
row_group_size, nblk, unpack_idx, &
#endif
i - limits(ip), .false.)
i - limits(ip), .false.)
#if REALCASE == 1
#if REALCASE == 1
...
@@ -810,9 +806,12 @@
...
@@ -810,9 +806,12 @@
#else /* WITH_OPENMP */
#else /* WITH_OPENMP */
#if COMPLEXCASE == 1
#if COMPLEXCASE == 1
! why is an cuda call in the openmp region?
call unpack_and_prepare_row_group_complex_gpu_&
call unpack_and_prepare_row_group_complex_gpu_&
&PRECISION&
&PRECISION&
&(i - limits(ip),.false.)
&(row_group, row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(ip),.false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
#endif
#endif
...
@@ -945,11 +944,9 @@
...
@@ -945,11 +944,9 @@
&_gpu_&
&_gpu_&
&PRECISION&
&PRECISION&
&( &
&( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev, &
stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, &
row_group_size, nblk, unpack_idx, &
#endif
i - limits(my_prow), .false.)
i - limits(my_prow), .false.)
#ifdef WITH_MPI
#ifdef WITH_MPI
...
@@ -1016,11 +1013,9 @@
...
@@ -1016,11 +1013,9 @@
&_gpu_&
&_gpu_&
&PRECISION&
&PRECISION&
&( &
&( &
#if REALCASE == 1
row_group, row_group_dev, aIntern_dev, stripe_count, &
row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, &
stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group_size, nblk, unpack_idx, &
a_dim2, l_nev, row_group_size, nblk, unpack_idx, &
#endif
-1, .true.)
-1, .true.)
successCUDA = cuda_devicesynchronize()
successCUDA = cuda_devicesynchronize()
...
@@ -1427,23 +1422,18 @@
...
@@ -1427,23 +1422,18 @@
&MATH_DATATYPE&
&MATH_DATATYPE&
&_gpu_&
&_gpu_&
&PRECISION &
&PRECISION &
#if REALCASE == 1
!
#if REALCASE == 1
(bcast_buffer_dev, hh_tau_dev, nbw, &
(bcast_buffer_dev, hh_tau_dev, nbw, &
#endif
!
#endif
#if COMPLEXCASE == 1
!
#if COMPLEXCASE == 1
( nbw, &
!
( nbw, &
#endif
!
#endif
current_local_n, .false.)
current_local_n, .false.)
call compute_hh_dot_products_&
call compute_hh_dot_products_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&_gpu_&
&_gpu_&
&PRECISION &
&PRECISION &
#if REALCASE == 1
(bcast_buffer_dev, hh_dot_dev, nbw, &
(bcast_buffer_dev, hh_dot_dev, nbw, &
#endif
#if COMPLEXCASE == 1
( nbw, &
#endif
current_local_n)
current_local_n)
endif ! useGPU
endif ! useGPU
...
@@ -1470,9 +1460,7 @@
...
@@ -1470,9 +1460,7 @@
&_gpu_&
&_gpu_&
&PRECISION&
&PRECISION&
&( &
&( &
#if REALCASE == 1
bcast_buffer_dev, hh_tau_dev, &
bcast_buffer_dev, hh_tau_dev, &
#endif
nbw, 1, .true.)
nbw, 1, .true.)
endif ! useGPU
endif ! useGPU
endif ! (current_local_n > 1) then
endif ! (current_local_n > 1) then
...
@@ -1720,7 +1708,9 @@
...
@@ -1720,7 +1708,9 @@
if (useGPU) then
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&PRECISION&
&(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
else
call compute_hh_trafo_complex_cpu_&
call compute_hh_trafo_complex_cpu_&
&PRECISION&
&PRECISION&
...
@@ -1892,8 +1882,10 @@
...
@@ -1892,8 +1882,10 @@
if (useGPU) then
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&PRECISION&
&(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
else
call compute_hh_trafo_complex_cpu_&
call compute_hh_trafo_complex_cpu_&
&PRECISION&
&PRECISION&
...
@@ -2008,8 +2000,10 @@
...
@@ -2008,8 +2000,10 @@
if (useGPU) then
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&PRECISION&
&(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, dev_offset, dev_offset_1, dev_offset_2)
i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
else
call compute_hh_trafo_complex_cpu_&
call compute_hh_trafo_complex_cpu_&
&PRECISION&
&PRECISION&
...
@@ -2106,7 +2100,9 @@
...
@@ -2106,7 +2100,9 @@
if (useGPU) then
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&PRECISION&
&(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
else
call compute_hh_trafo_complex_cpu_&
call compute_hh_trafo_complex_cpu_&
&PRECISION&
&PRECISION&
...
@@ -2308,9 +2304,7 @@
...
@@ -2308,9 +2304,7 @@
&_gpu_&
&_gpu_&
&PRECISION&
&PRECISION&
&( &
&( &
#if REALCASE == 1
row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_dev, aIntern_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
#endif
row_group(:, :), j * nblk + a_off, row_group_size)
row_group(:, :), j * nblk + a_off, row_group_size)
do i = 1, row_group_size
do i = 1, row_group_size
...
@@ -2351,10 +2345,8 @@
...
@@ -2351,10 +2345,8 @@
&_gpu_&
&_gpu_&
&PRECISION&
&PRECISION&
&( &
&( &
#if REALCASE == 1
row_group_dev, aIntern_dev, stripe_count, stripe_width, &
row_group_dev, aIntern_dev, stripe_count, stripe_width, &
last_stripe_width, a_dim2, l_nev, &
last_stripe_width, a_dim2, l_nev, &
#endif
result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else ! useGPU
else ! useGPU
...
@@ -2596,15 +2588,16 @@
...
@@ -2596,15 +2588,16 @@
stop 1
stop 1
endif
endif
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)* size_of_datatype, &
cudaMemcpyHostToDevice)
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&MATH_DATATYPE&
&: error in cudaMalloc"
&: error in cudaMalloc"
stop 1
stop 1
endif
endif
! endif
!#endif
!#endif
endif !use GPU
endif !use GPU
...
@@ -2797,33 +2790,30 @@
...
@@ -2797,33 +2790,30 @@
)
)
return
return
#if COMPLEXCASE == 1
!#if COMPLEXCASE == 1
contains
! contains
! The host wrapper for extracting "tau" from the HH reflectors (see the
! ! The host wrapper for extracting "tau" from the HH reflectors (see the
! kernel below)
! ! kernel below)
subroutine extract_hh_tau_complex_gpu_&
! subroutine extract_hh_tau_complex_gpu_&
&PRECISION&
! &PRECISION&
&(nbw, n, is_zero)
! &(nbw, n, is_zero)
use cuda_c_kernel
! use cuda_c_kernel
use precision
! use pack_unpack_gpu
implicit none
! use precision
integer(kind=ik), value :: nbw, n
! implicit none
logical, value :: is_zero
! integer(kind=ik), value :: nbw, n
integer(kind=ik) :: val_is_zero
! logical, value :: is_zero
! integer(kind=ik) :: val_is_zero
if (is_zero) then
!
val_is_zero = 1
! if (is_zero) then
else
! val_is_zero = 1
val_is_zero = 0
! else
endif
! val_is_zero = 0
call launch_extract_hh_tau_c_kernel_complex_&
! endif
&PRECISION&
! call launch_extract_hh_tau_c_kernel_complex_&
&(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
! &PRECISION&
end subroutine
! &(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero)
#endif /* COMPLEXCASE */
! end subroutine
!#endif /* COMPLEXCASE */
#if REALCASE == 1
end subroutine trans_ev_tridi_to_band_real_&
end subroutine
&PRECISION
#endif
src/mod_compute_hh_trafo_complex.F90
View file @
14fed421
...
@@ -46,12 +46,13 @@ module compute_hh_trafo_complex
...
@@ -46,12 +46,13 @@ module compute_hh_trafo_complex
use
elpa_mpi
use
elpa_mpi
implicit
none
implicit
none
private
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
public
compute_hh_trafo_complex_cpu_openmp_double
public
compute_hh_trafo_complex_cpu_openmp_double
#else
#else
public
compute_hh_trafo_complex_cpu_double
public
compute_hh_trafo_complex_cpu_double
#endif
#endif
public
compute_hh_trafo_complex_gpu_double
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP
...
@@ -59,7 +60,7 @@ module compute_hh_trafo_complex
...
@@ -59,7 +60,7 @@ module compute_hh_trafo_complex
#else
#else
public
compute_hh_trafo_complex_cpu_single
public
compute_hh_trafo_complex_cpu_single
#endif
#endif
public
compute_hh_trafo_complex_gpu_single
#endif
#endif
contains
contains
...
@@ -787,4 +788,25 @@ module compute_hh_trafo_complex
...
@@ -787,4 +788,25 @@ module compute_hh_trafo_complex
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
!complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
! complex single precision