Commit f07dc1f9 authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify GPU/CPU compute_hh_trafo

parent 1d9439c6
......@@ -64,7 +64,6 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \
src/compute_hh_trafo.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
......@@ -983,7 +982,6 @@ EXTRA_DIST = \
src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \
src/compute_hh_trafo.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
......
......@@ -46,25 +46,20 @@
subroutine compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_cpu_openmp_&
&_openmp_&
#else
&_cpu_&
&_&
#endif
&PRECISION &
(a, &
#if REALCASE == 1
a_dev, &
#endif
stripe_width, a_dim2, stripe_count, &
(a, a_dev, stripe_width, a_dim2, stripe_count, &
#ifdef WITH_OPENMP
max_threads, l_nev, &
#endif
a_off, nbw, max_blk_size, bcast_buffer, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, &
hh_dot_dev, &
#endif
kernel_flops, kernel_time, &
off, ncols, istripe, &
hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
#ifdef WITH_OPENMP
my_thread, thread_width, &
#else
......@@ -112,10 +107,8 @@
#endif /* COMPLEXCASE */
#if REALCASE == 1
use cuda_c_kernel
use cuda_functions
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
......@@ -171,19 +164,20 @@
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
#endif
#if REALCASE == 1
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: dev_offset
#if REALCASE == 1
integer(kind=c_intptr_t) :: hh_dot_dev ! why not needed in complex case
#endif
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik) :: off, ncols, istripe
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, noff
#endif
integer(kind=ik) :: j, nl, jj, jjj
integer(kind=ik) :: j, nl, jj, jjj, n_times
#if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: w(nbw,6)
#endif
......@@ -198,13 +192,19 @@
if (ncols < 1) return
endif
#endif
#if COMPLEXCASE == 1
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
endif
#endif
call timer%start("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_cpu_openmp" // &
&_openmp" // &
#else
&_cpu" // &
&" // &
#endif
&PRECISION_SUFFIX &
)
......@@ -230,6 +230,14 @@
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
if (istripe<stripe_count) then
nl = stripe_width
......@@ -240,9 +248,9 @@
call timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_cpu_openmp" // &
&_openmp" // &
#else
&_cpu" // &
&" // &
#endif
&PRECISION_SUFFIX &
)
......@@ -253,6 +261,7 @@
#endif /* not WITH_OPENMP */
#if REALCASE == 1
! GPU kernel real
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
&PRECISION&
......@@ -263,8 +272,37 @@
&_&
&PRECISION&
& (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
else ! not CUDA kernel
#endif /* REALCASE */
#if COMPLEXCASE == 1
! GPU kernel complex
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
dev_offset_1 = (0 + ( off-1 )* nbw) * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
dev_offset_2 =( off-1 )* size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
call launch_compute_hh_trafo_c_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif /* COMPLEXCASE */
else ! not CUDA kernel
#if REALCASE == 1
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
......@@ -1486,16 +1524,14 @@
!no avx512 block6 complex kernel
#endif /* COMPLEXCASE */
#if REALCASE == 1
endif ! GPU_KERNEL
#endif
#ifdef WITH_OPENMP
if (my_thread==1) then
#endif
kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
kernel_time = kernel_time + mpi_wtime()-ttt
n_times = n_times + 1
#ifdef WITH_OPENMP
endif
#endif
......@@ -1503,9 +1539,9 @@
call timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_cpu_openmp" // &
&_openmp" // &
#else
&_cpu" // &
&" // &
#endif
&PRECISION_SUFFIX &
)
......
......@@ -156,9 +156,7 @@
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx
#if COMPLEXCASE == 1
integer(kind=ik) :: n_times
#endif
integer(kind=ik) :: top, chunk, this_chunk
#if REALCASE == 1
......@@ -1667,19 +1665,15 @@
call compute_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&_openmp_&
&PRECISION &
(aIntern, &
#if REALCASE == 1
aIntern_dev, &
#endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, &
max_blk_size, bcast_buffer, &
(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, &
hh_dot_dev, &
#endif
kernel_flops, kernel_time, 0, current_local_n, i, my_thread, thread_width, &
THIS_&
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, &
i, my_thread, thread_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
enddo
......@@ -1688,34 +1682,37 @@
#else /* WITH_OPENMP */
#if REALCASE == 1
call compute_hh_trafo_real_cpu_&
&PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
last_stripe_width, THIS_&
call compute_hh_trafo_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
#endif
#if COMPLEXCASE == 1
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&_&
&PRECISION&
&(aIntern, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, current_local_n, i, last_stripe_width, &
THIS_&
& (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
endif
#endif /* COMPLEXCASE */
!#if COMPLEXCASE == 1
! if (useGPU) then
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
! dev_offset_2, a_dim2, &
! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
! else
! call compute_hh_trafo_complex_&
! &PRECISION&
! &(aIntern, stripe_width, a_dim2, stripe_count, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! 0, current_local_n, i, last_stripe_width, &
! THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
! endif
!#endif /* COMPLEXCASE */
#endif /* WITH_OPENMP */
!send_b 1
......@@ -1807,18 +1804,15 @@
call compute_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&_openmp_&
&PRECISION&
&(aIntern, &
#if REALCASE == 1
aIntern_dev, &
#endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
& (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, &
hh_dot_dev, &
#endif
kernel_flops, kernel_time, current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
thread_width, THIS_&
hh_tau_dev, kernel_flops, kernel_time, n_times, current_local_n - bottom_msg_length, &
bottom_msg_length, i, my_thread, thread_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
enddo
......@@ -1859,40 +1853,43 @@
#else /* WITH_OPENMP */
#if REALCASE == 1
call compute_hh_trafo_real_cpu_&
&PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_&
call compute_hh_trafo_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
#endif
#if COMPLEXCASE == 1
! the complex case and real case diverged here
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&_&
&PRECISION&
&(aIntern, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
& (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
endif
#endif
!#if COMPLEXCASE == 1
!! the complex case and real case diverged here
! if (useGPU) then
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
! dev_offset, dev_offset_1, dev_offset_2, &
! a_dim2, &
! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
! else
! call compute_hh_trafo_complex_&
! &PRECISION&
! &(aIntern, stripe_width, a_dim2, stripe_count, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! current_local_n - bottom_msg_length, bottom_msg_length, i, &
! last_stripe_width, THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
!
! endif
!
!#endif
!send_b
#ifdef WITH_MPI
......@@ -1958,17 +1955,14 @@
do my_thread = 1, max_threads
call compute_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&_openmp_&
&PRECISION&
&(aIntern, &
& (aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
aIntern_dev, &
hh_dot_dev, &
#endif
stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
#if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, &
#endif
kernel_flops, kernel_time, top_msg_length,&
hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,&
current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
THIS_&
&MATH_DATATYPE&
......@@ -1979,36 +1973,39 @@
#else /* WITH_OPENMP */
#if REALCASE == 1
call compute_hh_trafo_real_cpu_&
&PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_&
call compute_hh_trafo_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
#endif
#if COMPLEXCASE == 1
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&_&
&PRECISION&
&(aIntern, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
& (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
endif
#endif /* COMPLEXCASE */
!#if COMPLEXCASE == 1
! if (useGPU) then
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
! i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
! a_dim2, &
! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
! else
! call compute_hh_trafo_complex_&
! &PRECISION&
! &(aIntern, stripe_width, a_dim2, stripe_count, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
! last_stripe_width, THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
! endif
!#endif /* COMPLEXCASE */
#endif /* WITH_OPENMP */
!wait_t
......@@ -2060,18 +2057,15 @@
endif
call compute_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&_openmp_&
&PRECISION&
&(aIntern, &
#if REALCASE == 1
aIntern_dev, &
#endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
& (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, &
hh_dot_dev, &
#endif
kernel_flops, kernel_time, 0, top_msg_length, i, my_thread, thread_width, &
THIS_&
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, my_thread, &
thread_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
enddo
......@@ -2080,34 +2074,37 @@
#else /* WITH_OPENMP */
#if REALCASE == 1
call compute_hh_trafo_real_cpu_&
&PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, top_msg_length, i, &
last_stripe_width, THIS_&
call compute_hh_trafo_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
#endif
#if COMPLEXCASE == 1
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&_&
&PRECISION&
&(aIntern, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, top_msg_length, i, last_stripe_width, &
THIS_&
& (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
endif
#endif
!#if COMPLEXCASE == 1
! if (useGPU) then
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
! a_dim2, &
! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
! else
! call compute_hh_trafo_complex_&
! &PRECISION&
! &(aIntern, stripe_width, a_dim2, stripe_count, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! 0, top_msg_length, i, last_stripe_width, &
! THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
! endif
!#endif
#endif /* WITH_OPENMP */
endif
......
......@@ -47,36 +47,32 @@ module compute_hh_trafo
implicit none
#ifdef WITH_OPENMP
public compute_hh_trafo_real_cpu_openmp_double
public compute_hh_trafo_real_openmp_double
#else
public compute_hh_trafo_real_cpu_double
public compute_hh_trafo_real_double
#endif
#ifdef WITH_OPENMP
public compute_hh_trafo_complex_cpu_openmp_double
public compute_hh_trafo_complex_openmp_double
#else
public compute_hh_trafo_complex_cpu_double
public compute_hh_trafo_complex_double
#endif
public compute_hh_trafo_complex_gpu_double
#ifdef WANT_SINGLE_PRECISION_REAL
#ifdef WITH_OPENMP
public compute_hh_trafo_real_cpu_openmp_single
public compute_hh_trafo_real_openmp_single
#else
public compute_hh_trafo_real_cpu_single
public compute_hh_trafo_real_single
#endif
#endif
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#ifdef WITH_OPENMP
public compute_hh_trafo_complex_cpu_openmp_single
public compute_hh_trafo_complex_openmp_single
#else
public compute_hh_trafo_complex_cpu_single
public compute_hh_trafo_complex_single
#endif
public compute_hh_trafo_complex_gpu_single
#endif
contains
......@@ -116,23 +112,23 @@ module compute_hh_trafo
#undef SINGLE_PRECISION
#endif
!complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
! complex single precision
#if defined(WANT_SINGLE_PRECISION_COMPLEX)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif
!
! !complex double precision
!#define COMPLEXCASE 1
!#define DOUBLE_PRECISION 1
!#include "precision_macros.h"
!#include "compute_hh_trafo_complex_gpu.X90"