Commit f07dc1f9 authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify GPU/CPU compute_hh_trafo

parent 1d9439c6
...@@ -64,7 +64,6 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \ ...@@ -64,7 +64,6 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/pack_unpack_cpu.X90 \ src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \ src/pack_unpack_gpu.X90 \
src/compute_hh_trafo.X90 \ src/compute_hh_trafo.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/redist_band.X90 \ src/redist_band.X90 \
src/sanity.X90 \ src/sanity.X90 \
src/elpa_cholesky_template.X90 \ src/elpa_cholesky_template.X90 \
...@@ -983,7 +982,6 @@ EXTRA_DIST = \ ...@@ -983,7 +982,6 @@ EXTRA_DIST = \
src/pack_unpack_cpu.X90 \ src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \ src/pack_unpack_gpu.X90 \
src/compute_hh_trafo.X90 \ src/compute_hh_trafo.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/sanity.X90 \ src/sanity.X90 \
src/elpa_cholesky_template.X90 \ src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \ src/elpa_invert_trm.X90 \
......
...@@ -46,25 +46,20 @@ ...@@ -46,25 +46,20 @@
subroutine compute_hh_trafo_& subroutine compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_cpu_openmp_& &_openmp_&
#else #else
&_cpu_& &_&
#endif #endif
&PRECISION & &PRECISION &
(a, & (a, a_dev, stripe_width, a_dim2, stripe_count, &
#if REALCASE == 1
a_dev, &
#endif
stripe_width, a_dim2, stripe_count, &
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
max_threads, l_nev, & max_threads, l_nev, &
#endif #endif
a_off, nbw, max_blk_size, bcast_buffer, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, & hh_dot_dev, &
#endif #endif
kernel_flops, kernel_time, & hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
off, ncols, istripe, &
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
my_thread, thread_width, & my_thread, thread_width, &
#else #else
...@@ -112,10 +107,8 @@ ...@@ -112,10 +107,8 @@
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
#endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
...@@ -171,19 +164,20 @@ ...@@ -171,19 +164,20 @@
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
#endif #endif
#if REALCASE == 1
integer(kind=c_intptr_t) :: a_dev integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_intptr_t) :: hh_dot_dev #if REALCASE == 1
integer(kind=c_intptr_t) :: hh_tau_dev integer(kind=c_intptr_t) :: hh_dot_dev ! why not needed in complex case
integer(kind=c_intptr_t) :: dev_offset
#endif #endif
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2
! Private variables in OMP regions (my_thread) should better be in the argument list! ! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik) :: off, ncols, istripe integer(kind=ik) :: off, ncols, istripe
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, noff integer(kind=ik) :: my_thread, noff
#endif #endif
integer(kind=ik) :: j, nl, jj, jjj integer(kind=ik) :: j, nl, jj, jjj, n_times
#if REALCASE == 1 #if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: w(nbw,6) real(kind=C_DATATYPE_KIND) :: w(nbw,6)
#endif #endif
...@@ -198,13 +192,19 @@ ...@@ -198,13 +192,19 @@
if (ncols < 1) return if (ncols < 1) return
endif endif
#endif #endif
#if COMPLEXCASE == 1
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
endif
#endif
call timer%start("compute_hh_trafo_& call timer%start("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_cpu_openmp" // & &_openmp" // &
#else #else
&_cpu" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
...@@ -230,6 +230,14 @@ ...@@ -230,6 +230,14 @@
&_GPU OPENMP: not yet implemented" &_GPU OPENMP: not yet implemented"
stop 1 stop 1
endif endif
#endif
#if COMPLEXCASE == 1
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif #endif
if (istripe<stripe_count) then if (istripe<stripe_count) then
nl = stripe_width nl = stripe_width
...@@ -240,9 +248,9 @@ ...@@ -240,9 +248,9 @@
call timer%stop("compute_hh_trafo_& call timer%stop("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_cpu_openmp" // & &_openmp" // &
#else #else
&_cpu" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
...@@ -253,6 +261,7 @@ ...@@ -253,6 +261,7 @@
#endif /* not WITH_OPENMP */ #endif /* not WITH_OPENMP */
#if REALCASE == 1 #if REALCASE == 1
! GPU kernel real
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_& dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
&PRECISION& &PRECISION&
...@@ -263,8 +272,37 @@ ...@@ -263,8 +272,37 @@
&_& &_&
&PRECISION& &PRECISION&
& (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols) & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
else ! not CUDA kernel
#endif /* REALCASE */ #endif /* REALCASE */
#if COMPLEXCASE == 1
! GPU kernel complex
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
dev_offset_1 = (0 + ( off-1 )* nbw) * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
dev_offset_2 =( off-1 )* size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
call launch_compute_hh_trafo_c_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif /* COMPLEXCASE */
else ! not CUDA kernel
#if REALCASE == 1 #if REALCASE == 1
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
...@@ -1486,16 +1524,14 @@ ...@@ -1486,16 +1524,14 @@
!no avx512 block6 complex kernel !no avx512 block6 complex kernel
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1
endif ! GPU_KERNEL endif ! GPU_KERNEL
#endif
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
if (my_thread==1) then if (my_thread==1) then
#endif #endif
kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8) kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
n_times = n_times + 1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
endif endif
#endif #endif
...@@ -1503,9 +1539,9 @@ ...@@ -1503,9 +1539,9 @@
call timer%stop("compute_hh_trafo_& call timer%stop("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_cpu_openmp" // & &_openmp" // &
#else #else
&_cpu" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
......
...@@ -156,9 +156,7 @@ ...@@ -156,9 +156,7 @@
integer(kind=c_intptr_t) :: hh_dot_dev integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx integer(kind=ik) :: row_group_size, unpack_idx
#if COMPLEXCASE == 1
integer(kind=ik) :: n_times integer(kind=ik) :: n_times
#endif
integer(kind=ik) :: top, chunk, this_chunk integer(kind=ik) :: top, chunk, this_chunk
#if REALCASE == 1 #if REALCASE == 1
...@@ -1667,19 +1665,15 @@ ...@@ -1667,19 +1665,15 @@
call compute_hh_trafo_& call compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_cpu_openmp_& &_openmp_&
&PRECISION & &PRECISION &
(aIntern, & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
#if REALCASE == 1 l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
aIntern_dev, &
#endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, &
max_blk_size, bcast_buffer, &
#if REALCASE == 1 #if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, & hh_dot_dev, &
#endif #endif
kernel_flops, kernel_time, 0, current_local_n, i, my_thread, thread_width, & hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, &
THIS_& i, my_thread, thread_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_ELPA_KERNEL)
enddo enddo
...@@ -1688,34 +1682,37 @@ ...@@ -1688,34 +1682,37 @@
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
#if REALCASE == 1 call compute_hh_trafo_&
call compute_hh_trafo_real_cpu_&
&PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_&
#endif
#if COMPLEXCASE == 1
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
dev_offset_2, a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION& &PRECISION&
&(aIntern, stripe_width, a_dim2, stripe_count, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
0, current_local_n, i, last_stripe_width, & #if REALCASE == 1
THIS_& hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_ELPA_KERNEL)
endif !#if COMPLEXCASE == 1
#endif /* COMPLEXCASE */ ! if (useGPU) then
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
! dev_offset_2, a_dim2, &
! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
! else
! call compute_hh_trafo_complex_&
! &PRECISION&
! &(aIntern, stripe_width, a_dim2, stripe_count, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! 0, current_local_n, i, last_stripe_width, &
! THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
! endif
!#endif /* COMPLEXCASE */
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
!send_b 1 !send_b 1
...@@ -1807,18 +1804,15 @@ ...@@ -1807,18 +1804,15 @@
call compute_hh_trafo_& call compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_cpu_openmp_& &_openmp_&
&PRECISION& &PRECISION&
&(aIntern, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
#if REALCASE == 1 nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
aIntern_dev, &
#endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
#if REALCASE == 1 #if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, & hh_dot_dev, &
#endif #endif
kernel_flops, kernel_time, current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, & hh_tau_dev, kernel_flops, kernel_time, n_times, current_local_n - bottom_msg_length, &
thread_width, THIS_& bottom_msg_length, i, my_thread, thread_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_ELPA_KERNEL)
enddo enddo
...@@ -1859,40 +1853,43 @@ ...@@ -1859,40 +1853,43 @@
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
#if REALCASE == 1 call compute_hh_trafo_&
call compute_hh_trafo_real_cpu_&
&PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_&
#endif
#if COMPLEXCASE == 1
! the complex case and real case diverged here
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION& &PRECISION&
&(aIntern, stripe_width, a_dim2, stripe_count, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, &
current_local_n - bottom_msg_length, bottom_msg_length, i, & current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_& last_stripe_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_ELPA_KERNEL)
endif !#if COMPLEXCASE == 1
!! the complex case and real case diverged here
#endif ! if (useGPU) then
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
! dev_offset, dev_offset_1, dev_offset_2, &
! a_dim2, &
! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
! else
! call compute_hh_trafo_complex_&
! &PRECISION&
! &(aIntern, stripe_width, a_dim2, stripe_count, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! current_local_n - bottom_msg_length, bottom_msg_length, i, &
! last_stripe_width, THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
!
! endif
!
!#endif
!send_b !send_b
#ifdef WITH_MPI #ifdef WITH_MPI
...@@ -1958,17 +1955,14 @@ ...@@ -1958,17 +1955,14 @@
do my_thread = 1, max_threads do my_thread = 1, max_threads
call compute_hh_trafo_& call compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_cpu_openmp_& &_openmp_&
&PRECISION& &PRECISION&
&(aIntern, & & (aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
aIntern_dev, & hh_dot_dev, &
#endif #endif
stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, & hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,&
#if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, &
#endif
kernel_flops, kernel_time, top_msg_length,&
current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, & current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
THIS_& THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -1979,36 +1973,39 @@ ...@@ -1979,36 +1973,39 @@
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
#if REALCASE == 1 call compute_hh_trafo_&
call compute_hh_trafo_real_cpu_&
&PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_&
#endif
#if COMPLEXCASE == 1
if (useGPU) then
call compute_hh_trafo_complex_gpu_&
&PRECISION&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else
call compute_hh_trafo_complex_cpu_&
&PRECISION& &PRECISION&
&(aIntern, stripe_width, a_dim2, stripe_count, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & #if REALCASE == 1
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_& last_stripe_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_ELPA_KERNEL)
endif !#if COMPLEXCASE == 1
#endif /* COMPLEXCASE */ ! if (useGPU) then
! call compute_hh_trafo_complex_gpu_&
! &PRECISION&
! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
! i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
! a_dim2, &
! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
! else
! call compute_hh_trafo_complex_&
! &PRECISION&
! &(aIntern, stripe_width, a_dim2, stripe_count, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
! top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
! last_stripe_width, THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
! endif
!#endif /* COMPLEXCASE */
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
!wait_t !wait_t
...@@ -2060,18 +2057,15 @@ ...@@ -2060,18 +2057,15 @@
endif endif
call compute_hh_trafo_& call compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_cpu_openmp_& &_openmp_&
&PRECISION& &PRECISION&
&(aIntern, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
#if REALCASE == 1 nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
aIntern_dev, &
#endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
#if REALCASE == 1 #if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, & hh_dot_dev, &
#endif #endif
kernel_flops, kernel_time, 0, top_msg_length, i, my_thread, thread_width, & hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, my_thread, &
THIS_& thread_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_ELPA_KERNEL)
enddo enddo
...@@ -2080,34 +2074,37 @@ ...@@ -2080,34 +2074,37 @@
#else /* WITH_OPENMP */