Commit f07dc1f9 authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify GPU/CPU compute_hh_trafo

parent 1d9439c6
...@@ -64,7 +64,6 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \ ...@@ -64,7 +64,6 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/pack_unpack_cpu.X90 \ src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \ src/pack_unpack_gpu.X90 \
src/compute_hh_trafo.X90 \ src/compute_hh_trafo.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/redist_band.X90 \ src/redist_band.X90 \
src/sanity.X90 \ src/sanity.X90 \
src/elpa_cholesky_template.X90 \ src/elpa_cholesky_template.X90 \
...@@ -983,7 +982,6 @@ EXTRA_DIST = \ ...@@ -983,7 +982,6 @@ EXTRA_DIST = \
src/pack_unpack_cpu.X90 \ src/pack_unpack_cpu.X90 \
src/pack_unpack_gpu.X90 \ src/pack_unpack_gpu.X90 \
src/compute_hh_trafo.X90 \ src/compute_hh_trafo.X90 \
src/compute_hh_trafo_complex_gpu.X90 \
src/sanity.X90 \ src/sanity.X90 \
src/elpa_cholesky_template.X90 \ src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \ src/elpa_invert_trm.X90 \
......
...@@ -46,25 +46,20 @@ ...@@ -46,25 +46,20 @@
subroutine compute_hh_trafo_& subroutine compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_cpu_openmp_& &_openmp_&
#else #else
&_cpu_& &_&
#endif #endif
&PRECISION & &PRECISION &
(a, & (a, a_dev, stripe_width, a_dim2, stripe_count, &
#if REALCASE == 1
a_dev, &
#endif
stripe_width, a_dim2, stripe_count, &
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
max_threads, l_nev, & max_threads, l_nev, &
#endif #endif
a_off, nbw, max_blk_size, bcast_buffer, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, & hh_dot_dev, &
#endif #endif
kernel_flops, kernel_time, & hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
off, ncols, istripe, &
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
my_thread, thread_width, & my_thread, thread_width, &
#else #else
...@@ -112,10 +107,8 @@ ...@@ -112,10 +107,8 @@
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1
use cuda_c_kernel use cuda_c_kernel
use cuda_functions use cuda_functions
#endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
...@@ -171,19 +164,20 @@ ...@@ -171,19 +164,20 @@
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
#endif #endif
#if REALCASE == 1
integer(kind=c_intptr_t) :: a_dev integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_intptr_t) :: hh_dot_dev #if REALCASE == 1
integer(kind=c_intptr_t) :: hh_tau_dev integer(kind=c_intptr_t) :: hh_dot_dev ! why not needed in complex case
integer(kind=c_intptr_t) :: dev_offset
#endif #endif
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2
! Private variables in OMP regions (my_thread) should better be in the argument list! ! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik) :: off, ncols, istripe integer(kind=ik) :: off, ncols, istripe
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, noff integer(kind=ik) :: my_thread, noff
#endif #endif
integer(kind=ik) :: j, nl, jj, jjj integer(kind=ik) :: j, nl, jj, jjj, n_times
#if REALCASE == 1 #if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: w(nbw,6) real(kind=C_DATATYPE_KIND) :: w(nbw,6)
#endif #endif
...@@ -198,13 +192,19 @@ ...@@ -198,13 +192,19 @@
if (ncols < 1) return if (ncols < 1) return
endif endif
#endif #endif
#if COMPLEXCASE == 1
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
endif
#endif
call timer%start("compute_hh_trafo_& call timer%start("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_cpu_openmp" // & &_openmp" // &
#else #else
&_cpu" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
...@@ -230,6 +230,14 @@ ...@@ -230,6 +230,14 @@
&_GPU OPENMP: not yet implemented" &_GPU OPENMP: not yet implemented"
stop 1 stop 1
endif endif
#endif
#if COMPLEXCASE == 1
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif #endif
if (istripe<stripe_count) then if (istripe<stripe_count) then
nl = stripe_width nl = stripe_width
...@@ -240,9 +248,9 @@ ...@@ -240,9 +248,9 @@
call timer%stop("compute_hh_trafo_& call timer%stop("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_cpu_openmp" // & &_openmp" // &
#else #else
&_cpu" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
...@@ -253,6 +261,7 @@ ...@@ -253,6 +261,7 @@
#endif /* not WITH_OPENMP */ #endif /* not WITH_OPENMP */
#if REALCASE == 1 #if REALCASE == 1
! GPU kernel real
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_& dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
&PRECISION& &PRECISION&
...@@ -263,8 +272,37 @@ ...@@ -263,8 +272,37 @@
&_& &_&
&PRECISION& &PRECISION&
& (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols) & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
else ! not CUDA kernel
#endif /* REALCASE */ #endif /* REALCASE */
#if COMPLEXCASE == 1
! GPU kernel complex
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
dev_offset_1 = (0 + ( off-1 )* nbw) * size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
dev_offset_2 =( off-1 )* size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
call launch_compute_hh_trafo_c_kernel_&
&MATH_DATATYPE&
&_&
&PRECISION&
&(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif /* COMPLEXCASE */
else ! not CUDA kernel
#if REALCASE == 1 #if REALCASE == 1
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) #if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
...@@ -1486,16 +1524,14 @@ ...@@ -1486,16 +1524,14 @@
!no avx512 block6 complex kernel !no avx512 block6 complex kernel
#endif /* COMPLEXCASE */ #endif /* COMPLEXCASE */
#if REALCASE == 1
endif ! GPU_KERNEL endif ! GPU_KERNEL
#endif
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
if (my_thread==1) then if (my_thread==1) then
#endif #endif
kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8) kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
kernel_time = kernel_time + mpi_wtime()-ttt kernel_time = kernel_time + mpi_wtime()-ttt
n_times = n_times + 1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
endif endif
#endif #endif
...@@ -1503,9 +1539,9 @@ ...@@ -1503,9 +1539,9 @@
call timer%stop("compute_hh_trafo_& call timer%stop("compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
&_cpu_openmp" // & &_openmp" // &
#else #else
&_cpu" // & &" // &
#endif #endif
&PRECISION_SUFFIX & &PRECISION_SUFFIX &
) )
......
...@@ -156,9 +156,7 @@ ...@@ -156,9 +156,7 @@
integer(kind=c_intptr_t) :: hh_dot_dev integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx integer(kind=ik) :: row_group_size, unpack_idx
#if COMPLEXCASE == 1
integer(kind=ik) :: n_times integer(kind=ik) :: n_times
#endif
integer(kind=ik) :: top, chunk, this_chunk integer(kind=ik) :: top, chunk, this_chunk
#if REALCASE == 1 #if REALCASE == 1
...@@ -1667,19 +1665,15 @@ ...@@ -1667,19 +1665,15 @@
call compute_hh_trafo_& call compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_cpu_openmp_& &_openmp_&
&PRECISION & &PRECISION &
(aIntern, & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, &
#if REALCASE == 1 l_nev, a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
aIntern_dev, &
#endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, &
max_blk_size, bcast_buffer, &
#if REALCASE == 1 #if REALCASE == 1
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, & hh_dot_dev, &
#endif #endif
kernel_flops, kernel_time, 0, current_local_n, i, my_thread, thread_width, & hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, &
THIS_& i, my_thread, thread_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_ELPA_KERNEL)
enddo enddo
...@@ -1688,34 +1682,37 @@ ...@@ -1688,34 +1682,37 @@
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
#if REALCASE == 1 call compute_hh_trafo_&
call compute_hh_trafo_real_cpu_& &MATH_DATATYPE&
&_&
&PRECISION& &PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, & #if REALCASE == 1
last_stripe_width, THIS_& hh_dot_dev, &
&MATH_DATATYPE&
&_ELPA_KERNEL)
#endif #endif
#if COMPLEXCASE == 1 hh_tau_dev, kernel_flops, kernel_time, n_times, 0, current_local_n, i, &
if (useGPU) then last_stripe_width, THIS_&
call compute_hh_trafo_complex_gpu_& &MATH_DATATYPE&
&PRECISION& &_ELPA_KERNEL)
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, & !#if COMPLEXCASE == 1
dev_offset_2, a_dim2, & ! if (useGPU) then
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width) ! call compute_hh_trafo_complex_gpu_&
else ! &PRECISION&
call compute_hh_trafo_complex_cpu_& ! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, 0, current_local_n, i, a_off, dev_offset, dev_offset_1, &
&PRECISION& ! dev_offset_2, a_dim2, &
&(aIntern, stripe_width, a_dim2, stripe_count, & ! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! else
0, current_local_n, i, last_stripe_width, & ! call compute_hh_trafo_complex_&
THIS_& ! &PRECISION&
&MATH_DATATYPE& ! &(aIntern, stripe_width, a_dim2, stripe_count, &
&_ELPA_KERNEL) ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
endif ! 0, current_local_n, i, last_stripe_width, &
#endif /* COMPLEXCASE */ ! THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
! endif
!#endif /* COMPLEXCASE */
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
!send_b 1 !send_b 1
...@@ -1807,18 +1804,15 @@ ...@@ -1807,18 +1804,15 @@
call compute_hh_trafo_& call compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_cpu_openmp_& &_openmp_&
&PRECISION& &PRECISION&
&(aIntern, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
aIntern_dev, & hh_dot_dev, &
#endif #endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, & hh_tau_dev, kernel_flops, kernel_time, n_times, current_local_n - bottom_msg_length, &
#if REALCASE == 1 bottom_msg_length, i, my_thread, thread_width, THIS_&
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, &
#endif
kernel_flops, kernel_time, current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
thread_width, THIS_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_ELPA_KERNEL) &_ELPA_KERNEL)
enddo enddo
...@@ -1859,40 +1853,43 @@ ...@@ -1859,40 +1853,43 @@
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
#if REALCASE == 1 call compute_hh_trafo_&
call compute_hh_trafo_real_cpu_& &MATH_DATATYPE&
&_&
&PRECISION& &PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
hh_tau_dev, kernel_flops, kernel_time, & #if REALCASE == 1
current_local_n - bottom_msg_length, bottom_msg_length, i, & hh_dot_dev, &
last_stripe_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
#endif #endif
hh_tau_dev, kernel_flops, kernel_time, n_times, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
#if COMPLEXCASE == 1 !#if COMPLEXCASE == 1
! the complex case and real case diverged here !! the complex case and real case diverged here
if (useGPU) then ! if (useGPU) then
call compute_hh_trafo_complex_gpu_& ! call compute_hh_trafo_complex_gpu_&
&PRECISION& ! &PRECISION&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, & ! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2, & ! dev_offset, dev_offset_1, dev_offset_2, &
a_dim2, & ! a_dim2, &
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width) ! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
else ! else
call compute_hh_trafo_complex_cpu_& ! call compute_hh_trafo_complex_&
&PRECISION& ! &PRECISION&
&(aIntern, stripe_width, a_dim2, stripe_count, & ! &(aIntern, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, & ! current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_& ! last_stripe_width, THIS_&
&MATH_DATATYPE& ! &MATH_DATATYPE&
&_ELPA_KERNEL) ! &_ELPA_KERNEL)
!
endif ! endif
!
#endif !#endif
!send_b !send_b
#ifdef WITH_MPI #ifdef WITH_MPI
...@@ -1958,57 +1955,57 @@ ...@@ -1958,57 +1955,57 @@
do my_thread = 1, max_threads do my_thread = 1, max_threads
call compute_hh_trafo_& call compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_cpu_openmp_& &_openmp_&
&PRECISION& &PRECISION&
&(aIntern, & & (aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
aIntern_dev, & hh_dot_dev, &
#endif #endif
stripe_width ,a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, & hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length,&
#if REALCASE == 1 current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, & THIS_&
#endif &MATH_DATATYPE&
kernel_flops, kernel_time, top_msg_length,& &_ELPA_KERNEL)
current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, &
THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
enddo enddo
!$omp end parallel do !$omp end parallel do
call timer%stop("OpenMP parallel" // PRECISION_SUFFIX) call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
#if REALCASE == 1 call compute_hh_trafo_&
call compute_hh_trafo_real_cpu_& &MATH_DATATYPE&
&_&
&PRECISION& &PRECISION&
&(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
hh_tau_dev, kernel_flops, kernel_time, top_msg_length, & #if REALCASE == 1
current_local_n-top_msg_length-bottom_msg_length, i, & hh_dot_dev, &
last_stripe_width, THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
#endif #endif
#if COMPLEXCASE == 1 hh_tau_dev, kernel_flops, kernel_time, n_times, top_msg_length, &
if (useGPU) then current_local_n-top_msg_length-bottom_msg_length, i, &
call compute_hh_trafo_complex_gpu_& last_stripe_width, THIS_&
&PRECISION& &MATH_DATATYPE&
&(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, & &_ELPA_KERNEL)
i, a_off, dev_offset, dev_offset_1, dev_offset_2, & !#if COMPLEXCASE == 1
a_dim2, & ! if (useGPU) then
kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width) ! call compute_hh_trafo_complex_gpu_&
else ! &PRECISION&
call compute_hh_trafo_complex_cpu_& ! &(aIntern_dev, bcast_buffer_dev, hh_tau_dev, top_msg_length,current_local_n-top_msg_length-bottom_msg_length, &
&PRECISION& ! i, a_off, dev_offset, dev_offset_1, dev_offset_2, &
&(aIntern, stripe_width, a_dim2, stripe_count, & ! a_dim2, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & ! kernel_flops, kernel_time, last_stripe_width, n_times, nbw, stripe_count, stripe_width)
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & ! else
last_stripe_width, THIS_& ! call compute_hh_trafo_complex_&
&MATH_DATATYPE& ! &PRECISION&
&_ELPA_KERNEL) ! &(aIntern, stripe_width, a_dim2, stripe_count, &
endif ! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
#endif /* COMPLEXCASE */ ! top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
! last_stripe_width, THIS_&
! &MATH_DATATYPE&
! &_ELPA_KERNEL)
! endif
!#endif /* COMPLEXCASE */
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
!wait_t !wait_t
...@@ -2060,54 +2057,54 @@ ...@@ -2060,54 +2057,54 @@
endif endif
call compute_hh_trafo_& call compute_hh_trafo_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_cpu_openmp_& &_openmp_&
&PRECISION& &PRECISION&
&(aIntern, & & (aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, &
nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1 #if REALCASE == 1
aIntern_dev, & hh_dot_dev, &
#endif #endif
stripe_width, a_dim2, stripe_count, max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, & hh_tau_dev, kernel_flops, kernel_time, n_times, 0, top_msg_length, i, my_thread, &
#if REALCASE == 1 thread_width, THIS_&
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, & &MATH_DATATYPE&
#endif &_ELPA_KERNEL)
kernel_flops, kernel_time, 0, top_msg_length, i, my_thread, thread_width, &
THIS_&
&MATH_DATATYPE&
&_ELPA_KERNEL)
enddo enddo
!$omp end parallel do !$omp end parallel do
call timer%stop("OpenMP parallel" // PRECISION_SUFFIX) call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)