#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif
subroutine compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_openmp_&
#else
&_&
#endif
&PRECISION &
(obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, &
#ifdef WITH_OPENMP
l_nev, &
#endif
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
#if REALCASE == 1
hh_dot_dev, &
#endif
hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
#ifdef WITH_OPENMP
my_thread, thread_width, &
#else
last_stripe_width, &
#endif
kernel)
use precision
use elpa_abstract_impl
use iso_c_binding
#if REALCASE == 1
use single_hh_trafo_real
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
! use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
!#endif
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
! use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
!#endif
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_kernel !, only : double_hh_trafo_generic
#endif
#if defined(WITH_REAL_BGP_KERNEL)
use real_bgp_kernel !, only : double_hh_trafo_bgp
#endif
#if defined(WITH_REAL_BGQ_KERNEL)
use real_bgq_kernel !, only : double_hh_trafo_bgq
#endif
#endif /* REALCASE */
#if COMPLEXCASE == 1
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use complex_generic_kernel !, only : single_hh_trafo_complex_generic
#endif
#endif /* COMPLEXCASE */
use cuda_c_kernel
use cuda_functions
use elpa_generated_fortran_interfaces
implicit none
class(elpa_abstract_impl_t), intent(inout) :: obj
logical, intent(in) :: useGPU, wantDebug
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=ik), intent(in) :: nbw, max_blk_size
#if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size)
#endif
integer(kind=ik), intent(in) :: a_off
integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
integer(kind=ik), intent(in) :: max_threads
#ifndef WITH_OPENMP
integer(kind=ik), intent(in) :: last_stripe_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:)
#endif
#else /* WITH_OPENMP */
integer(kind=ik), intent(in) :: l_nev, thread_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:,:)
#endif
#endif /* WITH_OPENMP */
integer(kind=ik), intent(in) :: kernel
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
#if REALCASE == 1
integer(kind=c_intptr_t) :: hh_dot_dev ! why not needed in complex case
#endif
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik) :: off, ncols, istripe
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, noff
#endif
integer(kind=ik) :: j, nl, jj, jjj, n_times
#if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: w(nbw,6)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: w(nbw,2)
#endif
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
j = -99
if (wantDebug) then
if (useGPU .and. &
#if REALCASE == 1
( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
#endif
#if COMPLEXCASE == 1
( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
stop
endif
endif
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
#endif
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) then
if (wantDebug) then
print *, "Returning early from compute_hh_trafo"
endif
return
endif
endif
if (wantDebug) call obj%timer%start("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP
&_openmp" // &
#else
&" // &
#endif
&PRECISION_SUFFIX &
)
#ifdef WITH_OPENMP
if (my_thread==1) then
#endif
ttt = mpi_wtime()
#ifdef WITH_OPENMP
endif
#endif
#ifdef WITH_OPENMP
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#endif /* WITH_OPENMP */
#ifndef WITH_OPENMP
nl = merge(stripe_width, last_stripe_width, istripe