! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
module compute_hh_trafo_real
#include "config-f90.h"
use elpa_mpi
implicit none
#ifdef WITH_OPENMP
public compute_hh_trafo_real_cpu_openmp_double
#else
public compute_hh_trafo_real_cpu_double
#endif
#ifdef WANT_SINGLE_PRECISION_REAL
#ifdef WITH_OPENMP
public compute_hh_trafo_real_cpu_openmp_single
#else
public compute_hh_trafo_real_cpu_single
#endif
#endif
contains
#ifdef WITH_OPENMP
subroutine compute_hh_trafo_real_cpu_openmp_double(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe, &
my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
#else
subroutine compute_hh_trafo_real_cpu_double (a, a_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#endif
use precision
use iso_c_binding
use elpa2_utilities
use single_hh_trafo_real
use cuda_c_kernel
use cuda_functions
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE))
use real_generic_kernel !, only : double_hh_trafo_generic
#endif
#if defined(WITH_REAL_BGP_KERNEL)
use real_bgp_kernel !, only : double_hh_trafo_bgp
#endif
#if defined(WITH_REAL_BGQ_KERNEL)
use real_bgq_kernel !, only : double_hh_trafo_bgq
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) || defined(HAVE_AVX512)
use kernel_interfaces
#endif
implicit none
real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
integer(kind=lik) :: kernel_flops
integer(kind=ik), intent(in) :: nbw, max_blk_size
real(kind=rk8) :: bcast_buffer(nbw,max_blk_size)
integer(kind=ik), intent(in) :: a_off
integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
#ifndef WITH_OPENMP
integer(kind=ik), intent(in) :: last_stripe_width
! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count)
real(kind=rk8), pointer :: a(:,:,:)
#else
integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=rk8), pointer :: a(:,:,:,:)
#endif
integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: dev_offset
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=c_intptr_t) :: hh_tau_dev
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik) :: off, ncols, istripe
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, noff
#endif
integer(kind=ik) :: j, nl, jj, jjj
real(kind=rk8) :: w(nbw,6)
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%start("compute_hh_trafo_real_cpu_openmp_double")
#else
call timer%start("compute_hh_trafo_real_cpu_double")
#endif
#endif
ttt = mpi_wtime()
#ifndef WITH_OPENMP
nl = merge(stripe_width, last_stripe_width, istripe