! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! This file was written by A. Marek, MPCDF module compute_hh_trafo_real #include "config-f90.h" use elpa_mpi implicit none #ifdef WITH_OPENMP public compute_hh_trafo_real_cpu_openmp_double #else public compute_hh_trafo_real_cpu_double #endif #ifdef WANT_SINGLE_PRECISION_REAL #ifdef WITH_OPENMP public compute_hh_trafo_real_cpu_openmp_single #else public compute_hh_trafo_real_cpu_single #endif #endif contains #ifdef WITH_OPENMP subroutine compute_hh_trafo_real_cpu_openmp_double(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe, & my_thread, thread_width, THIS_REAL_ELPA_KERNEL) #else subroutine compute_hh_trafo_real_cpu_double (a, a_dev, stripe_width, a_dim2, stripe_count, & a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe, & last_stripe_width, THIS_REAL_ELPA_KERNEL) #endif use precision use iso_c_binding use elpa2_utilities use single_hh_trafo_real use cuda_c_kernel use cuda_functions #if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple #endif #if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE)) use real_generic_kernel !, only : double_hh_trafo_generic #endif #if defined(WITH_REAL_BGP_KERNEL) use real_bgp_kernel !, only : double_hh_trafo_bgp #endif #if defined(WITH_REAL_BGQ_KERNEL) use real_bgq_kernel !, only : double_hh_trafo_bgq #endif #ifdef HAVE_DETAILED_TIMINGS use timings #endif #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) || defined(HAVE_AVX512) use kernel_interfaces #endif implicit none real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double integer(kind=lik) :: kernel_flops integer(kind=ik), intent(in) :: nbw, max_blk_size real(kind=rk8) :: bcast_buffer(nbw,max_blk_size) integer(kind=ik), intent(in) :: a_off integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count #ifndef WITH_OPENMP integer(kind=ik), intent(in) :: last_stripe_width ! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count) real(kind=rk8), pointer :: a(:,:,:) #else integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width ! real(kind=rk8) :: a(stripe_width,a_dim2,stripe_count,max_threads) real(kind=rk8), pointer :: a(:,:,:,:) #endif integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL integer(kind=c_intptr_t) :: a_dev integer(kind=c_intptr_t) :: bcast_buffer_dev integer(kind=c_size_t) :: dev_offset integer(kind=c_intptr_t) :: hh_dot_dev integer(kind=c_intptr_t) :: hh_tau_dev ! Private variables in OMP regions (my_thread) should better be in the argument list! integer(kind=ik) :: off, ncols, istripe #ifdef WITH_OPENMP integer(kind=ik) :: my_thread, noff #endif integer(kind=ik) :: j, nl, jj, jjj real(kind=rk8) :: w(nbw,6) real(kind=c_double) :: ttt ! MPI_WTIME always needs double if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available if (ncols < 1) return endif #ifdef HAVE_DETAILED_TIMINGS #ifdef WITH_OPENMP call timer%start("compute_hh_trafo_real_cpu_openmp_double") #else call timer%start("compute_hh_trafo_real_cpu_double") #endif #endif ttt = mpi_wtime() #ifndef WITH_OPENMP nl = merge(stripe_width, last_stripe_width, istripe