#if 0 ! This file is part of ELPA. ! ! The ELPA library was originally created by the ELPA consortium, ! consisting of the following organizations: ! ! - Max Planck Computing and Data Facility (MPCDF), formerly known as ! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), ! - Bergische Universität Wuppertal, Lehrstuhl für angewandte ! Informatik, ! - Technische Universität München, Lehrstuhl für Informatik mit ! Schwerpunkt Wissenschaftliches Rechnen , ! - Fritz-Haber-Institut, Berlin, Abt. Theorie, ! - Max-Plack-Institut für Mathematik in den Naturwissenschaften, ! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, ! and ! - IBM Deutschland GmbH ! ! ! More information can be found here: ! http://elpa.mpcdf.mpg.de/ ! ! ELPA is free software: you can redistribute it and/or modify ! it under the terms of the version 3 of the license of the ! GNU Lesser General Public License as published by the Free ! Software Foundation. ! ! ELPA is distributed in the hope that it will be useful, ! but WITHOUT ANY WARRANTY; without even the implied warranty of ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ! GNU Lesser General Public License for more details. ! ! You should have received a copy of the GNU Lesser General Public License ! along with ELPA. If not, see ! ! ELPA reflects a substantial effort on the part of the original ! ELPA consortium, and we ask you to respect the spirit of the ! license that we chose: i.e., please contribute any changes you ! may have back to the original ELPA library distribution, and keep ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! ! This file was written by A. Marek, MPCDF #endif subroutine compute_hh_trafo_& &MATH_DATATYPE& #ifdef WITH_OPENMP_TRADITIONAL &_openmp_& #else &_& #endif &PRECISION & (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, & #ifdef WITH_OPENMP_TRADITIONAL l_nev, & #endif a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, & hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, & #ifdef WITH_OPENMP_TRADITIONAL my_thread, thread_width, & #else last_stripe_width, & #endif kernel) use precision use elpa_abstract_impl use iso_c_binding #if REALCASE == 1 use single_hh_trafo_real #if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE)) use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple #endif #if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE)) use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple #endif !#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE)) ! use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple !#endif #if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE)) use real_generic_kernel !, only : double_hh_trafo_generic #endif #if defined(WITH_REAL_BGP_KERNEL) use real_bgp_kernel !, only : double_hh_trafo_bgp #endif #if defined(WITH_REAL_BGQ_KERNEL) use real_bgq_kernel !, only : double_hh_trafo_bgq #endif #endif /* REALCASE */ #if COMPLEXCASE == 1 #if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE)) use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple #endif #if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE)) use complex_generic_kernel !, only : single_hh_trafo_complex_generic #endif #endif /* COMPLEXCASE */ use cuda_c_kernel use cuda_functions use elpa_generated_fortran_interfaces implicit none class(elpa_abstract_impl_t), intent(inout) :: obj logical, intent(in) :: useGPU, wantDebug real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double integer(kind=lik) :: kernel_flops integer(kind=ik), intent(in) :: nbw, max_blk_size #if REALCASE == 1 real(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size) #endif #if COMPLEXCASE == 1 complex(kind=C_DATATYPE_KIND) :: bcast_buffer(nbw,max_blk_size) #endif integer(kind=ik), intent(in) :: a_off integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count integer(kind=ik), intent(in) :: max_threads #ifndef WITH_OPENMP_TRADITIONAL integer(kind=ik), intent(in) :: last_stripe_width #if REALCASE == 1 ! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count) real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:) #endif #if COMPLEXCASE == 1 ! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count) complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:) #endif #else /* WITH_OPENMP_TRADITIONAL */ integer(kind=ik), intent(in) :: l_nev, thread_width #if REALCASE == 1 ! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads) real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:) #endif #if COMPLEXCASE == 1 ! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads) complex(kind=C_DATATYPE_KIND),pointer :: a(:,:,:,:) #endif #endif /* WITH_OPENMP_TRADITIONAL */ integer(kind=ik), intent(in) :: kernel integer(kind=c_intptr_t) :: a_dev integer(kind=c_intptr_t) :: bcast_buffer_dev integer(kind=c_intptr_t) :: hh_tau_dev integer(kind=c_intptr_t) :: dev_offset, dev_offset_1, dev_offset_2 ! Private variables in OMP regions (my_thread) should better be in the argument list! integer(kind=ik) :: off, ncols, istripe #ifdef WITH_OPENMP_TRADITIONAL integer(kind=ik) :: my_thread, noff #endif integer(kind=ik) :: j, nl, jj, jjj, n_times #if REALCASE == 1 real(kind=C_DATATYPE_KIND) :: w(nbw,6) #endif #if COMPLEXCASE == 1 complex(kind=C_DATATYPE_KIND) :: w(nbw,2) #endif real(kind=c_double) :: ttt ! MPI_WTIME always needs double integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_& &PRECISION& &_& &MATH_DATATYPE j = -99 if (wantDebug) then if (useGPU .and. & #if REALCASE == 1 ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then #endif #if COMPLEXCASE == 1 ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then #endif print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!" stop endif endif #if REALCASE == 1 if (kernel .eq. ELPA_2STAGE_REAL_GPU) then #endif #if COMPLEXCASE == 1 if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then #endif ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available if (ncols < 1) then if (wantDebug) then print *, "Returning early from compute_hh_trafo" endif return endif endif if (wantDebug) call obj%timer%start("compute_hh_trafo_& &MATH_DATATYPE& #ifdef WITH_OPENMP_TRADITIONAL &_openmp" // & #else &" // & #endif &PRECISION_SUFFIX & ) #ifdef WITH_OPENMP_TRADITIONAL if (my_thread==1) then #endif ttt = mpi_wtime() #ifdef WITH_OPENMP_TRADITIONAL endif #endif #ifdef WITH_OPENMP_TRADITIONAL #if REALCASE == 1 if (kernel .eq. ELPA_2STAGE_REAL_GPU) then print *,"compute_hh_trafo_& &MATH_DATATYPE& &_GPU OPENMP: not yet implemented" stop 1 endif #endif #if COMPLEXCASE == 1 if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then print *,"compute_hh_trafo_& &MATH_DATATYPE& &_GPU OPENMP: not yet implemented" stop 1 endif #endif #endif /* WITH_OPENMP_TRADITIONAL */ #ifndef WITH_OPENMP_TRADITIONAL nl = merge(stripe_width, last_stripe_width, istripe