Commit a0934d4e authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into ELPA_GPU

parents 98a4db33 33a94bfc
......@@ -17,6 +17,11 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/check_for_gpu.F90 \
src/mod_cuda.F90 \
src/interface_c_kernel.F90 \
src/mod_pack_unpack_real.F90 \
src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \
src/mod_pack_unpack_complex.F90 \
src/elpa2_compute.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
......@@ -314,6 +319,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90
elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
include doxygen.am
CLEANFILES = \
......
......@@ -298,6 +298,8 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
! was
! real a(lda,*), q(ldq,*)
integer(kind=ik) :: my_prow, my_pcol, mpierr
real(kind=rk), allocatable :: e(:), tau(:)
......@@ -397,6 +399,8 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols,
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols)
! was
! complex a(lda,*), q(ldq,*)
real(kind=rk) :: ev(na)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
......
......@@ -138,6 +138,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), d(na), e(na), tau(na)
! was
! real a(lda,*)
integer(kind=ik), parameter :: max_stored_rows = 32
......@@ -479,6 +481,8 @@ module ELPA1_compute
integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols), tau(na)
! was
! real a(lda,*), q(ldq,*)
integer(kind=ik) :: max_stored_rows
......@@ -911,6 +915,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), tau(na)
! was
! complex a(lda,*)
real(kind=rk) :: d(na), e(na)
integer(kind=ik), parameter :: max_stored_rows = 32
......@@ -1278,6 +1284,8 @@ module ELPA1_compute
integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols), tau(na)
! was
! complex a(lda,*), q(ldq,*)
integer(kind=ik) :: max_stored_rows
......@@ -1678,6 +1686,8 @@ module ELPA1_compute
integer(kind=ik) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: d(na), e(na), q(ldq,matrixCols)
! was
! real q(ldq,*)
integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
......@@ -1902,6 +1912,8 @@ module ELPA1_compute
integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
real(kind=rk) :: d(na), e(na), q(ldq,matrixCols)
! was
! real q(ldq,*)
integer(kind=ik), parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
......@@ -2175,6 +2187,8 @@ module ELPA1_compute
mpi_comm_cols, npc_0, npc_n
integer(kind=ik) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
real(kind=rk) :: d(na), e, q(ldq,matrixCols)
! was
! real q(ldq,*)
integer(kind=ik), parameter :: max_strip=128
......@@ -3309,6 +3323,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols)
! was
! real a(lda, *)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
......@@ -3490,6 +3506,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols)
! was
! real a(lda,*)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
......@@ -3626,6 +3644,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols)
!was
! complex a(lda,*)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
......@@ -3803,6 +3823,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols)
! was
! complex a(lda,*)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
......
......@@ -156,6 +156,8 @@ contains
mpi_comm_cols, mpi_comm_all
integer(kind=ik), intent(in) :: nblk
real(kind=rk), intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
! was
! real a(lda,*), q(ldq,*)
real(kind=rk), allocatable :: hh_trans_real(:,:)
integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
......@@ -429,6 +431,8 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
integer(kind=ik) :: THIS_COMPLEX_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
complex(kind=ck), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols)
! was
! complex a(lda,*), q(ldq,*)
real(kind=rk), intent(inout) :: ev(na)
complex(kind=ck), allocatable :: hh_trans_complex(:,:)
......
This diff is collapsed.
module single_hh_trafo_real
implicit none
#include "config-f90.h"
#ifdef WITH_OPENMP
public single_hh_trafo_real_cpu_openmp
#else
public single_hh_trafo_real_cpu
#endif
contains
#ifdef WITH_OPENMP
subroutine single_hh_trafo_real_cpu_openmp(q, hh, nb, nq, ldq)
#else
subroutine single_hh_trafo_real_cpu(q, hh, nb, nq, ldq)
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
! Perform single real Householder transformation.
! This routine is not performance critical and thus it is coded here in Fortran
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq
! real(kind=rk), intent(inout) :: q(ldq, *)
! real(kind=rk), intent(in) :: hh(*)
real(kind=rk), intent(inout) :: q(1:ldq, 1:nb)
real(kind=rk), intent(in) :: hh(1:nb)
integer(kind=ik) :: i
real(kind=rk) :: v(nq)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%start("single_hh_trafo_real_cpu_openmp")
#else
call timer%start("single_hh_trafo_real_cpu")
#endif
#endif
! v = q * hh
v(:) = q(1:nq,1)
do i=2,nb
v(:) = v(:) + q(1:nq,i) * hh(i)
enddo
! v = v * tau
v(:) = v(:) * hh(1)
! q = q - v * hh**T
q(1:nq,1) = q(1:nq,1) - v(:)
do i=2,nb
q(1:nq,i) = q(1:nq,i) - v(:) * hh(i)
enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%stop("single_hh_trafo_real_cpu_openmp")
#else
call timer%stop("single_hh_trafo_real_cpu")
#endif
#endif
end subroutine
end module
module compute_hh_trafo_complex
#include "config-f90.h"
implicit none
#ifdef WITH_OPENMP
public compute_hh_trafo_complex_cpu_openmp
#else
public compute_hh_trafo_complex_cpu
#endif
include 'mpif.h'
contains
#ifdef WITH_OPENMP
subroutine compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
off, ncols, istripe, &
my_thread, THIS_COMPLEX_ELPA_KERNEL)
#else
subroutine compute_hh_trafo_complex_cpu (a, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
off, ncols, istripe, last_stripe_width, &
THIS_COMPLEX_ELPA_KERNEL)
#endif
use precision
use elpa2_utilities
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
use complex_generic_kernel, only : single_hh_trafo_complex_generic
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
real(kind=rk), intent(inout) :: kernel_time
integer(kind=lik) :: kernel_flops
integer(kind=ik), intent(in) :: nbw, max_blk_size
complex(kind=ck) :: bcast_buffer(nbw,max_blk_size)
integer(kind=ik), intent(in) :: a_off
integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
#ifndef WITH_OPENMP
integer(kind=ik), intent(in) :: last_stripe_width
complex(kind=ck) :: a(stripe_width,a_dim2,stripe_count)
#else
integer(kind=ik), intent(in) :: max_threads
complex(kind=ck) :: a(stripe_width,a_dim2,stripe_count,max_threads)
#endif
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik) :: off, ncols, istripe, j, nl, jj
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, noff
#endif
real(kind=rk) :: ttt
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
! Currently (on Sandy Bridge), single is faster than double
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
complex(kind=ck) :: w(nbw,2)
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%stop("compute_hh_trafo_complex_cpu_openmp")
#else
call timer%stop("compute_hh_trafo_complex_cpu")
#endif
#endif
#ifdef WITH_OPENMP
if (istripe<stripe_count) then
nl = stripe_width
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if(nl<=0) then
#ifdef WITH_OPENMP
call timer%stop("compute_hh_trafo_complex_cpu_openmp")
#else
call timer%stop("compute_hh_trafo_complex_cpu")
#endif
return
endif
endif
#else
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_complex_sse_avx_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_complex_sse_avx_2hv(a(1,j+off+a_off-1,istripe), &
w, nbw, nl, stripe_width, nbw)
#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_complex_sse_avx_1hv(a(1,1+off+a_off,istripe,my_thread), &
bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_complex_sse_avx_1hv(a(1,1+off+a_off,istripe), &
bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL */
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe,my_thread), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe,my_thread), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */
#if defined(WITH_COMPLEX_SSE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_complex(a(1,j+off+a_off,istripe,my_thread), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_complex(a(1,j+off+a_off,istripe), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_KERNEL */
!#if defined(WITH_AVX_SANDYBRIDGE)
! call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
!#if defined(WITH_AMD_BULLDOZER)
! call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe,my_thread), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_complex_sse_avx_1hv(a(1,j+off+a_off,istripe), &
bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNE */
#ifdef WITH_OPENMP
if (my_thread==1) then
#endif
kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
kernel_time = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%stop("compute_hh_trafo_complex_cpu_openmp")
#else
call timer%stop("compute_hh_trafo_complex_cpu")
#endif
#endif
#ifdef WITH_OPENM
end subroutine compute_hh_trafo_complex_cpu_openmp
#else
end subroutine compute_hh_trafo_complex_cpu
#endif
end module
<
module compute_hh_trafo_real
#include "config-f90.h"
implicit none
#ifdef WITH_OPENMP
public compute_hh_trafo_real_cpu_openmp
#else
public compute_hh_trafo_real_cpu
#endif
include 'mpif.h'
contains
#ifdef WITH_OPENMP
subroutine compute_hh_trafo_real_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
off, ncols, istripe, &
my_thread, THIS_REAL_ELPA_KERNEL)
#else
subroutine compute_hh_trafo_real_cpu (a, stripe_width,a_dim2,stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
off, ncols, istripe, last_stripe_width, &
THIS_REAL_ELPA_KERNEL)
#endif
use precision
use elpa2_utilities
use single_hh_trafo_real
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
use real_generic_simple_kernel, only : double_hh_trafo_generic_simple
#endif
!#if defined(WITH_REAL_GENERIC_KERNEL)
! use real_generic_kernel, only : double_hh_trafo_generic
!#endif
#if defined(WITH_REAL_BGP_KERNEL)
use real_bgp_kernel, only : double_hh_trafo_bgp
#endif
#if defined(WITH_REAL_BGQ_KERNEL)
use real_bgq_kernel, only : double_hh_trafo_bgq
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
include "mpif.h"
real(kind=rk), intent(inout) :: kernel_time
integer(kind=lik) :: kernel_flops
integer(kind=ik), intent(in) :: nbw, max_blk_size
real(kind=rk) :: bcast_buffer(nbw,max_blk_size)
integer(kind=ik), intent(in) :: a_off
integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count
#ifndef WITH_OPENMP
integer(kind=ik), intent(in) :: last_stripe_width
real(kind=rk) :: a(stripe_width,a_dim2,stripe_count)
#else
integer(kind=ik), intent(in) :: max_threads
real(kind=rk) :: a(stripe_width,a_dim2,stripe_count,max_threads)
#endif
integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer(kind=ik) :: off, ncols, istripe
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, noff
#endif
integer(kind=ik) :: j, nl, jj, jjj
real(kind=rk) :: w(nbw,6), ttt
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%start("compute_hh_trafo_real_cpu_openmp")
#else
call timer%start("compute_hh_trafo_real_cpu")
#endif
#endif
ttt = mpi_wtime()
#ifndef WITH_OPENMP
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else
if (istripe<stripe_count) then
nl = stripe_width
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%stop("compute_hh_trafo_real_cpu_openmp")
#else
call timer%stop("compute_hh_trafo_real_cpu")
#endif
#endif
return
endif
endif
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP .or. &
THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
!FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
#if defined(WITH_REAL_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe,my_thread), w, &