Commit 53f2f2c6 authored by Andreas Marek's avatar Andreas Marek
Browse files

ELPA_2014.06 prepare release

Now it is possible
- to choose the kernel (real and complex independently) at run-time
  via environment variables, or
- to specify the kernel (real and complex independently) at runtime
  via specifing the kernel in the call to ELPA

This has a few implications
1) The ELPA 2014.06 release has a change in the API and is thus not
   binary compatible with previous versions
2) if no kernels are specified, a default kernel is choosen
3) if a wrong kernel is specified, a default kernel is choosen

For sake of simplicity it is still possible to build ELPA with
support for only one kernel, as in previous versions. However, it is
still not binary compatible to previous versions
parent c090a89f
......@@ -23,123 +23,120 @@ else
libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90
endif
if WITH_GENERIC_SIMPLE
if WITH_REAL_GENERIC_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \
src/elpa2_kernels/elpa2_kernels_real_simple.f90
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \
src/elpa2_kernels/elpa2_kernels_real_simple.f90
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
endif
endif
if WITH_GENERIC
if WITH_COMPLEX_GENERIC_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \
src/elpa2_kernels/elpa2_kernels_real.f90
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \
src/elpa2_kernels/elpa2_kernels_real.f90
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90
endif
endif
if WITH_BGP
if WITH_REAL_GENERIC_SIMPLE_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.f90
endif
endif
if WITH_BGQ
if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90
endif
endif
if WITH_SSE_AS
if WITH_REAL_BGP_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90
endif
endif
if WITH_AVX_SANDYBRIDGE
if WITH_REAL_BGQ_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
endif
endif
if WITH_AMD_BULLDOZER
if WITH_REAL_SSE_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
endif
else
if WITH_COMPLEX_SSE_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
endif
endif
endif
if WITH_AVX_COMPLEX_BLOCK1
if WITH_REAL_AVX_BLOCK2_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif
endif
if WITH_AVX_COMPLEX_BLOCK2
if WITH_REAL_AVX_BLOCK4_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
endif
endif
if WITH_AVX_REAL_BLOCK2
if WITH_REAL_AVX_BLOCK6_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
endif
endif
if WITH_AVX_REAL_BLOCK4
if WITH_COMPLEX_AVX_BLOCK1_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif
endif
if WITH_AVX_REAL_BLOCK6
if WITH_COMPLEX_AVX_BLOCK2_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
endif
endif
#if WITH_AVX_SANDYBRIDGE
#if WITH_OPENMP
# libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#else
# libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
#endif
if WITH_OPENMP
libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION)
else
......@@ -156,12 +153,24 @@ files_DATA = \
test/read_real.F90 \
test/read_real_gen.F90 \
test/test_complex2.F90 \
test/test_complex2_default_kernel.F90 \
test/test_complex2_choose_kernel_with_api.F90 \
test/test_complex.F90 \
test/test_complex_gen.F90 \
test/test_real2.F90 \
test/test_real2_default_kernel.F90 \
test/test_real2_choose_kernel_with_api.F90 \
test/print_available_elpa2_kernels.F90 \
test/test_real.F90 \
test/test_real_gen.F90
# test programs
if WITH_OPENMP
build_lib = libelpa_mt.la
else
build_lib = libelpa.la
endif
# pkg-config stuff
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = elpa.pc
......@@ -174,7 +183,8 @@ build_lib = libelpa.la
endif
#bindir = $(abs_top_builddir)
bin_PROGRAMS = test_real test_real2 test_complex test_complex2
bin_PROGRAMS = test_real test_real2 test_real2_default_kernel test_complex test_complex2 test_complex2_default_kernel test_real2_choose_kernel_with_api test_complex2_choose_kernel_with_api print_available_elpa2_kernels
test_real_SOURCES = test/test_real.F90
test_real_LDADD = $(build_lib)
......@@ -182,14 +192,28 @@ test_real_LDADD = $(build_lib)
test_real2_SOURCES = test/test_real2.F90
test_real2_LDADD = $(build_lib)
test_real2_default_kernel_SOURCES = test/test_real2_default_kernel.F90
test_real2_default_kernel_LDADD = $(build_lib)
test_real2_choose_kernel_with_api_SOURCES = test/test_real2_choose_kernel_with_api.F90
test_real2_choose_kernel_with_api_LDADD = $(build_lib)
test_complex_SOURCES = test/test_complex.F90
test_complex_LDADD = $(build_lib)
test_complex2_SOURCES = test/test_complex2.F90
test_complex2_LDADD = $(build_lib)
test_complex2_default_kernel_SOURCES = test/test_complex2_default_kernel.F90
test_complex2_default_kernel_LDADD = $(build_lib)
check_SCRIPTS = test_real.sh test_real2.sh test_complex.sh test_complex2.sh
test_complex2_choose_kernel_with_api_SOURCES = test/test_complex2_choose_kernel_with_api.F90
test_complex2_choose_kernel_with_api_LDADD = $(build_lib)
print_available_elpa2_kernels_SOURCES = test/print_available_elpa2_kernels.F90
print_available_elpa2_kernels_LDADD = $(build_lib)
check_SCRIPTS = test_real.sh test_real2.sh test_real2_default_kernel.sh test_complex.sh test_complex2.sh test_complex2_default_kernel.sh test_real2_choose_kernel_with_api.sh test_complex2_choose_kernel_with_api.sh print_available_elpa2_kernels.sh
TESTS = $(check_SCRIPTS)
test_real.sh:
......@@ -200,6 +224,14 @@ test_real2.sh:
echo "mpiexec -n 2 ./test_real2 > /dev/null 2>&1" > test_real2.sh
chmod +x test_real2.sh
test_real2_default_kernel.sh:
echo "mpiexec -n 2 ./test_real2_default_kernel > /dev/null 2>&1" > test_real2_default_kernel.sh
chmod +x test_real2_default_kernel.sh
test_real2_choose_kernel_with_api.sh:
echo "mpiexec -n 2 ./test_real2_choose_kernel_with_api > /dev/null 2>&1" > test_real2_choose_kernel_with_api.sh
chmod +x test_real2_choose_kernel_with_api.sh
test_complex.sh:
echo "mpiexec -n 2 ./test_complex > /dev/null 2>&1" > test_complex.sh
chmod +x test_complex.sh
......@@ -207,8 +239,25 @@ test_complex.sh:
test_complex2.sh:
echo "mpiexec -n 2 ./test_complex2 > /dev/null 2>&1" > test_complex2.sh
chmod +x test_complex2.sh
test_complex2_default_kernel.sh:
echo "mpiexec -n 2 ./test_complex2_default_kernel > /dev/null 2>&1" > test_complex2_default_kernel.sh
chmod +x test_complex2_default_kernel.sh
test_complex2_choose_kernel_with_api.sh:
echo "mpiexec -n 2 ./test_complex2_choose_kernel_with_api > /dev/null 2>&1" > test_complex2_choose_kernel_with_api.sh
chmod +x test_complex2_choose_kernel_with_api.sh
print_available_elpa2_kernels.sh:
echo "./print_available_elpa2_kernels" > print_available_elpa2_kernels.sh
chmod +x print_available_elpa2_kernels.sh
elpa2.i: $(top_srcdir)/src/elpa2.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2.F90 -o $@
elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
CLEANFILES = test_real.sh test_real2.sh test_complex.sh test_complex2.sh
CLEANFILES = test_real.sh test_real2.sh test_real2_default_kernel.sh test_complex.sh test_complex2.sh test_complex2_default_kernel.sh test_real2_choose_kernel_with_api.sh test_complex2_choose_kernel_with_api.sh print_available_elpa2_kernels
@FORTRAN_MODULE_DEPS@
This diff is collapsed.
......@@ -3,6 +3,9 @@
/* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H
/* Fortran can querry environment variables */
#undef HAVE_ENVIRONMENT_CHECKING
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
......@@ -67,45 +70,47 @@
/* Version number of package */
#undef VERSION
/* use kernel tuned for AVX on AMD Bulldozer (written in gcc assembler) */
#undef WITH_AMD_BULLDOZER
/* can use complex_avx_block1 kernel */
#undef WITH_COMPLEX_AVX_BLOCK1_KERNEL
/* use AVX optimized complex kernel with blocking 1 (written in gcc assembler)
*/
#undef WITH_AVX_COMPLEX_BLOCK1
/* can use complex_avx_block2 kernel */
#undef WITH_COMPLEX_AVX_BLOCK2_KERNEL
/* use AVX optimized complex kernel with blocking 2 (written in gcc assembler)
*/
#undef WITH_AVX_COMPLEX_BLOCK2
/* can use complex generic kernel */
#undef WITH_COMPLEX_GENERIC_KERNEL
/* use AVX optimized real kernel with blocking 2 (written in gcc assembler) */
#undef WITH_AVX_REAL_BLOCK2
/* can use complex generic-simple kernel */
#undef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
/* use AVX optimized real kernel with blocking 4 (written in gcc assembler) */
#undef WITH_AVX_REAL_BLOCK4
/* can use complex SSE kernel */
#undef WITH_COMPLEX_SSE_KERNEL
/* use AVX optimized real kernel with blocking 6 (written in gcc assembler) */
#undef WITH_AVX_REAL_BLOCK6
/* use OpenMP threading */
#undef WITH_OPENMP
/* use kernel tuned for AVX on Intel Sandybridge (written in gcc assembler) */
#undef WITH_AVX_SANDYBRIDGE
/* can use real_avx_block2 kernel */
#undef WITH_REAL_AVX_BLOCK2_KERNEL
/* use optimized kernel for IBM BG/P */
#undef WITH_BGP
/* can use real_avx_block4 kernel */
#undef WITH_REAL_AVX_BLOCK4_KERNEL
/* use optimized kernel for IBM BG/Q */
#undef WITH_BGQ
/* can use real_avx_block6 kernel */
#undef WITH_REAL_AVX_BLOCK6_KERNEL
/* use generic kernel for all architectures (with some hand-coded
optimizations) */
#undef WITH_GENERIC
/* can use real BGP kernel */
#undef WITH_REAL_BGP_KERNEL
/* use generic simple kernel for all architectures (without any hand-coded
optimizations) */
#undef WITH_GENERIC_SIMPLE
/* can use real BGQ kernel */
#undef WITH_REAL_BGQ_KERNEL
/* use OpenMP threading */
#undef WITH_OPENMP
/* can use real generic kernel */
#undef WITH_REAL_GENERIC_KERNEL
/* can use real generic-simple kernel */
#undef WITH_REAL_GENERIC_SIMPLE_KERNEL
/* can use real SSE kernel */
#undef WITH_REAL_SSE_KERNEL
/* use kernel tuned for SSE (written in gcc assembler) */
#undef WITH_SSE_AS
/* use specific real kernel */
#undef WITH_SPECIFIC_COMPLEX_KERNEL
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -52,8 +52,12 @@
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
module complex_generic_kernel
subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
private
public single_hh_trafo_complex_generic
contains
subroutine single_hh_trafo_complex_generic(q, hh, nb, nq, ldq)
implicit none
......@@ -83,11 +87,11 @@ subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq)
endif
end
end subroutine single_hh_trafo_complex_generic
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine double_hh_trafo_complex(q, hh, nb, nq, ldq, ldh)
subroutine double_hh_trafo_complex_generic(q, hh, nb, nq, ldq, ldh)
implicit none
......@@ -128,11 +132,11 @@ subroutine double_hh_trafo_complex(q, hh, nb, nq, ldq, ldh)
!else if(nq-i+1 > 0) then
! call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
!endif
end
end subroutine double_hh_trafo_complex_generic
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
implicit none
......@@ -158,7 +162,7 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
xb = q(11,1)
xc = q(12,1)
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
......@@ -204,7 +208,7 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
q(11,1) = q(11,1) + xb
q(12,1) = q(12,1) + xc
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
......@@ -221,11 +225,11 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
q(12,i) = q(12,i) + xc*h1
enddo
end
end subroutine hh_trafo_complex_kernel_12
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
implicit none
......@@ -247,7 +251,7 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
x7 = q(7,1)
x8 = q(8,1)
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
......@@ -281,7 +285,7 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
......@@ -294,11 +298,11 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
q(8,i) = q(8,i) + x8*h1
enddo
end
end subroutine hh_trafo_complex_kernel_8
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
implicit none
......@@ -316,7 +320,7 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
x3 = q(3,1)
x4 = q(4,1)
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1
......@@ -338,7 +342,7 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
q(3,1) = q(3,1) + x3
q(4,1) = q(4,1) + x4
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=2,nb
h1 = hh(i)
q(1,i) = q(1,i) + x1*h1
......@@ -347,11 +351,11 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
q(4,i) = q(4,i) + x4*h1
enddo
end
end subroutine hh_trafo_complex_kernel_4
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
implicit none
......@@ -374,7 +378,7 @@ subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
y3 = q(3,1) + q(3,2)*conjg(hh(2,2))
y4 = q(4,1) + q(4,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
......@@ -418,7 +422,7 @@ subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
q(3,2) = q(3,2) + x3 + y3*hh(2,2)
q(4,2) = q(4,2) + x4 + y4*hh(2,2)
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
......@@ -433,11 +437,11 @@ subroutine hh_trafo_complex_kernel_4_2hv(q, hh, nb, ldq, ldh, s)
q(3,nb+1) = q(3,nb+1) + x3*hh(nb,1)
q(4,nb+1) = q(4,nb+1) + x4*hh(nb,1)
end
end subroutine hh_trafo_complex_kernel_4_2hv
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
implicit none
......@@ -468,7 +472,7 @@ subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
y7 = q(7,1) + q(7,2)*conjg(hh(2,2))
y8 = q(8,1) + q(8,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
......@@ -541,7 +545,7 @@ subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
q(7,2) = q(7,2) + x7 + y7*hh(2,2)
q(8,2) = q(8,2) + x8 + y8*hh(2,2)
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
......@@ -564,11 +568,11 @@ subroutine hh_trafo_complex_kernel_8_2hv(q, hh, nb, ldq, ldh, s)
q(7,nb+1) = q(7,nb+1) + x7*hh(nb,1)
q(8,nb+1) = q(8,nb+1) + x8*hh(nb,1)
end
end subroutine hh_trafo_complex_kernel_8_2hv
! --------------------------------------------------------------------------------------------------
! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
implicit none
......@@ -607,7 +611,7 @@ subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
y11 = q(11,1) + q(11,2)*conjg(hh(2,2))
y12 = q(12,1) + q(12,2)*conjg(hh(2,2))
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
......@@ -707,7 +711,7 @@ subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
q(11,2) = q(11,2) + x11 + y11*hh(2,2)
q(12,2) = q(12,2) + x12 + y12*hh(2,2)
!DEC$ VECTOR ALIGNED
!DEC$ VECTOR ALIGNED
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
......@@ -738,6 +742,6 @@ subroutine hh_trafo_complex_kernel_12_2hv(q, hh, nb, ldq, ldh, s)
q(11,nb+1) = q(11,nb+1) + x11*hh(nb,1)
q(12,nb+1) = q(12,nb+1) + x12*hh(nb,1)
end
end subroutine hh_trafo_complex_kernel_12_2hv
end module complex_generic_kernel
! --------------------------------------------------------------------------------------------------