Commit 53f2f2c6 authored by Andreas Marek's avatar Andreas Marek
Browse files

ELPA_2014.06 prepare release

Now it is possible
- to choose the kernel (real and complex independently) at run-time
  via environment variables, or
- to specify the kernel (real and complex independently) at runtime
  via specifing the kernel in the call to ELPA

This has a few implications
1) The ELPA 2014.06 release has a change in the API and is thus not
   binary compatible with previous versions
2) if no kernels are specified, a default kernel is choosen
3) if a wrong kernel is specified, a default kernel is choosen

For sake of simplicity it is still possible to build ELPA with
support for only one kernel, as in previous versions. However, it is
still not binary compatible to previous versions
parent c090a89f
...@@ -23,123 +23,120 @@ else ...@@ -23,123 +23,120 @@ else
libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90 libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90
endif endif
if WITH_GENERIC_SIMPLE if WITH_REAL_GENERIC_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
src/elpa2_kernels/elpa2_kernels_real_simple.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
src/elpa2_kernels/elpa2_kernels_real_simple.f90
endif endif
endif endif
if WITH_GENERIC if WITH_COMPLEX_GENERIC_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90
src/elpa2_kernels/elpa2_kernels_real.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90
src/elpa2_kernels/elpa2_kernels_real.f90
endif endif
endif endif
if WITH_BGP if WITH_REAL_GENERIC_SIMPLE_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.f90
src/elpa2_kernels/elpa2_kernels_complex.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_simple.f90
src/elpa2_kernels/elpa2_kernels_complex.f90
endif endif
endif endif
if WITH_BGQ if WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90
src/elpa2_kernels/elpa2_kernels_complex.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90
src/elpa2_kernels/elpa2_kernels_complex.f90
endif endif
endif endif
if WITH_SSE_AS if WITH_REAL_BGP_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90
endif endif
endif endif
if WITH_AVX_SANDYBRIDGE if WITH_REAL_BGQ_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif endif
endif endif
if WITH_AMD_BULLDOZER if WITH_REAL_SSE_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ endif
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp else
if WITH_COMPLEX_SSE_KERNEL
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
endif
endif endif
endif endif
if WITH_AVX_COMPLEX_BLOCK1
if WITH_REAL_AVX_BLOCK2_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif endif
if WITH_AVX_COMPLEX_BLOCK2 if WITH_REAL_AVX_BLOCK4_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif endif
endif endif
if WITH_AVX_REAL_BLOCK2 if WITH_REAL_AVX_BLOCK6_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
endif endif
endif endif
if WITH_AVX_REAL_BLOCK4 if WITH_COMPLEX_AVX_BLOCK1_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif endif
if WITH_AVX_REAL_BLOCK6 if WITH_COMPLEX_AVX_BLOCK2_KERNEL
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \ libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif endif
#if WITH_AVX_SANDYBRIDGE
#if WITH_OPENMP
# libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#else
# libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
#endif
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION) libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION)
else else
...@@ -156,12 +153,24 @@ files_DATA = \ ...@@ -156,12 +153,24 @@ files_DATA = \
test/read_real.F90 \ test/read_real.F90 \
test/read_real_gen.F90 \ test/read_real_gen.F90 \
test/test_complex2.F90 \ test/test_complex2.F90 \
test/test_complex2_default_kernel.F90 \
test/test_complex2_choose_kernel_with_api.F90 \
test/test_complex.F90 \ test/test_complex.F90 \
test/test_complex_gen.F90 \ test/test_complex_gen.F90 \
test/test_real2.F90 \ test/test_real2.F90 \
test/test_real2_default_kernel.F90 \
test/test_real2_choose_kernel_with_api.F90 \
test/print_available_elpa2_kernels.F90 \
test/test_real.F90 \ test/test_real.F90 \
test/test_real_gen.F90 test/test_real_gen.F90
# test programs
if WITH_OPENMP
build_lib = libelpa_mt.la
else
build_lib = libelpa.la
endif
# pkg-config stuff # pkg-config stuff
pkgconfigdir = $(libdir)/pkgconfig pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = elpa.pc pkgconfig_DATA = elpa.pc
...@@ -174,7 +183,8 @@ build_lib = libelpa.la ...@@ -174,7 +183,8 @@ build_lib = libelpa.la
endif endif
#bindir = $(abs_top_builddir) #bindir = $(abs_top_builddir)
bin_PROGRAMS = test_real test_real2 test_complex test_complex2 bin_PROGRAMS = test_real test_real2 test_real2_default_kernel test_complex test_complex2 test_complex2_default_kernel test_real2_choose_kernel_with_api test_complex2_choose_kernel_with_api print_available_elpa2_kernels
test_real_SOURCES = test/test_real.F90 test_real_SOURCES = test/test_real.F90
test_real_LDADD = $(build_lib) test_real_LDADD = $(build_lib)
...@@ -182,14 +192,28 @@ test_real_LDADD = $(build_lib) ...@@ -182,14 +192,28 @@ test_real_LDADD = $(build_lib)
test_real2_SOURCES = test/test_real2.F90 test_real2_SOURCES = test/test_real2.F90
test_real2_LDADD = $(build_lib) test_real2_LDADD = $(build_lib)
test_real2_default_kernel_SOURCES = test/test_real2_default_kernel.F90
test_real2_default_kernel_LDADD = $(build_lib)
test_real2_choose_kernel_with_api_SOURCES = test/test_real2_choose_kernel_with_api.F90
test_real2_choose_kernel_with_api_LDADD = $(build_lib)
test_complex_SOURCES = test/test_complex.F90 test_complex_SOURCES = test/test_complex.F90
test_complex_LDADD = $(build_lib) test_complex_LDADD = $(build_lib)
test_complex2_SOURCES = test/test_complex2.F90 test_complex2_SOURCES = test/test_complex2.F90
test_complex2_LDADD = $(build_lib) test_complex2_LDADD = $(build_lib)
test_complex2_default_kernel_SOURCES = test/test_complex2_default_kernel.F90
test_complex2_default_kernel_LDADD = $(build_lib)
check_SCRIPTS = test_real.sh test_real2.sh test_complex.sh test_complex2.sh test_complex2_choose_kernel_with_api_SOURCES = test/test_complex2_choose_kernel_with_api.F90
test_complex2_choose_kernel_with_api_LDADD = $(build_lib)
print_available_elpa2_kernels_SOURCES = test/print_available_elpa2_kernels.F90
print_available_elpa2_kernels_LDADD = $(build_lib)
check_SCRIPTS = test_real.sh test_real2.sh test_real2_default_kernel.sh test_complex.sh test_complex2.sh test_complex2_default_kernel.sh test_real2_choose_kernel_with_api.sh test_complex2_choose_kernel_with_api.sh print_available_elpa2_kernels.sh
TESTS = $(check_SCRIPTS) TESTS = $(check_SCRIPTS)
test_real.sh: test_real.sh:
...@@ -200,6 +224,14 @@ test_real2.sh: ...@@ -200,6 +224,14 @@ test_real2.sh:
echo "mpiexec -n 2 ./test_real2 > /dev/null 2>&1" > test_real2.sh echo "mpiexec -n 2 ./test_real2 > /dev/null 2>&1" > test_real2.sh
chmod +x test_real2.sh chmod +x test_real2.sh
test_real2_default_kernel.sh:
echo "mpiexec -n 2 ./test_real2_default_kernel > /dev/null 2>&1" > test_real2_default_kernel.sh
chmod +x test_real2_default_kernel.sh
test_real2_choose_kernel_with_api.sh:
echo "mpiexec -n 2 ./test_real2_choose_kernel_with_api > /dev/null 2>&1" > test_real2_choose_kernel_with_api.sh
chmod +x test_real2_choose_kernel_with_api.sh
test_complex.sh: test_complex.sh:
echo "mpiexec -n 2 ./test_complex > /dev/null 2>&1" > test_complex.sh echo "mpiexec -n 2 ./test_complex > /dev/null 2>&1" > test_complex.sh
chmod +x test_complex.sh chmod +x test_complex.sh
...@@ -207,8 +239,25 @@ test_complex.sh: ...@@ -207,8 +239,25 @@ test_complex.sh:
test_complex2.sh: test_complex2.sh:
echo "mpiexec -n 2 ./test_complex2 > /dev/null 2>&1" > test_complex2.sh echo "mpiexec -n 2 ./test_complex2 > /dev/null 2>&1" > test_complex2.sh
chmod +x test_complex2.sh chmod +x test_complex2.sh
test_complex2_default_kernel.sh:
echo "mpiexec -n 2 ./test_complex2_default_kernel > /dev/null 2>&1" > test_complex2_default_kernel.sh
chmod +x test_complex2_default_kernel.sh
test_complex2_choose_kernel_with_api.sh:
echo "mpiexec -n 2 ./test_complex2_choose_kernel_with_api > /dev/null 2>&1" > test_complex2_choose_kernel_with_api.sh
chmod +x test_complex2_choose_kernel_with_api.sh
print_available_elpa2_kernels.sh:
echo "./print_available_elpa2_kernels" > print_available_elpa2_kernels.sh
chmod +x print_available_elpa2_kernels.sh
elpa2.i: $(top_srcdir)/src/elpa2.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa2.F90 -o $@
elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
CLEANFILES = test_real.sh test_real2.sh test_complex.sh test_complex2.sh CLEANFILES = test_real.sh test_real2.sh test_real2_default_kernel.sh test_complex.sh test_complex2.sh test_complex2_default_kernel.sh test_real2_choose_kernel_with_api.sh test_complex2_choose_kernel_with_api.sh print_available_elpa2_kernels
@FORTRAN_MODULE_DEPS@ @FORTRAN_MODULE_DEPS@
This diff is collapsed.
...@@ -3,6 +3,9 @@ ...@@ -3,6 +3,9 @@
/* Define to 1 if you have the <dlfcn.h> header file. */ /* Define to 1 if you have the <dlfcn.h> header file. */
#undef HAVE_DLFCN_H #undef HAVE_DLFCN_H
/* Fortran can querry environment variables */
#undef HAVE_ENVIRONMENT_CHECKING
/* Define to 1 if you have the <inttypes.h> header file. */ /* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H #undef HAVE_INTTYPES_H
...@@ -67,45 +70,47 @@ ...@@ -67,45 +70,47 @@
/* Version number of package */ /* Version number of package */
#undef VERSION #undef VERSION
/* use kernel tuned for AVX on AMD Bulldozer (written in gcc assembler) */ /* can use complex_avx_block1 kernel */
#undef WITH_AMD_BULLDOZER #undef WITH_COMPLEX_AVX_BLOCK1_KERNEL
/* use AVX optimized complex kernel with blocking 1 (written in gcc assembler) /* can use complex_avx_block2 kernel */
*/ #undef WITH_COMPLEX_AVX_BLOCK2_KERNEL
#undef WITH_AVX_COMPLEX_BLOCK1
/* use AVX optimized complex kernel with blocking 2 (written in gcc assembler) /* can use complex generic kernel */
*/ #undef WITH_COMPLEX_GENERIC_KERNEL
#undef WITH_AVX_COMPLEX_BLOCK2
/* use AVX optimized real kernel with blocking 2 (written in gcc assembler) */ /* can use complex generic-simple kernel */
#undef WITH_AVX_REAL_BLOCK2 #undef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
/* use AVX optimized real kernel with blocking 4 (written in gcc assembler) */ /* can use complex SSE kernel */
#undef WITH_AVX_REAL_BLOCK4 #undef WITH_COMPLEX_SSE_KERNEL
/* use AVX optimized real kernel with blocking 6 (written in gcc assembler) */ /* use OpenMP threading */
#undef WITH_AVX_REAL_BLOCK6 #undef WITH_OPENMP
/* use kernel tuned for AVX on Intel Sandybridge (written in gcc assembler) */ /* can use real_avx_block2 kernel */
#undef WITH_AVX_SANDYBRIDGE #undef WITH_REAL_AVX_BLOCK2_KERNEL
/* use optimized kernel for IBM BG/P */ /* can use real_avx_block4 kernel */
#undef WITH_BGP #undef WITH_REAL_AVX_BLOCK4_KERNEL
/* use optimized kernel for IBM BG/Q */ /* can use real_avx_block6 kernel */
#undef WITH_BGQ #undef WITH_REAL_AVX_BLOCK6_KERNEL
/* use generic kernel for all architectures (with some hand-coded /* can use real BGP kernel */
optimizations) */ #undef WITH_REAL_BGP_KERNEL
#undef WITH_GENERIC
/* use generic simple kernel for all architectures (without any hand-coded /* can use real BGQ kernel */
optimizations) */ #undef WITH_REAL_BGQ_KERNEL
#undef WITH_GENERIC_SIMPLE
/* use OpenMP threading */ /* can use real generic kernel */
#undef WITH_OPENMP #undef WITH_REAL_GENERIC_KERNEL
/* can use real generic-simple kernel */
#undef WITH_REAL_GENERIC_SIMPLE_KERNEL
/* can use real SSE kernel */
#undef WITH_REAL_SSE_KERNEL
/* use kernel tuned for SSE (written in gcc assembler) */ /* use specific real kernel */
#undef WITH_SSE_AS #undef WITH_SPECIFIC_COMPLEX_KERNEL
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -52,8 +52,12 @@ ...@@ -52,8 +52,12 @@
! distributed along with the original code in the file "COPYING". ! distributed along with the original code in the file "COPYING".
! !
! -------------------------------------------------------------------------------------------------- ! --------------------------------------------------------------------------------------------------
module complex_generic_kernel
subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq) private
public single_hh_trafo_complex_generic
contains
subroutine single_hh_trafo_complex_generic(q, hh, nb, nq, ldq)
implicit none implicit none
...@@ -83,11 +87,11 @@ subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq) ...@@ -83,11 +87,11 @@ subroutine single_hh_trafo_complex(q, hh, nb, nq, ldq)
call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq) call hh_trafo_complex_kernel_4(q(i,1),hh, nb, ldq)
endif endif
end end subroutine single_hh_trafo_complex_generic
! -------------------------------------------------------------------------------------------------- ! --------------------------------------------------------------------------------------------------
subroutine double_hh_trafo_complex(q, hh, nb, nq, ldq, ldh) subroutine double_hh_trafo_complex_generic(q, hh, nb, nq, ldq, ldh)
implicit none implicit none
...@@ -128,11 +132,11 @@ subroutine double_hh_trafo_complex(q, hh, nb, nq, ldq, ldh) ...@@ -128,11 +132,11 @@ subroutine double_hh_trafo_complex(q, hh, nb, nq, ldq, ldh)
!else if(nq-i+1 > 0) then !else if(nq-i+1 > 0) then
! call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s) ! call hh_trafo_complex_kernel_4_2hv(q(i,1),hh, nb, ldq, ldh, s)
!endif !endif
end end subroutine double_hh_trafo_complex_generic
! -------------------------------------------------------------------------------------------------- ! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq) subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
implicit none implicit none
...@@ -158,7 +162,7 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq) ...@@ -158,7 +162,7 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
xb = q(11,1) xb = q(11,1)
xc = q(12,1) xc = q(12,1)
!DEC$ VECTOR ALIGNED !DEC$ VECTOR ALIGNED
do i=2,nb do i=2,nb
h1 = conjg(hh(i)) h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1 x1 = x1 + q(1,i)*h1
...@@ -204,7 +208,7 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq) ...@@ -204,7 +208,7 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
q(11,1) = q(11,1) + xb q(11,1) = q(11,1) + xb
q(12,1) = q(12,1) + xc q(12,1) = q(12,1) + xc
!DEC$ VECTOR ALIGNED !DEC$ VECTOR ALIGNED
do i=2,nb do i=2,nb
h1 = hh(i) h1 = hh(i)
q(1,i) = q(1,i) + x1*h1 q(1,i) = q(1,i) + x1*h1
...@@ -221,11 +225,11 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq) ...@@ -221,11 +225,11 @@ subroutine hh_trafo_complex_kernel_12(q, hh, nb, ldq)
q(12,i) = q(12,i) + xc*h1 q(12,i) = q(12,i) + xc*h1
enddo enddo
end end subroutine hh_trafo_complex_kernel_12
! -------------------------------------------------------------------------------------------------- ! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq) subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
implicit none implicit none
...@@ -247,7 +251,7 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq) ...@@ -247,7 +251,7 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
x7 = q(7,1) x7 = q(7,1)
x8 = q(8,1) x8 = q(8,1)
!DEC$ VECTOR ALIGNED !DEC$ VECTOR ALIGNED
do i=2,nb do i=2,nb
h1 = conjg(hh(i)) h1 = conjg(hh(i))
x1 = x1 + q(1,i)*h1 x1 = x1 + q(1,i)*h1
...@@ -281,7 +285,7 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq) ...@@ -281,7 +285,7 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
q(7,1) = q(7,1) + x7 q(7,1) = q(7,1) + x7
q(8,1) = q(8,1) + x8 q(8,1) = q(8,1) + x8
!DEC$ VECTOR ALIGNED !DEC$ VECTOR ALIGNED
do i=2,nb do i=2,nb
h1 = hh(i) h1 = hh(i)
q(1,i) = q(1,i) + x1*h1 q(1,i) = q(1,i) + x1*h1
...@@ -294,11 +298,11 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq) ...@@ -294,11 +298,11 @@ subroutine hh_trafo_complex_kernel_8(q, hh, nb, ldq)
q(8,i) = q(8,i) + x8*h1 q(8,i) = q(8,i) + x8*h1
enddo enddo
end end subroutine hh_trafo_complex_kernel_8
! -------------------------------------------------------------------------------------------------- ! --------------------------------------------------------------------------------------------------
subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq) subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
implicit none implicit none
...@@ -316,7 +320,7 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq) ...@@ -316,7 +320,7 @@ subroutine hh_trafo_complex_kernel_4(q, hh, nb, ldq)
x3 = q(3,1) x3 = q(3,1)