Commit f0c7cb0d authored by Andreas Marek's avatar Andreas Marek

ELPA_2013.11.006 bugfix for kernels real blocking 6 and 4

Due to an error in a preprocessor statement, the results for
real matrices were wrong if the kernels "avx-real-block6" or
"avx-real-block4" were chosen. No other kernels are affected.

The test programms always correctly stated that the results for
these kernels are wrong.
parent 191ad3a5
......@@ -120,17 +120,23 @@ endif
if WITH_AVX_REAL_BLOCK4
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif
endif
if WITH_AVX_REAL_BLOCK6
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif
endif
......
......@@ -131,10 +131,20 @@ host_triplet = @host@
@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__append_19 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__append_20 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__append_21 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__append_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__append_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__append_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__append_21 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__append_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__append_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__append_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
bin_PROGRAMS = test_real$(EXEEXT) test_real2$(EXEEXT) \
test_complex$(EXEEXT) test_complex2$(EXEEXT)
subdir = .
......@@ -146,9 +156,9 @@ DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
ltmain.sh
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_gnu_make.m4 \
$(top_srcdir)/m4/libtool.m4 $(top_srcdir)/m4/ltoptions.m4 \
$(top_srcdir)/m4/ltsugar.m4 $(top_srcdir)/m4/ltversion.m4 \
$(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/m4/ax_elpa_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/fdep/fortran_dependencies.m4 \
$(top_srcdir)/m4/ax_prog_fc_mpi.m4 $(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
......@@ -223,8 +233,11 @@ am__dirstamp = $(am__leading_dot)dirstamp
@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__objects_9 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \
@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo
@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__objects_10 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__objects_11 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__objects_12 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__objects_11 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__objects_12 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_OPENMP_FALSE@am_libelpa_la_OBJECTS = src/elpa1.lo src/elpa2.lo \
@WITH_OPENMP_FALSE@ $(am__objects_1) $(am__objects_2) \
@WITH_OPENMP_FALSE@ $(am__objects_3) $(am__objects_4) \
......@@ -273,8 +286,11 @@ am__libelpa_mt_la_SOURCES_DIST = src/elpa1.F90 src/elpa2.F90 \
@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__objects_21 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \
@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo
@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__objects_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__objects_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__objects_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__objects_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__objects_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_OPENMP_TRUE@am_libelpa_mt_la_OBJECTS = src/elpa1.lo src/elpa2.lo \
@WITH_OPENMP_TRUE@ $(am__objects_13) $(am__objects_14) \
@WITH_OPENMP_TRUE@ $(am__objects_15) $(am__objects_16) \
......@@ -679,6 +695,7 @@ NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OPENMP_FCFLAGS = @OPENMP_FCFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
......
......@@ -1149,6 +1149,7 @@ AC_SUBST([am__untar])
]) # _AM_PROG_TAR
m4_include([m4/ax_check_gnu_make.m4])
m4_include([m4/ax_elpa_openmp.m4])
m4_include([m4/libtool.m4])
m4_include([m4/ltoptions.m4])
m4_include([m4/ltsugar.m4])
......
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for elpa 2013.11.005.
# Generated by GNU Autoconf 2.69 for elpa 2013.11.006.
#
# Report bugs to <elpa-library@rzg.mpg.de>.
#
......@@ -590,8 +590,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='elpa'
PACKAGE_TARNAME='elpa'
PACKAGE_VERSION='2013.11.005'
PACKAGE_STRING='elpa 2013.11.005'
PACKAGE_VERSION='2013.11.006'
PACKAGE_STRING='elpa 2013.11.006'
PACKAGE_BUGREPORT='elpa-library@rzg.mpg.de'
PACKAGE_URL=''
......@@ -672,6 +672,7 @@ build_vendor
build_cpu
build
LIBTOOL
OPENMP_FCFLAGS
WITH_OPENMP_FALSE
WITH_OPENMP_TRUE
FC_MODOUT
......@@ -823,6 +824,7 @@ with_avx_real_block4
with_avx_real_block6
with_avx_optimization
with_openmp
enable_openmp
enable_shared
enable_static
with_pic
......@@ -1388,7 +1390,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures elpa 2013.11.005 to adapt to many kinds of systems.
\`configure' configures elpa 2013.11.006 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
......@@ -1458,7 +1460,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of elpa 2013.11.005:";;
short | recursive ) echo "Configuration of elpa 2013.11.006:";;
esac
cat <<\_ACEOF
......@@ -1472,6 +1474,7 @@ Optional Features:
do not reject slow dependency extractors
--disable-dependency-tracking
speeds up one-time build
--disable-openmp do not use OpenMP
--enable-shared[=PKGS] build shared libraries [default=yes]
--enable-static[=PKGS] build static libraries [default=yes]
--enable-fast-install[=PKGS]
......@@ -1596,7 +1599,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
elpa configure 2013.11.005
elpa configure 2013.11.006
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
......@@ -2079,7 +2082,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by elpa $as_me 2013.11.005, which was
It was created by elpa $as_me 2013.11.006, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
......@@ -2944,7 +2947,7 @@ fi
# Define the identity of the package.
PACKAGE='elpa'
VERSION='2013.11.005'
VERSION='2013.11.006'
cat >>confdefs.h <<_ACEOF
......@@ -5911,7 +5914,7 @@ fi
ELPA_LIB_VERSION=2013.11.005
ELPA_LIB_VERSION=2013.11.006
# this is the version of the API, should be changed in the major revision
# if and only if the actual API changes
......@@ -6144,7 +6147,61 @@ fi
$as_echo "#define WITH_OPENMP 1" >>confdefs.h
AX_ELPA_OPENMP
OPENMP_FCFLAGS=
# Check whether --enable-openmp was given.
if test "${enable_openmp+set}" = set; then :
enableval=$enable_openmp;
fi
if test "$enable_openmp" != no; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to support OpenMP" >&5
$as_echo_n "checking for $CC option to support OpenMP... " >&6; }
if ${ac_cv_prog_fc_openmp+:} false; then :
$as_echo_n "(cached) " >&6
else
cat > conftest.$ac_ext <<_ACEOF
program main
call omp_get_num_threads
end
_ACEOF
if ac_fn_fc_try_link "$LINENO"; then :
ac_cv_prog_fc_openmp='none needed'
else
ac_cv_prog_fc_openmp='unsupported'
for ac_option in -openmp -fopenmp -xopenmp -mp -omp -qsmp=omp; do
ac_save_FCFLAGS=$FCFLAGS
FCFLAGS="$FCFLAGS $ac_option"
cat > conftest.$ac_ext <<_ACEOF
program main
call omp_get_num_threads
end
_ACEOF
if ac_fn_fc_try_link "$LINENO"; then :
ac_cv_prog_fc_openmp=$ac_option
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
FCFLAGS=$ac_save_FCFLAGS
if test "$ac_cv_prog_fc_openmp" != unsupported; then
break
fi
done
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_openmp" >&5
$as_echo "$ac_cv_prog_fc_openmp" >&6; }
case $ac_cv_prog_fc_openmp in #(
"none needed" | unsupported)
;; #(
*)
OPENMP_FCFLAGS=$ac_cv_prog_fc_openmp ;;
esac
fi
fi
FCFLAGS="$FCFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
......@@ -20726,7 +20783,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by elpa $as_me 2013.11.005, which was
This file was extended by elpa $as_me 2013.11.006, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
......@@ -20792,7 +20849,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
elpa config.status 2013.11.005
elpa config.status 2013.11.006
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
......
AC_PREREQ([2.69])
AC_INIT([elpa],[2013.11.005], elpa-library@rzg.mpg.de)
AC_INIT([elpa],[2013.11.006], elpa-library@rzg.mpg.de)
AC_CONFIG_SRCDIR([src/elpa1.F90])
AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
......@@ -196,7 +196,7 @@ AX_PROG_FC_MPI([],[have_mpi=yes],[have_mpi=no
fi])
AC_SUBST([ELPA_LIB_VERSION], [2013.11.005])
AC_SUBST([ELPA_LIB_VERSION], [2013.11.006])
# this is the version of the API, should be changed in the major revision
# if and only if the actual API changes
AC_SUBST([ELPA_SO_VERSION], [0:0:0])
......
......@@ -2431,7 +2431,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
#endif
#endif
#if (defined(WITH_AVX_REAL_BLOCK4) && defined(WITH_AVX_REAL_BLOCK2)) || defined(WITH_AMD_BULLDOZER)
#if defined(WITH_AVX_REAL_BLOCK4) || defined(WITH_AMD_BULLDOZER)
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4
......@@ -2470,7 +2470,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
#endif
#if (defined(WITH_AVX_REAL_BLOCK6) && defined(WITH_AVX_REAL_BLOCK4) && defined(WITH_AVX_REAL_BLOCK2))
#if defined(WITH_AVX_REAL_BLOCK6)
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
......
......@@ -112,12 +112,16 @@ Several
CFLAGS and CXXFLAGS automatically.
On Intel Sandybridge architectures the
configure option "--with-intel-sandybride"
use the best combination.
configure option "--with-avx-sandybride"
uses the best combination, which is a
combination of block2 for real matrices
and block1 for complex matrices.
On AMD Bulldozer architectures the
configure option "--with-amd-bulldozer"
use the best combination.
uses the best combination, which is a
combination of block4 for real matrices
and block1 for complex matrices.
Otherwise, you can try out your own
combinations with the configure options
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment