Commit be241bd8 authored by Andreas Marek's avatar Andreas Marek

ELPA_development_version_qr: bugfix for kernels real blocking 6 and 4

Due to an error in a preprocessor statement, the results for
real matrices were wrong if the kernels "avx-real-block6" or
"avx-real-block4" were chosen. No other kernels are affected.

The test programms always correctly stated that the results for
these kernels are wrong.
parent f0c7cb0d
...@@ -131,17 +131,23 @@ endif ...@@ -131,17 +131,23 @@ endif
if WITH_AVX_REAL_BLOCK4 if WITH_AVX_REAL_BLOCK4
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif endif
if WITH_AVX_REAL_BLOCK6 if WITH_AVX_REAL_BLOCK6
if WITH_OPENMP if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif endif
......
...@@ -134,10 +134,20 @@ host_triplet = @host@ ...@@ -134,10 +134,20 @@ host_triplet = @host@
@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__append_19 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c @WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__append_19 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__append_20 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c @WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__append_20 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__append_21 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c @WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__append_21 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__append_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c @WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__append_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__append_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c @WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__append_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__append_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__append_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
bin_PROGRAMS = test_real$(EXEEXT) test_real2$(EXEEXT) \ bin_PROGRAMS = test_real$(EXEEXT) test_real2$(EXEEXT) \
test_complex$(EXEEXT) test_complex2$(EXEEXT) test_complex$(EXEEXT) test_complex2$(EXEEXT)
subdir = . subdir = .
...@@ -228,8 +238,11 @@ am__dirstamp = $(am__leading_dot)dirstamp ...@@ -228,8 +238,11 @@ am__dirstamp = $(am__leading_dot)dirstamp
@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__objects_9 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \ @WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__objects_9 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \
@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo @WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo
@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__objects_10 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo @WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__objects_10 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__objects_11 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo @WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__objects_11 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__objects_12 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo @WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__objects_12 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_OPENMP_FALSE@@WITH_QR_FALSE@am_libelpa_la_OBJECTS = \ @WITH_OPENMP_FALSE@@WITH_QR_FALSE@am_libelpa_la_OBJECTS = \
@WITH_OPENMP_FALSE@@WITH_QR_FALSE@ src/elpa1.lo src/elpa2.lo \ @WITH_OPENMP_FALSE@@WITH_QR_FALSE@ src/elpa1.lo src/elpa2.lo \
@WITH_OPENMP_FALSE@@WITH_QR_FALSE@ $(am__objects_1) \ @WITH_OPENMP_FALSE@@WITH_QR_FALSE@ $(am__objects_1) \
...@@ -305,8 +318,11 @@ am__libelpa_mt_la_SOURCES_DIST = src/elpa1.F90 src/elpa2.F90 \ ...@@ -305,8 +318,11 @@ am__libelpa_mt_la_SOURCES_DIST = src/elpa1.F90 src/elpa2.F90 \
@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__objects_21 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \ @WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__objects_21 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \
@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo @WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo
@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__objects_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo @WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__objects_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__objects_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo @WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__objects_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__objects_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo @WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__objects_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \
@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo
@WITH_OPENMP_TRUE@@WITH_QR_FALSE@am_libelpa_mt_la_OBJECTS = \ @WITH_OPENMP_TRUE@@WITH_QR_FALSE@am_libelpa_mt_la_OBJECTS = \
@WITH_OPENMP_TRUE@@WITH_QR_FALSE@ src/elpa1.lo src/elpa2.lo \ @WITH_OPENMP_TRUE@@WITH_QR_FALSE@ src/elpa1.lo src/elpa2.lo \
@WITH_OPENMP_TRUE@@WITH_QR_FALSE@ $(am__objects_13) \ @WITH_OPENMP_TRUE@@WITH_QR_FALSE@ $(am__objects_13) \
......
...@@ -2559,7 +2559,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows ...@@ -2559,7 +2559,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
#endif #endif
#endif #endif
#if (defined(WITH_AVX_REAL_BLOCK4) && defined(WITH_AVX_REAL_BLOCK2)) || defined(WITH_AMD_BULLDOZER) #if defined(WITH_AVX_REAL_BLOCK4) || defined(WITH_AMD_BULLDOZER)
! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
do j = ncols, 4, -4 do j = ncols, 4, -4
...@@ -2598,7 +2598,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows ...@@ -2598,7 +2598,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows
#endif #endif
#if (defined(WITH_AVX_REAL_BLOCK6) && defined(WITH_AVX_REAL_BLOCK4) && defined(WITH_AVX_REAL_BLOCK2)) #if defined(WITH_AVX_REAL_BLOCK6)
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6 do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off) w(:,1) = bcast_buffer(1:nbw,j+off)
......
...@@ -112,12 +112,16 @@ Several ...@@ -112,12 +112,16 @@ Several
CFLAGS and CXXFLAGS automatically. CFLAGS and CXXFLAGS automatically.
On Intel Sandybridge architectures the On Intel Sandybridge architectures the
configure option "--with-intel-sandybride" configure option "--with-avx-sandybride"
use the best combination. uses the best combination, which is a
combination of block2 for real matrices
and block1 for complex matrices.
On AMD Bulldozer architectures the On AMD Bulldozer architectures the
configure option "--with-amd-bulldozer" configure option "--with-amd-bulldozer"
use the best combination. uses the best combination, which is a
combination of block4 for real matrices
and block1 for complex matrices.
Otherwise, you can try out your own Otherwise, you can try out your own
combinations with the configure options combinations with the configure options
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment