From 48e712efa0a757135110afc4d44f08155bc14b46 Mon Sep 17 00:00:00 2001 From: Andreas Marek Date: Mon, 28 Oct 2013 16:07:34 +0100 Subject: [PATCH] Introducing OpenMP functionality in ELPA_development_version_OpenMP This commit introduces OpenMP functionality in the ELPA_development_version_OpenMP branch. It contains several bugfixes to the OpenMP functionality in the branch "ELPA_development_version", the later will soon be deleted since the new branch is the new reference implementation. The current branch contains the following features/bugfixes: - building of the OpenMP version of ELPA via configure and the "--with-openmp" flag. The build library contains a "_mt" (multi-threaded) in its name. The configure procedure should (hopefully) determine for each compiler the neccessary OpenMP flags. If the "--with-openmp" flag is ommitted exactly the same code as in the ELPA 2013.08.001 release is used and build in the same way - The example test cases print which kernels have been used and how many OpenMP threads are used at runtime - correct handling of OpenMP stack arrays: the previous implementation caused compiler dependent segmentation faults - OpenMP capability with all available kernels: the correctness of the computations have been checked for all kernels except the Bluegene (P/Q) versions --- ELPA_development_version_OpenMP/Makefile.am | 118 +- ELPA_development_version_OpenMP/Makefile.in | 244 ++- ELPA_development_version_OpenMP/config.h.in | 3 + ELPA_development_version_OpenMP/configure | 110 +- ELPA_development_version_OpenMP/configure.ac | 20 +- .../src/{elpa1.f90 => elpa1.F90} | 171 +- ELPA_development_version_OpenMP/src/elpa2.F90 | 1915 ++++++++++++++--- .../elpa2_kernels_real_sse-avx_2hv.o | Bin 7184 -> 0 bytes .../{test_complex.f90 => test_complex.F90} | 0 .../{test_complex2.f90 => test_complex2.F90} | 0 .../test/{test_real.f90 => test_real.F90} | 0 .../test/{test_real2.f90 => test_real2.F90} | 0 12 files changed, 2138 insertions(+), 443 deletions(-) rename ELPA_development_version_OpenMP/src/{elpa1.f90 => elpa1.F90} (96%) delete mode 100644 ELPA_development_version_OpenMP/src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.o rename ELPA_development_version_OpenMP/test/{test_complex.f90 => test_complex.F90} (100%) rename ELPA_development_version_OpenMP/test/{test_complex2.f90 => test_complex2.F90} (100%) rename ELPA_development_version_OpenMP/test/{test_real.f90 => test_real.F90} (100%) rename ELPA_development_version_OpenMP/test/{test_real2.f90 => test_real2.F90} (100%) diff --git a/ELPA_development_version_OpenMP/Makefile.am b/ELPA_development_version_OpenMP/Makefile.am index 77803dc5..f5ff9b2f 100644 --- a/ELPA_development_version_OpenMP/Makefile.am +++ b/ELPA_development_version_OpenMP/Makefile.am @@ -7,73 +7,138 @@ AM_LDFLAGS = @AM_LDFLAGS@ @BLACS_LDFLAGS@ BLACS_LDFLAGS = @BLACS_LDFLAGS@ # libelpa +if WITH_OPENMP +lib_LTLIBRARIES = libelpa_mt.la +else lib_LTLIBRARIES = libelpa.la +endif ##rule to produce fortran config file: #config_f90.h: ./config.h # grep "^#define" ./config.h > $@ - -libelpa_la_SOURCES = src/elpa1.f90 src/elpa2.F90 - +if WITH_OPENMP +libelpa_mt_la_SOURCES = src/elpa1.F90 src/elpa2.F90 +else +libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90 +endif if WITH_GENERIC_SIMPLE +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ + src/elpa2_kernels/elpa2_kernels_real_simple.f90 +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ - src/elpa2_kernels/elpa2_kernels_real_simple.f90 + src/elpa2_kernels/elpa2_kernels_real_simple.f90 +endif endif if WITH_GENERIC +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \ + src/elpa2_kernels/elpa2_kernels_real.f90 +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \ - src/elpa2_kernels/elpa2_kernels_real.f90 + src/elpa2_kernels/elpa2_kernels_real.f90 +endif endif if WITH_BGP +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ + src/elpa2_kernels/elpa2_kernels_complex.f90 +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ - src/elpa2_kernels/elpa2_kernels_complex.f90 + src/elpa2_kernels/elpa2_kernels_complex.f90 +endif endif if WITH_BGQ +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ + src/elpa2_kernels/elpa2_kernels_complex.f90 +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ - src/elpa2_kernels/elpa2_kernels_complex.f90 + src/elpa2_kernels/elpa2_kernels_complex.f90 +endif endif if WITH_SSE_AS +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s endif +endif if WITH_AVX_SANDYBRIDGE +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ + src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ - src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +endif endif if WITH_AMD_BULLDOZER +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ + src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ + src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ - src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ - src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ + src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +endif endif if WITH_AVX_COMPLEX_BLOCK1 +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp endif +endif if WITH_AVX_COMPLEX_BLOCK2 +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ + src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ - src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +endif endif if WITH_AVX_REAL_BLOCK2 +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c endif +endif if WITH_AVX_REAL_BLOCK4 +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c endif +endif if WITH_AVX_REAL_BLOCK6 +if WITH_OPENMP + libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c +else libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c endif +endif - +if WITH_OPENMP libelpa_la_LDFLAGS = -version-info $(ELPA_SO_VERSION) +else +libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION) +endif # install any .mod files in the include/ dir elpa_includedir = $(includedir)/elpa @@ -84,11 +149,11 @@ filesdir = $(datarootdir) files_DATA = \ test/read_real.f90 \ test/read_real_gen.f90 \ - test/test_complex2.f90 \ - test/test_complex.f90 \ + test/test_complex2.F90 \ + test/test_complex.F90 \ test/test_complex_gen.f90 \ - test/test_real2.f90 \ - test/test_real.f90 \ + test/test_real2.F90 \ + test/test_real.F90 \ test/test_real_gen.f90 # pkg-config stuff @@ -96,20 +161,25 @@ pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = elpa.pc # test programs +if WITH_OPENMP +build_lib = libelpa_mt.la +else +build_lib = libelpa.la +endif noinst_bindir = $(abs_top_builddir) noinst_bin_PROGRAMS = test_real test_real2 test_complex test_complex2 -test_real_SOURCES = test/test_real.f90 -test_real_LDADD = libelpa.la +test_real_SOURCES = test/test_real.F90 +test_real_LDADD = $(build_lib) -test_real2_SOURCES = test/test_real2.f90 -test_real2_LDADD = libelpa.la +test_real2_SOURCES = test/test_real2.F90 +test_real2_LDADD = $(build_lib) -test_complex_SOURCES = test/test_complex.f90 -test_complex_LDADD = libelpa.la +test_complex_SOURCES = test/test_complex.F90 +test_complex_LDADD = $(build_lib) -test_complex2_SOURCES = test/test_complex2.f90 -test_complex2_LDADD = libelpa.la +test_complex2_SOURCES = test/test_complex2.F90 +test_complex2_LDADD = $(build_lib) check_SCRIPTS = test_real.sh test_real2.sh test_complex.sh test_complex2.sh diff --git a/ELPA_development_version_OpenMP/Makefile.in b/ELPA_development_version_OpenMP/Makefile.in index f4f74bf6..8d1cfc9f 100644 --- a/ELPA_development_version_OpenMP/Makefile.in +++ b/ELPA_development_version_OpenMP/Makefile.in @@ -53,33 +53,60 @@ PRE_UNINSTALL = : POST_UNINSTALL = : build_triplet = @build@ host_triplet = @host@ -@WITH_GENERIC_SIMPLE_TRUE@am__append_1 = src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ -@WITH_GENERIC_SIMPLE_TRUE@ src/elpa2_kernels/elpa2_kernels_real_simple.f90 +@WITH_GENERIC_SIMPLE_TRUE@@WITH_OPENMP_TRUE@am__append_1 = src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ +@WITH_GENERIC_SIMPLE_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_simple.f90 -@WITH_GENERIC_TRUE@am__append_2 = src/elpa2_kernels/elpa2_kernels_complex.f90 \ -@WITH_GENERIC_TRUE@ src/elpa2_kernels/elpa2_kernels_real.f90 +@WITH_GENERIC_SIMPLE_TRUE@@WITH_OPENMP_FALSE@am__append_2 = src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ +@WITH_GENERIC_SIMPLE_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_simple.f90 -@WITH_BGP_TRUE@am__append_3 = src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ -@WITH_BGP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.f90 +@WITH_GENERIC_TRUE@@WITH_OPENMP_TRUE@am__append_3 = src/elpa2_kernels/elpa2_kernels_complex.f90 \ +@WITH_GENERIC_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real.f90 -@WITH_BGQ_TRUE@am__append_4 = src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ -@WITH_BGQ_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.f90 +@WITH_GENERIC_TRUE@@WITH_OPENMP_FALSE@am__append_4 = src/elpa2_kernels/elpa2_kernels_complex.f90 \ +@WITH_GENERIC_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real.f90 -@WITH_SSE_AS_TRUE@am__append_5 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.s -@WITH_AVX_SANDYBRIDGE_TRUE@am__append_6 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ -@WITH_AVX_SANDYBRIDGE_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +@WITH_BGP_TRUE@@WITH_OPENMP_TRUE@am__append_5 = src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ +@WITH_BGP_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.f90 -@WITH_AMD_BULLDOZER_TRUE@am__append_7 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ -@WITH_AMD_BULLDOZER_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ -@WITH_AMD_BULLDOZER_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +@WITH_BGP_TRUE@@WITH_OPENMP_FALSE@am__append_6 = src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ +@WITH_BGP_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex.f90 -@WITH_AVX_COMPLEX_BLOCK1_TRUE@am__append_8 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp -@WITH_AVX_COMPLEX_BLOCK2_TRUE@am__append_9 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ -@WITH_AVX_COMPLEX_BLOCK2_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +@WITH_BGQ_TRUE@@WITH_OPENMP_TRUE@am__append_7 = src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ +@WITH_BGQ_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.f90 -@WITH_AVX_REAL_BLOCK2_TRUE@am__append_10 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c -@WITH_AVX_REAL_BLOCK4_TRUE@am__append_11 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c -@WITH_AVX_REAL_BLOCK6_TRUE@am__append_12 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c +@WITH_BGQ_TRUE@@WITH_OPENMP_FALSE@am__append_8 = src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ +@WITH_BGQ_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex.f90 + +@WITH_OPENMP_TRUE@@WITH_SSE_AS_TRUE@am__append_9 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.s +@WITH_OPENMP_FALSE@@WITH_SSE_AS_TRUE@am__append_10 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.s +@WITH_AVX_SANDYBRIDGE_TRUE@@WITH_OPENMP_TRUE@am__append_11 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ +@WITH_AVX_SANDYBRIDGE_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + +@WITH_AVX_SANDYBRIDGE_TRUE@@WITH_OPENMP_FALSE@am__append_12 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ +@WITH_AVX_SANDYBRIDGE_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_TRUE@am__append_13 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_FALSE@am__append_14 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + +@WITH_AVX_COMPLEX_BLOCK1_TRUE@@WITH_OPENMP_TRUE@am__append_15 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +@WITH_AVX_COMPLEX_BLOCK1_TRUE@@WITH_OPENMP_FALSE@am__append_16 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp +@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__append_17 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ +@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + +@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__append_18 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ +@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp + +@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__append_19 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c +@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__append_20 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c +@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__append_21 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c +@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__append_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c +@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__append_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c +@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__append_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c noinst_bin_PROGRAMS = test_real$(EXEEXT) test_real2$(EXEEXT) \ test_complex$(EXEEXT) test_complex2$(EXEEXT) subdir = . @@ -134,7 +161,7 @@ am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(noinst_bindir)" \ "$(DESTDIR)$(elpa_includedir)" LTLIBRARIES = $(lib_LTLIBRARIES) libelpa_la_LIBADD = -am__libelpa_la_SOURCES_DIST = src/elpa1.f90 src/elpa2.F90 \ +am__libelpa_la_SOURCES_DIST = src/elpa1.F90 src/elpa2.F90 \ src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ src/elpa2_kernels/elpa2_kernels_real_simple.f90 \ src/elpa2_kernels/elpa2_kernels_complex.f90 \ @@ -148,51 +175,97 @@ am__libelpa_la_SOURCES_DIST = src/elpa1.f90 src/elpa2.F90 \ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c am__dirstamp = $(am__leading_dot)dirstamp -@WITH_GENERIC_SIMPLE_TRUE@am__objects_1 = src/elpa2_kernels/elpa2_kernels_complex_simple.lo \ -@WITH_GENERIC_SIMPLE_TRUE@ src/elpa2_kernels/elpa2_kernels_real_simple.lo -@WITH_GENERIC_TRUE@am__objects_2 = \ -@WITH_GENERIC_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.lo \ -@WITH_GENERIC_TRUE@ src/elpa2_kernels/elpa2_kernels_real.lo -@WITH_BGP_TRUE@am__objects_3 = \ -@WITH_BGP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_bgp.lo \ -@WITH_BGP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.lo -@WITH_BGQ_TRUE@am__objects_4 = \ -@WITH_BGQ_TRUE@ src/elpa2_kernels/elpa2_kernels_real_bgq.lo \ -@WITH_BGQ_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.lo -@WITH_SSE_AS_TRUE@am__objects_5 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo -@WITH_AVX_SANDYBRIDGE_TRUE@am__objects_6 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo \ -@WITH_AVX_SANDYBRIDGE_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo -@WITH_AMD_BULLDOZER_TRUE@am__objects_7 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \ -@WITH_AMD_BULLDOZER_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo \ -@WITH_AMD_BULLDOZER_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo -@WITH_AVX_COMPLEX_BLOCK1_TRUE@am__objects_8 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo -@WITH_AVX_COMPLEX_BLOCK2_TRUE@am__objects_9 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \ -@WITH_AVX_COMPLEX_BLOCK2_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo -@WITH_AVX_REAL_BLOCK2_TRUE@am__objects_10 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo -@WITH_AVX_REAL_BLOCK4_TRUE@am__objects_11 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo -@WITH_AVX_REAL_BLOCK6_TRUE@am__objects_12 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo -am_libelpa_la_OBJECTS = src/elpa1.lo src/elpa2.lo $(am__objects_1) \ - $(am__objects_2) $(am__objects_3) $(am__objects_4) \ - $(am__objects_5) $(am__objects_6) $(am__objects_7) \ - $(am__objects_8) $(am__objects_9) $(am__objects_10) \ - $(am__objects_11) $(am__objects_12) +@WITH_GENERIC_SIMPLE_TRUE@@WITH_OPENMP_FALSE@am__objects_1 = src/elpa2_kernels/elpa2_kernels_complex_simple.lo \ +@WITH_GENERIC_SIMPLE_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_simple.lo +@WITH_GENERIC_TRUE@@WITH_OPENMP_FALSE@am__objects_2 = src/elpa2_kernels/elpa2_kernels_complex.lo \ +@WITH_GENERIC_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real.lo +@WITH_BGP_TRUE@@WITH_OPENMP_FALSE@am__objects_3 = src/elpa2_kernels/elpa2_kernels_real_bgp.lo \ +@WITH_BGP_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex.lo +@WITH_BGQ_TRUE@@WITH_OPENMP_FALSE@am__objects_4 = src/elpa2_kernels/elpa2_kernels_real_bgq.lo \ +@WITH_BGQ_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex.lo +@WITH_OPENMP_FALSE@@WITH_SSE_AS_TRUE@am__objects_5 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo +@WITH_AVX_SANDYBRIDGE_TRUE@@WITH_OPENMP_FALSE@am__objects_6 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo \ +@WITH_AVX_SANDYBRIDGE_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_FALSE@am__objects_7 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \ +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo \ +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo +@WITH_AVX_COMPLEX_BLOCK1_TRUE@@WITH_OPENMP_FALSE@am__objects_8 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo +@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__objects_9 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \ +@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_FALSE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo +@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_FALSE@am__objects_10 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo +@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_FALSE@am__objects_11 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo +@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_FALSE@am__objects_12 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo +@WITH_OPENMP_FALSE@am_libelpa_la_OBJECTS = src/elpa1.lo src/elpa2.lo \ +@WITH_OPENMP_FALSE@ $(am__objects_1) $(am__objects_2) \ +@WITH_OPENMP_FALSE@ $(am__objects_3) $(am__objects_4) \ +@WITH_OPENMP_FALSE@ $(am__objects_5) $(am__objects_6) \ +@WITH_OPENMP_FALSE@ $(am__objects_7) $(am__objects_8) \ +@WITH_OPENMP_FALSE@ $(am__objects_9) $(am__objects_10) \ +@WITH_OPENMP_FALSE@ $(am__objects_11) $(am__objects_12) libelpa_la_OBJECTS = $(am_libelpa_la_OBJECTS) libelpa_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ $(CXXFLAGS) $(libelpa_la_LDFLAGS) $(LDFLAGS) -o $@ +@WITH_OPENMP_FALSE@am_libelpa_la_rpath = -rpath $(libdir) +libelpa_mt_la_LIBADD = +am__libelpa_mt_la_SOURCES_DIST = src/elpa1.F90 src/elpa2.F90 \ + src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ + src/elpa2_kernels/elpa2_kernels_real_simple.f90 \ + src/elpa2_kernels/elpa2_kernels_complex.f90 \ + src/elpa2_kernels/elpa2_kernels_real.f90 \ + src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ + src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ + src/elpa2_kernels/elpa2_kernels_asm_x86_64.s \ + src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ + src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp \ + src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ + src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ + src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c +@WITH_GENERIC_SIMPLE_TRUE@@WITH_OPENMP_TRUE@am__objects_13 = src/elpa2_kernels/elpa2_kernels_complex_simple.lo \ +@WITH_GENERIC_SIMPLE_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_simple.lo +@WITH_GENERIC_TRUE@@WITH_OPENMP_TRUE@am__objects_14 = src/elpa2_kernels/elpa2_kernels_complex.lo \ +@WITH_GENERIC_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real.lo +@WITH_BGP_TRUE@@WITH_OPENMP_TRUE@am__objects_15 = src/elpa2_kernels/elpa2_kernels_real_bgp.lo \ +@WITH_BGP_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.lo +@WITH_BGQ_TRUE@@WITH_OPENMP_TRUE@am__objects_16 = src/elpa2_kernels/elpa2_kernels_real_bgq.lo \ +@WITH_BGQ_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex.lo +@WITH_OPENMP_TRUE@@WITH_SSE_AS_TRUE@am__objects_17 = src/elpa2_kernels/elpa2_kernels_asm_x86_64.lo +@WITH_AVX_SANDYBRIDGE_TRUE@@WITH_OPENMP_TRUE@am__objects_18 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo \ +@WITH_AVX_SANDYBRIDGE_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_TRUE@am__objects_19 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo \ +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo \ +@WITH_AMD_BULLDOZER_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo +@WITH_AVX_COMPLEX_BLOCK1_TRUE@@WITH_OPENMP_TRUE@am__objects_20 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo +@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__objects_21 = src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.lo \ +@WITH_AVX_COMPLEX_BLOCK2_TRUE@@WITH_OPENMP_TRUE@ src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.lo +@WITH_AVX_REAL_BLOCK2_TRUE@@WITH_OPENMP_TRUE@am__objects_22 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.lo +@WITH_AVX_REAL_BLOCK4_TRUE@@WITH_OPENMP_TRUE@am__objects_23 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.lo +@WITH_AVX_REAL_BLOCK6_TRUE@@WITH_OPENMP_TRUE@am__objects_24 = src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo +@WITH_OPENMP_TRUE@am_libelpa_mt_la_OBJECTS = src/elpa1.lo src/elpa2.lo \ +@WITH_OPENMP_TRUE@ $(am__objects_13) $(am__objects_14) \ +@WITH_OPENMP_TRUE@ $(am__objects_15) $(am__objects_16) \ +@WITH_OPENMP_TRUE@ $(am__objects_17) $(am__objects_18) \ +@WITH_OPENMP_TRUE@ $(am__objects_19) $(am__objects_20) \ +@WITH_OPENMP_TRUE@ $(am__objects_21) $(am__objects_22) \ +@WITH_OPENMP_TRUE@ $(am__objects_23) $(am__objects_24) +libelpa_mt_la_OBJECTS = $(am_libelpa_mt_la_OBJECTS) +libelpa_mt_la_LINK = $(LIBTOOL) --tag=CXX $(AM_LIBTOOLFLAGS) \ + $(LIBTOOLFLAGS) --mode=link $(CXXLD) $(AM_CXXFLAGS) \ + $(CXXFLAGS) $(libelpa_mt_la_LDFLAGS) $(LDFLAGS) -o $@ +@WITH_OPENMP_TRUE@am_libelpa_mt_la_rpath = -rpath $(libdir) PROGRAMS = $(noinst_bin_PROGRAMS) am_test_complex_OBJECTS = test/test_complex.$(OBJEXT) test_complex_OBJECTS = $(am_test_complex_OBJECTS) -test_complex_DEPENDENCIES = libelpa.la +test_complex_DEPENDENCIES = $(build_lib) am_test_complex2_OBJECTS = test/test_complex2.$(OBJEXT) test_complex2_OBJECTS = $(am_test_complex2_OBJECTS) -test_complex2_DEPENDENCIES = libelpa.la +test_complex2_DEPENDENCIES = $(build_lib) am_test_real_OBJECTS = test/test_real.$(OBJEXT) test_real_OBJECTS = $(am_test_real_OBJECTS) -test_real_DEPENDENCIES = libelpa.la +test_real_DEPENDENCIES = $(build_lib) am_test_real2_OBJECTS = test/test_real2.$(OBJEXT) test_real2_OBJECTS = $(am_test_real2_OBJECTS) -test_real2_DEPENDENCIES = libelpa.la +test_real2_DEPENDENCIES = $(build_lib) DEFAULT_INCLUDES = -I.@am__isrc@ depcomp = $(SHELL) $(top_srcdir)/depcomp am__depfiles_maybe = depfiles @@ -230,10 +303,11 @@ LTFCCOMPILE = $(LIBTOOL) --tag=FC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS) LTCCASCOMPILE = $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \ --mode=compile $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS) -SOURCES = $(libelpa_la_SOURCES) $(test_complex_SOURCES) \ - $(test_complex2_SOURCES) $(test_real_SOURCES) \ - $(test_real2_SOURCES) -DIST_SOURCES = $(am__libelpa_la_SOURCES_DIST) $(test_complex_SOURCES) \ +SOURCES = $(libelpa_la_SOURCES) $(libelpa_mt_la_SOURCES) \ + $(test_complex_SOURCES) $(test_complex2_SOURCES) \ + $(test_real_SOURCES) $(test_real2_SOURCES) +DIST_SOURCES = $(am__libelpa_la_SOURCES_DIST) \ + $(am__libelpa_mt_la_SOURCES_DIST) $(test_complex_SOURCES) \ $(test_complex2_SOURCES) $(test_real_SOURCES) \ $(test_real2_SOURCES) am__can_run_installinfo = \ @@ -330,6 +404,7 @@ NM = @NM@ NMEDIT = @NMEDIT@ OBJDUMP = @OBJDUMP@ OBJEXT = @OBJEXT@ +OPENMP_FCFLAGS = @OPENMP_FCFLAGS@ OTOOL = @OTOOL@ OTOOL64 = @OTOOL64@ PACKAGE = @PACKAGE@ @@ -405,18 +480,29 @@ top_builddir = @top_builddir@ top_srcdir = @top_srcdir@ with_amd_bulldozer_kernel = @with_amd_bulldozer_kernel@ ACLOCAL_AMFLAGS = ${ACLOCAL_FLAGS} -I m4 +@WITH_OPENMP_FALSE@lib_LTLIBRARIES = libelpa.la # libelpa -lib_LTLIBRARIES = libelpa.la +@WITH_OPENMP_TRUE@lib_LTLIBRARIES = libelpa_mt.la #config_f90.h: ./config.h # grep "^#define" ./config.h > $@ -libelpa_la_SOURCES = src/elpa1.f90 src/elpa2.F90 $(am__append_1) \ - $(am__append_2) $(am__append_3) $(am__append_4) \ - $(am__append_5) $(am__append_6) $(am__append_7) \ - $(am__append_8) $(am__append_9) $(am__append_10) \ - $(am__append_11) $(am__append_12) -libelpa_la_LDFLAGS = -version-info $(ELPA_SO_VERSION) +@WITH_OPENMP_TRUE@libelpa_mt_la_SOURCES = src/elpa1.F90 src/elpa2.F90 \ +@WITH_OPENMP_TRUE@ $(am__append_1) $(am__append_3) \ +@WITH_OPENMP_TRUE@ $(am__append_5) $(am__append_7) \ +@WITH_OPENMP_TRUE@ $(am__append_9) $(am__append_11) \ +@WITH_OPENMP_TRUE@ $(am__append_13) $(am__append_15) \ +@WITH_OPENMP_TRUE@ $(am__append_17) $(am__append_19) \ +@WITH_OPENMP_TRUE@ $(am__append_21) $(am__append_23) +@WITH_OPENMP_FALSE@libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90 \ +@WITH_OPENMP_FALSE@ $(am__append_2) $(am__append_4) \ +@WITH_OPENMP_FALSE@ $(am__append_6) $(am__append_8) \ +@WITH_OPENMP_FALSE@ $(am__append_10) $(am__append_12) \ +@WITH_OPENMP_FALSE@ $(am__append_14) $(am__append_16) \ +@WITH_OPENMP_FALSE@ $(am__append_18) $(am__append_20) \ +@WITH_OPENMP_FALSE@ $(am__append_22) $(am__append_24) +@WITH_OPENMP_TRUE@libelpa_la_LDFLAGS = -version-info $(ELPA_SO_VERSION) +@WITH_OPENMP_FALSE@libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION) # install any .mod files in the include/ dir elpa_includedir = $(includedir)/elpa @@ -427,28 +513,30 @@ filesdir = $(datarootdir) files_DATA = \ test/read_real.f90 \ test/read_real_gen.f90 \ - test/test_complex2.f90 \ - test/test_complex.f90 \ + test/test_complex2.F90 \ + test/test_complex.F90 \ test/test_complex_gen.f90 \ - test/test_real2.f90 \ - test/test_real.f90 \ + test/test_real2.F90 \ + test/test_real.F90 \ test/test_real_gen.f90 # pkg-config stuff pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = elpa.pc +@WITH_OPENMP_FALSE@build_lib = libelpa.la # test programs +@WITH_OPENMP_TRUE@build_lib = libelpa_mt.la noinst_bindir = $(abs_top_builddir) -test_real_SOURCES = test/test_real.f90 -test_real_LDADD = libelpa.la -test_real2_SOURCES = test/test_real2.f90 -test_real2_LDADD = libelpa.la -test_complex_SOURCES = test/test_complex.f90 -test_complex_LDADD = libelpa.la -test_complex2_SOURCES = test/test_complex2.f90 -test_complex2_LDADD = libelpa.la +test_real_SOURCES = test/test_real.F90 +test_real_LDADD = $(build_lib) +test_real2_SOURCES = test/test_real2.F90 +test_real2_LDADD = $(build_lib) +test_complex_SOURCES = test/test_complex.F90 +test_complex_LDADD = $(build_lib) +test_complex2_SOURCES = test/test_complex2.F90 +test_complex2_LDADD = $(build_lib) check_SCRIPTS = test_real.sh test_real2.sh test_complex.sh test_complex2.sh TESTS = $(check_SCRIPTS) CLEANFILES = test_real.sh test_real2.sh test_complex.sh test_complex2.sh @@ -593,7 +681,9 @@ src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.lo: \ src/elpa2_kernels/$(am__dirstamp) \ src/elpa2_kernels/$(DEPDIR)/$(am__dirstamp) libelpa.la: $(libelpa_la_OBJECTS) $(libelpa_la_DEPENDENCIES) $(EXTRA_libelpa_la_DEPENDENCIES) - $(libelpa_la_LINK) -rpath $(libdir) $(libelpa_la_OBJECTS) $(libelpa_la_LIBADD) $(LIBS) + $(libelpa_la_LINK) $(am_libelpa_la_rpath) $(libelpa_la_OBJECTS) $(libelpa_la_LIBADD) $(LIBS) +libelpa_mt.la: $(libelpa_mt_la_OBJECTS) $(libelpa_mt_la_DEPENDENCIES) $(EXTRA_libelpa_mt_la_DEPENDENCIES) + $(libelpa_mt_la_LINK) $(am_libelpa_mt_la_rpath) $(libelpa_mt_la_OBJECTS) $(libelpa_mt_la_LIBADD) $(LIBS) install-noinst_binPROGRAMS: $(noinst_bin_PROGRAMS) @$(NORMAL_INSTALL) @list='$(noinst_bin_PROGRAMS)'; test -n "$(noinst_bindir)" || list=; \ diff --git a/ELPA_development_version_OpenMP/config.h.in b/ELPA_development_version_OpenMP/config.h.in index 58fc18d8..475be276 100644 --- a/ELPA_development_version_OpenMP/config.h.in +++ b/ELPA_development_version_OpenMP/config.h.in @@ -101,5 +101,8 @@ optimizations) */ #undef WITH_GENERIC_SIMPLE +/* use OpenMP threading */ +#undef WITH_OPENMP + /* use kernel tuned for SSE (written in gcc assembler) */ #undef WITH_SSE_AS diff --git a/ELPA_development_version_OpenMP/configure b/ELPA_development_version_OpenMP/configure index 6fc3148d..3a4fe314 100755 --- a/ELPA_development_version_OpenMP/configure +++ b/ELPA_development_version_OpenMP/configure @@ -595,7 +595,7 @@ PACKAGE_STRING='elpa 2013.08.001' PACKAGE_BUGREPORT='elpa-library@rzg.mpg.de' PACKAGE_URL='' -ac_unique_file="src/elpa1.f90" +ac_unique_file="src/elpa1.F90" # Factoring default headers for most tests. ac_includes_default="\ #include @@ -672,6 +672,9 @@ build_vendor build_cpu build LIBTOOL +OPENMP_FCFLAGS +WITH_OPENMP_FALSE +WITH_OPENMP_TRUE WITH_AVX_REAL_BLOCK6_FALSE WITH_AVX_REAL_BLOCK6_TRUE WITH_AVX_REAL_BLOCK4_FALSE @@ -811,6 +814,8 @@ with_avx_complex_block2 with_avx_real_block2 with_avx_real_block4 with_avx_real_block6 +with_openmp +enable_openmp enable_shared enable_static with_pic @@ -1458,6 +1463,7 @@ Optional Features: do not reject slow dependency extractors --disable-dependency-tracking speeds up one-time build + --disable-openmp do not use OpenMP --enable-shared[=PKGS] build shared libraries [default=yes] --enable-static[=PKGS] build static libraries [default=yes] --enable-fast-install[=PKGS] @@ -1491,6 +1497,7 @@ Optional Packages: (written in gcc assembler), default no. --with-avx-real-block6 use AVX optimized real kernel with blocking 6 (written in gcc assembler), default no. + --with-openmp use OpenMP threading, default no. --with-pic[=PKGS] try to use only PIC/non-PIC objects [default=use both] --with-gnu-ld assume the C compiler uses GNU ld [default=no] @@ -5794,6 +5801,101 @@ $as_echo "#define WITH_AVX_REAL_BLOCK6 1" >>confdefs.h +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether OpenMP usage is specified" >&5 +$as_echo_n "checking whether OpenMP usage is specified... " >&6; } + +# Check whether --with-openmp was given. +if test "${with_openmp+set}" = set; then : + withval=$with_openmp; with_openmp=yes +else + with_openmp=no +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${with_openmp}" >&5 +$as_echo "${with_openmp}" >&6; } + if test x"$with_openmp" = x"yes"; then + WITH_OPENMP_TRUE= + WITH_OPENMP_FALSE='#' +else + WITH_OPENMP_TRUE='#' + WITH_OPENMP_FALSE= +fi + + if test "x${with_openmp}" = xyes; then + +$as_echo "#define WITH_OPENMP 1" >>confdefs.h + + + OPENMP_FCFLAGS= + # Check whether --enable-openmp was given. +if test "${enable_openmp+set}" = set; then : + enableval=$enable_openmp; +fi + + if test "$enable_openmp" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $FC option to support OpenMP" >&5 +$as_echo_n "checking for $FC option to support OpenMP... " >&6; } +if ${ac_cv_prog_fc_openmp+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.$ac_ext <<_ACEOF + + program main + implicit none +!$ integer tid + tid = 42 + call omp_set_num_threads(2) + end + +_ACEOF +if ac_fn_fc_try_link "$LINENO"; then : + ac_cv_prog_fc_openmp='none needed' +else + ac_cv_prog_fc_openmp='unsupported' + for ac_option in -fopenmp -xopenmp -openmp -mp -omp -qsmp=omp -homp \ + -Popenmp --openmp; do + ac_save_FCFLAGS=$FCFLAGS + FCFLAGS="$FCFLAGS $ac_option" + cat > conftest.$ac_ext <<_ACEOF + + program main + implicit none +!$ integer tid + tid = 42 + call omp_set_num_threads(2) + end + +_ACEOF +if ac_fn_fc_try_link "$LINENO"; then : + ac_cv_prog_fc_openmp=$ac_option +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + FCFLAGS=$ac_save_FCFLAGS + if test "$ac_cv_prog_fc_openmp" != unsupported; then + break + fi + done +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_openmp" >&5 +$as_echo "$ac_cv_prog_fc_openmp" >&6; } + case $ac_cv_prog_fc_openmp in #( + "none needed" | unsupported) + ;; #( + *) + OPENMP_FCFLAGS=$ac_cv_prog_fc_openmp ;; + esac + fi + + + fi + +FCFLAGS="$FCFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS" +#LDFLAGS="$LDFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS" + save_FCFLAGS=$FCFLAGS save_LDFLAGS=$LDFLAGS @@ -19759,6 +19861,8 @@ ac_compiler_gnu=$ac_cv_fc_compiler_gnu +#AC_SUBST(OPT_FCFLAGS) + mkdir modules #gl_VISIBILITY @@ -19967,6 +20071,10 @@ if test -z "${WITH_AVX_REAL_BLOCK6_TRUE}" && test -z "${WITH_AVX_REAL_BLOCK6_FAL as_fn_error $? "conditional \"WITH_AVX_REAL_BLOCK6\" was never defined. Usually this means the macro was only invoked conditionally." "$LINENO" 5 fi +if test -z "${WITH_OPENMP_TRUE}" && test -z "${WITH_OPENMP_FALSE}"; then + as_fn_error $? "conditional \"WITH_OPENMP\" was never defined. +Usually this means the macro was only invoked conditionally." "$LINENO" 5 +fi : "${CONFIG_STATUS=./config.status}" ac_write_fail=0 diff --git a/ELPA_development_version_OpenMP/configure.ac b/ELPA_development_version_OpenMP/configure.ac index b2ddaa80..caf9e505 100644 --- a/ELPA_development_version_OpenMP/configure.ac +++ b/ELPA_development_version_OpenMP/configure.ac @@ -1,6 +1,6 @@ AC_PREREQ([2.69]) AC_INIT([elpa],[2013.08.001], elpa-library@rzg.mpg.de) -AC_CONFIG_SRCDIR([src/elpa1.f90]) +AC_CONFIG_SRCDIR([src/elpa1.F90]) AM_INIT_AUTOMAKE([foreign -Wall subdir-objects]) AC_CONFIG_MACRO_DIR([m4]) @@ -102,6 +102,22 @@ DEFINE_OPTION([WITH_AVX_REAL_BLOCK6], [avx-real-block6], [no],[]) +AC_MSG_CHECKING(whether OpenMP usage is specified) +AC_ARG_WITH([openmp], + AS_HELP_STRING([--with-openmp], + [use OpenMP threading, default no.]), + [with_openmp=yes], + [with_openmp=no]) + AC_MSG_RESULT([${with_openmp}]) + AM_CONDITIONAL([WITH_OPENMP],[test x"$with_openmp" = x"yes"]) + if test "x${with_openmp}" = xyes; then + AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading]) + AC_OPENMP + fi + +FCFLAGS="$FCFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS" +#LDFLAGS="$LDFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS" + save_FCFLAGS=$FCFLAGS save_LDFLAGS=$LDFLAGS @@ -231,6 +247,8 @@ AC_SUBST([FC_MODOUT]) AC_SUBST(BLACS_LDFLAGS) AC_SUBST(BLACS_FCFLAGS) +#AC_SUBST(OPT_FCFLAGS) + mkdir modules #gl_VISIBILITY diff --git a/ELPA_development_version_OpenMP/src/elpa1.f90 b/ELPA_development_version_OpenMP/src/elpa1.F90 similarity index 96% rename from ELPA_development_version_OpenMP/src/elpa1.f90 rename to ELPA_development_version_OpenMP/src/elpa1.F90 index 0b4c2c84..f1cfc6e4 100644 --- a/ELPA_development_version_OpenMP/src/elpa1.f90 +++ b/ELPA_development_version_OpenMP/src/elpa1.F90 @@ -46,6 +46,8 @@ ! with their original authors, but shall adhere to the licensing terms ! distributed along with the original code in the file "COPYING". +#include "config-f90.h" + module ELPA1 ! Version 1.1.2, 2011-02-21 @@ -351,10 +353,17 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta integer istep, i, j, lcs, lce, lrs, lre integer tile_size, l_rows_tile, l_cols_tile +#ifdef WITH_OPENMP + integer my_thread, n_threads, max_threads, n_iter + integer omp_get_thread_num, omp_get_num_threads, omp_get_max_threads +#endif + real*8 vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf real*8, allocatable:: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) - +#ifdef WITH_OPENMP + real*8, allocatable:: ur_p(:,:), uc_p(:,:) +#endif integer pcol, prow pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number @@ -387,6 +396,13 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta allocate(vc(max_local_cols)) allocate(uc(max_local_cols)) +#ifdef WITH_OPENMP + max_threads = omp_get_max_threads() + + allocate(ur_p(max_local_rows,0:max_threads-1)) + allocate(uc_p(max_local_cols,0:max_threads-1)) +#endif + tmp = 0 vr = 0 ur = 0 @@ -478,6 +494,17 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta ur(1:l_rows) = 0 if(l_rows>0 .and. l_cols>0) then +#ifdef WITH_OPENMP +!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre) + + my_thread = omp_get_thread_num() + n_threads = omp_get_num_threads() + + n_iter = 0 + + uc_p(1:l_cols,my_thread) = 0. + ur_p(1:l_rows,my_thread) = 0. +#endif do i=0,(istep-2)/tile_size lcs = i*l_cols_tile+1 lce = min(l_cols,(i+1)*l_cols_tile) @@ -486,11 +513,27 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta lrs = j*l_rows_tile+1 lre = min(l_rows,(j+1)*l_rows_tile) if(lre0) then call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,1),vr,1,0.d0,aux,1) call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,1),aux,1,1.d0,uc,1) @@ -1027,10 +1070,18 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, integer istep, i, j, lcs, lce, lrs, lre integer tile_size, l_rows_tile, l_cols_tile +#ifdef WITH_OPENMP + integer my_thread, n_threads, max_threads, n_iter + integer omp_get_thread_num, omp_get_num_threads, omp_get_max_threads +#endif + real*8 vnorm2 complex*16 vav, xc, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf complex*16, allocatable:: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) +#ifdef WITH_OPENMP + complex*16, allocatable:: ur_p(:,:), uc_p(:,:) +#endif real*8, allocatable:: tmpr(:) integer pcol, prow @@ -1065,6 +1116,13 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, allocate(vc(max_local_cols)) allocate(uc(max_local_cols)) +#ifdef WITH_OPENMP + max_threads = omp_get_max_threads() + + allocate(ur_p(max_local_rows,0:max_threads-1)) + allocate(uc_p(max_local_cols,0:max_threads-1)) +#endif + tmp = 0 vr = 0 ur = 0 @@ -1155,6 +1213,17 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, uc(1:l_cols) = 0 ur(1:l_rows) = 0 if(l_rows>0 .and. l_cols>0) then +#ifdef WITH_OPENMP +!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre) + + my_thread = omp_get_thread_num() + n_threads = omp_get_num_threads() + + n_iter = 0 + + uc_p(1:l_cols,my_thread) = 0. + ur_p(1:l_rows,my_thread) = 0. +#endif do i=0,(istep-2)/tile_size lcs = i*l_cols_tile+1 @@ -1164,10 +1233,26 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, lrs = j*l_rows_tile+1 lre = min(l_rows,(j+1)*l_rows_tile) if(lre0) then call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,1),vr,1,CZERO,aux,1) @@ -2124,6 +2209,9 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_ real*8 z(na), d1(na), d2(na), z1(na), delta(na), dbase(na), ddiff(na), ev_scale(na), tmp(na) real*8 d1u(na), zu(na), d1l(na), zl(na) real*8, allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:) +#ifdef WITH_OPENMP + real*8, allocatable :: z_p(:,:) +#endif integer i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, l_rqm, ns, info integer l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, l_cols_qreorg, np, l_idx, nqcols1, nqcols2 @@ -2132,6 +2220,14 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_ integer idx(na), idx1(na), idx2(na) integer coltyp(na), idxq1(na), idxq2(na) +#ifdef WITH_OPENMP + integer max_threads, my_thread + integer omp_get_max_threads, omp_get_thread_num + + max_threads = omp_get_max_threads() + + allocate(z_p(na,0:max_threads-1)) +#endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) @@ -2405,11 +2501,18 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_ ! Solve secular equation z(1:na1) = 1 +#ifdef WITH_OPENMP + z_p(1:na1,:) = 1 +#endif dbase(1:na1) = 0 ddiff(1:na1) = 0 info = 0 - +#ifdef WITH_OPENMP +!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j) + my_thread = omp_get_thread_num() +!$OMP DO +#endif DO i = my_proc+1, na1, n_procs ! work distributed over all processors call DLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! @@ -2423,11 +2526,17 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_ ! Compute updated z +#ifdef WITH_OPENMP + do j=1,na1 + if(i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) ) + enddo + z_p(i,my_thread) = z_p(i,my_thread)*delta(i) +#else do j=1,na1 if(i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) ) enddo z(i) = z(i)*delta(i) - +#endif ! store dbase/ddiff if(ina_s) then ! Receive Householder vector from previous task, from PE owning subdiagonal +#ifdef WITH_OPENMP + call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS,mpierr) +#else call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr) +#endif tau = hv(1) hv(1) = 1. endif @@ -1047,33 +1093,166 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm n_off = n_off + nb endif - do iblk=1,nblocks - ns = na_s + (iblk-1)*nb - n_off ! first column in block - ne = ns+nb-1 ! last column in block +#ifdef WITH_OPENMP + if(max_threads > 1) then - if(ns+n_off>na) exit + ! Codepath for OpenMP - ! Store Householder vector for back transformation + ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! + ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. + ! This simulates the behaviour of the MPI tasks which also work after each other. + ! The code would be considerably easier, if the MPI communication would be made within + ! the parallel region - this is avoided here since this would require + ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. - hh_cnt(iblk) = hh_cnt(iblk) + 1 + hv_t(:,1) = hv + tau_t(1) = tau - hh_gath(1 ,hh_cnt(iblk),iblk) = tau - hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) + do iter = 1, 2 - if(hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then - ! Wait for last transfer to finish - call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) - ! Copy vectors into send buffer - hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) - ! Send to destination - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, & - global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) - ! Reset counter and increase destination row - hh_cnt(iblk) = 0 - hh_dst(iblk) = hh_dst(iblk)+1 - endif + ! iter=1 : work on first block + ! iter=2 : work on remaining blocks + ! This is done in 2 iterations so that we have a barrier in between: + ! After the first iteration, it is guaranteed that the last row of the last block + ! is completed by the next thread. + ! After the first iteration it is also the place to exchange the last row + ! with MPI calls + +!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & +!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) + do my_thread = 1, max_threads + + if(iter == 1) then + my_block_s = omp_block_limits(my_thread-1) + 1 + my_block_e = my_block_s + else + my_block_s = omp_block_limits(my_thread-1) + 2 + my_block_e = omp_block_limits(my_thread) + endif + + do iblk = my_block_s, my_block_e + + ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block + ne = ns+nb-1 ! last column in block + + if(istepna) exit + + hv = hv_t(:,my_thread) + tau = tau_t(my_thread) + + ! Store Householder vector for back transformation + + hh_cnt(iblk) = hh_cnt(iblk) + 1 + + hh_gath(1 ,hh_cnt(iblk),iblk) = tau + hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) + + nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block + nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) + ! Note that nr>=0 implies that diagonal block is full (nc==nb)! + + ! Transform diagonal block + + call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1) + + x = dot_product(hv(1:nc),hd(1:nc))*tau + hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) + + call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1) + + hv_t(:,my_thread) = 0 + tau_t(my_thread) = 0 + + if(nr<=0) cycle ! No subdiagonal block present any more + + ! Transform subdiagonal block + + call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1) + + if(nr>1) then + + ! complete (old) Householder transformation for first column + + ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 + + ! calculate new Householder transformation for first column + ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) + + vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) + call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) + hv_t(1 ,my_thread) = 1. + hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf + ab(nb+2:,ns) = 0 + + ! update subdiagonal block for old and new Householder transformation + ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster + + call DGEMV('T',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,0.d0,h(2),1) + x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) + h(2:nb) = h(2:nb) - x*hv(2:nb) + ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") + do i=2,nb + ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i) + enddo + + else + + ! No new Householder transformation for nr=1, just complete the old one + ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 + do i=2,nb + ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) + enddo + ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) + hv_t(1,my_thread) = 1. + + endif + + enddo + + enddo ! my_thread +!$omp end parallel do + + if (iter==1) then + ! We are at the end of the first block + + ! Send our first column to previous PE + if(my_pe>0 .and. na_s <= na) then + call mpi_wait(ireq_ab,mpi_status,mpierr) + ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) + call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr) + endif + + ! Request last column from next PE + ne = na_s + nblocks*nb - (max_threads-1) - 1 + if(istep>=max_threads .and. ne <= na) then + call mpi_recv(ab(1,ne-n_off),nb+1,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr) + endif + + else + ! We are at the end of all blocks + + ! Send last HH vector and TAU to next PE if it has been calculated above + ne = na_s + nblocks*nb - (max_threads-1) - 1 + if(istep>=max_threads .and. ne < na) then + call mpi_wait(ireq_hv,mpi_status,mpierr) + hv_s(1) = tau_t(max_threads) + hv_s(2:) = hv_t(2:,max_threads) + call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr) + endif + + ! "Send" HH vector and TAU to next OpenMP thread + do my_thread = max_threads, 2, -1 + hv_t(:,my_thread) = hv_t(:,my_thread-1) + tau_t(my_thread) = tau_t(my_thread-1) + enddo + + endif + enddo ! iter + + else + + ! Codepath for 1 thread without OpenMP ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. @@ -1081,129 +1260,219 @@ subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_comm ! and sends the Householder vector and the first column as early ! as possible. - nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block - nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) +#endif /* WITH_OPENMP */ + + do iblk=1,nblocks + + ns = na_s + (iblk-1)*nb - n_off ! first column in block + ne = ns+nb-1 ! last column in block + + if(ns+n_off>na) exit + + ! Store Householder vector for back transformation + + hh_cnt(iblk) = hh_cnt(iblk) + 1 + + hh_gath(1 ,hh_cnt(iblk),iblk) = tau + hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) + +#ifndef WITH_OPENMP + if(hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then + ! Wait for last transfer to finish + + call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) + + ! Copy vectors into send buffer + hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) + ! Send to destination + call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, & + global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), & + 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) + ! Reset counter and increase destination row + hh_cnt(iblk) = 0 + hh_dst(iblk) = hh_dst(iblk)+1 + endif + + ! The following code is structured in a way to keep waiting times for + ! other PEs at a minimum, especially if there is only one block. + ! For this reason, it requests the last column as late as possible + ! and sends the Householder vector and the first column as early + ! as possible. +#endif + nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block + nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! - ! Multiply diagonal block and subdiagonal block with Householder vector + ! Multiply diagonal block and subdiagonal block with Householder vector - if(iblk==nblocks .and. nc==nb) then + if(iblk==nblocks .and. nc==nb) then ! We need the last column from the next PE. ! First do the matrix multiplications without last column ... ! Diagonal block, the contribution of the last element is added below! - ab(1,ne) = 0 - call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1) + ab(1,ne) = 0 + call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1) ! Subdiagonal block - if(nr>0) call DGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1) + if(nr>0) call DGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1) ! ... then request last column ... - call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr) +#ifdef WITH_OPENMP + call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS,mpierr) +#else + call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr) - ! ... and complete the result - hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) - hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau - - else +#endif - ! Normal matrix multiply - call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1) - if(nr>0) call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1) + ! ... and complete the result + hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) + hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau - endif + else - ! Calculate first column of subdiagonal block and calculate new - ! Householder transformation for this column + ! Normal matrix multiply + call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1) + if(nr>0) call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1) + + endif - hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb - tau_new = 0 + ! Calculate first column of subdiagonal block and calculate new + ! Householder transformation for this column - if(nr>0) then + hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb + tau_new = 0 - ! complete (old) Householder transformation for first column + if(nr>0) then - ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 + ! complete (old) Householder transformation for first column + + ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 ! calculate new Householder transformation ... - if(nr>1) then - vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) - call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_new) - hv_new(1) = 1. - hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf - ab(nb+2:,ns) = 0 - endif + if(nr>1) then + vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) + call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_new) + hv_new(1) = 1. + hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf + ab(nb+2:,ns) = 0 + endif - ! ... and send it away immediatly if this is the last block + ! ... and send it away immediatly if this is the last block - if(iblk==nblocks) then - call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) - hv_s(1) = tau_new - hv_s(2:) = hv_new(2:) - call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr) + if(iblk==nblocks) then +#ifdef WITH_OPENMP + call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) +#else + call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) +#endif + hv_s(1) = tau_new + hv_s(2:) = hv_new(2:) + call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr) + endif + endif - endif + ! Transform diagonal block + x = dot_product(hv(1:nc),hd(1:nc))*tau + hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) + if(my_pe>0 .and. iblk==1) then - ! Transform diagonal block - x = dot_product(hv(1:nc),hd(1:nc))*tau - hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) - - if(my_pe>0 .and. iblk==1) then + ! The first column of the diagonal block has to be send to the previous PE + ! Calculate first column only ... - ! The first column of the diagonal block has to be send to the previous PE - ! Calculate first column only ... + ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1) - ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1) - - ! ... send it away ... - - call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) - ab_s(1:nb+1) = ab(1:nb+1,ns) - call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr) - - ! ... and calculate remaining columns with rank-2 update - if(nc>1) call DSYR2('L',nc-1,-1.d0,hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1) - else - ! No need to send, just a rank-2 update - call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1) - endif + ! ... send it away ... +#ifdef WITH_OPENMP + call mpi_wait(ireq_ab,MPI_STATUS,mpierr) +#else + call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) +#endif + ab_s(1:nb+1) = ab(1:nb+1,ns) + call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr) + + ! ... and calculate remaining columns with rank-2 update + if(nc>1) call DSYR2('L',nc-1,-1.d0,hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1) + else + ! No need to send, just a rank-2 update + call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1) + endif + ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb - if(nr>0) then - if(nr>1) then - call DGEMV('T',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,0.d0,h(2),1) - x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new - h(2:nb) = h(2:nb) - x*hv(2:nb) - ! Unfortunately the is no BLAS routine like DGER2 for a nonsymmetric rank 2 update - do i=2,nb - ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i) - enddo - else - ! No double Householder transformation for nr=1, just complete the row - do i=2,nb + if(nr>0) then + if(nr>1) then + call DGEMV('T',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,0.d0,h(2),1) + x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new + h(2:nb) = h(2:nb) - x*hv(2:nb) + ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update + do i=2,nb + ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i) + enddo + else + ! No double Householder transformation for nr=1, just complete the row + do i=2,nb ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) enddo endif endif - + ! Use new HH vector for the next block hv(:) = hv_new(:) tau = tau_new enddo +#ifdef WITH_OPENMP + endif + + + do iblk = 1, nblocks + + + + if(hh_dst(iblk) >= np_rows) exit + if(snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit + + if(hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then + ! Wait for last transfer to finish + call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr) + ! Copy vectors into send buffer + hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) + ! Send to destination + call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, & + global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), & + 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) + ! Reset counter and increase destination row + hh_cnt(iblk) = 0 + hh_dst(iblk) = hh_dst(iblk)+1 + endif + enddo +#endif +enddo + + ! Finish the last outstanding requests +#ifdef WITH_OPENMP + call mpi_wait(ireq_ab,MPI_STATUS,mpierr) + call mpi_wait(ireq_hv,MPI_STATUS,mpierr) + + allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks))) + call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES, mpierr) + call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES, mpierr) + deallocate(mpi_statuses) +#else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) +#endif call mpi_barrier(mpi_comm,mpierr) @@ -1259,14 +1528,30 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows integer next_n, next_local_n, next_n_start, next_n_end integer bottom_msg_length, top_msg_length, next_top_msg_length integer stripe_width, last_stripe_width, stripe_count +#ifdef WITH_OPENMP + integer thread_width, csw, b_off, b_len +#endif integer num_result_blocks, num_result_buffers, num_bufs_recvd integer a_off, current_tv_off, max_blk_size integer mpierr, src, src_offset, dst, offset, nfact, num_blk +#ifdef WITH_OPENMP + integer mpi_status(MPI_STATUS_SIZE) +#endif logical flag +#ifdef WITH_OPENMP + real*8, allocatable :: a(:,:,:,:), row(:) +#else real*8, allocatable :: a(:,:,:), row(:) +#endif + +#ifdef WITH_OPENMP + real*8, allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) + real*8, allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) +#else real*8, allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) real*8, allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) +#endif real*8, allocatable :: result_buffer(:,:,:) real*8, allocatable :: bcast_buffer(:,:) @@ -1274,7 +1559,9 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows integer, allocatable :: result_send_request(:), result_recv_request(:), limits(:) integer, allocatable :: top_send_request(:), bottom_send_request(:) integer, allocatable :: top_recv_request(:), bottom_recv_request(:) - +#ifdef WITH_OPENMP + integer, allocatable :: mpi_statuses(:,:) +#endif ! MPI send/recv tags, arbitrary integer, parameter :: bottom_recv_tag = 111 @@ -1285,10 +1572,18 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows real*8 kernel_time integer*8 kernel_flops +#ifdef WITH_OPENMP + integer max_threads, my_thread + integer omp_get_max_threads +#endif kernel_time = 1.d-100 kernel_flops = 0 +#ifdef WITH_OPENMP + max_threads = 1 + max_threads = omp_get_max_threads() +#endif call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) @@ -1310,16 +1605,30 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) if(l_nev==0) then +#ifdef WITH_OPENMP + thread_width = 0 +#endif stripe_width = 0 stripe_count = 0 last_stripe_width = 0 else ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into ! every primary cache +#ifdef WITH_OPENMP + thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread +#endif stripe_width = 48 ! Must be a multiple of 4 +#ifdef WITH_OPENMP + stripe_count = (thread_width-1)/stripe_width + 1 +#else stripe_count = (l_nev-1)/stripe_width + 1 +#endif ! Adapt stripe width so that last one doesn't get too small +#ifdef WITH_OPENMP + stripe_width = (thread_width-1)/stripe_count + 1 +#else stripe_width = (l_nev-1)/stripe_count + 1 +#endif stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!! last_stripe_width = l_nev - (stripe_count-1)*stripe_width endif @@ -1335,8 +1644,13 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows a_dim2 = max_blk_size + nbw !DEC$ ATTRIBUTES ALIGN: 64:: a +#ifdef WITH_OPENMP + allocate(a(stripe_width,a_dim2,stripe_count,max_threads)) + ! a(:,:,:,:) should be set to 0 in a parallel region, not here! +#else allocate(a(stripe_width,a_dim2,stripe_count)) a(:,:,:) = 0 +#endif allocate(row(l_nev)) row(:) = 0 @@ -1347,6 +1661,17 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows ! The peculiar way it is done below is due to the fact that the last row should be ! ready first since it is the first one to start below +#ifdef WITH_OPENMP + ! Please note about the OMP usage below: + ! This is not for speed, but because we want the matrix a in the memory and + ! in the cache of the correct thread (if possible) + +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation! + enddo +#endif + do ip = np_rows-1, 0, -1 if(my_prow == ip) then ! Receive my rows which have not yet been received @@ -1354,12 +1679,27 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if(src < my_prow) then +#ifdef WITH_OPENMP + call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr) +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + call unpack_row(row,i-limits(ip),my_thread) + enddo +#else call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call unpack_row(row,i-limits(ip)) +#endif elseif(src==my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + call unpack_row(row,i-limits(ip),my_thread) + enddo +#else call unpack_row(row,i-limits(ip)) +#endif endif enddo ! Send all rows which have not yet been send @@ -1388,8 +1728,16 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows do i=limits(my_prow)+1,limits(my_prow+1) src = mod((i-1)/nblk, np_rows) if(src == ip) then +#ifdef WITH_OPENMP + call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr) +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + call unpack_row(row,i-limits(my_prow),my_thread) + enddo +#else call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call unpack_row(row,i-limits(my_prow)) +#endif endif enddo endif @@ -1431,6 +1779,19 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows bottom_send_request(:) = MPI_REQUEST_NULL bottom_recv_request(:) = MPI_REQUEST_NULL +#ifdef WITH_OPENMP + allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count)) + allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count)) + allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count)) + allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count)) + + top_border_send_buffer(:,:) = 0 + top_border_recv_buffer(:,:) = 0 + bottom_border_send_buffer(:,:) = 0 + bottom_border_recv_buffer(:,:) = 0 + + ! Initialize broadcast buffer +#else allocate(top_border_send_buffer(stripe_width, nbw, stripe_count)) allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count)) allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count)) @@ -1440,8 +1801,7 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows top_border_recv_buffer(:,:,:) = 0 bottom_border_send_buffer(:,:,:) = 0 bottom_border_recv_buffer(:,:,:) = 0 - - ! Initialize broadcast buffer +#endif allocate(bcast_buffer(nbw, max_blk_size)) bcast_buffer = 0 @@ -1484,8 +1844,15 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows if(sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then do i = 1, stripe_count - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, & +#ifdef WITH_OPENMP + csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" + b_len = csw*nbw*max_threads + call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow+1, bottom_recv_tag, & mpi_comm_rows, bottom_recv_request(i), mpierr) +#else + call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, & + mpi_comm_rows, bottom_recv_request(i), mpierr) +#endif enddo endif @@ -1505,15 +1872,42 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows if(current_local_n > 0) then do i = 1, stripe_count +#ifdef WITH_OPENMP + ! Get real stripe width for strip i; + ! The last OpenMP tasks may have an even smaller stripe with, + ! but we don't care about this, i.e. we send/recv a bit too much in this case. + ! csw: current_stripe_width + csw = min(stripe_width, thread_width-(i-1)*stripe_width) +#endif !wait_b if(current_n_end < current_n) then +#ifdef WITH_OPENMP + call MPI_Wait(bottom_recv_request(i), MPI_STATUS, mpierr) +!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) + do my_thread = 1, max_threads + n_off = current_local_n+a_off + b_len = csw*nbw + b_off = (my_thread-1)*b_len + a(1:csw,n_off+1:n_off+nbw,i,my_thread) = & + reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) + enddo +#else call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) n_off = current_local_n+a_off a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) + +#endif if(next_n_end < next_n) then +#ifdef WITH_OPENMP + call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & + MPI_REAL8, my_prow+1, bottom_recv_tag, & + mpi_comm_rows, bottom_recv_request(i), mpierr) +#else call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, & + mpi_comm_rows, bottom_recv_request(i), mpierr) +#endif endif endif @@ -1521,320 +1915,562 @@ subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, mpi_comm_rows !wait_t if(top_msg_length>0) then +#ifdef WITH_OPENMP + call MPI_Wait(top_recv_request(i), MPI_STATUS, mpierr) +#else call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) +#endif endif !compute +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) + do my_thread = 1, max_threads + if(top_msg_length>0) then + b_len = csw*top_msg_length + b_off = (my_thread-1)*b_len + a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & + reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) + endif + call compute_hh_trafo(0, current_local_n, i, my_thread) + enddo +#else call compute_hh_trafo(0, current_local_n, i) - +#endif !send_b +#ifdef WITH_OPENMP + call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) + if(bottom_msg_length>0) then + n_off = current_local_n+nbw-bottom_msg_length+a_off + b_len = csw*bottom_msg_length*max_threads + bottom_border_send_buffer(1:b_len,i) = & + reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) + call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, & + top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) + endif +#else call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) if(bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, & + + top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) endif - +#endif else !compute +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) + do my_thread = 1, max_threads + call compute_hh_trafo(current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread) + enddo + + !send_b + call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) + if(bottom_msg_length > 0) then + n_off = current_local_n+nbw-bottom_msg_length+a_off + b_len = csw*bottom_msg_length*max_threads + bottom_border_send_buffer(1:b_len,i) = & + reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) + call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, & + top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) + endif +#else call compute_hh_trafo(current_local_n - bottom_msg_length, bottom_msg_length, i) + + + !send_b call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) if(bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, & + + top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) endif +#endif !compute +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + call compute_hh_trafo(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread) + enddo +#else call compute_hh_trafo(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i) +#endif !wait_t if(top_msg_length>0) then +#ifdef WITH_OPENMP + call MPI_Wait(top_recv_request(i), mpi_status, mpierr) +#else call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) +#endif endif !compute +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) + do my_thread = 1, max_threads + if(top_msg_length>0) then + b_len = csw*top_msg_length + b_off = (my_thread-1)*b_len + a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & + reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) + endif + call compute_hh_trafo(0, top_msg_length, i, my_thread) + enddo +#else call compute_hh_trafo(0, top_msg_length, i) +#endif endif if(next_top_msg_length > 0) then !request top_border data +#ifdef WITH_OPENMP + b_len = csw*next_top_msg_length*max_threads + call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow-1, & + top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) +#else call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_REAL8, my_prow-1, & + top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) +#endif endif !send_t if(my_prow > 0) then +#ifdef WITH_OPENMP + call MPI_Wait(top_send_request(i), mpi_status, mpierr) + b_len = csw*nbw*max_threads + top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) + call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_REAL8, & + my_prow-1, bottom_recv_tag, & + mpi_comm_rows, top_send_request(i), mpierr) +#else call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i) call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) + +#endif endif ! Care that there are not too many outstanding top_recv_request's if(stripe_count > 1) then if(i>1) then - call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) - else - call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) - endif - endif +#ifdef WITH_OPENMP + call MPI_Wait(top_recv_request(i-1), MPI_STATUS, mpierr) +#else + call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) +#endif + else +#ifdef WITH_OPENMP + call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS, mpierr) +#else + call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) +#endif + endif + endif - enddo + enddo - top_msg_length = next_top_msg_length - - else - ! wait for last top_send_request - do i = 1, stripe_count - call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) - enddo - endif - - ! Care about the result + top_msg_length = next_top_msg_length - if(my_prow == 0) then + else + ! wait for last top_send_request + do i = 1, stripe_count +#ifdef WITH_OPENMP + call MPI_Wait(top_send_request(i), MPI_STATUS, mpierr) +#else + call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) +#endif + enddo + endif - ! topmost process sends nbw rows to destination processes + ! Care about the result - do j=0,nfact-1 + if(my_prow == 0) then - num_blk = sweep*nfact+j ! global number of destination block, 0 based - if(num_blk*nblk >= na) exit + ! topmost process sends nbw rows to destination processes - nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block + do j=0,nfact-1 - call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) + num_blk = sweep*nfact+j ! global number of destination block, 0 based + if(num_blk*nblk >= na) exit - dst = mod(num_blk, np_rows) + nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block - if(dst == 0) then - do i = 1, min(na - num_blk*nblk, nblk) - call pack_row(row, j*nblk+i+a_off) - q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) - enddo - else - do i = 1, nblk - call pack_row(result_buffer(:,i,nbuf),j*nblk+i+a_off) - enddo - call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, dst, & - result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) - endif - enddo +#ifdef WITH_OPENMP + call MPI_Wait(result_send_request(nbuf), MPI_STATUS, mpierr) +#else + call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) +#endif + dst = mod(num_blk, np_rows) + + if(dst == 0) then + do i = 1, min(na - num_blk*nblk, nblk) + call pack_row(row, j*nblk+i+a_off) + q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) + enddo + else + do i = 1, nblk + call pack_row(result_buffer(:,i,nbuf),j*nblk+i+a_off) + enddo + call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, dst, & + result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) + endif + enddo - else + else - ! receive and store final result + ! receive and store final result - do j = num_bufs_recvd, num_result_blocks-1 + do j = num_bufs_recvd, num_result_blocks-1 - nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block + nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block - ! If there is still work to do, just test for the next result request - ! and leave the loop if it is not ready, otherwise wait for all - ! outstanding requests + ! If there is still work to do, just test for the next result request + ! and leave the loop if it is not ready, otherwise wait for all + ! outstanding requests - if(next_local_n > 0) then - call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) - if(.not.flag) exit - else - call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) - endif + if(next_local_n > 0) then +#ifdef WITH_OPENMP + call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS, mpierr) +#else + call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) - ! Fill result buffer into q - num_blk = j*np_rows + my_prow ! global number of current block, 0 based - do i = 1, min(na - num_blk*nblk, nblk) - q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) - enddo - - ! Queue result buffer again if there are outstanding blocks left - if(j+num_result_buffers < num_result_blocks) & - call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(nbuf), mpierr) +#endif + if(.not.flag) exit + else +#ifdef WITH_OPENMP + call MPI_Wait(result_recv_request(nbuf), MPI_STATUS, mpierr) +#else + call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) +#endif + endif - enddo - num_bufs_recvd = j + ! Fill result buffer into q + num_blk = j*np_rows + my_prow ! global number of current block, 0 based + do i = 1, min(na - num_blk*nblk, nblk) + q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) + enddo - endif + ! Queue result buffer again if there are outstanding blocks left + if(j+num_result_buffers < num_result_blocks) & + call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, & + mpi_comm_rows, result_recv_request(nbuf), mpierr) - ! Shift the remaining rows to the front of A (if necessary) + enddo + num_bufs_recvd = j - offset = nbw - top_msg_length - if(offset<0) then - print *,'internal error, offset for shifting = ',offset - call MPI_Abort(MPI_COMM_WORLD, 1, mpierr) - endif - a_off = a_off + offset - if(a_off + next_local_n + nbw > a_dim2) then - do i = 1, stripe_count - do j = top_msg_length+1, top_msg_length+next_local_n - A(:,j,i) = A(:,j+a_off,i) - enddo - enddo - a_off = 0 - endif + endif - enddo + ! Shift the remaining rows to the front of A (if necessary) - ! Just for safety: - if(ANY(top_send_request /= MPI_REQUEST_NULL)) print *,'*** ERROR top_send_request ***',my_prow,my_pcol - if(ANY(bottom_send_request /= MPI_REQUEST_NULL)) print *,'*** ERROR bottom_send_request ***',my_prow,my_pcol - if(ANY(top_recv_request /= MPI_REQUEST_NULL)) print *,'*** ERROR top_recv_request ***',my_prow,my_pcol - if(ANY(bottom_recv_request /= MPI_REQUEST_NULL)) print *,'*** ERROR bottom_recv_request ***',my_prow,my_pcol - - if(my_prow == 0) then - call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) - endif + offset = nbw - top_msg_length + if(offset<0) then + print *,'internal error, offset for shifting = ',offset + call MPI_Abort(MPI_COMM_WORLD, 1, mpierr) + endif + a_off = a_off + offset + if(a_off + next_local_n + nbw > a_dim2) then +#ifdef WITH_OPENMP + !$omp parallel do private(my_thread, i, j), schedule(static, 1) + do my_thread = 1, max_threads + do i = 1, stripe_count + do j = top_msg_length+1, top_msg_length+next_local_n + A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread) + enddo +#else + do i = 1, stripe_count + do j = top_msg_length+1, top_msg_length+next_local_n + A(:,j,i) = A(:,j+a_off,i) +#endif + enddo + enddo + a_off = 0 + endif - if(ANY(result_send_request /= MPI_REQUEST_NULL)) print *,'*** ERROR result_send_request ***',my_prow,my_pcol - if(ANY(result_recv_request /= MPI_REQUEST_NULL)) print *,'*** ERROR result_recv_request ***',my_prow,my_pcol + enddo + + ! Just for safety: + if(ANY(top_send_request /= MPI_REQUEST_NULL)) print *,'*** ERROR top_send_request ***',my_prow,my_pcol + if(ANY(bottom_send_request /= MPI_REQUEST_NULL)) print *,'*** ERROR bottom_send_request ***',my_prow,my_pcol + if(ANY(top_recv_request /= MPI_REQUEST_NULL)) print *,'*** ERROR top_recv_request ***',my_prow,my_pcol + if(ANY(bottom_recv_request /= MPI_REQUEST_NULL)) print *,'*** ERROR bottom_recv_request ***',my_prow,my_pcol + + if(my_prow == 0) then +#ifdef WITH_OPENMP + allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers)) + call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr) + deallocate(mpi_statuses) +#else + call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) +#endif + endif + + if(ANY(result_send_request /= MPI_REQUEST_NULL)) print *,'*** ERROR result_send_request ***',my_prow,my_pcol + if(ANY(result_recv_request /= MPI_REQUEST_NULL)) print *,'*** ERROR result_recv_request ***',my_prow,my_pcol + + if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + print '(" Kernel time:",f10.3," MFlops: ",f10.3)', kernel_time, kernel_flops/kernel_time*1.d-6 + + ! deallocate all working space + + deallocate(a) + deallocate(row) + deallocate(limits) + deallocate(result_send_request) + deallocate(result_recv_request) + deallocate(top_border_send_buffer) + deallocate(top_border_recv_buffer) + deallocate(bottom_border_send_buffer) + deallocate(bottom_border_recv_buffer) + deallocate(result_buffer) + deallocate(bcast_buffer) + deallocate(top_send_request) + deallocate(top_recv_request) + deallocate(bottom_send_request) + deallocate(bottom_recv_request) + + contains + + subroutine pack_row(row, n) + real*8 row(:) + integer n, i, noff, nl +#ifdef WITH_OPENMP + integer nt +#endif - if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & - print '(" Kernel time:",f10.3," MFlops: ",f10.3)', kernel_time, kernel_flops/kernel_time*1.d-6 +#ifdef WITH_OPENMP + do nt = 1, max_threads + do i = 1, stripe_count + noff = (nt-1)*thread_width + (i-1)*stripe_width + nl = min(stripe_width, nt*thread_width-noff, l_nev-noff) + if(nl<=0) exit + row(noff+1:noff+nl) = a(1:nl,n,i,nt) + enddo + enddo +#else + do i=1,stripe_count + nl = merge(stripe_width, last_stripe_width, ina_s) then ! Receive Householder vector from previous task, from PE owning subdiagonal +#ifdef WITH_OPENMP + call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,mpi_status,mpierr) +#else call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr) +#endif tau = hv(1) hv(1) = 1. endif @@ -2582,6 +3254,173 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 n_off = n_off + nb endif +#ifdef WITH_OPENMP + if(max_threads > 1) then + + ! Codepath for OpenMP + + ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! + ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. + ! This simulates the behaviour of the MPI tasks which also work after each other. + ! The code would be considerably easier, if the MPI communication would be made within + ! the parallel region - this is avoided here since this would require + ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. + + hv_t(:,1) = hv + tau_t(1) = tau + + do iter = 1, 2 + + ! iter=1 : work on first block + ! iter=2 : work on remaining blocks + ! This is done in 2 iterations so that we have a barrier in between: + ! After the first iteration, it is guaranteed that the last row of the last block + ! is completed by the next thread. + ! After the first iteration it is also the place to exchange the last row + ! with MPI calls + +!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & +!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) + do my_thread = 1, max_threads + + if(iter == 1) then + my_block_s = omp_block_limits(my_thread-1) + 1 + my_block_e = my_block_s + else + my_block_s = omp_block_limits(my_thread-1) + 2 + my_block_e = omp_block_limits(my_thread) + endif + + do iblk = my_block_s, my_block_e + + ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block + ne = ns+nb-1 ! last column in block + + if(istepna) exit + + hv = hv_t(:,my_thread) + tau = tau_t(my_thread) + + ! Store Householder vector for back transformation + + hh_cnt(iblk) = hh_cnt(iblk) + 1 + + hh_gath(1 ,hh_cnt(iblk),iblk) = tau + hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) + + nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block + nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) + ! Note that nr>=0 implies that diagonal block is full (nc==nb)! + + ! Transform diagonal block + + call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1) + + x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) + hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) + + call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1) + + hv_t(:,my_thread) = 0 + tau_t(my_thread) = 0 + + if(nr<=0) cycle ! No subdiagonal block present any more + + ! Transform subdiagonal block + + call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1) + + if(nr>1) then + + ! complete (old) Householder transformation for first column + + ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 + + ! calculate new Householder transformation for first column + ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) + + vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2) + call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) + hv_t(1 ,my_thread) = 1. + hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf + ab(nb+2:,ns) = 0 + + ! update subdiagonal block for old and new Householder transformation + ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster + + call ZGEMV('C',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,(0.d0,0.d0),h(2),1) + x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) + h(2:nb) = h(2:nb) - x*hv(2:nb) + ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") + do i=2,nb + ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) + enddo + + else + + ! No new Householder transformation for nr=1, just complete the old one + ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 + do i=2,nb + ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) + enddo + ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) + hv_t(1,my_thread) = 1. + + endif + + enddo + + enddo ! my_thread +!$omp end parallel do + + if (iter==1) then + ! We are at the end of the first block + + ! Send our first column to previous PE + if(my_pe>0 .and. na_s <= na) then + call mpi_wait(ireq_ab,mpi_status,mpierr) + ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) + call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr) + endif + + ! Request last column from next PE + ne = na_s + nblocks*nb - (max_threads-1) - 1 + if(istep>=max_threads .and. ne <= na) then + call mpi_recv(ab(1,ne-n_off),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr) + endif + + else + ! We are at the end of all blocks + + ! Send last HH vector and TAU to next PE if it has been calculated above + ne = na_s + nblocks*nb - (max_threads-1) - 1 + if(istep>=max_threads .and. ne < na) then + call mpi_wait(ireq_hv,mpi_status,mpierr) + hv_s(1) = tau_t(max_threads) + hv_s(2:) = hv_t(2:,max_threads) + call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr) + endif + + ! "Send" HH vector and TAU to next OpenMP thread + do my_thread = max_threads, 2, -1 + hv_t(:,my_thread) = hv_t(:,my_thread-1) + tau_t(my_thread) = tau_t(my_thread-1) + enddo + + endif + enddo ! iter + + else + + ! Codepath for 1 thread without OpenMP + + ! The following code is structured in a way to keep waiting times for + ! other PEs at a minimum, especially if there is only one block. + ! For this reason, it requests the last column as late as possible + ! and sends the Householder vector and the first column as early + ! as possible. + +#endif do iblk=1,nblocks @@ -2597,6 +3436,8 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c hh_gath(1 ,hh_cnt(iblk),iblk) = tau hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) + +#ifndef WITH_OPENMP if(hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then ! Wait for last transfer to finish call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) @@ -2611,16 +3452,19 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c hh_dst(iblk) = hh_dst(iblk)+1 endif + ! The following code is structured in a way to keep waiting times for ! other PEs at a minimum, especially if there is only one block. ! For this reason, it requests the last column as late as possible ! and sends the Householder vector and the first column as early ! as possible. +#endif nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) ! Note that nr>=0 implies that diagonal block is full (nc==nb)! + ! Multiply diagonal block and subdiagonal block with Householder vector if(iblk==nblocks .and. nc==nb) then @@ -2636,8 +3480,12 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c if(nr>0) call ZGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1) ! ... then request last column ... - call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr) +#ifdef WITH_OPENMP + call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr) +#else + call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr) +#endif ! ... and complete the result hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau @@ -2674,7 +3522,11 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c ! ... and send it away immediatly if this is the last block if(iblk==nblocks) then +#ifdef WITH_OPENMP + call mpi_wait(ireq_hv,mpi_status,mpierr) +#else call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) +#endif hv_s(1) = tau_new hv_s(2:) = hv_new(2:) call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr) @@ -2695,8 +3547,11 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1)) ! ... send it away ... - +#ifdef WITH_OPENMP + call mpi_wait(ireq_ab,mpi_status,mpierr) +#else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) +#endif ab_s(1:nb+1) = ab(1:nb+1,ns) call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr) @@ -2714,7 +3569,7 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c call ZGEMV('C',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,(0.d0,0.d0),h(2),1) x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new h(2:nb) = h(2:nb) - x*hv(2:nb) - ! Unfortunately the is no BLAS routine like DGER2 for a nonsymmetric rank 2 update + ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update do i=2,nb ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) enddo @@ -2731,16 +3586,55 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c tau = tau_new enddo +#ifdef WITH_OPENMP + endif +#endif + +#ifdef WITH_OPENMP + do iblk = 1, nblocks + + + if(hh_dst(iblk) >= np_rows) exit + if(snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit + + if(hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then + ! Wait for last transfer to finish + call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr) + ! Copy vectors into send buffer + hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) + ! Send to destination + call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex16, & + global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), & + 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) + ! Reset counter and increase destination row + hh_cnt(iblk) = 0 + hh_dst(iblk) = hh_dst(iblk)+1 + endif + + enddo +#endif enddo + + ! Finish the last outstanding requests +#ifdef WITH_OPENMP + call mpi_wait(ireq_ab,mpi_status,mpierr) + call mpi_wait(ireq_hv,mpi_status,mpierr) + + allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks))) + call mpi_waitall(nblocks, ireq_hhs, mpi_statuses, mpierr) + call mpi_waitall(num_chunks, ireq_hhr, mpi_statuses, mpierr) + deallocate(mpi_statuses) +#else call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) +#endif call mpi_barrier(mpi_comm,mpierr) deallocate(ab) @@ -2751,7 +3645,7 @@ subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, mpi_comm_rows, mpi_c deallocate(block_limits) deallocate(global_id) -end subroutine + end subroutine tridiag_band_complex !--------------------------------------------------------------------------------------------------- @@ -2795,14 +3689,26 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r integer next_n, next_local_n, next_n_start, next_n_end integer bottom_msg_length, top_msg_length, next_top_msg_length integer stripe_width, last_stripe_width, stripe_count +#ifdef WITH_OPENMP + integer thread_width, csw, b_off, b_len +#endif integer num_result_blocks, num_result_buffers, num_bufs_recvd integer a_off, current_tv_off, max_blk_size integer mpierr, src, src_offset, dst, offset, nfact, num_blk logical flag +#ifdef WITH_OPENMP + complex*16, allocatable :: a(:,:,:,:), row(:) +#else complex*16, allocatable :: a(:,:,:), row(:) +#endif +#ifdef WITH_OPENMP + complex*16, allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) + complex*16, allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) +#else complex*16, allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) complex*16, allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) +#endif complex*16, allocatable :: result_buffer(:,:,:) complex*16, allocatable :: bcast_buffer(:,:) @@ -2810,6 +3716,10 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r integer, allocatable :: result_send_request(:), result_recv_request(:), limits(:) integer, allocatable :: top_send_request(:), bottom_send_request(:) integer, allocatable :: top_recv_request(:), bottom_recv_request(:) +#ifdef WITH_OPENMP + integer, allocatable :: mpi_statuses(:,:) + integer mpi_status(MPI_STATUS_SIZE) +#endif ! MPI send/recv tags, arbitrary @@ -2817,6 +3727,11 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r integer, parameter :: top_recv_tag = 222 integer, parameter :: result_recv_tag = 333 +#ifdef WITH_OPENMP + integer :: max_threads, my_thread + integer :: omp_get_max_threads +#endif + ! Just for measuring the kernel performance real*8 kernel_time integer*8 kernel_flops @@ -2825,6 +3740,10 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r kernel_time = 1.d-100 kernel_flops = 0 +#ifdef WITH_OPENMP + max_threads = 1 + max_threads = omp_get_max_threads() +#endif call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) @@ -2846,17 +3765,34 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) if(l_nev==0) then +#ifdef WITH_OPENMP + thread_width = 0 +#endif stripe_width = 0 stripe_count = 0 last_stripe_width = 0 else ! Suggested stripe width is 48 - should this be reduced for the complex case ??? +#ifdef WITH_OPENMP + thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread +#endif + stripe_width = 48 ! Must be a multiple of 4 +#ifdef WITH_OPENMP + stripe_count = (thread_width-1)/stripe_width + 1 +#else stripe_count = (l_nev-1)/stripe_width + 1 +#endif ! Adapt stripe width so that last one doesn't get too small +#ifdef WITH_OPENMP + stripe_width = (thread_width-1)/stripe_count + 1 +#else stripe_width = (l_nev-1)/stripe_count + 1 +#endif stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!! +#ifndef WITH_OPENMP last_stripe_width = l_nev - (stripe_count-1)*stripe_width +#endif endif ! Determine the matrix distribution at the beginning @@ -2870,8 +3806,13 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r a_dim2 = max_blk_size + nbw !DEC$ ATTRIBUTES ALIGN: 64:: a +#ifdef WITH_OPENMP + allocate(a(stripe_width,a_dim2,stripe_count,max_threads)) + ! a(:,:,:,:) should be set to 0 in a parallel region, not here! +#else allocate(a(stripe_width,a_dim2,stripe_count)) a(:,:,:) = 0 +#endif allocate(row(l_nev)) row(:) = 0 @@ -2882,6 +3823,16 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r ! The peculiar way it is done below is due to the fact that the last row should be ! ready first since it is the first one to start below +#ifdef WITH_OPENMP + ! Please note about the OMP usage below: + ! This is not for speed, but because we want the matrix a in the memory and + ! in the cache of the correct thread (if possible) + +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation! + enddo +#endif do ip = np_rows-1, 0, -1 if(my_prow == ip) then ! Receive my rows which have not yet been received @@ -2889,12 +3840,32 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r do i=limits(ip)+1,limits(ip+1) src = mod((i-1)/nblk, np_rows) if(src < my_prow) then +#ifdef WITH_OPENMP + call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr) + +#else call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) +#endif + +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + call unpack_row(row,i-limits(ip),my_thread) + enddo +#else call unpack_row(row,i-limits(ip)) - elseif(src==my_prow) then +#endif + elseif(src==my_prow) then src_offset = src_offset+1 row(:) = q(src_offset, 1:l_nev) +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + call unpack_row(row,i-limits(ip),my_thread) + enddo +#else call unpack_row(row,i-limits(ip)) +#endif endif enddo ! Send all rows which have not yet been send @@ -2923,8 +3894,20 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r do i=limits(my_prow)+1,limits(my_prow+1) src = mod((i-1)/nblk, np_rows) if(src == ip) then +#ifdef WITH_OPENMP + call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr) +#else call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) +#endif + +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + call unpack_row(row,i-limits(my_prow),my_thread) + enddo +#else call unpack_row(row,i-limits(my_prow)) +#endif endif enddo endif @@ -2966,6 +3949,17 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r bottom_send_request(:) = MPI_REQUEST_NULL bottom_recv_request(:) = MPI_REQUEST_NULL +#ifdef WITH_OPENMP + allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count)) + allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count)) + allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count)) + allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count)) + + top_border_send_buffer(:,:) = 0 + top_border_recv_buffer(:,:) = 0 + bottom_border_send_buffer(:,:) = 0 + bottom_border_recv_buffer(:,:) = 0 +#else allocate(top_border_send_buffer(stripe_width, nbw, stripe_count)) allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count)) allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count)) @@ -2975,6 +3969,7 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r top_border_recv_buffer(:,:,:) = 0 bottom_border_send_buffer(:,:,:) = 0 bottom_border_recv_buffer(:,:,:) = 0 +#endif ! Initialize broadcast buffer @@ -3019,8 +4014,17 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r if(sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then do i = 1, stripe_count +#ifdef WITH_OPENMP + csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" + b_len = csw*nbw*max_threads + call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & + mpi_comm_rows, bottom_recv_request(i), mpierr) +#else call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & + + mpi_comm_rows, bottom_recv_request(i), mpierr) +#endif enddo endif @@ -3041,14 +4045,45 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r do i = 1, stripe_count +#ifdef WITH_OPENMP + ! Get real stripe width for strip i; + ! The last OpenMP tasks may have an even smaller stripe with, + ! but we don't care about this, i.e. we send/recv a bit too much in this case. + ! csw: current_stripe_width + + csw = min(stripe_width, thread_width-(i-1)*stripe_width) +#endif + !wait_b if(current_n_end < current_n) then +#ifdef WITH_OPENMP + call MPI_Wait(bottom_recv_request(i), mpi_status, mpierr) +#else call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) +#endif +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) + do my_thread = 1, max_threads + n_off = current_local_n+a_off + b_len = csw*nbw + b_off = (my_thread-1)*b_len + a(1:csw,n_off+1:n_off+nbw,i,my_thread) = & + reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) + enddo +#else n_off = current_local_n+a_off a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) +#endif if(next_n_end < next_n) then +#ifdef WITH_OPENMP + call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & + MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & + mpi_comm_rows, bottom_recv_request(i), mpierr) +#else call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & + mpi_comm_rows, bottom_recv_request(i), mpierr) +#endif endif endif @@ -3056,69 +4091,180 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r !wait_t if(top_msg_length>0) then +#ifdef WITH_OPENMP + call MPI_Wait(top_recv_request(i), mpi_status, mpierr) +#else call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) +#endif +#ifndef WITH_OPENMP a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) +#endif endif !compute +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) + do my_thread = 1, max_threads + if(top_msg_length>0) then + b_len = csw*top_msg_length + b_off = (my_thread-1)*b_len + a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & + reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) + endif + call compute_hh_trafo(0, current_local_n, i, my_thread) + enddo +#else call compute_hh_trafo(0, current_local_n, i) - +#endif !send_b +#ifdef WITH_OPENMP + call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) +#else call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) +#endif if(bottom_msg_length>0) then n_off = current_local_n+nbw-bottom_msg_length+a_off +#ifdef WITH_OPENMP + b_len = csw*bottom_msg_length*max_threads + bottom_border_send_buffer(1:b_len,i) = & + reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) + call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, & + top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) +#else bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & + + top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) +#endif endif else !compute +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) + do my_thread = 1, max_threads + call compute_hh_trafo(current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread) + enddo +#else call compute_hh_trafo(current_local_n - bottom_msg_length, bottom_msg_length, i) + +#endif !send_b +#ifdef WITH_OPENMP + call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) +#else + call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) +#endif if(bottom_msg_length > 0) then n_off = current_local_n+nbw-bottom_msg_length+a_off +#ifdef WITH_OPENMP + b_len = csw*bottom_msg_length*max_threads + bottom_border_send_buffer(1:b_len,i) = & + reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) + call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, & + top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) +#else bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & + + top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) +#endif endif !compute +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread), schedule(static, 1) + do my_thread = 1, max_threads + call compute_hh_trafo(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread) + enddo +#else call compute_hh_trafo(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i) +#endif !wait_t if(top_msg_length>0) then +#ifdef WITH_OPENMP + call MPI_Wait(top_recv_request(i), mpi_status, mpierr) +#else call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) +#endif +#ifndef WITH_OPENMP a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) + +#endif endif !compute +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) + do my_thread = 1, max_threads + if(top_msg_length>0) then + b_len = csw*top_msg_length + b_off = (my_thread-1)*b_len + a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & + reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) + endif + call compute_hh_trafo(0, top_msg_length, i, my_thread) + enddo +#else call compute_hh_trafo(0, top_msg_length, i) +#endif endif if(next_top_msg_length > 0) then !request top_border data +#ifdef WITH_OPENMP + b_len = csw*next_top_msg_length*max_threads + call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow-1, & + top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) +#else call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX16, my_prow-1, & + top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) +#endif endif !send_t if(my_prow > 0) then +#ifdef WITH_OPENMP + call MPI_Wait(top_send_request(i), mpi_status, mpierr) +#else call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) +#endif + +#ifdef WITH_OPENMP + b_len = csw*nbw*max_threads + top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) + call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX16, & + my_prow-1, bottom_recv_tag, & + mpi_comm_rows, top_send_request(i), mpierr) +#else top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i) call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow-1, bottom_recv_tag, & mpi_comm_rows, top_send_request(i), mpierr) + +#endif endif ! Care that there are not too many outstanding top_recv_request's if(stripe_count > 1) then if(i>1) then +#ifdef WITH_OPENMP + call MPI_Wait(top_recv_request(i-1), mpi_status, mpierr) +#else call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) +#endif else +#ifdef WITH_OPENMP + call MPI_Wait(top_recv_request(stripe_count), mpi_status, mpierr) +#else call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) +#endif endif endif @@ -3129,7 +4275,11 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r else ! wait for last top_send_request do i = 1, stripe_count +#ifdef WITH_OPENMP + call MPI_Wait(top_send_request(i), mpi_status, mpierr) +#else call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) +#endif enddo endif @@ -3146,7 +4296,11 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block +#ifdef WITH_OPENMP + call MPI_Wait(result_send_request(nbuf), mpi_status, mpierr) +#else call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) +#endif dst = mod(num_blk, np_rows) @@ -3177,10 +4331,21 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r ! outstanding requests if(next_local_n > 0) then +#ifdef WITH_OPENMP + call MPI_Test(result_recv_request(nbuf), flag, mpi_status, mpierr) + +#else call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) +#endif if(.not.flag) exit else +#ifdef WITH_OPENMP + call MPI_Wait(result_recv_request(nbuf), mpi_status, mpierr) + +#else + call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) +#endif endif ! Fill result buffer into q @@ -3208,9 +4373,18 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r endif a_off = a_off + offset if(a_off + next_local_n + nbw > a_dim2) then +#ifdef WITH_OPENMP +!$omp parallel do private(my_thread, i, j), schedule(static, 1) + do my_thread = 1, max_threads + do i = 1, stripe_count + do j = top_msg_length+1, top_msg_length+next_local_n + A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread) + enddo +#else do i = 1, stripe_count do j = top_msg_length+1, top_msg_length+next_local_n A(:,j,i) = A(:,j+a_off,i) +#endif enddo enddo a_off = 0 @@ -3225,7 +4399,13 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r if(ANY(bottom_recv_request /= MPI_REQUEST_NULL)) print *,'*** ERROR bottom_recv_request ***',my_prow,my_pcol if(my_prow == 0) then +#ifdef WITH_OPENMP + allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers)) + call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr) + deallocate(mpi_statuses) +#else call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) +#endif endif if(ANY(result_send_request /= MPI_REQUEST_NULL)) print *,'*** ERROR result_send_request ***',my_prow,my_pcol @@ -3254,6 +4434,22 @@ subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, mpi_comm_r contains +#ifdef WITH_OPENMP + subroutine pack_row(row, n) + complex*16 row(:) + integer n, i, noff, nl, nt + + do nt = 1, max_threads + do i = 1, stripe_count + noff = (nt-1)*thread_width + (i-1)*stripe_width + nl = min(stripe_width, nt*thread_width-noff, l_nev-noff) + if(nl<=0) exit + row(noff+1:noff+nl) = a(1:nl,n,i,nt) + enddo + enddo + + end subroutine pack_row +#else subroutine pack_row(row, n) complex*16 row(:) integer n, i, noff, nl @@ -3262,25 +4458,57 @@ contains nl = merge(stripe_width, last_stripe_width, i0^$RnETw$}+b|p{| z*b>+r*z_|ga(PIKT;EYvyFA!Fv$$Q5+M2|}fynKwQZ0DP9&8JWN7x+~$U$+G-N_4z zA2<)(kL@IFi@jJ|clU!VEG@O2ja*$Q#lEsi-(HAZwFZ2NfOB0U5J}t@aBfKK9}lnI zUgTW3k8=@!``0e{hCLFsXwqHwjhZV(PS|~)uscfZg-e)e?6U7DMhDwYw%zvL3AFVG zV%M@ZXa)OeiQ@eO;b%%U$1OHi5tTLhj9t>2dR)tUmv=cA3T({fm=lRv;+God%5z+n zc&t%%sh+k^X?z9jU`$IT=^YBWkwi{xDAZeSfhs6*{d;*qkpyEG^v zwQmv+%bG3QGA?7oKS;IX(D-dP4Ri;zmy$9D9Fn!*U97j)zMbm}`U^vUU~ZH(dsgN~ zn#Ai_&{SCT#2;$*US~{s$Q7M@ zC|AM84Wfop)LF_`;Oh%e54I9Fmd-L?2X^ejFUqbc=fLfdbAM=je4IFi7q{3+ADnL& zORD)CByPvm zPVQpd^6aS66@R-?aXbR?y^cp1!}xa%+9FO{#7=|umcGV(T4O%z14V;@-URiE66Gt9 zH{u=ciY=G=wp`uCoC{mQjzIpEL;mH^{cD$+0CyMJe&sUMmw=%|Ls%WR;|6sxr@BFe z0u0&$eIyY>Zg8g0p9Sg_@Pjq{^u`YkFLGni>V#T&6GEqzSfLSk@bU-U<(&JgOczVJ zR|KFlS9%2j5~W$`b{F$+$6E&k9f zW$8?WvKMz!+X}9>71ApwXhR>icfK}wXM8*e*tjmY0}auc>)RwAaqjy)k_#3&;_Z$k z@}2vRTLmGKa5?ui+NtAk?t6VYb?gMIbDjGRP%Z*i9hPcko~nS^GUkuUnxli`a!86@ zv{FVZ(+Z}Mv0U+3vu8THsrJTWyOg|^r?L4FLNQ6(VCNuKs;#|=?F37&RNGsk<@Y!j z`k`GuOQh;8TebAG762ABme6qW-59nMH`8;so zp#fw_f|e*Kij~OLmU1pyU}JSTWIBrBw) zyvhzBwdllpo(JeAifm9F<&J+^c??YKaku=EBtgN!ja075r(EU8SQAD@rcOi9LA3{Fy#_4k z;;E!h8Ah7}&F6rnjc#-{a+ec0Wc7jDK>!pZQv>2eZ0hBA)9iKRYyoceEp`js3bjIq zd&2%HJ!KJ_j@eX!fs1}D-vch9;D+$8m;1nof_#}&1aLtenB3AS`EED{_w9ya7Uk*7Byoq2Yo#+m< z#Xi}AOa-O%lFR+a#r0y;3+D(Ykrg znAS{L1fh_x5=Wb#4<)& zBtjk$+~t~MH5CF6(vuy&GfeO7a3*4S6|Dou%VLWwK1=!NGV)Q97f`ZG{S+2P`5?E; zDBF@1rwD_-#H21>3ao;$N7S$5u__g^LE%T0TQCDTO@drD9;;IaP->u{ssfdAKRNL+ zI5BDuR8&kE0|hR9oEygr3o&J5h-e|s1NU$}u}=qeI#2X{NZjzR>5v8#Bzo?o6ayUY zCdWWA2c}}wfxm;i!S+hHpH^-!whWZj4j=|DxQn%)Crh=;&s=5NGkSe!iNA_Cm|hZz z*-^vw!=HNMfyL0;W;_%s6+_5vxUDkDRUi{ilAASq;c8q3UHTjXPRwz`_aGi7q6a}G z%O@qibBzae^MWnFMw1=~&?$gO(FN`!uWnQb4!uquUL0`E@ZzC_rIF5FOSD2l9zZ3? zZP*3%H62=D?CCtD){stJgmHSRE2pa73<8qA9ES*(1wlHFSRB*q_M|5Vyb_y>XnMi@ zdKgfiLrI*T=V)se4TLD0L|(u?NJ5R4Svrx|D#cFSKdV#@U&tj?VnWi(G)WfMK=5wv zT!Y`2)p9U|XElh7go>P2N-~(>MlzIY;WH^2xPJBc)fp!zq&04$KuzkX3h^Z04LzR; zWJu`R$Y;Y##>U6ZQ`vCcw>p5$!kT*Lnd+)7M|TLxwUqEeO3Z1R>I zI^4R=Oj)p+s=HBA9HmP~3 zcAJi7JXSK}M8XG#tB4hz%xE`3)(Y?`VpF%92L+;IK6WvyzGYZ{hr8uj{!OJ=D@EDY zfmwm;SXjyF(_!Z@zKZn+|`6Qh1X+7zll8>nN5-MFT+*9-YC5Uu~Jbas~ zc6`~mn*ckH!%jlEwsbXN)oe1$XtDEsUjGA>D$bHqm(_NaYim#IBjrvcVA~pObWa^9 znK`zYl-2|cuE~<*kvADkTKFuV&J~OnmUPfLYHYLF_Q4%GD`=wNKH5w zQtk3Eu}pHq96U{BCZ?yjMJOpzg5nuOKW_2bLZHzUB2o>UP_&_M)kauW9B6kRn6X<;WuuKkCUI7SF6#vx81mY{g2(AjpduY?v+K4 z6+LPU%r{!gOFtDXdtH_X969!bI6W@~y})?)=UM0ke&G5|*7_OQPrq(GVAJk6}(;;wMbRg#sa6ou>dhJ~VSHQ^eeXlp}*P*lH%9*Sxk z!jG;HifZ@l+Euq3BX;iz*A;=y!iI27?Q7i& zYyP+Mmt!vNKNG)6m+`-VdlFF)%xANG0`F=5GW^mnw=q9+T^dW5*-B}B{=&GeNc&$R zoo>!EX;tVDjs9Rrzt$M@Y5nxe)W1nn(8<&(y-OJOGwnq7qq*jMlb#1!rtU!!T8;gm zrH8aDlmDrBr{SLY`XswC!GXtI{4)2S2HI5nFEjXGqc1aY&)k3FMMj6l*@Ukrfr&Z9 zI$=GNqPNy5IU)Y0(lF8ahp)+t;K?fF+(RRRbwc{o>qWf#ed}aKwIDo<@iWu-pNQ|e zoxiGCQT<$9{qDM*4b}B^H9M;t8tMvbp5I%&V#o7EwL+1+ewnasPvh2|b=5m|REO(p zw(qGHUNjU+{zta?4j)qK+HHuCNpB>B;mV}n=Ad~m{Z9tqWRCaVKlM6poy^y#Ip+0G DG>h%q diff --git a/ELPA_development_version_OpenMP/test/test_complex.f90 b/ELPA_development_version_OpenMP/test/test_complex.F90 similarity index 100% rename from ELPA_development_version_OpenMP/test/test_complex.f90 rename to ELPA_development_version_OpenMP/test/test_complex.F90 diff --git a/ELPA_development_version_OpenMP/test/test_complex2.f90 b/ELPA_development_version_OpenMP/test/test_complex2.F90 similarity index 100% rename from ELPA_development_version_OpenMP/test/test_complex2.f90 rename to ELPA_development_version_OpenMP/test/test_complex2.F90 diff --git a/ELPA_development_version_OpenMP/test/test_real.f90 b/ELPA_development_version_OpenMP/test/test_real.F90 similarity index 100% rename from ELPA_development_version_OpenMP/test/test_real.f90 rename to ELPA_development_version_OpenMP/test/test_real.F90 diff --git a/ELPA_development_version_OpenMP/test/test_real2.f90 b/ELPA_development_version_OpenMP/test/test_real2.F90 similarity index 100% rename from ELPA_development_version_OpenMP/test/test_real2.f90 rename to ELPA_development_version_OpenMP/test/test_real2.F90 -- GitLab