diff --git a/Doxyfile.in b/Doxyfile.in index 69a562fcaaf6d7a18c3140846d7beab9d2e208c9..a86a7207fdada0c72106ffe6484f7673d8888b64 100644 --- a/Doxyfile.in +++ b/Doxyfile.in @@ -765,7 +765,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = @top_srcdir@/src @top_srcdir@/test @builddir@/elpa +INPUT = @top_srcdir@/src @top_srcdir@/test @builddir@/elpa @builddir@/config-f90.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses @@ -2014,7 +2014,7 @@ ENABLE_PREPROCESSING = YES # The default value is: NO. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -MACRO_EXPANSION = NO +MACRO_EXPANSION = YES # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then # the macro expansion is limited to the macros specified with the PREDEFINED and @@ -2036,7 +2036,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = @builddir@ @builddir@/elpa # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the diff --git a/Makefile.am b/Makefile.am index a43928463297a1bb9a429a944f414bc4d7d4a7ea..f130e01ccc3012f5a45a5fe49edd032961760b13 100644 --- a/Makefile.am +++ b/Makefile.am @@ -29,7 +29,7 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.F90 \ src/elpa2.F90 \ src/elpa_c_interface.F90 \ src/elpa_qr/qr_utils.F90 \ - src/elpa_qr/elpa_qrkernels.f90 \ + src/elpa_qr/elpa_qrkernels.F90 \ src/elpa_qr/elpa_pdlarfb.F90 \ src/elpa_qr/elpa_pdgeqrf.F90 if HAVE_DETAILED_TIMINGS @@ -117,15 +117,28 @@ nobase_elpa_include_HEADERS = $(wildcard modules/*) nobase_elpa_include_HEADERS += elpa/elpa.h elpa/elpa_kernel_constants.h elpa/elpa_generated.h man_MANS = man/solve_evp_real.3 \ - man/solve_evp_real_1stage.3 \ + man/solve_evp_real_1stage_double.3 \ man/solve_evp_complex.3 \ - man/solve_evp_complex_1stage.3 \ - man/solve_evp_real_2stage.3 \ - man/solve_evp_complex_2stage.3 \ + man/solve_evp_complex_1stage_double.3 \ + man/solve_evp_real_2stage_double.3 \ + man/solve_evp_complex_2stage_double.3 \ man/get_elpa_row_col_comms.3 \ man/get_elpa_communicators.3 \ man/print_available_elpa2_kernels.1 +if WANT_SINGLE_PRECISION_REAL + +man_MANS += man/solve_evp_real_1stage_single.3 \ + man/solve_evp_real_2stage_single.3 +endif + +if WANT_SINGLE_PRECISION_COMPLEX + +man_MANS += man/solve_evp_complex_1stage_single.3 \ + man/solve_evp_complex_2stage_single.3 +endif + + # other files to distribute filesdir = $(docdir)/examples dist_files_DATA = \ @@ -142,6 +155,34 @@ dist_files_DATA = \ test/fortran_test_programs/test_real_with_c.F90 \ src/print_available_elpa2_kernels.F90 +if WANT_SINGLE_PRECISION_COMPLEX +dist_files_DATA += test/fortran_test_programs/test_complex2_single_precision.F90 \ + test/fortran_test_programs/test_complex2_default_kernel_single_precision.F90 \ + test/fortran_test_programs/test_complex2_choose_kernel_with_api_single_precision.F90 \ + test/fortran_test_programs/test_complex_single_precision.F90 +endif + +if WANT_SINGLE_PRECISION_REAL +dist_files_DATA += test/fortran_test_programs/test_real2_single_precision.F90 \ + test/fortran_test_programs/test_real2_default_kernel_single_precision.F90 \ + test/fortran_test_programs/test_real2_default_kernel_qr_decomposition_single_precision.F90 \ + test/fortran_test_programs/test_real2_choose_kernel_with_api_single_precision.F90 \ + test/fortran_test_programs/test_real_single_precision.F90 +endif + +if WITH_GPU_VERSION +dist_files_DATA += test/fortran_test_programs/test_real2_gpu_version.F90 \ + test/fortran_test_programs/test_complex2_gpu_version.F90 + +if WANT_SINGLE_PRECISION_REAL +dist_files_DATA += test/fortran_test_programs/test_real2_gpu_version_single_precision.F90 +endif + +if WANT_SINGLE_PRECISION_COMPLEX +dist_files_DATA += test/fortran_test_programs/test_complex2_gpu_version_single_precision.F90 +endif + +endif dist_doc_DATA = README COPYING/COPYING COPYING/gpl.txt COPYING/lgpl.txt # pkg-config stuff @@ -156,6 +197,18 @@ bin_PROGRAMS = \ elpa2_test_complex@SUFFIX@ \ elpa2_print_kernels@SUFFIX@ +if WANT_SINGLE_PRECISION_COMPLEX +bin_PROGRAMS += \ + elpa1_test_complex_single_precision@SUFFIX@ \ + elpa2_test_complex_single_precision@SUFFIX@ +endif + +if WANT_SINGLE_PRECISION_REAL +bin_PROGRAMS += \ + elpa1_test_real_single_precision@SUFFIX@ \ + elpa2_test_real_single_precision@SUFFIX@ +endif + noinst_PROGRAMS = \ elpa2_test_real_default_kernel@SUFFIX@ \ elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@ \ @@ -168,6 +221,36 @@ noinst_PROGRAMS = \ elpa2_test_complex_c_version@SUFFIX@ \ elpa1_test_real_with_c@SUFFIX@ +if WANT_SINGLE_PRECISION_COMPLEX +noinst_PROGRAMS += \ + elpa2_test_complex_default_kernel_single_precision@SUFFIX@ \ + elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@ +endif + +if WANT_SINGLE_PRECISION_REAL +noinst_PROGRAMS += \ + elpa2_test_real_default_kernel_single_precision@SUFFIX@ \ + elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@ \ + elpa2_test_real_choose_kernel_with_api_single_precision@SUFFIX@ +endif + +if WITH_GPU_VERSION +noinst_PROGRAMS += \ + elpa2_test_complex_gpu_version@SUFFIX@ \ + elpa2_test_real_gpu_version@SUFFIX@ + +if WANT_SINGLE_PRECISION_REAL +noinst_PROGRAMS += \ + elpa2_test_real_gpu_version_single_precision@SUFFIX@ +endif + +if WANT_SINGLE_PRECISION_COMPLEX +noinst_PROGRAMS += \ + elpa2_test_complex_gpu_version_single_precision@SUFFIX@ +endif + +endif + build_lib = libelpa@SUFFIX@.la @@ -212,42 +295,84 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib) #elpa1_test_complex_with_c@SUFFIX@_LDADD = $(build_lib) elpa2_test_real@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2.F90 $(shared_sources) $(redirect_sources) - elpa2_test_real@SUFFIX@_LDFLAGS = -static elpa2_test_real@SUFFIX@_LDADD = $(build_lib) elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources) - elpa2_test_real_default_kernel@SUFFIX@_LDADD = $(build_lib) elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition.F90 $(shared_sources) $(redirect_sources) - elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@_LDADD = $(build_lib) elpa2_test_real_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_choose_kernel_with_api.F90 $(shared_sources) $(redirect_sources) - elpa2_test_real_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib) elpa1_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex.F90 $(shared_sources) $(redirect_sources) - elpa1_test_complex@SUFFIX@_LDADD = $(build_lib) elpa2_test_complex@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2.F90 $(shared_sources) $(redirect_sources) - elpa2_test_complex@SUFFIX@_LDADD = $(build_lib) elpa2_test_complex_default_kernel@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel.F90 $(shared_sources) $(redirect_sources) - elpa2_test_complex_default_kernel@SUFFIX@_LDADD = $(build_lib) elpa2_test_complex_choose_kernel_with_api@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api.F90 $(shared_sources) $(redirect_sources) - elpa2_test_complex_choose_kernel_with_api@SUFFIX@_LDADD = $(build_lib) elpa2_print_kernels@SUFFIX@_SOURCES = src/print_available_elpa2_kernels.F90 $(shared_sources) $(redirect_sources) - elpa2_print_kernels@SUFFIX@_LDADD = $(build_lib) +if WANT_SINGLE_PRECISION_REAL +elpa1_test_real_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa1_test_real_single_precision@SUFFIX@_LDADD = $(build_lib) + +elpa2_test_real_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_real_single_precision@SUFFIX@_LDFLAGS = -static +elpa2_test_real_single_precision@SUFFIX@_LDADD = $(build_lib) + +elpa2_test_real_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_real_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib) + +elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_default_kernel_qr_decomposition_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@_LDADD = $(build_lib) + +elpa2_test_real_choose_kernel_with_api_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_choose_kernel_with_api_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_real_choose_kernel_with_api_single_precision@SUFFIX@_LDADD = $(build_lib) +endif + +if WANT_SINGLE_PRECISION_COMPLEX +elpa1_test_complex_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa1_test_complex_single_precision@SUFFIX@_LDADD = $(build_lib) + +elpa2_test_complex_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_complex_single_precision@SUFFIX@_LDADD = $(build_lib) + +elpa2_test_complex_default_kernel_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_default_kernel_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_complex_default_kernel_single_precision@SUFFIX@_LDADD = $(build_lib) + +elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_choose_kernel_with_api_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@_LDADD = $(build_lib) +endif + +if WITH_GPU_VERSION +elpa2_test_real_gpu_version@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_gpu_version.F90 $(shared_sources) $(redirect_sources) +elpa2_test_real_gpu_version@SUFFIX@_LDADD = $(build_lib) + +elpa2_test_complex_gpu_version@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_gpu_version.F90 $(shared_sources) $(redirect_sources) +elpa2_test_complex_gpu_version@SUFFIX@_LDADD = $(build_lib) + +if WANT_SINGLE_PRECISION_REAL +elpa2_test_real_gpu_version_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_real2_gpu_version_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_real_gpu_version_single_precision@SUFFIX@_LDADD = $(build_lib) +endif + +if WANT_SINGLE_PRECISION_COMPLEX +elpa2_test_complex_gpu_version_single_precision@SUFFIX@_SOURCES = test/fortran_test_programs/test_complex2_gpu_version_single_precision.F90 $(shared_sources) $(redirect_sources) +elpa2_test_complex_gpu_version_single_precision@SUFFIX@_LDADD = $(build_lib) +endif + +endif + check_SCRIPTS = \ elpa1_test_real.sh \ elpa1_test_real_with_c.sh \ @@ -265,7 +390,39 @@ check_SCRIPTS = \ elpa2_test_complex_choose_kernel_with_api.sh \ elpa2_print_kernels@SUFFIX@ +if WANT_SINGLE_PRECISION_REAL +check_SCRIPTS += \ + elpa1_test_real_single_precision.sh \ + elpa2_test_real_single_precision.sh \ + elpa2_test_real_default_kernel_single_precision.sh \ + elpa2_test_real_default_kernel_qr_decomposition_single_precision.sh \ + elpa2_test_real_choose_kernel_with_api_single_precision.sh +endif + +if WANT_SINGLE_PRECISION_COMPLEX +check_SCRIPTS += \ + elpa1_test_complex_single_precision.sh \ + elpa2_test_complex_single_precision.sh \ + elpa2_test_complex_default_kernel_single_precision.sh +endif + +if WITH_GPU_VERSION +check_SCRIPTS += \ + elpa2_test_real_gpu_version.sh \ + elpa2_test_complex_gpu_version.sh +if WANT_SINGLE_PRECISION_REAL +check_SCRIPTS += \ + elpa2_test_real_gpu_version_single_precision.sh +endif +if WANT_SINGLE_PRECISION_COMPLEX +check_SCRIPTS += \ + elpa2_test_complex_gpu_version_single_precision.sh +endif + +endif + TESTS = $(check_SCRIPTS) +if WITH_MPI elpa1_test_real.sh: echo 'mpiexec -n 2 ./elpa1_test_real@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real.sh chmod +x elpa1_test_real.sh @@ -289,6 +446,7 @@ elpa1_test_real_c_version.sh: elpa1_test_complex_c_version.sh: echo 'mpiexec -n 2 ./elpa1_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_c_version.sh chmod +x elpa1_test_complex_c_version.sh + elpa2_test_real.sh: echo 'mpiexec -n 2 ./elpa2_test_real@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real.sh chmod +x elpa2_test_real.sh @@ -320,6 +478,197 @@ elpa2_test_complex_default_kernel.sh: elpa2_test_complex_choose_kernel_with_api.sh: echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh chmod +x elpa2_test_complex_choose_kernel_with_api.sh + +if WANT_SINGLE_PRECISION_REAL +elpa1_test_real_single_precision.sh: + echo 'mpiexec -n 2 ./elpa1_test_real_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_single_precision.sh + chmod +x elpa1_test_real_single_precision.sh + +elpa2_test_real_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_single_precision.sh + chmod +x elpa2_test_real_single_precision.sh + +elpa2_test_real_default_kernel_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_default_kernel_single_precision.sh + chmod +x elpa2_test_real_default_kernel_single_precision.sh + +elpa2_test_real_default_kernel_qr_decomposition_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > elpa2_test_real_default_kernel_qr_decomposition_single_precision.sh + chmod +x elpa2_test_real_default_kernel_qr_decomposition_single_precision.sh + +elpa2_test_real_choose_kernel_with_api_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_choose_kernel_with_api_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_choose_kernel_with_api_single_precision.sh + chmod +x elpa2_test_real_choose_kernel_with_api_single_precision.sh +endif + +if WANT_SINGLE_PRECISION_COMPLEX +elpa1_test_complex_single_precision.sh: + echo 'mpiexec -n 2 ./elpa1_test_complex_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_single_precision.sh + chmod +x elpa1_test_complex_single_precision.sh + +elpa2_test_complex_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_single_precision.sh + chmod +x elpa2_test_complex_single_precision.sh + +elpa2_test_complex_default_kernel_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex_default_kernel_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_default_kernel_single_precision.sh + chmod +x elpa2_test_complex_default_kernel_single_precision.sh + +elpa2_test_complex_choose_kernel_with_api_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api_single_precision.sh + chmod +x elpa2_test_complex_choose_kernel_with_api_single_precision.sh +endif + +if WITH_GPU_VERSION +elpa2_test_real_gpu_version.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_gpu_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_gpu_version.sh + chmod +x elpa2_test_real_gpu_version.sh + +elpa2_test_complex_gpu_version.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex_gpu_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_gpu_version.sh + chmod +x elpa2_test_complex_gpu_version.sh + +if WANT_SINGLE_PRECISION_REAL +elpa2_test_real_gpu_version_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_real_gpu_version_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_gpu_version_single_precision.sh + chmod +x elpa2_test_real_gpu_version_single_precision.sh +endif + +if WANT_SINGLE_PRECISION_COMPLEX +elpa2_test_complex_gpu_version_single_precision.sh: + echo 'mpiexec -n 2 ./elpa2_test_complex_gpu_version_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_gpu_version_single_precision.sh + chmod +x elpa2_test_complex_gpu_version_single_precision.sh +endif +# GPU_VERSION +endif + +else +# build tests without mpi support + +elpa1_test_real.sh: + echo './elpa1_test_real@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real.sh + chmod +x elpa1_test_real.sh + +elpa1_test_real_with_c.sh: + echo './elpa1_test_real_with_c@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_with_c.sh + chmod +x elpa1_test_real_with_c.sh + +elpa2_test_real_c_version.sh: + echo './elpa2_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_c_version.sh + chmod +x elpa2_test_real_c_version.sh + +elpa2_test_complex_c_version.sh: + echo './elpa2_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_c_version.sh + chmod +x elpa2_test_complex_c_version.sh + +elpa1_test_real_c_version.sh: + echo './elpa1_test_real_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_c_version.sh + chmod +x elpa1_test_real_c_version.sh + +elpa1_test_complex_c_version.sh: + echo './elpa1_test_complex_c_version@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_c_version.sh + chmod +x elpa1_test_complex_c_version.sh + +elpa2_test_real.sh: + echo './elpa2_test_real@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real.sh + chmod +x elpa2_test_real.sh + +elpa2_test_real_default_kernel.sh: + echo './elpa2_test_real_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_default_kernel.sh + chmod +x elpa2_test_real_default_kernel.sh + +elpa2_test_real_default_kernel_qr_decomposition.sh: + echo './elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > elpa2_test_real_default_kernel_qr_decomposition.sh + chmod +x elpa2_test_real_default_kernel_qr_decomposition.sh + +elpa2_test_real_choose_kernel_with_api.sh: + echo './elpa2_test_real_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_choose_kernel_with_api.sh + chmod +x elpa2_test_real_choose_kernel_with_api.sh + +elpa1_test_complex.sh: + echo './elpa1_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex.sh + chmod +x elpa1_test_complex.sh + +elpa2_test_complex.sh: + echo './elpa2_test_complex@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex.sh + chmod +x elpa2_test_complex.sh + +elpa2_test_complex_default_kernel.sh: + echo './elpa2_test_complex_default_kernel@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_default_kernel.sh + chmod +x elpa2_test_complex_default_kernel.sh + +elpa2_test_complex_choose_kernel_with_api.sh: + echo './elpa2_test_complex_choose_kernel_with_api@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api.sh + chmod +x elpa2_test_complex_choose_kernel_with_api.sh + +if WANT_SINGLE_PRECISION_REAL +elpa1_test_real_single_precision.sh: + echo './elpa1_test_real_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa1_test_real_single_precision.sh + chmod +x elpa1_test_real_single_precision.sh + +elpa2_test_real_single_precision.sh: + echo './elpa2_test_real_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_single_precision.sh + chmod +x elpa2_test_real_single_precision.sh + +elpa2_test_real_default_kernel_single_precision.sh: + echo './elpa2_test_real_default_kernel_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_default_kernel_single_precision.sh + chmod +x elpa2_test_real_default_kernel_single_precision.sh + +elpa2_test_real_default_kernel_qr_decomposition_single_precision.sh: + echo './elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > elpa2_test_real_default_kernel_qr_decomposition_single_precision.sh + chmod +x elpa2_test_real_default_kernel_qr_decomposition_single_precision.sh + +elpa2_test_real_choose_kernel_with_api_single_precision.sh: + echo './elpa2_test_real_choose_kernel_with_api_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_choose_kernel_with_api_single_precision.sh + chmod +x elpa2_test_real_choose_kernel_with_api_single_precision.sh +endif + +if WANT_SINGLE_PRECISION_COMPLEX +elpa1_test_complex_single_precision.sh: + echo './elpa1_test_complex_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa1_test_complex_single_precision.sh + chmod +x elpa1_test_complex_single_precision.sh + +elpa2_test_complex_single_precision.sh: + echo './elpa2_test_complex_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_single_precision.sh + chmod +x elpa2_test_complex_single_precision.sh + +elpa2_test_complex_default_kernel_single_precision.sh: + echo './elpa2_test_complex_default_kernel_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_default_kernel_single_precision.sh + chmod +x elpa2_test_complex_default_kernel_single_precision.sh + +elpa2_test_complex_choose_kernel_with_api_single_precision.sh: + echo './elpa2_test_complex_choose_kernel_with_api_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_choose_kernel_with_api_single_precision.sh + chmod +x elpa2_test_complex_choose_kernel_with_api_single_precision.sh +endif + +if WITH_GPU_VERSION +elpa2_test_real_gpu_version.sh: + echo './elpa2_test_real_gpu_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_gpu_version.sh + chmod +x elpa2_test_real_gpu_version.sh + +elpa2_test_complex_gpu_version.sh: + echo './elpa2_test_complex_gpu_version@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_gpu_version.sh + chmod +x elpa2_test_complex_gpu_version.sh + +if WANT_SINGLE_PRECISION_REAL +elpa2_test_real_gpu_version_single_precision.sh: + echo './elpa2_test_real_gpu_version_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_real_gpu_version_single_precision.sh + chmod +x elpa2_test_real_gpu_version_single_precision.sh +endif + +if WANT_SINGLE_PRECISION_COMPLEX +elpa2_test_complex_gpu_version_single_precision.sh: + echo './elpa2_test_complex_gpu_version_single_precision@SUFFIX@ $$TEST_FLAGS' > elpa2_test_complex_gpu_version_single_precision.sh + chmod +x elpa2_test_complex_gpu_version_single_precision.sh +endif +# GPU_VERSION +endif + + + +# use mpi +endif + mod_precision.i: $(top_srcdir)/src/mod_precision.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/mod_precision.F90 -o $@ @@ -341,6 +690,27 @@ elpa2_kernels_real.i: $(top_srcdir)/src/elpa2_kernels/elpa2_kernels_real.F90 mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@ +test_real.i: $(top_srcdir)/test/fortran_test_programs/test_real.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/test/fortran_test_programs/test_real.F90 -o $@ + +blacs_infrastructure.i: $(top_srcdir)/test/shared_sources/blacs_infrastructure.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/test/shared_sources/blacs_infrastructure.F90 -o $@ + +check_correctnes.i: $(top_srcdir)/test/shared_sources/check_correctnes.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/test/shared_sources/check_correctnes.F90 -o $@ + +prepare_matrix.i: $(top_srcdir)/test/shared_sources/prepare_matrix.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/test/shared_sources/prepare_matrix.F90 -o $@ + +read_input_parameters.i: $(top_srcdir)/test/shared_sources/read_input_parameters.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/test/shared_sources/read_input_parameters.F90 -o $@ + +setup_mpi.i: $(top_srcdir)/test/shared_sources/setup_mpi.F90 + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/test/shared_sources/setup_mpi.F90 -o $@ + +cuUtils.i: $(top_srcdir)/src/cuUtils.cu + $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/cuUtils.cu -o $@ + include doxygen.am CLEANFILES = \ @@ -383,7 +753,16 @@ EXTRA_DIST = \ test/fortran_test_programs/elpa_test_programs_print_headers.X90 \ src/elpa_reduce_add_vectors.X90 \ src/elpa_transpose_vectors.X90 \ + src/elpa1_compute_template_real.X90 \ + src/elpa1_compute_template_complex.X90 \ + src/elpa2_compute_template_real.X90 \ + src/elpa2_compute_template_complex.X90 \ src/redist_band.X90 \ + src/elpa_qr/elpa_qrkernels.X90 \ + src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \ + src/ev_tridi_band_gpu_c_v2_real_template.Xcu \ + src/cuUtils_complex_template.Xcu \ + src/cuUtils_real_template.Xcu \ nvcc_wrap # Rules to re-generated the headers diff --git a/configure.ac b/configure.ac index f688974a8f21c22dba6d8ea10790b7fa319c5c45..c051e6ad80701ae46207e302c989f01ebb788162 100644 --- a/configure.ac +++ b/configure.ac @@ -523,7 +523,7 @@ if test x"${want_gpu}" = x"yes" ; then fi dnl check whether single precision is requested -AC_MSG_CHECKING(whether single precision calculations are requested) +AC_MSG_CHECKING(whether ELPA library should contain also single precision functions) AC_ARG_ENABLE(single-precision,[AS_HELP_STRING([--enable-single-precision], [build with single precision])], want_single_precision="yes", want_single_precision="no") @@ -747,10 +747,12 @@ if test x"${DESPERATELY_WANT_ASSUMED_SIZE}" = x"yes" ; then AC_DEFINE([DESPERATELY_WANT_ASSUMED_SIZE],[1],[use assumed size arrays, even if not debuggable]) fi -if test x"${want_single_precision}" = x"no" ; then - AC_DEFINE([DOUBLE_PRECISION_REAL],[1],[use double precision for real calculation]) - AC_DEFINE([DOUBLE_PRECISION_COMPLEX],[1],[use double precision for complex calculation]) +if test x"${want_single_precision}" = x"yes" ; then + AC_DEFINE([WANT_SINGLE_PRECISION_REAL],[1],[build also single-precision for real calculation]) + AC_DEFINE([WANT_SINGLE_PRECISION_COMPLEX],[1],[build also single-precision for complex calculation]) fi +AM_CONDITIONAL([WANT_SINGLE_PRECISION_REAL],[test x"$want_single_precision" = x"yes"]) +AM_CONDITIONAL([WANT_SINGLE_PRECISION_COMPLEX],[test x"$want_single_precision" = x"yes"]) AC_SUBST([WITH_MKL]) AC_SUBST([WITH_BLACS]) diff --git a/man/solve_evp_complex.3 b/man/solve_evp_complex.3 index 3984ca0b16dbeacfc0a2ea0b49a6f16208500c14..8b446a715232c657ce4bbdd748b01b1c7cb87e0f 100644 --- a/man/solve_evp_complex.3 +++ b/man/solve_evp_complex.3 @@ -1,9 +1,9 @@ -.TH "solve_evp_complex" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*- +.TH "solve_evp_complex" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- .ad l .nh .SH NAME -solve_evp_complex \- solve the complex eigenvalue problem with the 1-stage ELPA solver. -This interface is old and deprecated. It is recommended to use \fBsolve_evp_complex_1stage\fP(3) +solve_evp_complex \- solve the double-precision complex eigenvalue problem with the 1-stage ELPA solver. +This interface is old and deprecated. It is recommended to use \fBsolve_evp_complex_1stage_double\fP(3) .br .SH SYNOPSIS @@ -48,4 +48,4 @@ use elpa1 Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. .br .SH "SEE ALSO" -\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1) +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_double\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_complex_1stage_double.3 b/man/solve_evp_complex_1stage_double.3 new file mode 100644 index 0000000000000000000000000000000000000000..b3f8d43ef8bb52f655caf6e23bcdbbae49024502 --- /dev/null +++ b/man/solve_evp_complex_1stage_double.3 @@ -0,0 +1,88 @@ +.TH "solve_evp_complex_1stage_double" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- +.ad l +.nh +.SH NAME +solve_evp_complex_1stage_double \- solve the double-precision complex eigenvalue problem with the 1-stage ELPA solver +.br + +.SH SYNOPSIS +.br +.SS FORTRAN INTERFACE +use elpa1 +.br +.br +.RI "success = \fBsolve_evp_complex_1stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "integer, intent(in) \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "integer, intent(in) \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "complex*16, intent(inout) \fBa\fP: locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "integer, intent(in) \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "real*8, intent(inout) \fBev\fP: on output the first \fBnev\fP computed eigenvalues" +.br +.RI "complex*16, intent(inout) \fBq\fP: on output the first \fBnev\fP computed eigenvectors" +.br +.RI "integer, intent(in) \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "integer, intent(in) \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "integer, intent(in) \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "integer, intent(in) \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br + +.RI "logical \fBsuccess\fP: return value indicating success or failure" +.br +.SS C INTERFACE +#include "elpa.h" +.br +#include + +.br +.RI "success = \fBsolve_evp_complex_1stage_double_precision\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex*\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols);" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "double complex *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "double *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues" +.br +.RI "double complex *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors" +.br +.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br + +.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) + +.SH DESCRIPTION +Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. +.br +.SH "SEE ALSO" +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_double\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_complex_1stage_single.3 b/man/solve_evp_complex_1stage_single.3 new file mode 100644 index 0000000000000000000000000000000000000000..73e88b317c85154ec5541a97317184bca7b66e6c --- /dev/null +++ b/man/solve_evp_complex_1stage_single.3 @@ -0,0 +1,88 @@ +.TH "solve_evp_complex_1stage_single" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- +.ad l +.nh +.SH NAME +solve_evp_complex_1stage_single \- solve the single-precision complex eigenvalue problem with the 1-stage ELPA solver +.br + +.SH SYNOPSIS +.br +.SS FORTRAN INTERFACE +use elpa1 +.br +.br +.RI "success = \fBsolve_evp_complex_1stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "integer, intent(in) \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "integer, intent(in) \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "complex*8, intent(inout) \fBa\fP: locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "integer, intent(in) \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "real*4, intent(inout) \fBev\fP: on output the first \fBnev\fP computed eigenvalues" +.br +.RI "complex*8, intent(inout) \fBq\fP: on output the first \fBnev\fP computed eigenvectors" +.br +.RI "integer, intent(in) \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "integer, intent(in) \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "integer, intent(in) \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "integer, intent(in) \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br + +.RI "logical \fBsuccess\fP: return value indicating success or failure" +.br +.SS C INTERFACE +#include "elpa.h" +.br +#include + +.br +.RI "success = \fBsolve_evp_complex_1stage_single_precision\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex*\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols);" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "complex *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "floar *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues" +.br +.RI "complex *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors" +.br +.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br + +.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) + +.SH DESCRIPTION +Solve the complex eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. +.br +.SH "SEE ALSO" +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_double\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_double\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_complex_2stage_double.3 b/man/solve_evp_complex_2stage_double.3 new file mode 100644 index 0000000000000000000000000000000000000000..1ffb72976734f30268f6853d4eb4bbc77e6b22fe --- /dev/null +++ b/man/solve_evp_complex_2stage_double.3 @@ -0,0 +1,91 @@ +.TH "solve_evp_complex_2stage_double" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- +.ad l +.nh +.SH NAME +solve_evp_complex_2stage_double \- solve the double-precision complex eigenvalue problem with the 2-stage ELPA solver +.br + +.SH SYNOPSIS +.br +.SS FORTRAN INTERFACE +use elpa1 +use elpa2 +.br +.br +.RI "success = \fBsolve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL)" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "integer, intent(in) \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "integer, intent(in) \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "complex*16, intent(inout) \fBa\fP: locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "integer, intent(in) \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "real*8, intent(inout) \fBev\fP: on output the first \fBnev\fP computed eigenvalues" +.br +.RI "complex*16, intent(inout) \fBq\fP: on output the first \fBnev\fP computed eigenvectors" +.br +.RI "integer, intent(in) \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "integer, intent(in) \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "integer, intent(in) \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "integer, intent(in) \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA" +.br +.RI "logical \fBsuccess\fP: return value indicating success or failure" +.br +.SS C INTERFACE +#include "elpa.h" +.br +#include + +.br +.RI "success = \fBsolve_evp_complex_2stage_double_precision\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL);" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "double complex *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "double *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues" +.br +.RI "double complex *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors" +.br +.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA" +.br +.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) + +.SH DESCRIPTION +Solve the complex eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. +.br +.SH "SEE ALSO" +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_double\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_double\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_complex_2stage_single.3 b/man/solve_evp_complex_2stage_single.3 new file mode 100644 index 0000000000000000000000000000000000000000..f55d2801c9a4f00b797e0e5cf874e523bfc61201 --- /dev/null +++ b/man/solve_evp_complex_2stage_single.3 @@ -0,0 +1,91 @@ +.TH "solve_evp_complex_2stage_single" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- +.ad l +.nh +.SH NAME +solve_evp_complex_2stage_single \- solve the single-precision complex eigenvalue problem with the 2-stage ELPA solver +.br + +.SH SYNOPSIS +.br +.SS FORTRAN INTERFACE +use elpa1 +use elpa2 +.br +.br +.RI "success = \fBsolve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL)" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "integer, intent(in) \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "integer, intent(in) \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "complex*8, intent(inout) \fBa\fP: locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "integer, intent(in) \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "real*4, intent(inout) \fBev\fP: on output the first \fBnev\fP computed eigenvalues" +.br +.RI "complex*8, intent(inout) \fBq\fP: on output the first \fBnev\fP computed eigenvectors" +.br +.RI "integer, intent(in) \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "integer, intent(in) \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "integer, intent(in) \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "integer, intent(in) \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA" +.br +.RI "logical \fBsuccess\fP: return value indicating success or failure" +.br +.SS C INTERFACE +#include "elpa.h" +.br +#include + +.br +.RI "success = \fBsolve_evp_complex_2stage_single_precision\fP (\fBint\fP na, \fBint\fP nev, \fBcomplex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL);" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "complex *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "float *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues" +.br +.RI "complex *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors" +.br +.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA" +.br +.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) + +.SH DESCRIPTION +Solve the complex eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. +.br +.SH "SEE ALSO" +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_double\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_double\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_real.3 b/man/solve_evp_real.3 index a6762cd201a3c0aed06b3e26461c0a409a4aee73..ac9ab4afad8c6de76a15d039e1705e42edc61d80 100644 --- a/man/solve_evp_real.3 +++ b/man/solve_evp_real.3 @@ -1,9 +1,9 @@ -.TH "solve_evp_real" 3 "Wed Dec 2 2015" "ELPA" \" -*- nroff -*- +.TH "solve_evp_real" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- .ad l .nh .SH NAME -solve_evp_real \- solve the real eigenvalue problem with the 1-stage ELPA solver. -This is an old and deprecated interface. It is recommendet to use \fBsolve_evp_real_1stage\fP(3) +solve_evp_real \- solve the double-precision real eigenvalue problem with the 1-stage ELPA solver. +This is an old and deprecated interface. It is recommendet to use \fBsolve_evp_real_1stage_double\fP(3) .br .SH SYNOPSIS @@ -48,4 +48,4 @@ use elpa1 Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. .br .SH "SEE ALSO" -\fBget_elpa_communicators\fP(3) \fBsolve_evp_complex_1stage\fP(3) \fBsolve_evp_real_2stage\fP(3) \fBsolve_evp_complex_2stage\fP(3) \fBprint_available_elpa2_kernels\fP(1) +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_double\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_real_1stage_double.3 b/man/solve_evp_real_1stage_double.3 new file mode 100644 index 0000000000000000000000000000000000000000..dcd08ad71e2c0ca15702ae8efacfea874742836c --- /dev/null +++ b/man/solve_evp_real_1stage_double.3 @@ -0,0 +1,86 @@ +.TH "solve_evp_real_1stage_double" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- +.ad l +.nh +.SH NAME +solve_evp_real_1stage_double \- solve the double-precision real eigenvalue problem with the 1-stage ELPA solver +.br + +.SH SYNOPSIS +.br +.SS FORTRAN INTERFACE +use elpa1 +.br +.br +.RI "success = \fBsolve_evp_real_1stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "integer, intent(in) \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "integer, intent(in) \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "real*8, intent(inout) \fBa\fP: locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "integer, intent(in) \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "real*8, intent(inout) \fBev\fP: on output the first \fBnev\fP computed eigenvalues" +.br +.RI "real*8, intent(inout) \fBq\fP: on output the first \fBnev\fP computed eigenvectors" +.br +.RI "integer, intent(in) \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "integer, intent(in) \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "integer, intent(in) \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "integer, intent(in) \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br + +.RI "logical \fBsuccess\fP: return value indicating success or failure" +.br +.SS C INTERFACE +#include "elpa.h" + +.br +.RI "success = \fBsolve_evp_real_1stage_double_precision\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols);" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "double *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "double *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues" +.br +.RI "double *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors" +.br +.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br + +.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) + +.SH DESCRIPTION +Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. +.br +.SH "SEE ALSO" +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_double\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_real_1stage_single.3 b/man/solve_evp_real_1stage_single.3 new file mode 100644 index 0000000000000000000000000000000000000000..936b8eca53b750f0294bdc9b92d60227df7ef759 --- /dev/null +++ b/man/solve_evp_real_1stage_single.3 @@ -0,0 +1,86 @@ +.TH "solve_evp_real_1stage_single" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- +.ad l +.nh +.SH NAME +solve_evp_real_1stage_single \- solve the single-precision real eigenvalue problem with the 1-stage ELPA solver +.br + +.SH SYNOPSIS +.br +.SS FORTRAN INTERFACE +use elpa1 +.br +.br +.RI "success = \fBsolve_evp_real_1stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols)" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "integer, intent(in) \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "integer, intent(in) \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "real*4, intent(inout) \fBa\fP: locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "integer, intent(in) \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "real*4, intent(inout) \fBev\fP: on output the first \fBnev\fP computed eigenvalues" +.br +.RI "real*4, intent(inout) \fBq\fP: on output the first \fBnev\fP computed eigenvectors" +.br +.RI "integer, intent(in) \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "integer, intent(in) \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "integer, intent(in) \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "integer, intent(in) \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br + +.RI "logical \fBsuccess\fP: return value indicating success or failure" +.br +.SS C INTERFACE +#include "elpa.h" + +.br +.RI "success = \fBsolve_evp_real_1stage_single_precision\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols);" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "float *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "float *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues" +.br +.RI "float *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors" +.br +.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br + +.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) + +.SH DESCRIPTION +Solve the real eigenvalue problem with the 1-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. +.br +.SH "SEE ALSO" +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_double\fP(3) \fBsolve_evp_complex_1stage_double\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_real_2stage_double.3 b/man/solve_evp_real_2stage_double.3 new file mode 100644 index 0000000000000000000000000000000000000000..9ada462ec681f0131a886a65e8c0304cb694943b --- /dev/null +++ b/man/solve_evp_real_2stage_double.3 @@ -0,0 +1,93 @@ +.TH "solve_evp_real_2stage_double" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- +.ad l +.nh +.SH NAME +solve_evp_real_2stage_double \- solve the double-precision real eigenvalue problem with the 2-stage ELPA solver +.br + +.SH SYNOPSIS +.br +.SS FORTRAN INTERFACE +use elpa1 +use elpa2 +.br +.br +.RI "success = \fBsolve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQr=useQR)" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "integer, intent(in) \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "integer, intent(in) \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "real*8, intent(inout) \fBa\fP: locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "integer, intent(in) \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "real*8, intent(inout) \fBev\fP: on output the first \fBnev\fP computed eigenvalues" +.br +.RI "real*8, intent(inout) \fBq\fP: on output the first \fBnev\fP computed eigenvectors" +.br +.RI "integer, intent(in) \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "integer, intent(in) \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "integer, intent(in) \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "integer, intent(in) \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA" +.br +.RI "logical, intent(in), optional: \fBuseQR\fP: optional argument; switches to QR-decomposition if set to .true." + +.RI "logical \fBsuccess\fP: return value indicating success or failure" +.br +.SS C INTERFACE +#include "elpa.h" + +.br +.RI "success = \fBsolve_evp_real_2stage_double_precision\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQr);" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "double *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "double *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues" +.br +.RI "double *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors" +.br +.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA" +.br +.RI "int \fBuseQR\fP: if set to 1 switch to QR-decomposition" + +.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) + +.SH DESCRIPTION +Solve the real eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. +.br +.SH "SEE ALSO" +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_double\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_double\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBsolve_evp_real_2stage_single\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/man/solve_evp_real_2stage_single.3 b/man/solve_evp_real_2stage_single.3 new file mode 100644 index 0000000000000000000000000000000000000000..2c517a7ef7e3c694a8b7af4f01605ee75bab9247 --- /dev/null +++ b/man/solve_evp_real_2stage_single.3 @@ -0,0 +1,93 @@ +.TH "solve_evp_real_2stage_single" 3 "Thu Mar 17 2016" "ELPA" \" -*- nroff -*- +.ad l +.nh +.SH NAME +solve_evp_real_2stage_single \- solve the single-precision real eigenvalue problem with the 2-stage ELPA solver +.br + +.SH SYNOPSIS +.br +.SS FORTRAN INTERFACE +use elpa1 +use elpa2 +.br +.br +.RI "success = \fBsolve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQr=useQR)" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "integer, intent(in) \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "integer, intent(in) \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "real*4, intent(inout) \fBa\fP: locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "integer, intent(in) \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "real*4, intent(inout) \fBev\fP: on output the first \fBnev\fP computed eigenvalues" +.br +.RI "real*4, intent(inout) \fBq\fP: on output the first \fBnev\fP computed eigenvectors" +.br +.RI "integer, intent(in) \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "integer, intent(in) \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "integer, intent(in) \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "integer, intent(in) \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "integer, intent(in) \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA" +.br +.RI "logical, intent(in), optional: \fBuseQR\fP: optional argument; switches to QR-decomposition if set to .true." + +.RI "logical \fBsuccess\fP: return value indicating success or failure" +.br +.SS C INTERFACE +#include "elpa.h" + +.br +.RI "success = \fBsolve_evp_real_2stage_single_precision\fP (\fBint\fP na, \fBint\fP nev, \fBfloat *\fPa, \fBint\fP lda, \fBfloat *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQr);" +.br +.RI " " +.br +.RI "With the definintions of the input and output variables:" + +.br +.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve" +.br +.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated" +.br +.RI "float *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP" +.br +.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP" +.br +.RI "float *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues" +.br +.RI "float *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors" +.br +.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors" +.br +.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions" +.br +.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP" +.br +.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBget_elpa_communicators\fP(3)" +.br +.RI "int \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA" +.br +.RI "int \fBuseQR\fP: if set to 1 switch to QR-decomposition" + +.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0) + +.SH DESCRIPTION +Solve the real eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBget_elpa_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver. +.br +.SH "SEE ALSO" +\fBget_elpa_communicators\fP(3) \fBsolve_evp_real_1stage_double\fP(3) \fBsolve_evp_real_1stage_single\fP(3) \fBsolve_evp_complex_1stage_double\fP(3) \fBsolve_evp_complex_1stage_single\fP(3) \fBsolve_evp_complex_2stage_double\fP(3) \fBsolve_evp_complex_2stage_single\fP(3) \fBsolve_evp_real_2stage_double\fP(3) \fBprint_available_elpa2_kernels\fP(1) diff --git a/src/check_for_gpu.F90 b/src/check_for_gpu.F90 index f4a07452948d51887e0dcfe72cd491b616440220..159d9ef4e1a38591159cc657d924153863696792 100644 --- a/src/check_for_gpu.F90 +++ b/src/check_for_gpu.F90 @@ -38,7 +38,8 @@ ! any derivatives of ELPA under the same license that we chose for ! the original distribution, the GNU Lesser General Public License. ! -! +! This file was written by A. Marek, MPCDF + #include "config-f90.h" module mod_check_for_gpu diff --git a/src/cuUtils.cu b/src/cuUtils.cu index e56d9cffa5a69f696f211326117596f9f5cbea6a..09c6ac8f931a1e007c1978c70cfce2be747128bb 100644 --- a/src/cuUtils.cu +++ b/src/cuUtils.cu @@ -1,788 +1,76 @@ -#include -#include -#include -#include -#include "config-f90.h" - -// Reset a reduction block -// Limitation: the thread-block size must be a divider of the reduction block's size -#ifdef DOUBLE_PRECISION_REAL -__device__ void reset_shared_block_c ( double * s_block, int b_size) -#else -__device__ void reset_shared_block_c ( float * s_block, int b_size) -#endif -{ - int i, t_idx, s_chunk ; - t_idx = threadIdx.x; - s_chunk = b_size / blockDim.x; - for(i = ((t_idx - 1) * s_chunk + 1) ; i < (t_idx * s_chunk); i++) - s_block[i] = 0.0 ; - __syncthreads(); -} - -// Reset 2 reduction blocks without an explicit synchronization at the end -// Limitation: : the thread-block size must be a divider of the reduction block's size -#ifdef DOUBLE_PRECISION_REAL -__device__ void reset_shared_block_pair_c( double *s_block_1, double *s_block_2, int b_size) -#else -__device__ void reset_shared_block_pair_c( float *s_block_1, float *s_block_2, int b_size) -#endif -{ - int i, t_idx, s_chunk; - - t_idx = threadIdx.x; - s_chunk = b_size / blockDim.x; - for(i = ((t_idx - 1) * s_chunk + 1); i < (t_idx * s_chunk); i++) - { s_block_1[i] = 0.0 ; - s_block_2[i] = 0.0 ; - } -} -// Reset a reduction block -// Limitation: the thread-block size must be a divider of the reduction block's size -#ifdef DOUBLE_PRECISION_COMPLEX -__device__ void reset_shared_block_c_complex ( cuDoubleComplex * s_block, int b_size) -#else -__device__ void reset_shared_block_c_complex ( cuFloatComplex * s_block, int b_size) -#endif -{ - int i, t_idx, s_chunk ; - t_idx = threadIdx.x; - s_chunk = b_size / blockDim.x; - for(i = ((t_idx - 1) * s_chunk + 1) ; i < (t_idx * s_chunk); i++) - { s_block[i].x = 0.0 ; - s_block[i].y = 0.0 ;} - __syncthreads(); -} - -// Reset 2 reduction blocks without an explicit synchronization at the end -// Limitation: : the thread-block size must be a divider of the reduction block's size -#ifdef DOUBLE_PRECISION_COMPLEX -__device__ void reset_shared_block_pair_c_complex( cuDoubleComplex *s_block_1, cuDoubleComplex *s_block_2, int b_size) -#else -__device__ void reset_shared_block_pair_c_complex( cuFloatComplex *s_block_1, cuFloatComplex *s_block_2, int b_size) -#endif -{ - int i, t_idx, s_chunk; - - t_idx = threadIdx.x; - s_chunk = b_size / blockDim.x; - for(i = ((t_idx - 1) * s_chunk + 1); i < (t_idx * s_chunk); i++) - { s_block_1[i].x = 0.0 ; - s_block_2[i].x= 0.0 ; - s_block_1[i].y = 0.0 ; - s_block_2[i].y= 0.0 ; - } -} -#ifdef DOUBLE_PRECISION_COMPLEX -__device__ void warp_reduce_complex( cuDoubleComplex *s_block) -#else -__device__ void warp_reduce_complex( cuFloatComplex *s_block) -#endif -{ - int t_idx ; - t_idx = threadIdx.x; - __syncthreads(); - - if (t_idx < 32) - { - - s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 32]) , cuCadd( s_block[t_idx + 64], s_block[t_idx + 96]) ); - if (t_idx < 8) - { - s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) ); - - } - if (t_idx < 4) - { - s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ; - } - if (t_idx < 1) - { - s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) ); - } - } - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -__global__ void my_pack_c_kernel_complex(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* src, cuDoubleComplex* dst) -#else -__global__ void my_pack_c_kernel_complex(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* src, cuFloatComplex* dst) -#endif -{ - int b_id, t_id ; - int dst_ind ; - b_id = blockIdx.y; - t_id = threadIdx.x; - - dst_ind = b_id * stripe_width + t_id; - if (dst_ind < max_idx) - { - // dimension of dst - lnev, nblk - // dimension of src - stripe_width,a_dim2,stripe_count - dst[dst_ind + (l_nev*blockIdx.x)].x = src[t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2)].x; - dst[dst_ind + (l_nev*blockIdx.x)].y = src[t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2)].y; - } - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -__global__ void my_unpack_c_kernel_complex( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* src, cuDoubleComplex* dst) -#else -__global__ void my_unpack_c_kernel_complex( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* src, cuFloatComplex* dst) -#endif -{ - int b_id, t_id ; - int src_ind; - - b_id = blockIdx.y; - t_id = threadIdx.x; - - src_ind = b_id * stripe_width + t_id; - if (src_ind < max_idx) -{ dst[ t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 )].x = src[ src_ind + (blockIdx.x) *l_nev ].x; - dst[ t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 )].y = src[ src_ind + (blockIdx.x) *l_nev ].y; -} -} - -#ifdef DOUBLE_PRECISION_COMPLEX -__global__ void extract_hh_tau_c_kernel_complex(cuDoubleComplex* hh, cuDoubleComplex* hh_tau, const int nbw, const int n, int val) -#else -__global__ void extract_hh_tau_c_kernel_complex(cuFloatComplex* hh, cuFloatComplex* hh_tau, const int nbw, const int n, int val) -#endif -{ - int h_idx ; - - h_idx = (blockIdx.x) * blockDim.x + threadIdx.x; - - if (h_idx < n) - { - //dimension of hh - (nbw, max_blk_size) - //dimension of hh_tau - max_blk_size - hh_tau[h_idx] = hh[h_idx * nbw] ; - // Replace the first element in the HH reflector with 1.0 or 0.0 - if( val == 0) - { - hh[(h_idx * nbw)].x = 1.0; - hh[h_idx *nbw].y= 0.0; - } - else - { - hh[(h_idx * nbw)].x = 0.0; - hh[h_idx*nbw].y =0.0; - } - } -} - -#ifdef DOUBLE_PRECISION_COMPLEX -__global__ void compute_hh_dotp_c_kernel_complex(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n) -{ - __shared__ cuDoubleComplex hh_s[128] ; -#else -__global__ void compute_hh_dotp_c_kernel_complex(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n) -{ - __shared__ cuFloatComplex hh_s[128] ; -#endif - - int t_idx, v_idx; - - // The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed - v_idx = blockIdx.x ; - - // The thread index indicates the position within the two HH reflectors - t_idx = threadIdx.x ; - - if (t_idx > 0) - { - - hh_s[t_idx] = cuCmul(cuConj(hh[t_idx + v_idx * nbw]), hh[ (t_idx - 1) + (v_idx +1)* nbw]) ; - } - else - { - hh_s[t_idx].x = 0.0 ; - hh_s[t_idx].y = 0.0; - } - - // Compute the dot product using a fast reduction - warp_reduce_complex(hh_s); - __syncthreads(); - - if(t_idx == 0) - { - v_dot[v_idx] = hh_s[0] ; - } - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_my_pack_c_kernel_complex(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* a_dev, cuDoubleComplex* row_group_dev) -#else -extern "C" void launch_my_pack_c_kernel_complex(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* a_dev, cuFloatComplex* row_group_dev) -#endif -{ - - dim3 grid_size; - grid_size = dim3(row_count, stripe_count, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to mypack kernel: %s, %d\n",cudaGetErrorString(err), err); - my_pack_c_kernel_complex<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev); - err = cudaGetLastError(); - if ( err!= cudaSuccess) - { - printf("\n my pack_kernel failed %s \n",cudaGetErrorString(err) ); - } -} - -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_compute_hh_dotp_c_kernel_complex(cuDoubleComplex* bcast_buffer_dev, cuDoubleComplex* hh_dot_dev,const int nbw,const int n) -#else -extern "C" void launch_compute_hh_dotp_c_kernel_complex(cuFloatComplex* bcast_buffer_dev, cuFloatComplex* hh_dot_dev,const int nbw,const int n) -#endif -{ - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to compute_hh kernel: %s, %d\n",cudaGetErrorString(err), err); - compute_hh_dotp_c_kernel_complex<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n); - - err = cudaGetLastError(); - if ( err!= cudaSuccess) - { - printf("\n compute _kernel failed %s \n",cudaGetErrorString(err) ); - } -} - -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_extract_hh_tau_c_kernel_complex(cuDoubleComplex* bcast_buffer_dev, cuDoubleComplex* hh_tau_dev, const int nbw, const int n , const int is_zero) -#else -extern "C" void launch_extract_hh_tau_c_kernel_complex(cuFloatComplex* bcast_buffer_dev, cuFloatComplex* hh_tau_dev, const int nbw, const int n , const int is_zero) -#endif -{ - int grid_size; - grid_size = 1 + (n - 1) / 256; - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err); - extract_hh_tau_c_kernel_complex<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); - err = cudaGetLastError(); - if ( err!= cudaSuccess) - { - printf("\n extract _kernel failed %s \n",cudaGetErrorString(err) ); - } - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_my_unpack_c_kernel_complex( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* row_group_dev, cuDoubleComplex* a_dev) -#else -extern "C" void launch_my_unpack_c_kernel_complex( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* row_group_dev, cuFloatComplex* a_dev) -#endif -{ - - dim3 grid_size; - grid_size = dim3(row_count, stripe_count, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to unpack kernel: %s, %d\n",cudaGetErrorString(err), err); - my_unpack_c_kernel_complex<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n my_unpack_c_kernel failed %s \n",cudaGetErrorString(err) ); - } -} - -#ifdef DOUBLE_PRECISION_REAL -__device__ void warp_reduce_c( double *s_block) -#else -__device__ void warp_reduce_c( float *s_block) -#endif -{ - int t_idx ; - t_idx = threadIdx.x; - __syncthreads(); - - if (t_idx < 32) - { - s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ; - if (t_idx < 8) - s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 8] + s_block[t_idx + 16] + s_block[t_idx + 24]; - if (t_idx < 4) - s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 4]; - if (t_idx < 1) - s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3]; - } -} - -#ifdef DOUBLE_PRECISION_REAL -__global__ void my_pack_c_kernel(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst) -#else -__global__ void my_pack_c_kernel(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* src, float* dst) -#endif -{ - int b_id, t_id ; - int dst_ind ; - b_id = blockIdx.y; - t_id = threadIdx.x; - - dst_ind = b_id * stripe_width + t_id; - if (dst_ind < max_idx) - { - // dimension of dst - lnev, nblk - // dimension of src - stripe_width,a_dim2,stripe_count - *(dst + dst_ind + (l_nev*blockIdx.x) ) = *(src + t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2 )); - } - -} - -#ifdef DOUBLE_PRECISION_REAL -__global__ void my_unpack_c_kernel( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst) -#else -__global__ void my_unpack_c_kernel( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* src, float* dst) -#endif -{ - int b_id, t_id ; - int src_ind; - - b_id = blockIdx.y; - t_id = threadIdx.x; - - src_ind = b_id * stripe_width + t_id; - if (src_ind < max_idx) - *(dst + (t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 ))) = *(src + src_ind + (blockIdx.x) *l_nev ); - -} -#ifdef DOUBLE_PRECISION_COMPLEX -__global__ void compute_kernel_reduce( cuDoubleComplex* a_dev, int lda , int n ,int nbw , cuDoubleComplex *h1_dev ) -#else -__global__ void compute_kernel_reduce( cuFloatComplex* a_dev, int lda , int n ,int nbw , cuFloatComplex *h1_dev ) -#endif -{ - int t_id ; - int st_ind; - - t_id = threadIdx.x; - - st_ind = (t_id*(t_id+1))/2; - if(t_id< n) - { - for(int i =0;i<=t_id;i++) - { - h1_dev[st_ind + i] = a_dev[t_id *lda + i ] ; - } - } - __syncthreads(); - - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -__global__ void compute_kernel_reduce_1( cuDoubleComplex* a_dev, int lda , int n, cuDoubleComplex *h1_dev ) -#else -__global__ void compute_kernel_reduce_1( cuFloatComplex* a_dev, int lda , int n, cuFloatComplex *h1_dev ) -#endif -{ - int t_id ; - int st_ind; - - t_id = threadIdx.x; - - st_ind = (t_id*(t_id+1))/2; - if(t_id< n) - { - for(int i =0;i<=t_id;i++) - { - a_dev[t_id *lda + i ] = h1_dev[st_ind + i]; - a_dev[ (i-1)*lda + t_id ] = cuConj(a_dev[ t_id *lda + i-1]) ; - } - } - __syncthreads(); - +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file was originally written by NVIDIA +// and re-written by A. Marek, MPCDF -} - -#ifdef DOUBLE_PRECISION_COMPLEX -__global__ void dot_product_c_kernel( cuDoubleComplex* hs_dev, cuDoubleComplex* hv_new_dev, cuDoubleComplex tau_new_dev, cuDoubleComplex* x_dev, cuDoubleComplex *h_dev, cuDoubleComplex *hv_dev, int nr) -#else -__global__ void dot_product_c_kernel( cuFloatComplex* hs_dev, cuFloatComplex* hv_new_dev, cuFloatComplex tau_new_dev, cuFloatComplex* x_dev, cuFloatComplex *h_dev, cuFloatComplex *hv_dev, int nr) -#endif -{ - int t_id ; - -#ifdef DOUBLE_PRECISION_COMPLEX - __shared__ cuDoubleComplex x_dev_temp[128]; - __shared__ cuDoubleComplex x_val; -#else - __shared__ cuFloatComplex x_dev_temp[128]; - __shared__ cuFloatComplex x_val; -#endif - //b_id = blockIdx.y; - t_id = threadIdx.x; - - if(t_id0 )&& (t_id < nb)) - { - h_dev[t_id] = cuCsub(h_dev[t_id], cuCmul(x_dev[0],hv_dev[t_id])); - for(i=0;i0 )&& (t_id < nb)) - { - ab_dev[ nb-t_id + (t_id+ns-1)*2*nb ] = cuCsub(ab_dev[ nb-t_id + (t_id+ns-1)*2*nb],cuCmul(hs_dev[0], cuConj(hv_dev[t_id]))); - } - - __syncthreads(); - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -__global__ void double_hh_transform_kernel_2( cuDoubleComplex* ab_dev, cuDoubleComplex *hd_dev, cuDoubleComplex *hv_dev, int nc, int ns , int nb ) -#else -__global__ void double_hh_transform_kernel_2( cuFloatComplex* ab_dev, cuFloatComplex *hd_dev, cuFloatComplex *hv_dev, int nc, int ns , int nb ) -#endif -{ - int t_id = threadIdx.x; - if(t_id < nc) - { - - ab_dev[ t_id + (ns-1)*2*nb ] = cuCsub(cuCsub(ab_dev[ t_id + (ns-1)*2*nb],cuCmul(hd_dev[ t_id], cuConj(hv_dev[0]))) , cuCmul(hv_dev[ t_id], cuConj(hd_dev[0]))); - - } - - __syncthreads(); - -} - -#ifdef DOUBLE_PRECISION_REAL -__global__ void extract_hh_tau_c_kernel(double* hh, double* hh_tau, const int nbw, const int n, int val) -#else -__global__ void extract_hh_tau_c_kernel(float* hh, float* hh_tau, const int nbw, const int n, int val) -#endif -{ - int h_idx ; - h_idx = (blockIdx.x) * blockDim.x + threadIdx.x; - - if (h_idx < n) - { - //dimension of hh - (nbw, max_blk_size) - //dimension of hh_tau - max_blk_size - *(hh_tau + h_idx ) = *(hh + (h_idx * nbw)) ; - // Replace the first element in the HH reflector with 1.0 or 0.0 - if( val == 0) - *(hh + (h_idx * nbw)) = 1.0; - else - *(hh + (h_idx * nbw)) = 0.0; - } -} - -#ifdef DOUBLE_PRECISION_REAL -__global__ void compute_hh_dotp_c_kernel(double* hh, double* v_dot, const int nbw, const int n) -{ - - __shared__ double hh_s[128] ; -#else -__global__ void compute_hh_dotp_c_kernel(float* hh, float* v_dot, const int nbw, const int n) -{ - - __shared__ float hh_s[128] ; -#endif - int t_idx, v_idx; - - // The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed - v_idx = blockIdx.x ; - - // The thread index indicates the position within the two HH reflectors - t_idx = threadIdx.x ; - -// // The contents of the shared memory must be fully reset -// reset_shared_block_c(hh_s, 128); - - // Initialize the contents of the shared buffer (preparing for reduction) - if (t_idx > 0) - *(hh_s + t_idx) = *(hh + t_idx + v_idx * nbw ) * (*(hh + (t_idx - 1) + (v_idx +1)* nbw)) ; - else - *(hh_s + t_idx) = 0.0 ; - - // Compute the dot product using a fast reduction - warp_reduce_c(hh_s); - - if(t_idx == 0) - *(v_dot + v_idx) = *(hh_s) ; - -} - -#ifdef DOUBLE_PRECISION_REAL -extern "C" void launch_my_pack_c_kernel(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* a_dev, double* row_group_dev) -#else -extern "C" void launch_my_pack_c_kernel(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* a_dev, float* row_group_dev) -#endif -{ - - dim3 grid_size; - grid_size = dim3(row_count, stripe_count, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to mypack kernel: %s, %d\n",cudaGetErrorString(err), err); - - my_pack_c_kernel<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev); - err = cudaGetLastError(); - if ( err!= cudaSuccess) - { - printf("\n my pack_kernel failed %s \n",cudaGetErrorString(err) ); - } - -} -#ifdef DOUBLE_PRECISION_REAL -extern "C" void launch_compute_hh_dotp_c_kernel(double* bcast_buffer_dev, double* hh_dot_dev,const int nbw,const int n) -#else -extern "C" void launch_compute_hh_dotp_c_kernel(float* bcast_buffer_dev, float* hh_dot_dev,const int nbw,const int n) -#endif -{ - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to compute_hh kernel: %s, %d\n",cudaGetErrorString(err), err); - compute_hh_dotp_c_kernel<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n); - err = cudaGetLastError(); - if ( err!= cudaSuccess) - { - printf("\n compute _kernel failed %s \n",cudaGetErrorString(err) ); - } - -} -#ifdef DOUBLE_PRECISION_REAL -extern "C" void launch_extract_hh_tau_c_kernel(double* bcast_buffer_dev, double* hh_tau_dev, const int nbw, const int n , const int is_zero) -#else -extern "C" void launch_extract_hh_tau_c_kernel(float* bcast_buffer_dev, float* hh_tau_dev, const int nbw, const int n , const int is_zero) -#endif -{ - int grid_size; - grid_size = 1 + (n - 1) / 256; - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err); - extract_hh_tau_c_kernel<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); - err = cudaGetLastError(); - if ( err!= cudaSuccess) - { - printf("\n extract _kernel failed %s \n",cudaGetErrorString(err) ); - } - -} - -#ifdef DOUBLE_PRECISION_REAL -extern "C" void launch_my_unpack_c_kernel( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, double* row_group_dev, double* a_dev) -#else -extern "C" void launch_my_unpack_c_kernel( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, float* row_group_dev, float* a_dev) -#endif -{ - - dim3 grid_size; - grid_size = dim3(row_count, stripe_count, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to unpack kernel: %s, %d\n",cudaGetErrorString(err), err); - my_unpack_c_kernel<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n my_unpack_c_kernel failed %s \n",cudaGetErrorString(err) ); - } -} - -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_dot_product_kernel( cuDoubleComplex* hs_dev, cuDoubleComplex* hv_new_dev, cuDoubleComplex tau_new_dev, cuDoubleComplex* x_dev, cuDoubleComplex* h_dev ,cuDoubleComplex* hv_dev,int nr ) -#else -extern "C" void launch_dot_product_kernel( cuFloatComplex* hs_dev, cuFloatComplex* hv_new_dev, cuFloatComplex tau_new_dev, cuFloatComplex* x_dev, cuFloatComplex* h_dev ,cuFloatComplex* hv_dev,int nr ) -#endif -{ - - dim3 grid_size; - grid_size = dim3(1,1, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); - dot_product_c_kernel<<>>(hs_dev, hv_new_dev, tau_new_dev, x_dev, h_dev, hv_dev, nr ); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); - - } - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_dot_product_kernel_1( cuDoubleComplex* ab_dev, cuDoubleComplex *hs_dev, cuDoubleComplex* hv_new_dev,cuDoubleComplex* x_dev, cuDoubleComplex* h_dev ,cuDoubleComplex* hv_dev, int nb ,int nr , int ns) -#else -extern "C" void launch_dot_product_kernel_1( cuFloatComplex* ab_dev, cuFloatComplex *hs_dev, cuFloatComplex* hv_new_dev,cuFloatComplex* x_dev, cuFloatComplex* h_dev ,cuFloatComplex* hv_dev, int nb ,int nr , int ns) -#endif -{ - dim3 grid_size; - grid_size = dim3(1,1, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); - dot_product_c_kernel_1<<>>( ab_dev, hs_dev, hv_new_dev, x_dev, h_dev, hv_dev, nb, nr, ns ); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); - - } - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_dot_product_kernel_2( cuDoubleComplex* ab_dev, cuDoubleComplex *hs_dev, cuDoubleComplex* hv_dev,cuDoubleComplex* hd_dev, int nb ,int nr , int ne) -#else -extern "C" void launch_dot_product_kernel_2( cuFloatComplex* ab_dev, cuFloatComplex *hs_dev, cuFloatComplex* hv_dev,cuFloatComplex* hd_dev, int nb ,int nr , int ne) -#endif -{ - dim3 grid_size; - grid_size = dim3(1,1, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); - - } - -} - -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_double_hh_transform_1( cuDoubleComplex* ab_dev, cuDoubleComplex *hs_dev,cuDoubleComplex* hv_dev, int nb , int ns) -#else -extern "C" void launch_double_hh_transform_1( cuFloatComplex* ab_dev, cuFloatComplex *hs_dev,cuFloatComplex* hv_dev, int nb , int ns) -#endif -{ - dim3 grid_size; - grid_size = dim3(1,1, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to launch_double_hh_transform kernel: %s, %d\n",cudaGetErrorString(err), err); - double_hh_transform_kernel<<>>( ab_dev, hs_dev, hv_dev, nb, ns ); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); - - } - -} +#include "config-f90.h" -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_double_hh_transform_2( cuDoubleComplex* ab_dev, cuDoubleComplex *hd_dev,cuDoubleComplex* hv_dev, int nc , int ns , int nb ) -#else -extern "C" void launch_double_hh_transform_2( cuFloatComplex* ab_dev, cuFloatComplex *hd_dev,cuFloatComplex* hv_dev, int nc , int ns , int nb ) -#endif -{ - dim3 grid_size; - grid_size = dim3(1,1, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to launch_double_hh_transform kernel: %s, %d\n",cudaGetErrorString(err), err); - double_hh_transform_kernel_2<<>>( ab_dev, hd_dev, hv_dev, nc, ns, nb ); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); +// The real part +#define DOUBLE_PRECISION_REAL 1 +#include "cuUtils_real_template.Xcu" +#undef DOUBLE_PRECISION_REAL - } +#if WANT_SINGLE_PRECISION_REAL -} +#undef DOUBLE_PRECISION_REAL +#include "cuUtils_real_template.Xcu" -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_compute_kernel_reduce( cuDoubleComplex* a_dev, int lda, int n,int nbw, cuDoubleComplex* h_dev) -#else -extern "C" void launch_compute_kernel_reduce( cuFloatComplex* a_dev, int lda, int n,int nbw, cuFloatComplex* h_dev) #endif -{ - dim3 grid_size; - grid_size = dim3(1,1, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); - compute_kernel_reduce<<>>(a_dev, lda, n, nbw,h_dev); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); +// The complex part +#define DOUBLE_PRECISION_COMPLEX 1 +#include "cuUtils_complex_template.Xcu" +#undef DOUBLE_PRECISION_COMPLEX - } +#if WANT_SINGLE_PRECISION_COMPLEX -} +#undef DOUBLE_PRECISION_COMPLEX +#include "cuUtils_complex_template.Xcu" -#ifdef DOUBLE_PRECISION_COMPLEX -extern "C" void launch_compute_kernel_reduce_1( cuDoubleComplex* a_dev, int lda, int n , cuDoubleComplex* h_dev) -#else -extern "C" void launch_compute_kernel_reduce_1( cuFloatComplex* a_dev, int lda, int n , cuFloatComplex* h_dev) #endif -{ - - dim3 grid_size; - grid_size = dim3(1,1, 1); - cudaDeviceSynchronize(); - cudaError_t err = cudaGetLastError(); - if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); - compute_kernel_reduce_1<<>>(a_dev, lda, n, h_dev); - cudaDeviceSynchronize(); - err = cudaGetLastError(); - if ( err != cudaSuccess) - { - printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); - - } - -} - -extern "C" int cuda_MemcpyDeviceToDevice(int val) -{ - val = cudaMemcpyDeviceToDevice; - return val; -} diff --git a/src/cuUtils_complex_template.Xcu b/src/cuUtils_complex_template.Xcu new file mode 100644 index 0000000000000000000000000000000000000000..dbe30573c0b06b570b91b9498edc062a1ebaf042 --- /dev/null +++ b/src/cuUtils_complex_template.Xcu @@ -0,0 +1,698 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file was originally written by NVIDIA +// and re-written by A. Marek, MPCDF + +#include +#include +#include +#include +#include "config-f90.h" +// Reset a reduction block +// Limitation: the thread-block size must be a divider of the reduction block's size +#ifdef DOUBLE_PRECISION_COMPLEX +__device__ void reset_shared_block_c_complex_double ( cuDoubleComplex * s_block, int b_size) +#else +__device__ void reset_shared_block_c_complex_single ( cuFloatComplex * s_block, int b_size) +#endif +{ + int i, t_idx, s_chunk ; + t_idx = threadIdx.x; + s_chunk = b_size / blockDim.x; + for(i = ((t_idx - 1) * s_chunk + 1) ; i < (t_idx * s_chunk); i++) + { s_block[i].x = 0.0 ; + s_block[i].y = 0.0 ;} + __syncthreads(); +} + +// Reset 2 reduction blocks without an explicit synchronization at the end +// Limitation: : the thread-block size must be a divider of the reduction block's size +#ifdef DOUBLE_PRECISION_COMPLEX +__device__ void reset_shared_block_pair_c_complex_double( cuDoubleComplex *s_block_1, cuDoubleComplex *s_block_2, int b_size) +#else +__device__ void reset_shared_block_pair_c_complex_single( cuFloatComplex *s_block_1, cuFloatComplex *s_block_2, int b_size) +#endif +{ + int i, t_idx, s_chunk; + + t_idx = threadIdx.x; + s_chunk = b_size / blockDim.x; + for(i = ((t_idx - 1) * s_chunk + 1); i < (t_idx * s_chunk); i++) + { s_block_1[i].x = 0.0 ; + s_block_2[i].x= 0.0 ; + s_block_1[i].y = 0.0 ; + s_block_2[i].y= 0.0 ; + } +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__device__ void warp_reduce_complex_double( cuDoubleComplex *s_block) +#else +__device__ void warp_reduce_complex_single( cuFloatComplex *s_block) +#endif +{ + int t_idx ; + t_idx = threadIdx.x; + __syncthreads(); + + if (t_idx < 32) + { +#ifdef DOUBLE_PRECISION_COMPLEX + s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 32]) , cuCadd( s_block[t_idx + 64], s_block[t_idx + 96]) ); + if (t_idx < 8) + { + s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) ); + + } + if (t_idx < 4) + { + s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ; + } + if (t_idx < 1) + { + s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) ); + } + } +#else + s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 32]) , cuCaddf( s_block[t_idx + 64], s_block[t_idx + 96]) ); + if (t_idx < 8) + { + s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 8] ) , cuCaddf( s_block[t_idx + 16] , s_block[t_idx + 24] ) ); + + } + if (t_idx < 4) + { + s_block[t_idx] = cuCaddf(s_block[t_idx] , s_block[t_idx + 4]) ; + } + if (t_idx < 1) + { + s_block[t_idx] = cuCaddf(cuCaddf(s_block[t_idx],s_block[t_idx + 1] ) , cuCaddf( s_block[t_idx +2] , s_block[t_idx + 3] ) ); + } + } +#endif +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__global__ void my_pack_c_kernel_complex_double(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* src, cuDoubleComplex* dst) +#else +__global__ void my_pack_c_kernel_complex_single(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* src, cuFloatComplex* dst) +#endif +{ + int b_id, t_id ; + int dst_ind ; + b_id = blockIdx.y; + t_id = threadIdx.x; + + dst_ind = b_id * stripe_width + t_id; + if (dst_ind < max_idx) + { + // dimension of dst - lnev, nblk + // dimension of src - stripe_width,a_dim2,stripe_count + dst[dst_ind + (l_nev*blockIdx.x)].x = src[t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2)].x; + dst[dst_ind + (l_nev*blockIdx.x)].y = src[t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2)].y; + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__global__ void my_unpack_c_kernel_complex_double( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* src, cuDoubleComplex* dst) +#else +__global__ void my_unpack_c_kernel_complex_single( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* src, cuFloatComplex* dst) +#endif +{ + int b_id, t_id ; + int src_ind; + + b_id = blockIdx.y; + t_id = threadIdx.x; + + src_ind = b_id * stripe_width + t_id; + if (src_ind < max_idx) +{ dst[ t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 )].x = src[ src_ind + (blockIdx.x) *l_nev ].x; + dst[ t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 )].y = src[ src_ind + (blockIdx.x) *l_nev ].y; +} +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__global__ void extract_hh_tau_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* hh_tau, const int nbw, const int n, int val) +#else +__global__ void extract_hh_tau_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* hh_tau, const int nbw, const int n, int val) +#endif +{ + int h_idx ; + + h_idx = (blockIdx.x) * blockDim.x + threadIdx.x; + + if (h_idx < n) + { + //dimension of hh - (nbw, max_blk_size) + //dimension of hh_tau - max_blk_size + hh_tau[h_idx] = hh[h_idx * nbw] ; + // Replace the first element in the HH reflector with 1.0 or 0.0 + if( val == 0) + { + hh[(h_idx * nbw)].x = 1.0; + hh[h_idx *nbw].y= 0.0; + } + else + { + hh[(h_idx * nbw)].x = 0.0; + hh[h_idx*nbw].y =0.0; + } + } +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__global__ void compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n) +{ + __shared__ cuDoubleComplex hh_s[128] ; +#else +__global__ void compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* hh, cuFloatComplex* v_dot, const int nbw, const int n) +{ + __shared__ cuFloatComplex hh_s[128] ; +#endif + + int t_idx, v_idx; + + // The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed + v_idx = blockIdx.x ; + + // The thread index indicates the position within the two HH reflectors + t_idx = threadIdx.x ; + + if (t_idx > 0) + { +#ifdef DOUBLE_PRECISION_COMPLEX + hh_s[t_idx] = cuCmul(cuConj(hh[t_idx + v_idx * nbw]), hh[ (t_idx - 1) + (v_idx +1)* nbw]) ; +#else + hh_s[t_idx] = cuCmulf(cuConjf(hh[t_idx + v_idx * nbw]), hh[ (t_idx - 1) + (v_idx +1)* nbw]) ; +#endif + } + else + { + hh_s[t_idx].x = 0.0 ; + hh_s[t_idx].y = 0.0; + } + + // Compute the dot product using a fast reduction +#ifdef DOUBLE_PRECISION_COMPLEX + warp_reduce_complex_double(hh_s); +#else + warp_reduce_complex_single(hh_s); +#endif + __syncthreads(); + + if(t_idx == 0) + { + v_dot[v_idx] = hh_s[0] ; + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_my_pack_c_kernel_complex_double(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* a_dev, cuDoubleComplex* row_group_dev) +#else +extern "C" void launch_my_pack_c_kernel_complex_single(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* a_dev, cuFloatComplex* row_group_dev) +#endif +{ + + dim3 grid_size; + grid_size = dim3(row_count, stripe_count, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to mypack kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + my_pack_c_kernel_complex_double<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev); +#else + my_pack_c_kernel_complex_single<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev); +#endif + err = cudaGetLastError(); + if ( err!= cudaSuccess) + { + printf("\n my pack_kernel failed %s \n",cudaGetErrorString(err) ); + } +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_compute_hh_dotp_c_kernel_complex_double(cuDoubleComplex* bcast_buffer_dev, cuDoubleComplex* hh_dot_dev,const int nbw,const int n) +#else +extern "C" void launch_compute_hh_dotp_c_kernel_complex_single(cuFloatComplex* bcast_buffer_dev, cuFloatComplex* hh_dot_dev,const int nbw,const int n) +#endif +{ + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to compute_hh kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + compute_hh_dotp_c_kernel_complex_double<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n); +#else + compute_hh_dotp_c_kernel_complex_single<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n); +#endif + + err = cudaGetLastError(); + if ( err!= cudaSuccess) + { + printf("\n compute _kernel failed %s \n",cudaGetErrorString(err) ); + } +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_extract_hh_tau_c_kernel_complex_double(cuDoubleComplex* bcast_buffer_dev, cuDoubleComplex* hh_tau_dev, const int nbw, const int n , const int is_zero) +#else +extern "C" void launch_extract_hh_tau_c_kernel_complex_single(cuFloatComplex* bcast_buffer_dev, cuFloatComplex* hh_tau_dev, const int nbw, const int n , const int is_zero) +#endif +{ + int grid_size; + grid_size = 1 + (n - 1) / 256; + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + extract_hh_tau_c_kernel_complex_double<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); +#else + extract_hh_tau_c_kernel_complex_single<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); +#endif + err = cudaGetLastError(); + if ( err!= cudaSuccess) + { + printf("\n extract _kernel failed %s \n",cudaGetErrorString(err) ); + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_my_unpack_c_kernel_complex_double( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* row_group_dev, cuDoubleComplex* a_dev) +#else +extern "C" void launch_my_unpack_c_kernel_complex_single( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, cuFloatComplex* row_group_dev, cuFloatComplex* a_dev) +#endif +{ + + dim3 grid_size; + grid_size = dim3(row_count, stripe_count, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to unpack kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + my_unpack_c_kernel_complex_double<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev); +#else + my_unpack_c_kernel_complex_single<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev); +#endif + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n my_unpack_c_kernel failed %s \n",cudaGetErrorString(err) ); + } +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__global__ void compute_kernel_reduce_double( cuDoubleComplex* a_dev, int lda , int n ,int nbw , cuDoubleComplex *h1_dev ) +#else +__global__ void compute_kernel_reduce_single( cuFloatComplex* a_dev, int lda , int n ,int nbw , cuFloatComplex *h1_dev ) +#endif +{ + int t_id ; + int st_ind; + + t_id = threadIdx.x; + + st_ind = (t_id*(t_id+1))/2; + if(t_id< n) + { + for(int i =0;i<=t_id;i++) + { + h1_dev[st_ind + i] = a_dev[t_id *lda + i ] ; + } + } + __syncthreads(); + + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__global__ void compute_kernel_reduce_1_double( cuDoubleComplex* a_dev, int lda , int n, cuDoubleComplex *h1_dev ) +#else +__global__ void compute_kernel_reduce_1_single( cuFloatComplex* a_dev, int lda , int n, cuFloatComplex *h1_dev ) +#endif +{ + int t_id ; + int st_ind; + + t_id = threadIdx.x; + + st_ind = (t_id*(t_id+1))/2; + if(t_id< n) + { + for(int i =0;i<=t_id;i++) + { + a_dev[t_id *lda + i ] = h1_dev[st_ind + i]; +#ifdef DOUBLE_PRECISION_COMPLEX + a_dev[ (i-1)*lda + t_id ] = cuConj(a_dev[ t_id *lda + i-1]) ; +#else + a_dev[ (i-1)*lda + t_id ] = cuConjf(a_dev[ t_id *lda + i-1]) ; +#endif + } + } + __syncthreads(); + + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__global__ void dot_product_c_kernel_double( cuDoubleComplex* hs_dev, cuDoubleComplex* hv_new_dev, cuDoubleComplex tau_new_dev, cuDoubleComplex* x_dev, cuDoubleComplex *h_dev, cuDoubleComplex *hv_dev, int nr) +#else +__global__ void dot_product_c_kernel_single( cuFloatComplex* hs_dev, cuFloatComplex* hv_new_dev, cuFloatComplex tau_new_dev, cuFloatComplex* x_dev, cuFloatComplex *h_dev, cuFloatComplex *hv_dev, int nr) +#endif +{ + int t_id ; + +#ifdef DOUBLE_PRECISION_COMPLEX + __shared__ cuDoubleComplex x_dev_temp[128]; + __shared__ cuDoubleComplex x_val; +#else + __shared__ cuFloatComplex x_dev_temp[128]; + __shared__ cuFloatComplex x_val; +#endif + //b_id = blockIdx.y; + t_id = threadIdx.x; + + if(t_id0 )&& (t_id < nb)) + { +#ifdef DOUBLE_PRECISION_COMPLEX + h_dev[t_id] = cuCsub(h_dev[t_id], cuCmul(x_dev[0],hv_dev[t_id])); +#else + h_dev[t_id] = cuCsubf(h_dev[t_id], cuCmulf(x_dev[0],hv_dev[t_id])); +#endif + for(i=0;i0 )&& (t_id < nb)) + { +#ifdef DOUBLE_PRECISION_COMPLEX + ab_dev[ nb-t_id + (t_id+ns-1)*2*nb ] = cuCsub(ab_dev[ nb-t_id + (t_id+ns-1)*2*nb],cuCmul(hs_dev[0], cuConj(hv_dev[t_id]))); +#else + ab_dev[ nb-t_id + (t_id+ns-1)*2*nb ] = cuCsubf(ab_dev[ nb-t_id + (t_id+ns-1)*2*nb],cuCmulf(hs_dev[0], cuConjf(hv_dev[t_id]))); +#endif + } + + __syncthreads(); + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +__global__ void double_hh_transform_kernel_2_double( cuDoubleComplex* ab_dev, cuDoubleComplex *hd_dev, cuDoubleComplex *hv_dev, int nc, int ns , int nb ) +#else +__global__ void double_hh_transform_kernel_2_single( cuFloatComplex* ab_dev, cuFloatComplex *hd_dev, cuFloatComplex *hv_dev, int nc, int ns , int nb ) +#endif +{ + int t_id = threadIdx.x; + if(t_id < nc) + { +#ifdef DOUBLE_PRECISION_COMPLEX + ab_dev[ t_id + (ns-1)*2*nb ] = cuCsub(cuCsub(ab_dev[ t_id + (ns-1)*2*nb],cuCmul(hd_dev[ t_id], cuConj(hv_dev[0]))) , cuCmul(hv_dev[ t_id], cuConj(hd_dev[0]))); +#else + ab_dev[ t_id + (ns-1)*2*nb ] = cuCsubf(cuCsubf(ab_dev[ t_id + (ns-1)*2*nb],cuCmulf(hd_dev[ t_id], cuConjf(hv_dev[0]))) , cuCmulf(hv_dev[ t_id], cuConjf(hd_dev[0]))); + +#endif + + } + + __syncthreads(); + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_dot_product_kernel_double( cuDoubleComplex* hs_dev, cuDoubleComplex* hv_new_dev, cuDoubleComplex tau_new_dev, cuDoubleComplex* x_dev, cuDoubleComplex* h_dev ,cuDoubleComplex* hv_dev,int nr ) +#else +extern "C" void launch_dot_product_kernel_single( cuFloatComplex* hs_dev, cuFloatComplex* hv_new_dev, cuFloatComplex tau_new_dev, cuFloatComplex* x_dev, cuFloatComplex* h_dev ,cuFloatComplex* hv_dev,int nr ) +#endif +{ + + dim3 grid_size; + grid_size = dim3(1,1, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + dot_product_c_kernel_double<<>>(hs_dev, hv_new_dev, tau_new_dev, x_dev, h_dev, hv_dev, nr ); +#else + dot_product_c_kernel_single<<>>(hs_dev, hv_new_dev, tau_new_dev, x_dev, h_dev, hv_dev, nr ); +#endif + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); + + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_dot_product_kernel_1_double( cuDoubleComplex* ab_dev, cuDoubleComplex *hs_dev, cuDoubleComplex* hv_new_dev,cuDoubleComplex* x_dev, cuDoubleComplex* h_dev ,cuDoubleComplex* hv_dev, int nb ,int nr , int ns) +#else +extern "C" void launch_dot_product_kernel_1_single( cuFloatComplex* ab_dev, cuFloatComplex *hs_dev, cuFloatComplex* hv_new_dev,cuFloatComplex* x_dev, cuFloatComplex* h_dev ,cuFloatComplex* hv_dev, int nb ,int nr , int ns) +#endif +{ + dim3 grid_size; + grid_size = dim3(1,1, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + dot_product_c_kernel_1_double<<>>( ab_dev, hs_dev, hv_new_dev, x_dev, h_dev, hv_dev, nb, nr, ns ); +#else + dot_product_c_kernel_1_single<<>>( ab_dev, hs_dev, hv_new_dev, x_dev, h_dev, hv_dev, nb, nr, ns ); +#endif + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); + + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_dot_product_kernel_2_double( cuDoubleComplex* ab_dev, cuDoubleComplex *hs_dev, cuDoubleComplex* hv_dev,cuDoubleComplex* hd_dev, int nb ,int nr , int ne) +#else +extern "C" void launch_dot_product_kernel_2_single( cuFloatComplex* ab_dev, cuFloatComplex *hs_dev, cuFloatComplex* hv_dev,cuFloatComplex* hd_dev, int nb ,int nr , int ne) +#endif +{ + dim3 grid_size; + grid_size = dim3(1,1, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); + + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_double_hh_transform_1_double( cuDoubleComplex* ab_dev, cuDoubleComplex *hs_dev,cuDoubleComplex* hv_dev, int nb , int ns) +#else +extern "C" void launch_double_hh_transform_1_single( cuFloatComplex* ab_dev, cuFloatComplex *hs_dev,cuFloatComplex* hv_dev, int nb , int ns) +#endif +{ + dim3 grid_size; + grid_size = dim3(1,1, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to launch_double_hh_transform kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + double_hh_transform_kernel_double<<>>( ab_dev, hs_dev, hv_dev, nb, ns ); +#else + double_hh_transform_kernel_single<<>>( ab_dev, hs_dev, hv_dev, nb, ns ); +#endif + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); + + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_double_hh_transform_2_double( cuDoubleComplex* ab_dev, cuDoubleComplex *hd_dev,cuDoubleComplex* hv_dev, int nc , int ns , int nb ) +#else +extern "C" void launch_double_hh_transform_2_single( cuFloatComplex* ab_dev, cuFloatComplex *hd_dev,cuFloatComplex* hv_dev, int nc , int ns , int nb ) +#endif +{ + dim3 grid_size; + grid_size = dim3(1,1, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to launch_double_hh_transform kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + double_hh_transform_kernel_2_double<<>>( ab_dev, hd_dev, hv_dev, nc, ns, nb ); +#else + double_hh_transform_kernel_2_single<<>>( ab_dev, hd_dev, hv_dev, nc, ns, nb ); +#endif + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); + + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_compute_kernel_reduce_double( cuDoubleComplex* a_dev, int lda, int n,int nbw, cuDoubleComplex* h_dev) +#else +extern "C" void launch_compute_kernel_reduce_single( cuFloatComplex* a_dev, int lda, int n,int nbw, cuFloatComplex* h_dev) +#endif +{ + + dim3 grid_size; + grid_size = dim3(1,1, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + compute_kernel_reduce_double<<>>(a_dev, lda, n, nbw,h_dev); +#else + compute_kernel_reduce_single<<>>(a_dev, lda, n, nbw,h_dev); +#endif + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); + + } + +} + +#ifdef DOUBLE_PRECISION_COMPLEX +extern "C" void launch_compute_kernel_reduce_1_double( cuDoubleComplex* a_dev, int lda, int n , cuDoubleComplex* h_dev) +#else +extern "C" void launch_compute_kernel_reduce_1_single( cuFloatComplex* a_dev, int lda, int n , cuFloatComplex* h_dev) +#endif +{ + + dim3 grid_size; + grid_size = dim3(1,1, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to launch_dot_product kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_COMPLEX + compute_kernel_reduce_1_double<<>>(a_dev, lda, n, h_dev); +#else + compute_kernel_reduce_1_single<<>>(a_dev, lda, n, h_dev); +#endif + cudaDeviceSynchronize(); + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n dot product kernel failed %s \n",cudaGetErrorString(err) ); + + } + +} + diff --git a/src/cuUtils_real_template.Xcu b/src/cuUtils_real_template.Xcu new file mode 100644 index 0000000000000000000000000000000000000000..defdf8b06f40e84f9ec80e4bc517f9059606ae21 --- /dev/null +++ b/src/cuUtils_real_template.Xcu @@ -0,0 +1,314 @@ +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file was originally written by NVIDIA +// and re-written by A. Marek, MPCDF + +#include +#include +#include + +// Reset a reduction block +// Limitation: the thread-block size must be a divider of the reduction block's size +#ifdef DOUBLE_PRECISION_REAL +__device__ void reset_shared_block_c_double ( double * s_block, int b_size) +#else +__device__ void reset_shared_block_c_single ( float * s_block, int b_size) +#endif +{ + int i, t_idx, s_chunk ; + t_idx = threadIdx.x; + s_chunk = b_size / blockDim.x; + for(i = ((t_idx - 1) * s_chunk + 1) ; i < (t_idx * s_chunk); i++) + s_block[i] = 0.0 ; + __syncthreads(); +} + +// Reset 2 reduction blocks without an explicit synchronization at the end +// Limitation: : the thread-block size must be a divider of the reduction block's size +#ifdef DOUBLE_PRECISION_REAL +__device__ void reset_shared_block_pair_c_double( double *s_block_1, double *s_block_2, int b_size) +#else +__device__ void reset_shared_block_pair_c_single( float *s_block_1, float *s_block_2, int b_size) +#endif +{ + int i, t_idx, s_chunk; + + t_idx = threadIdx.x; + s_chunk = b_size / blockDim.x; + for(i = ((t_idx - 1) * s_chunk + 1); i < (t_idx * s_chunk); i++) + { s_block_1[i] = 0.0 ; + s_block_2[i] = 0.0 ; + } +} +// Reset a reduction block +// Limitation: the thread-block size must be a divider of the reduction block's size + +#ifdef DOUBLE_PRECISION_REAL +__device__ void warp_reduce_c_double( double *s_block) +#else +__device__ void warp_reduce_c_single( float *s_block) +#endif +{ + int t_idx ; + t_idx = threadIdx.x; + __syncthreads(); + + if (t_idx < 32) + { + s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ; + if (t_idx < 8) + s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 8] + s_block[t_idx + 16] + s_block[t_idx + 24]; + if (t_idx < 4) + s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 4]; + if (t_idx < 1) + s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3]; + } +} + +#ifdef DOUBLE_PRECISION_REAL +__global__ void my_pack_c_kernel_double(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst) +#else +__global__ void my_pack_c_kernel_single(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* src, float* dst) +#endif +{ + int b_id, t_id ; + int dst_ind ; + b_id = blockIdx.y; + t_id = threadIdx.x; + + dst_ind = b_id * stripe_width + t_id; + if (dst_ind < max_idx) + { + // dimension of dst - lnev, nblk + // dimension of src - stripe_width,a_dim2,stripe_count + *(dst + dst_ind + (l_nev*blockIdx.x) ) = *(src + t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2 )); + } + +} + +#ifdef DOUBLE_PRECISION_REAL +__global__ void my_unpack_c_kernel_double( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst) +#else +__global__ void my_unpack_c_kernel_single( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* src, float* dst) +#endif +{ + int b_id, t_id ; + int src_ind; + + b_id = blockIdx.y; + t_id = threadIdx.x; + + src_ind = b_id * stripe_width + t_id; + if (src_ind < max_idx) + *(dst + (t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 ))) = *(src + src_ind + (blockIdx.x) *l_nev ); + +} + +#ifdef DOUBLE_PRECISION_REAL +__global__ void extract_hh_tau_c_kernel_double(double* hh, double* hh_tau, const int nbw, const int n, int val) +#else +__global__ void extract_hh_tau_c_kernel_single(float* hh, float* hh_tau, const int nbw, const int n, int val) +#endif +{ + int h_idx ; + h_idx = (blockIdx.x) * blockDim.x + threadIdx.x; + + if (h_idx < n) + { + //dimension of hh - (nbw, max_blk_size) + //dimension of hh_tau - max_blk_size + *(hh_tau + h_idx ) = *(hh + (h_idx * nbw)) ; + // Replace the first element in the HH reflector with 1.0 or 0.0 + if( val == 0) + *(hh + (h_idx * nbw)) = 1.0; + else + *(hh + (h_idx * nbw)) = 0.0; + } +} + +#ifdef DOUBLE_PRECISION_REAL +__global__ void compute_hh_dotp_c_kernel_double(double* hh, double* v_dot, const int nbw, const int n) +{ + + __shared__ double hh_s[128] ; +#else +__global__ void compute_hh_dotp_c_kernel_single(float* hh, float* v_dot, const int nbw, const int n) +{ + + __shared__ float hh_s[128] ; +#endif + int t_idx, v_idx; + + // The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed + v_idx = blockIdx.x ; + + // The thread index indicates the position within the two HH reflectors + t_idx = threadIdx.x ; + +// // The contents of the shared memory must be fully reset +// reset_shared_block_c(hh_s, 128); + + // Initialize the contents of the shared buffer (preparing for reduction) + if (t_idx > 0) + *(hh_s + t_idx) = *(hh + t_idx + v_idx * nbw ) * (*(hh + (t_idx - 1) + (v_idx +1)* nbw)) ; + else + *(hh_s + t_idx) = 0.0 ; + + // Compute the dot product using a fast reduction +#ifdef DOUBLE_PRECISION_REAL + warp_reduce_c_double(hh_s); +#else + warp_reduce_c_single(hh_s); +#endif + + if(t_idx == 0) + *(v_dot + v_idx) = *(hh_s) ; + +} + +#ifdef DOUBLE_PRECISION_REAL +extern "C" void launch_my_pack_c_kernel_double(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* a_dev, double* row_group_dev) +#else +extern "C" void launch_my_pack_c_kernel_single(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, float* a_dev, float* row_group_dev) +#endif +{ + + dim3 grid_size; + grid_size = dim3(row_count, stripe_count, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to mypack kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_REAL + my_pack_c_kernel_double<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev); +#else + my_pack_c_kernel_single<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev); +#endif + err = cudaGetLastError(); + if ( err!= cudaSuccess) + { + printf("\n my pack_kernel failed %s \n",cudaGetErrorString(err) ); + } + +} +#ifdef DOUBLE_PRECISION_REAL +extern "C" void launch_compute_hh_dotp_c_kernel_double(double* bcast_buffer_dev, double* hh_dot_dev,const int nbw,const int n) +#else +extern "C" void launch_compute_hh_dotp_c_kernel_single(float* bcast_buffer_dev, float* hh_dot_dev,const int nbw,const int n) +#endif +{ + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to compute_hh kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_REAL + compute_hh_dotp_c_kernel_double<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n); +#else + compute_hh_dotp_c_kernel_single<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n); +#endif + err = cudaGetLastError(); + if ( err!= cudaSuccess) + { + printf("\n compute _kernel failed %s \n",cudaGetErrorString(err) ); + } + +} +#ifdef DOUBLE_PRECISION_REAL +extern "C" void launch_extract_hh_tau_c_kernel_double(double* bcast_buffer_dev, double* hh_tau_dev, const int nbw, const int n , const int is_zero) +#else +extern "C" void launch_extract_hh_tau_c_kernel_single(float* bcast_buffer_dev, float* hh_tau_dev, const int nbw, const int n , const int is_zero) +#endif +{ + int grid_size; + grid_size = 1 + (n - 1) / 256; + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_REAL + extract_hh_tau_c_kernel_double<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); +#else + extract_hh_tau_c_kernel_single<<>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero); +#endif + err = cudaGetLastError(); + if ( err!= cudaSuccess) + { + printf("\n extract _kernel failed %s \n",cudaGetErrorString(err) ); + } + +} + +#ifdef DOUBLE_PRECISION_REAL +extern "C" void launch_my_unpack_c_kernel_double( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, double* row_group_dev, double* a_dev) +#else +extern "C" void launch_my_unpack_c_kernel_single( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, float* row_group_dev, float* a_dev) +#endif +{ + + dim3 grid_size; + grid_size = dim3(row_count, stripe_count, 1); + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) printf("error prior to unpack kernel: %s, %d\n",cudaGetErrorString(err), err); +#ifdef DOUBLE_PRECISION_REAL + my_unpack_c_kernel_double<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev); +#else + my_unpack_c_kernel_single<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev); +#endif + err = cudaGetLastError(); + if ( err != cudaSuccess) + { + printf("\n my_unpack_c_kernel failed %s \n",cudaGetErrorString(err) ); + } +} + +#ifndef MEMCPY_ALREADY_DEFINED +extern "C" int cuda_MemcpyDeviceToDevice(int val) +{ + val = cudaMemcpyDeviceToDevice; + return val; +} +#define MEMCPY_ALREADY_DEFINED 1 +#endif diff --git a/src/cudaFunctions.cu b/src/cudaFunctions.cu index 422692fe04ad061b228773ac757fa84d16a2bfa9..a2f0968b9274ac1c59963ab7b82a8ebaeb0c2900 100644 --- a/src/cudaFunctions.cu +++ b/src/cudaFunctions.cu @@ -1,6 +1,56 @@ #include #include #include +// This file is part of ELPA. +// +// The ELPA library was originally created by the ELPA consortium, +// consisting of the following organizations: +// +// - Max Planck Computing and Data Facility (MPCDF), formerly known as +// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +// - Bergische Universität Wuppertal, Lehrstuhl für angewandte +// Informatik, +// - Technische Universität München, Lehrstuhl für Informatik mit +// Schwerpunkt Wissenschaftliches Rechnen , +// - Fritz-Haber-Institut, Berlin, Abt. Theorie, +// - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +// and +// - IBM Deutschland GmbH +// +// This particular source code file contains additions, changes and +// enhancements authored by Intel Corporation which is not part of +// the ELPA consortium. +// +// More information can be found here: +// http://elpa.mpcdf.mpg.de/ +// +// ELPA is free software: you can redistribute it and/or modify +// it under the terms of the version 3 of the license of the +// GNU Lesser General Public License as published by the Free +// Software Foundation. +// +// ELPA is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with ELPA. If not, see +// +// ELPA reflects a substantial effort on the part of the original +// ELPA consortium, and we ask you to respect the spirit of the +// license that we chose: i.e., please contribute any changes you +// may have back to the original ELPA library distribution, and keep +// any derivatives of ELPA under the same license that we chose for +// the original distribution, the GNU Lesser General Public License. +// +// +// -------------------------------------------------------------------------------------------------- +// +// This file was written by A. Marek, MPCDF + + #include #include #include diff --git a/src/elpa1.F90 b/src/elpa1.F90 index 707f10bbc05c761c1134f3f99358c2dd13ed544d..d0ec024cf257a98f2f5804330e08e535e1fc63e4 100644 --- a/src/elpa1.F90 +++ b/src/elpa1.F90 @@ -79,6 +79,7 @@ #include "config-f90.h" + !> \brief Fortran module which provides the routines to use the one-stage ELPA solver module ELPA1 use precision @@ -97,14 +98,21 @@ module ELPA1 ! The following routines are public: - public :: get_elpa_row_col_comms !< old, deprecated interface: Sets MPI row/col communicators - public :: get_elpa_communicators !< Sets MPI row/col communicators - - public :: solve_evp_real !< old, deprecated interface: Driver routine for real eigenvalue problem - public :: solve_evp_real_1stage !< Driver routine for real eigenvalue problem - public :: solve_evp_complex !< old, deprecated interface: Driver routine for complex eigenvalue problem - public :: solve_evp_complex_1stage !< Driver routine for complex eigenvalue problem + public :: get_elpa_row_col_comms !< old, deprecated interface: Sets MPI row/col communicators + public :: get_elpa_communicators !< Sets MPI row/col communicators + public :: solve_evp_real !< old, deprecated interface: Driver routine for real double-precision eigenvalue problem + public :: solve_evp_real_1stage !< Driver routine for real double-precision eigenvalue problem + public :: solve_evp_real_1stage_double !< Driver routine for real double-precision eigenvalue problem +#ifdef WANT_SINGLE_PRECISION_REAL + public :: solve_evp_real_1stage_single !< Driver routine for real single-precision eigenvalue problem +#endif + public :: solve_evp_complex !< old, deprecated interface: Driver routine for complex double-precision eigenvalue problem + public :: solve_evp_complex_1stage !< Driver routine for complex double-precision eigenvalue problem + public :: solve_evp_complex_1stage_double !< Driver routine for complex double-precision eigenvalue problem +#ifdef WANT_SINGLE_PRECISION_COMPLEX + public :: solve_evp_complex_1stage_single !< Driver routine for complex single-precision eigenvalue problem +#endif ! Timing results, set by every call to solve_evp_xxx real(kind=c_double), public :: time_evp_fwd !< time for forward transformations (to tridiagonal form) @@ -169,7 +177,11 @@ module ELPA1 interface solve_evp_real - module procedure solve_evp_real_1stage + module procedure solve_evp_real_1stage_double + end interface + + interface solve_evp_real_1stage + module procedure solve_evp_real_1stage_double end interface !> \brief solve_evp_complex: old, deprecated Fortran function to solve the complex eigenvalue problem with 1-stage solver. Better use "solve_evp_complex_1stage" @@ -210,7 +222,11 @@ module ELPA1 interface solve_evp_complex - module procedure solve_evp_complex_1stage + module procedure solve_evp_complex_1stage_double + end interface + + interface solve_evp_complex_1stage + module procedure solve_evp_complex_1stage_double end interface contains @@ -256,7 +272,7 @@ function get_elpa_communicators(mpi_comm_global, my_prow, my_pcol, mpi_comm_rows end function get_elpa_communicators -!> \brief solve_evp_real_1stage: Fortran function to solve the real eigenvalue problem with 1-stage solver +!> \brief solve_evp_real_1stage_double: Fortran function to solve the real double-precision eigenvalue problem with 1-stage solver !> ! Parameters ! @@ -290,8 +306,13 @@ end function get_elpa_communicators !> !> \result success +#define DOUBLE_PRECISION_REAL 1 +#define DOUBLE_PRECISION_COMPLEX 1 +#define REAL_DATATYPE rk8 +#define COMPLEX_DATATYPE ck4 -function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success) +function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, & + matrixCols, mpi_comm_rows, mpi_comm_cols) result(success) use precision #ifdef HAVE_DETAILED_TIMINGS use timings @@ -300,19 +321,19 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp implicit none integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - real(kind=rk) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) + real(kind=REAL_DATATYPE) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) ! was ! real a(lda,*), q(ldq,*) integer(kind=ik) :: my_prow, my_pcol, mpierr - real(kind=rk), allocatable :: e(:), tau(:) + real(kind=REAL_DATATYPE), allocatable :: e(:), tau(:) real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double logical :: success logical, save :: firstCall = .true. logical :: wantDebug #ifdef HAVE_DETAILED_TIMINGS - call timer%start("solve_evp_real_1stage") + call timer%start("solve_evp_real_1stage_double") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) @@ -330,15 +351,149 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp allocate(e(na), tau(na)) ttt0 = MPI_Wtime() - call tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#ifdef DOUBLE_PRECISION_REAL + call tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#else + call tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#endif + ttt1 = MPI_Wtime() + if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0 + time_evp_fwd = ttt1-ttt0 + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_REAL + call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#else + call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#endif + if (.not.(success)) return + + ttt1 = MPI_Wtime() + if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 + time_evp_solve = ttt1-ttt0 + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_REAL + call trans_ev_real_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#else + call trans_ev_real_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#endif + ttt1 = MPI_Wtime() + if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0 + time_evp_back = ttt1-ttt0 + + deallocate(e, tau) + +#ifdef HAVE_DETAILED_TIMINGS + call timer%stop("solve_evp_real_1stage_double") +#endif + +end function solve_evp_real_1stage_double + +#undef DOUBLE_PRECISION_REAL +#undef DOUBLE_PRECISION_COMPLEX +#undef REAL_DATATYPE +#undef COMPLEX_DATATYPE + +#ifdef WANT_SINGLE_PRECISION_REAL +#undef DOUBLE_PRECISION_REAL +#undef DOUBLE_PRECISION_COMPLEX +#define REAL_DATATYPE rk4 +#define COMPLEX_DATATYPE CK4 +!> \brief solve_evp_real_1stage_single: Fortran function to solve the real single-precision eigenvalue problem with 1-stage solver +!> +! Parameters +! +!> \param na Order of matrix a +!> +!> \param nev Number of eigenvalues needed. +!> The smallest nev eigenvalues/eigenvectors are calculated. +!> +!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. +!> Distribution is like in Scalapack. +!> The full matrix must be set (not only one half like in scalapack). +!> Destroyed on exit (upper and lower half). +!> +!> \param lda Leading dimension of a +!> +!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set +!> +!> \param q(ldq,matrixCols) On output: Eigenvectors of a +!> Distribution is like in Scalapack. +!> Must be always dimensioned to the full size (corresponding to (na,na)) +!> even if only a part of the eigenvalues is needed. +!> +!> \param ldq Leading dimension of q +!> +!> \param nblk blocksize of cyclic distribution, must be the same in both directions! +!> +!> \param matrixCols distributed number of matrix columns +!> +!> \param mpi_comm_rows MPI-Communicator for rows +!> \param mpi_comm_cols MPI-Communicator for columns +!> +!> \result success + + +function solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, & + mpi_comm_rows, mpi_comm_cols) result(success) + use precision +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use iso_c_binding + implicit none + + integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + real(kind=REAL_DATATYPE) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) + ! was + ! real a(lda,*), q(ldq,*) + + integer(kind=ik) :: my_prow, my_pcol, mpierr + real(kind=REAL_DATATYPE), allocatable :: e(:), tau(:) + real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double + logical :: success + logical, save :: firstCall = .true. + logical :: wantDebug + +#ifdef HAVE_DETAILED_TIMINGS + call timer%start("solve_evp_real_1stage_single") +#endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + + success = .true. + + wantDebug = .false. + if (firstCall) then + ! are debug messages desired? + wantDebug = debug_messages_via_environment_variable() + firstCall = .false. + endif + + allocate(e(na), tau(na)) + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_REAL + call tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#else + call tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#endif ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0 time_evp_fwd = ttt1-ttt0 ttt0 = MPI_Wtime() - call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & +#ifdef DOUBLE_PRECISION_REAL + call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) +#else + call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#endif if (.not.(success)) return ttt1 = MPI_Wtime() @@ -346,7 +501,11 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp time_evp_solve = ttt1-ttt0 ttt0 = MPI_Wtime() - call trans_ev_real(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#ifdef DOUBLE_PRECISION_REAL + call trans_ev_real_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#else + call trans_ev_real_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#endif ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0 time_evp_back = ttt1-ttt0 @@ -354,13 +513,22 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp deallocate(e, tau) #ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_evp_real_1stage") + call timer%stop("solve_evp_real_1stage_single") #endif -end function solve_evp_real_1stage +end function solve_evp_real_1stage_single +#undef DOUBLE_PRECISION_REAL +#undef DOUBLE_PRECISION_COMPLEX +#undef REAL_DATATYPE +#undef COMPLEX_DATATYPE +#endif /* WANT_SINGLE_PRECISION_REAL */ -!> \brief solve_evp_complex_1stage: Fortran function to solve the complex eigenvalue problem with 1-stage solver +#define DOUBLE_PRECISION_REAL 1 +#define DOUBLE_PRECISION_COMPLEX 1 +#define REAL_DATATYPE rk8 +#define COMPLEX_DATATYPE ck8 +!> \brief solve_evp_complex_1stage_double: Fortran function to solve the complex double-precision eigenvalue problem with 1-stage solver !> ! Parameters ! @@ -394,7 +562,8 @@ end function solve_evp_real_1stage !> !> \result success -function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) result(success) +function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, & + mpi_comm_rows, mpi_comm_cols) result(success) #ifdef HAVE_DETAILED_TIMINGS use timings #endif @@ -403,15 +572,15 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, implicit none integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols) + complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols) ! was ! complex a(lda,*), q(ldq,*) - real(kind=rk) :: ev(na) + real(kind=REAL_DATATYPE) :: ev(na) integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: l_rows, l_cols, l_cols_nev - real(kind=rk), allocatable :: q_real(:,:), e(:) - complex(kind=ck), allocatable :: tau(:) + real(kind=REAL_DATATYPE), allocatable :: q_real(:,:), e(:) + complex(kind=COMPLEX_DATATYPE), allocatable :: tau(:) real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double logical :: success @@ -419,7 +588,7 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, logical :: wantDebug #ifdef HAVE_DETAILED_TIMINGS - call timer%start("solve_evp_complex_1stage") + call timer%start("solve_evp_complex_1stage_double") #endif call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) @@ -446,14 +615,23 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, allocate(q_real(l_rows,l_cols)) ttt0 = MPI_Wtime() - call tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#ifdef DOUBLE_PRECISION_COMPLEX + call tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#else + call tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#endif ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0 time_evp_fwd = ttt1-ttt0 ttt0 = MPI_Wtime() - call solve_tridi(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, & +#ifdef DOUBLE_PRECISION_COMPLEX + call solve_tridi_double(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#else + call solve_tridi_single(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) +#endif if (.not.(success)) return ttt1 = MPI_Wtime() @@ -462,8 +640,152 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, ttt0 = MPI_Wtime() q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) +#ifdef DOUBLE_PRECISION_COMPLEX + call trans_ev_complex_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#else + call trans_ev_complex_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#endif + ttt1 = MPI_Wtime() + if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0 + time_evp_back = ttt1-ttt0 + + deallocate(q_real) + deallocate(e, tau) +#ifdef HAVE_DETAILED_TIMINGS + call timer%stop("solve_evp_complex_1stage_double") +#endif + +end function solve_evp_complex_1stage_double +#undef DOUBLE_PRECISION_REAL +#undef DOUBLE_PRECISION_COMPLEX +#undef REAL_DATATYPE +#undef COMPLEX_DATATYPE + + +#ifdef WANT_SINGLE_PRECISION_COMPLEX +#undef DOUBLE_PRECISION_REAL +#undef DOUBLE_PRECISION_COMPLEX +#define COMPLEX_DATATYPE ck4 +#define REAL_DATATYPE rk4 + +!> \brief solve_evp_complex_1stage_single: Fortran function to solve the complex single-precision eigenvalue problem with 1-stage solver +!> +! Parameters +! +!> \param na Order of matrix a +!> +!> \param nev Number of eigenvalues needed. +!> The smallest nev eigenvalues/eigenvectors are calculated. +!> +!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. +!> Distribution is like in Scalapack. +!> The full matrix must be set (not only one half like in scalapack). +!> Destroyed on exit (upper and lower half). +!> +!> \param lda Leading dimension of a +!> +!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set +!> +!> \param q(ldq,matrixCols) On output: Eigenvectors of a +!> Distribution is like in Scalapack. +!> Must be always dimensioned to the full size (corresponding to (na,na)) +!> even if only a part of the eigenvalues is needed. +!> +!> \param ldq Leading dimension of q +!> +!> \param nblk blocksize of cyclic distribution, must be the same in both directions! +!> +!> \param matrixCols distributed number of matrix columns +!> +!> \param mpi_comm_rows MPI-Communicator for rows +!> \param mpi_comm_cols MPI-Communicator for columns +!> +!> \result success + + +function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, & + mpi_comm_rows, mpi_comm_cols) result(success) +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + use iso_c_binding + implicit none - call trans_ev_complex(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) + integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols) + ! was + ! complex a(lda,*), q(ldq,*) + real(kind=REAL_DATATYPE) :: ev(na) + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: l_rows, l_cols, l_cols_nev + real(kind=REAL_DATATYPE), allocatable :: q_real(:,:), e(:) + complex(kind=COMPLEX_DATATYPE), allocatable :: tau(:) + real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double + + logical :: success + logical, save :: firstCall = .true. + logical :: wantDebug + +#ifdef HAVE_DETAILED_TIMINGS + call timer%start("solve_evp_complex_1stage_single") +#endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + + success = .true. + + wantDebug = .false. + if (firstCall) then + ! are debug messages desired? + wantDebug = debug_messages_via_environment_variable() + firstCall = .false. + endif + + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q + l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q + + l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev + + allocate(e(na), tau(na)) + allocate(q_real(l_rows,l_cols)) + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_COMPLEX + call tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#else + call tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) +#endif + ttt1 = MPI_Wtime() + if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0 + time_evp_fwd = ttt1-ttt0 + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_COMPLEX + call solve_tridi_double(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#else + call solve_tridi_single(na, nev, ev, e, q_real, l_rows, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#endif + if (.not.(success)) return + + ttt1 = MPI_Wtime() + if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 + time_evp_solve = ttt1-ttt0 + + ttt0 = MPI_Wtime() + q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) +#ifdef DOUBLE_PRECISION_COMPLEX + call trans_ev_complex_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#else + call trans_ev_complex_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#endif ttt1 = MPI_Wtime() if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0 time_evp_back = ttt1-ttt0 @@ -471,9 +793,15 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, deallocate(q_real) deallocate(e, tau) #ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_evp_complex_1stage") + call timer%stop("solve_evp_complex_1stage_single") #endif -end function solve_evp_complex_1stage +end function solve_evp_complex_1stage_single +#undef DOUBLE_PRECISION_REAL +#undef DOUBLE_PRECISION_COMPLEX +#undef COMPLEX_DATATYPE +#undef REAL_DATATYPE + +#endif /* WANT_SINGLE_PRECISION_COMPLEX */ end module ELPA1 diff --git a/src/elpa1_compute.F90 b/src/elpa1_compute.F90 index c1168b27fda8c28fe78a25f8a9fce71a1e2bd11c..8a6385dba504272a1b384bbfa98d0693caa9ff58 100644 --- a/src/elpa1_compute.F90 +++ b/src/elpa1_compute.F90 @@ -62,5226 +62,262 @@ module ELPA1_compute PRIVATE ! set default to private - public :: tridiag_real ! Transform real symmetric matrix to tridiagonal form - public :: trans_ev_real ! Transform eigenvectors of a tridiagonal matrix back - public :: mult_at_b_real ! Multiply real matrices A**T * B + public :: tridiag_real_double ! Transform real symmetric matrix to tridiagonal form + public :: tridiag_real + public :: trans_ev_real_double ! Transform real eigenvectors of a tridiagonal matrix back + public :: trans_ev_real + public :: mult_at_b_real_double ! Multiply real matrices A**T * B + public :: mult_at_b_real - public :: tridiag_complex ! Transform complex hermitian matrix to tridiagonal form - public :: trans_ev_complex ! Transform eigenvectors of a tridiagonal matrix back - public :: mult_ah_b_complex ! Multiply complex matrices A**H * B + interface tridiag_real + module procedure tridiag_real_double + end interface - public :: solve_tridi ! Solve tridiagonal eigensystem with divide and conquer method + interface trans_ev_real + module procedure trans_ev_real_double + end interface - public :: cholesky_real ! Cholesky factorization of a real matrix - public :: invert_trm_real ! Invert real triangular matrix + interface mult_at_b_real + module procedure mult_at_b_real_double + end interface - public :: cholesky_complex ! Cholesky factorization of a complex matrix - public :: invert_trm_complex ! Invert complex triangular matrix - - public :: local_index ! Get local index of a block cyclic distributed matrix - public :: least_common_multiple ! Get least common multiple - - public :: hh_transform_real - public :: hh_transform_complex - - public :: elpa_reduce_add_vectors_complex, elpa_reduce_add_vectors_real - public :: elpa_transpose_vectors_complex, elpa_transpose_vectors_real - - contains -#ifdef DOUBLE_PRECISION_REAL - -#define DATATYPE REAL(kind=rk) -#define BYTESIZE 8 -#define REALCASE 1 -#include "elpa_transpose_vectors.X90" -#include "elpa_reduce_add_vectors.X90" -#undef DATATYPE -#undef BYTESIZE -#undef REALCASE - -#else - -#define DATATYPE REAL(kind=rk) -#define BYTESIZE 4 -#define REALCASE 1 -#include "elpa_transpose_vectors.X90" -#include "elpa_reduce_add_vectors.X90" -#undef DATATYPE -#undef BYTESIZE -#undef REALCASE - -#endif /* DOUBLE_PRECISION_REAL */ - - subroutine tridiag_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) - - !------------------------------------------------------------------------------- - ! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form - ! (like Scalapack Routine PDSYTRD) - ! - ! Parameters - ! - ! na Order of matrix - ! - ! a(lda,matrixCols) Distributed matrix which should be reduced. - ! Distribution is like in Scalapack. - ! Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half) - ! a(:,:) is overwritten on exit with the Householder vectors - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - ! d(na) Diagonal elements (returned), identical on all processors - ! - ! e(na) Off-Diagonal elements (returned), identical on all processors - ! - ! tau(na) Factors for the Householder vectors (returned), needed for back transformation - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - real(kind=rk) :: d(na), e(na), tau(na) -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - real(kind=rk) :: a(lda,*) -#else - real(kind=rk) :: a(lda,matrixCols) -#endif - - integer(kind=ik), parameter :: max_stored_rows = 32 - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols - integer(kind=ik) :: l_cols, l_rows, nstor - integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre - integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile - -#ifdef WITH_OPENMP - integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter - integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads -#endif - - real(kind=rk) :: vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf - - real(kind=rk), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) -#ifdef WITH_OPENMP - real(kind=rk), allocatable :: ur_p(:,:), uc_p(:,:) -#endif - integer(kind=ik) :: istat - character(200) :: errorMessage - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("tridiag_real") -#endif - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - - ! Matrix is split into tiles; work is done only for tiles on the diagonal or above - - tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide - - l_rows_tile = tile_size/np_rows ! local rows of a tile - l_cols_tile = tile_size/np_cols ! local cols of a tile - - - totalblocks = (na-1)/nblk + 1 - max_blocks_row = (totalblocks-1)/np_rows + 1 - max_blocks_col = (totalblocks-1)/np_cols + 1 - - max_local_rows = max_blocks_row*nblk - max_local_cols = max_blocks_col*nblk - - allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating tmp "//errorMessage - stop - endif - - allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating vr "//errorMessage - stop - endif - - allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating ur "//errorMessage - stop - endif - - allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating vc "//errorMessage - stop - endif - - allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating uc "//errorMessage - stop - endif - -#ifdef WITH_OPENMP - max_threads = omp_get_max_threads() - - allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating ur_p "//errorMessage - stop - endif - - allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating uc_p "//errorMessage - stop - endif - -#endif - - tmp = 0 - vr = 0 - ur = 0 - vc = 0 - uc = 0 - - allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating vur "//errorMessage - stop - endif - - allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating uvc "//errorMessage - stop - endif - - d(:) = 0 - e(:) = 0 - tau(:) = 0 - - nstor = 0 - - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a - l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a - if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols) - - do istep=na,3,-1 - - ! Calculate number of local rows and columns of the still remaining matrix - ! on the local processor - - l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1) - l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1) - - ! Calculate vector for Householder transformation on all procs - ! owning column istep - - if(my_pcol==pcol(istep, nblk, np_cols)) then - - ! Get vector to be transformed; distribute last element and norm of - ! remaining elements to all procs in current column - - vr(1:l_rows) = a(1:l_rows,l_cols+1) - if(nstor>0 .and. l_rows>0) then -#ifdef DOUBLE_PRECISION_REAL - call DGEMV('N', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), & - uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk, vr, 1) -#else - call SGEMV('N', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), & - uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk, vr, 1) - -#endif - endif - - if(my_prow==prow(istep-1, nblk, np_rows)) then - aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1)) - aux1(2) = vr(l_rows) - else - aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows)) - aux1(2) = 0. - endif - -#ifdef WITH_MPI - -#if DOUBLE_PRECISION_REAL - call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - aux2 = aux1 -#endif /* WITH_MPI */ - vnorm2 = aux2(1) - vrl = aux2(2) - - ! Householder transformation - - call hh_transform_real(vrl, vnorm2, xf, tau(istep)) - - ! Scale vr and store Householder vector for back transformation - - vr(1:l_rows) = vr(1:l_rows) * xf - if(my_prow==prow(istep-1, nblk, np_rows)) then - vr(l_rows) = 1. - e(istep-1) = vrl - endif - a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation - - endif - - ! Broadcast the Householder vector (and tau) along columns - - if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(vr, l_rows+1, MPI_REAL8, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) -#else - call MPI_Bcast(vr, l_rows+1, MPI_REAL4, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - tau(istep) = vr(l_rows+1) - - ! Transpose Householder vector vr -> vc - - call elpa_transpose_vectors_real (vr, ubound(vr,dim=1), mpi_comm_rows, & - vc, ubound(vc,dim=1), mpi_comm_cols, & - 1, istep-1, 1, nblk) - - - ! Calculate u = (A + VU**T + UV**T)*v - - ! For cache efficiency, we use only the upper half of the matrix tiles for this, - ! thus the result is partly in uc(:) and partly in ur(:) - - uc(1:l_cols) = 0 - ur(1:l_rows) = 0 - if (l_rows>0 .and. l_cols>0) then - -#ifdef WITH_OPENMP - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre) - - my_thread = omp_get_thread_num() - n_threads = omp_get_num_threads() - - n_iter = 0 - - uc_p(1:l_cols,my_thread) = 0. - ur_p(1:l_rows,my_thread) = 0. -#endif - do i=0,(istep-2)/tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - if (lce0) then -#ifdef DOUBLE_PRECISION_REAL - call DGEMV('T', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), vr, 1, 0.0_rk, aux, 1) - call DGEMV('N', l_cols, 2*nstor, 1.0_rk, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk, uc, 1) -#else - call SGEMV('T', l_rows, 2*nstor, 1.0_rk, vur, ubound(vur,dim=1), vr, 1, 0.0_rk, aux, 1) - call SGEMV('N', l_cols, 2*nstor, 1.0_rk, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk, uc, 1) -#endif - endif - - endif - - ! Sum up all ur(:) parts along rows and add them to the uc(:) parts - ! on the processors containing the diagonal - ! This is only necessary if ur has been calculated, i.e. if the - ! global tile size is smaller than the global remaining matrix - - if (tile_size < istep-1) then - call elpa_reduce_add_vectors_REAL (ur, ubound(ur,dim=1), mpi_comm_rows, & - uc, ubound(uc,dim=1), mpi_comm_cols, & - istep-1, 1, nblk) - endif - - ! Sum up all the uc(:) parts, transpose uc -> ur - - if (l_cols>0) then - tmp(1:l_cols) = uc(1:l_cols) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(tmp, uc, l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(tmp, uc, l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - uc = tmp -#endif /* WITH_MPI */ - - endif - - call elpa_transpose_vectors_real (uc, ubound(uc,dim=1), mpi_comm_cols, & - ur, ubound(ur,dim=1), mpi_comm_rows, & - 1, istep-1, 1, nblk) - - ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) - - x = 0 - if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols)) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(x, vav, 1, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) -#else - call mpi_allreduce(x, vav, 1, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) -#endif - -#else /* WITH_MPI */ - - vav = x -#endif /* WITH_MPI */ - - ! store u and v in the matrices U and V - ! these matrices are stored combined in one here - - do j=1,l_rows - vur(j,2*nstor+1) = tau(istep)*vr(j) - vur(j,2*nstor+2) = 0.5*tau(istep)*vav*vr(j) - ur(j) - enddo - do j=1,l_cols - uvc(j,2*nstor+1) = 0.5*tau(istep)*vav*vc(j) - uc(j) - uvc(j,2*nstor+2) = tau(istep)*vc(j) - enddo - - nstor = nstor+1 - - ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T - - if (nstor==max_stored_rows .or. istep==3) then - - do i=0,(istep-2)/tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - lrs = 1 - lre = min(l_rows,(i+1)*l_rows_tile) - if (lce0) a(l_rows,l_cols) = a(l_rows,l_cols) & - + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor)) - d(istep-1) = a(l_rows,l_cols) - endif - - enddo - - ! Store e(1) and d(1) - - if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above - if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1) - - deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when deallocating uvc "//errorMessage - stop - endif - - - ! distribute the arrays d and e to all processors - - allocate(tmp(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when allocating tmp "//errorMessage - stop - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - tmp = d - call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) - tmp = d - call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) - tmp = e - call mpi_allreduce(tmp, e, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) - tmp = e - call mpi_allreduce(tmp, e, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) -#else - tmp = d - call mpi_allreduce(tmp, d, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) - tmp = d - call mpi_allreduce(tmp, d, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) - tmp = e - call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) - tmp = e - call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - deallocate(tmp, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_real: error when deallocating tmp "//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("tridiag_real") -#endif - - end subroutine tridiag_real - - subroutine trans_ev_real(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) - - !------------------------------------------------------------------------------- - ! trans_ev_real: Transforms the eigenvectors of a tridiagonal matrix back - ! to the eigenvectors of the original matrix - ! (like Scalapack Routine PDORMTR) - ! - ! Parameters - ! - ! na Order of matrix a, number of rows of matrix q - ! - ! nqc Number of columns of matrix q - ! - ! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real) - ! Distribution is like in Scalapack. - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix a and q - ! - ! tau(na) Factors of the Householder vectors - ! - ! q On input: Eigenvectors of tridiagonal matrix - ! On output: Transformed eigenvectors - ! Distribution is like in Scalapack. - ! - ! ldq Leading dimension of q - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - real(kind=rk) :: tau(na) -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - real(kind=rk) :: a(lda,*), q(ldq,*) -#else - real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols) -#endif - - integer(kind=ik) :: max_stored_rows - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols - integer(kind=ik) :: l_cols, l_rows, l_colh, nstor - integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol - - real(kind=rk), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) - real(kind=rk), allocatable :: tmat(:,:), h1(:), h2(:) - integer(kind=ik) :: istat - character(200) :: errorMessage -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("trans_ev_real") -#endif - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - - totalblocks = (na-1)/nblk + 1 - max_blocks_row = (totalblocks-1)/np_rows + 1 - max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q! - - max_local_rows = max_blocks_row*nblk - max_local_cols = max_blocks_col*nblk - - max_stored_rows = (63/nblk+1)*nblk - - allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_real: error when allocating tmat "//errorMessage - stop - endif - - allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_real: error when allocating h1 "//errorMessage - stop - endif - - allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_real: error when allocating h2 "//errorMessage - stop - endif - - allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_real: error when allocating tmp1 "//errorMessage - stop - endif - - allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_real: error when allocating tmp2 "//errorMessage - stop - endif - - allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_real: error when allocating hvn "//errorMessage - stop - endif - - allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_real: error when allocating hvm "//errorMessage - stop - endif - - hvm = 0 ! Must be set to 0 !!! - hvb = 0 ! Safety only - - l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q - - nstor = 0 - - do istep=1,na,nblk - - ics = MAX(istep,3) - ice = MIN(istep+nblk-1,na) - if (ice0) & -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(hvb, nb, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr) -#else - call MPI_Bcast(hvb, nb, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - nb = 0 - do ic=ics,ice - l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector - hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) - nstor = nstor+1 - nb = nb+l_rows - enddo - - ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough! - if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then - - ! Calculate scalar products of stored vectors. - ! This can be done in different ways, we use dsyrk - - tmat = 0 - if (l_rows>0) & -#ifdef DOUBLE_PRECISION_REAL - call dsyrk('U', 'T', nstor, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), 0.0_rk, tmat, max_stored_rows) -#else - call ssyrk('U', 'T', nstor, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), 0.0_rk, tmat, max_stored_rows) -#endif - - - nc = 0 - do n=1,nstor-1 - h1(nc+1:nc+n) = tmat(1:n,n+1) - nc = nc+n - enddo -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - if (nc>0) h2 = h1 -#endif /* WITH_MPI */ - ! Calculate triangular matrix T - - nc = 0 - tmat(1,1) = tau(ice-nstor+1) - do n=1,nstor-1 -#ifdef DOUBLE_PRECISION_REAL - call dtrmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1) -#else - call strmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1) -#endif - tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1) - tmat(n+1,n+1) = tau(ice-nstor+n+1) - nc = nc+n - enddo - - ! Q = Q - V * T * V**T * Q - - if (l_rows>0) then -#ifdef DOUBLE_PRECISION_REAL - call dgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), & - q, ldq, 0.0_rk, tmp1, nstor) -#else - call sgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), & - q, ldq, 0.0_rk, tmp1, nstor) -#endif - - else - tmp1(1:l_cols*nstor) = 0 - endif - -#ifdef DOUBLE_PRECISION_REAL - -#ifdef WITH_MPI - call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - tmp2 = tmp1 -#endif - if (l_rows>0) then - call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor) - call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), & - tmp2, nstor, 1.0_rk, q, ldq) - endif -#else /* DOUBLE_PRECISION_REAL */ - -#ifdef WITH_MPI - call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#else - tmp2 = tmp1 -#endif - - if (l_rows>0) then - call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk, tmat, max_stored_rows, tmp2, nstor) - call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk, hvm, ubound(hvm,dim=1), & - tmp2, nstor, 1.0_rk, q, ldq) - endif -#endif /* DOUBLE_PRECISION_REAL */ - nstor = 0 - endif - - enddo - - deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_real: error when deallocating hvm "//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("trans_ev_real") -#endif - - end subroutine trans_ev_real - - subroutine mult_at_b_real(uplo_a, uplo_c, na, ncb, a, lda, ldaCols, b, ldb, ldbCols, nblk, & - mpi_comm_rows, mpi_comm_cols, c, ldc, ldcCols) - - !------------------------------------------------------------------------------- - ! mult_at_b_real: Performs C := A**T * B - ! - ! where: A is a square matrix (na,na) which is optionally upper or lower triangular - ! B is a (na,ncb) matrix - ! C is a (na,ncb) matrix where optionally only the upper or lower - ! triangle may be computed - ! - ! Parameters - ! - ! uplo_a 'U' if A is upper triangular - ! 'L' if A is lower triangular - ! anything else if A is a full matrix - ! Please note: This pertains to the original A (as set in the calling program) - ! whereas the transpose of A is used for calculations - ! If uplo_a is 'U' or 'L', the other triangle is not used at all, - ! i.e. it may contain arbitrary numbers - ! - ! uplo_c 'U' if only the upper diagonal part of C is needed - ! 'L' if only the upper diagonal part of C is needed - ! anything else if the full matrix C is needed - ! Please note: Even when uplo_c is 'U' or 'L', the other triangle may be - ! written to a certain extent, i.e. one shouldn't rely on the content there! - ! - ! na Number of rows/columns of A, number of rows of B and C - ! - ! ncb Number of columns of B and C - ! - ! a Matrix A - ! - ! lda Leading dimension of a - ! ldaCols Columns of Matrix a - ! - ! b Matrix B - ! ldbCol Columns of Matrix b - ! - ! ldb Leading dimension of b - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - ! c Matrix C - ! - ! ldc Leading dimension of c - ! ldcCol Columns of Matrix c - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - character*1 :: uplo_a, uplo_c - - integer(kind=ik), intent(in) :: na, lda, ldaCols, ldb, ldbCols, ldc, ldcCols, nblk - integer(kind=ik) :: ncb, mpi_comm_rows, mpi_comm_cols - real(kind=rk) :: a(lda,ldaCols), b(ldb,ldbCols), c(ldc,ldcCols) - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: l_cols, l_rows, l_rows_np - integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce - integer(kind=ik) :: gcol_min, gcol, goff - integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals - integer(kind=ik), allocatable :: lrs_save(:), lre_save(:) - - logical :: a_lower, a_upper, c_lower, c_upper - - real(kind=rk), allocatable :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:) - integer(kind=ik) :: istat - character(200) :: errorMessage -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("mult_at_b_real") -#endif - if (na .lt. lda) then - print *,"na lt lda ",na,lda - stop - endif - if (na .lt. ldb) then - print *,"na lt ldb ",na,ldb - stop - endif - if (na .lt. ldc) then - print *,"na lt ldc ",na,ldc - stop - endif - if (na .lt. ldaCols) then - print *,"na lt ldaCols ",na,ldaCols - stop - endif - if (na .lt. ldbCols) then - print *,"na lt ldbCols ",na,ldbCols - stop - endif - if (na .lt. ldcCols) then - print *,"na lt ldcCols ",na,ldcCols - stop - endif - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b - l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b - - ! Block factor for matrix multiplications, must be a multiple of nblk - - if (na/np_rows<=256) then - nblk_mult = (31/nblk+1)*nblk - else - nblk_mult = (63/nblk+1)*nblk - endif - - allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_at_b_real: error when allocating aux_mat "//errorMessage - stop - endif - - allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_at_b_real: error when allocating aux_bc "//errorMessage - stop - endif - - allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_at_b_real: error when allocating lrs_save "//errorMessage - stop - endif - - allocate(lre_save(nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_at_b_real: error when allocating lre_save "//errorMessage - stop - endif - - a_lower = .false. - a_upper = .false. - c_lower = .false. - c_upper = .false. - - if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true. - if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true. - if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true. - if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true. - - ! Build up the result matrix by processor rows - - do np = 0, np_rows-1 - - ! In this turn, procs of row np assemble the result - - l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors - - nr_done = 0 ! Number of rows done - aux_mat = 0 - nstor = 0 ! Number of columns stored in aux_mat - - ! Loop over the blocks on row np - - do nb=0,(l_rows_np-1)/nblk - - goff = nb*np_rows + np ! Global offset in blocks corresponding to nb - - ! Get the processor column which owns this block (A is transposed, so we need the column) - ! and the offset in blocks within this column. - ! The corresponding block column in A is then broadcast to all for multiplication with B - - np_bc = MOD(goff,np_cols) - noff = goff/np_cols - n_aux_bc = 0 - - ! Gather up the complete block column of A on the owner - - do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast - - gcol = goff*nblk + n ! global column corresponding to n - if (nstor==0 .and. n==1) gcol_min = gcol - - lrs = 1 ! 1st local row number for broadcast - lre = l_rows ! last local row number for broadcast - if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1) - if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) - - if (lrs<=lre) then - nvals = lre-lrs+1 - if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n) - n_aux_bc = n_aux_bc + nvals - endif - - lrs_save(n) = lrs - lre_save(n) = lre - - enddo - - ! Broadcast block column -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL8, np_bc, mpi_comm_cols, mpierr) -#else - call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL4, np_bc, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - ! Insert what we got in aux_mat - - n_aux_bc = 0 - do n = 1, min(l_rows_np-nb*nblk,nblk) - nstor = nstor+1 - lrs = lrs_save(n) - lre = lre_save(n) - if (lrs<=lre) then - nvals = lre-lrs+1 - aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals) - n_aux_bc = n_aux_bc + nvals - endif - enddo - - ! If we got nblk_mult columns in aux_mat or this is the last block - ! do the matrix multiplication - - if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then - - lrs = 1 ! 1st local row number for multiply - lre = l_rows ! last local row number for multiply - if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1) - if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) - - lcs = 1 ! 1st local col number for multiply - lce = l_cols ! last local col number for multiply - if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1) - if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols) - - if (lcs<=lce) then - allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_at_b_real: error when allocating tmp1 "//errorMessage - stop - endif - - if (lrs<=lre) then -#ifdef DOUBLE_PRECISION_REAL - call dgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk, aux_mat(lrs,1), ubound(aux_mat,dim=1), & - b(lrs,lcs), ldb, 0.0_rk, tmp1, nstor) -#else - call sgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk, aux_mat(lrs,1), ubound(aux_mat,dim=1), & - b(lrs,lcs), ldb, 0.0_rk, tmp1, nstor) -#endif - - else - tmp1 = 0 - endif - - ! Sum up the results and send to processor row np -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL8, MPI_SUM, np, mpi_comm_rows, mpierr) -#else - call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - tmp2 = tmp1 -#endif /* WITH_MPI */ - ! Put the result into C - if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) - - deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_at_b_real: error when deallocating tmp1 "//errorMessage - stop - endif - - endif - - nr_done = nr_done+nstor - nstor=0 - aux_mat(:,:)=0 - endif - enddo - enddo - - deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_at_b_real: error when deallocating aux_mat "//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("mult_at_b_real") -#endif - - end subroutine mult_at_b_real - -#ifdef DOUBLE_PRECISION_COMPLEX - -#define DATATYPE COMPLEX(kind=ck) -#define BYTESIZE 16 -#define COMPLEXCASE 1 -#include "elpa_transpose_vectors.X90" -#include "elpa_reduce_add_vectors.X90" -#undef DATATYPE -#undef BYTESIZE -#undef COMPLEXCASE - -#else /* DOUBLE_PRECISION_COMPLEX */ - -#define DATATYPE COMPLEX(kind=ck) -#define BYTESIZE 8 -#define COMPLEXCASE 1 -#include "elpa_transpose_vectors.X90" -#include "elpa_reduce_add_vectors.X90" -#undef DATATYPE -#undef BYTESIZE -#undef COMPLEXCASE - -#endif /* DOUBLE_PRECISION_COMPLEX */ - subroutine tridiag_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) - - !------------------------------------------------------------------------------- - ! tridiag_complex: Reduces a distributed hermitian matrix to tridiagonal form - ! (like Scalapack Routine PZHETRD) - ! - ! Parameters - ! - ! na Order of matrix - ! - ! a(lda,matrixCols) Distributed matrix which should be reduced. - ! Distribution is like in Scalapack. - ! Opposed to PZHETRD, a(:,:) must be set completely (upper and lower half) - ! a(:,:) is overwritten on exit with the Householder vectors - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix a - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - ! d(na) Diagonal elements (returned), identical on all processors - ! - ! e(na) Off-Diagonal elements (returned), identical on all processors - ! - ! tau(na) Factors for the Householder vectors (returned), needed for back transformation - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - complex(kind=ck) :: tau(na) -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - complex(kind=ck) :: a(lda,*) -#else - complex(kind=ck) :: a(lda,matrixCols) -#endif - real(kind=rk) :: d(na), e(na) - - integer(kind=ik), parameter :: max_stored_rows = 32 - - complex(kind=ck), parameter :: CZERO = (0.0_rk,0.0_rk), CONE = (1.0_rk,0.0_rk) - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols - integer(kind=ik) :: l_cols, l_rows, nstor - integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre - integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile - -#ifdef WITH_OPENMP - integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter - integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads -#endif - - real(kind=rk) :: vnorm2 - complex(kind=ck) :: vav, xc, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf - - complex(kind=ck), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) -#ifdef WITH_OPENMP - complex(kind=ck), allocatable :: ur_p(:,:), uc_p(:,:) -#endif - real(kind=rk), allocatable :: tmpr(:) - integer(kind=ik) :: istat - character(200) :: errorMessage - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("tridiag_complex") -#endif - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - - ! Matrix is split into tiles; work is done only for tiles on the diagonal or above - - tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide - - l_rows_tile = tile_size/np_rows ! local rows of a tile - l_cols_tile = tile_size/np_cols ! local cols of a tile - - - totalblocks = (na-1)/nblk + 1 - max_blocks_row = (totalblocks-1)/np_rows + 1 - max_blocks_col = (totalblocks-1)/np_cols + 1 - - max_local_rows = max_blocks_row*nblk - max_local_cols = max_blocks_col*nblk - - allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating tmp "//errorMessage - stop - endif - - allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating vr "//errorMessage - stop - endif - - allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating ur "//errorMessage - stop - endif - - allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating vc "//errorMessage - stop - endif - - allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating uc "//errorMessage - stop - endif - -#ifdef WITH_OPENMP - max_threads = omp_get_max_threads() - - allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating ur_p "//errorMessage - stop - endif - - allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating uc_p "//errorMessage - stop - endif -#endif - - tmp = 0 - vr = 0 - ur = 0 - vc = 0 - uc = 0 - - allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating vur "//errorMessage - stop - endif - - allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating uvc "//errorMessage - stop - endif - - d(:) = 0 - e(:) = 0 - tau(:) = 0 - - nstor = 0 - - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a - l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a - if (my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols) - - do istep=na,3,-1 - - ! Calculate number of local rows and columns of the still remaining matrix - ! on the local processor - - l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1) - l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1) - - ! Calculate vector for Householder transformation on all procs - ! owning column istep - - if (my_pcol==pcol(istep, nblk, np_cols)) then - - ! Get vector to be transformed; distribute last element and norm of - ! remaining elements to all procs in current column - - vr(1:l_rows) = a(1:l_rows,l_cols+1) - if (nstor>0 .and. l_rows>0) then - aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor)) -#ifdef DOUBLE_PRECISION_COMPLEX - call ZGEMV('N', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), & - aux, 1, CONE, vr, 1) -#else - call CGEMV('N', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), & - aux, 1, CONE, vr, 1) -#endif - endif - - if (my_prow==prow(istep-1, nblk, np_rows)) then - aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1)) - aux1(2) = vr(l_rows) - else - aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows)) - aux1(2) = 0. - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - aux2 = aux1 -#endif /* WITH_MPI */ - vnorm2 = aux2(1) - vrl = aux2(2) - - ! Householder transformation - - call hh_transform_complex(vrl, vnorm2, xf, tau(istep)) - - ! Scale vr and store Householder vector for back transformation - - vr(1:l_rows) = vr(1:l_rows) * xf - if (my_prow==prow(istep-1, nblk, np_rows)) then - vr(l_rows) = 1. - e(istep-1) = vrl - endif - a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation - - endif - - ! Broadcast the Householder vector (and tau) along columns - - if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Bcast(vr, l_rows+1, MPI_DOUBLE_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) -#else - call MPI_Bcast(vr, l_rows+1, MPI_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - tau(istep) = vr(l_rows+1) - - ! Transpose Householder vector vr -> vc - -! call elpa_transpose_vectors (vr, 2*ubound(vr,dim=1), mpi_comm_rows, & -! vc, 2*ubound(vc,dim=1), mpi_comm_cols, & -! 1, 2*(istep-1), 1, 2*nblk) - - call elpa_transpose_vectors_complex (vr, ubound(vr,dim=1), mpi_comm_rows, & - vc, ubound(vc,dim=1), mpi_comm_cols, & - 1, (istep-1), 1, nblk) - ! Calculate u = (A + VU**T + UV**T)*v - - ! For cache efficiency, we use only the upper half of the matrix tiles for this, - ! thus the result is partly in uc(:) and partly in ur(:) - - uc(1:l_cols) = 0 - ur(1:l_rows) = 0 - if (l_rows>0 .and. l_cols>0) then - -#ifdef WITH_OPENMP - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre) - - my_thread = omp_get_thread_num() - n_threads = omp_get_num_threads() - - n_iter = 0 - - uc_p(1:l_cols,my_thread) = 0. - ur_p(1:l_rows,my_thread) = 0. -#endif - - do i=0,(istep-2)/tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - if (lce0) then -#ifdef DOUBLE_PRECISION_COMPLEX - call ZGEMV('C', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), vr, 1, CZERO, aux, 1) - call ZGEMV('N', l_cols, 2*nstor, CONE, uvc, ubound(uvc,dim=1), aux, 1, CONE, uc, 1) -#else - call CGEMV('C', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), vr, 1, CZERO, aux, 1) - call CGEMV('N', l_cols, 2*nstor, CONE, uvc, ubound(uvc,dim=1), aux, 1, CONE, uc, 1) -#endif - endif - - endif - - ! Sum up all ur(:) parts along rows and add them to the uc(:) parts - ! on the processors containing the diagonal - ! This is only necessary if ur has been calculated, i.e. if the - ! global tile size is smaller than the global remaining matrix - - if (tile_size < istep-1) then - call elpa_reduce_add_vectors_COMPLEX (ur, ubound(ur,dim=1), mpi_comm_rows, & - uc, ubound(uc,dim=1), mpi_comm_cols, & - (istep-1), 1, nblk) - endif - - ! Sum up all the uc(:) parts, transpose uc -> ur - - if (l_cols>0) then - tmp(1:l_cols) = uc(1:l_cols) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_allreduce(tmp, uc, l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(tmp, uc, l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - uc = tmp -#endif /* WITH_MPI */ - endif - -! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, & -! ur, 2*ubound(ur,dim=1), mpi_comm_rows, & -! 1, 2*(istep-1), 1, 2*nblk) - - call elpa_transpose_vectors_complex (uc, ubound(uc,dim=1), mpi_comm_cols, & - ur, ubound(ur,dim=1), mpi_comm_rows, & - 1, (istep-1), 1, nblk) - - - - ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) - - xc = 0 - if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols)) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_allreduce(xc, vav, 1 , MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_cols, mpierr) -#else - call mpi_allreduce(xc, vav, 1 , MPI_COMPLEX, MPI_SUM, mpi_comm_cols, mpierr) -#endif - -#else /* WITH_MPI */ - vav = xc -#endif /* WITH_MPI */ - - ! store u and v in the matrices U and V - ! these matrices are stored combined in one here - - do j=1,l_rows - vur(j,2*nstor+1) = conjg(tau(istep))*vr(j) - vur(j,2*nstor+2) = 0.5*conjg(tau(istep))*vav*vr(j) - ur(j) - enddo - do j=1,l_cols - uvc(j,2*nstor+1) = 0.5*conjg(tau(istep))*vav*vc(j) - uc(j) - uvc(j,2*nstor+2) = conjg(tau(istep))*vc(j) - enddo - - nstor = nstor+1 - - ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T - - if (nstor==max_stored_rows .or. istep==3) then - - do i=0,(istep-2)/tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - lrs = 1 - lre = min(l_rows,(i+1)*l_rows_tile) - if (lce0) a(l_rows,l_cols) = a(l_rows,l_cols) & - + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor)) - d(istep-1) = a(l_rows,l_cols) - endif - - enddo ! istep - - ! Store e(1) and d(1) - - if (my_pcol==pcol(2, nblk, np_cols)) then - if (my_prow==prow(1, nblk, np_rows)) then - ! We use last l_cols value of loop above - vrl = a(1,l_cols) - call hh_transform_complex(vrl, 0.0_rk, xf, tau(2)) - e(1) = vrl - a(1,l_cols) = 1. ! for consistency only - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_bcast(tau(2), 1, MPI_DOUBLE_COMPLEX, prow(1, nblk, np_rows), mpi_comm_rows, mpierr) -#else - call mpi_bcast(tau(2), 1, MPI_COMPLEX, prow(1, nblk, np_rows), mpi_comm_rows, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_bcast(tau(2), 1, MPI_DOUBLE_COMPLEX, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr) -#else - call mpi_bcast(tau(2), 1, MPI_COMPLEX, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - - - if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1) - - deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when deallocating tmp "//errorMessage - stop - endif - ! distribute the arrays d and e to all processors - - allocate(tmpr(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when allocating tmpr "//errorMessage - stop - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - tmpr = d - call mpi_allreduce(tmpr, d, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) - tmpr = d - call mpi_allreduce(tmpr, d, na, MPI_REAL8 ,MPI_SUM, mpi_comm_cols, mpierr) - tmpr = e - call mpi_allreduce(tmpr, e, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) - tmpr = e - call mpi_allreduce(tmpr, e, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) -#else - tmpr = d - call mpi_allreduce(tmpr, d, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) - tmpr = d - call mpi_allreduce(tmpr, d, na, MPI_REAL4 ,MPI_SUM, mpi_comm_cols, mpierr) - tmpr = e - call mpi_allreduce(tmpr, e, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) - tmpr = e - call mpi_allreduce(tmpr, e, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - deallocate(tmpr, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_complex: error when deallocating tmpr "//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("tridiag_complex") -#endif - - end subroutine tridiag_complex - - subroutine trans_ev_complex(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) - - !------------------------------------------------------------------------------- - ! trans_ev_complex: Transforms the eigenvectors of a tridiagonal matrix back - ! to the eigenvectors of the original matrix - ! (like Scalapack Routine PZUNMTR) - ! - ! Parameters - ! - ! na Order of matrix a, number of rows of matrix q - ! - ! nqc Number of columns of matrix q - ! - ! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex) - ! Distribution is like in Scalapack. - ! - ! lda Leading dimension of a - ! - ! tau(na) Factors of the Householder vectors - ! - ! q On input: Eigenvectors of tridiagonal matrix - ! On output: Transformed eigenvectors - ! Distribution is like in Scalapack. - ! - ! ldq Leading dimension of q - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - complex(kind=ck) :: tau(na) -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - complex(kind=ck) :: a(lda,*), q(ldq,*) -#else - complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols) -#endif - integer(kind=ik) :: max_stored_rows - - complex(kind=ck), parameter :: CZERO = (0.0_rk,0.0_rk), CONE = (1.0_rk,0.0_rk) - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols - integer(kind=ik) :: l_cols, l_rows, l_colh, nstor - integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol - - complex(kind=ck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) - complex(kind=ck), allocatable :: tmat(:,:), h1(:), h2(:) - integer(kind=ik) :: istat - character(200) :: errorMessage -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("trans_ev_complex") -#endif - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - - totalblocks = (na-1)/nblk + 1 - max_blocks_row = (totalblocks-1)/np_rows + 1 - max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q! - - max_local_rows = max_blocks_row*nblk - max_local_cols = max_blocks_col*nblk - - max_stored_rows = (63/nblk+1)*nblk - - allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_complex: error when allocating tmat "//errorMessage - stop - endif - - allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_complex: error when allocating h1 "//errorMessage - stop - endif - - allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_complex: error when allocating h2 "//errorMessage - stop - endif - - allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_complex: error when allocating tmp1 "//errorMessage - stop - endif - - allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_complex: error when allocating tmp2 "//errorMessage - stop - endif - - allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_complex: error when allocating hvb "//errorMessage - stop - endif - - allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_complex: error when allocating hvm "//errorMessage - stop - endif - - hvm = 0 ! Must be set to 0 !!! - hvb = 0 ! Safety only - - l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q - - nstor = 0 - - ! In the complex case tau(2) /= 0 - if (my_prow == prow(1, nblk, np_rows)) then - q(1,1:l_cols) = q(1,1:l_cols)*((1.0_rk,0.0_rk)-tau(2)) - endif - - do istep=1,na,nblk - - ics = MAX(istep,3) - ice = MIN(istep+nblk-1,na) - if (ice0) & -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Bcast(hvb, nb, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) -#else - call MPI_Bcast(hvb, nb, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - nb = 0 - do ic=ics,ice - l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector - hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) - nstor = nstor+1 - nb = nb+l_rows - enddo - - ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough! - if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then - - ! Calculate scalar products of stored vectors. - ! This can be done in different ways, we use zherk - - tmat = 0 - if (l_rows>0) & -#ifdef DOUBLE_PRECISION_COMPLEX - call zherk('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows) -#else - call cherk('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows) -#endif - nc = 0 - do n=1,nstor-1 - h1(nc+1:nc+n) = tmat(1:n,n+1) - nc = nc+n - enddo -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - if (nc>0) h2=h1 -#endif /* WITH_MPI */ - ! Calculate triangular matrix T - - nc = 0 - tmat(1,1) = tau(ice-nstor+1) - do n=1,nstor-1 -#ifdef DOUBLE_PRECISION_COMPLEX - call ztrmv('L', 'C', 'N', n, tmat, max_stored_rows, h2(nc+1),1) -#else - call ctrmv('L', 'C', 'N', n, tmat, max_stored_rows, h2(nc+1),1) -#endif - tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1) - tmat(n+1,n+1) = tau(ice-nstor+n+1) - nc = nc+n - enddo - - ! Q = Q - V * T * V**T * Q - - if (l_rows>0) then -#ifdef DOUBLE_PRECISION_COMPLEX - call zgemm('C', 'N', nstor, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & - q, ldq, CZERO, tmp1 ,nstor) -#else - call cgemm('C', 'N', nstor, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & - q, ldq, CZERO, tmp1 ,nstor) -#endif - else - tmp1(1:l_cols*nstor) = 0 - endif -#ifdef DOUBLE_PRECISION_COMPLEX - -#ifdef WITH_MPI - call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - tmp2 = tmp1 -#endif - if (l_rows>0) then - call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor) - call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), & - tmp2, nstor, CONE, q, ldq) - endif -#else /* DOUBLE_PRECISION_COMPLEX */ - -#ifdef WITH_MPI - call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - tmp2 = tmp1 -#endif - if (l_rows>0) then - call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor) - call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), & - tmp2, nstor, CONE, q, ldq) - endif -#endif /* DOUBLE_PRECISION_COMPLEX */ - nstor = 0 - endif - - enddo - - deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_complex: error when deallocating hvb "//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("trans_ev_complex") -#endif - - end subroutine trans_ev_complex - - subroutine mult_ah_b_complex(uplo_a, uplo_c, na, ncb, a, lda, ldaCols, b, ldb, ldbCols, nblk, & - mpi_comm_rows, mpi_comm_cols, c, ldc, ldcCols) - - !------------------------------------------------------------------------------- - ! mult_ah_b_complex: Performs C := A**H * B - ! - ! where: A is a square matrix (na,na) which is optionally upper or lower triangular - ! B is a (na,ncb) matrix - ! C is a (na,ncb) matrix where optionally only the upper or lower - ! triangle may be computed - ! - ! Parameters - ! - ! uplo_a 'U' if A is upper triangular - ! 'L' if A is lower triangular - ! anything else if A is a full matrix - ! Please note: This pertains to the original A (as set in the calling program) - ! whereas the transpose of A is used for calculations - ! If uplo_a is 'U' or 'L', the other triangle is not used at all, - ! i.e. it may contain arbitrary numbers - ! - ! uplo_c 'U' if only the upper diagonal part of C is needed - ! 'L' if only the upper diagonal part of C is needed - ! anything else if the full matrix C is needed - ! Please note: Even when uplo_c is 'U' or 'L', the other triangle may be - ! written to a certain extent, i.e. one shouldn't rely on the content there! - ! - ! na Number of rows/columns of A, number of rows of B and C - ! - ! ncb Number of columns of B and C - ! - ! a Matrix A - ! - ! lda Leading dimension of a - ! ldaCols Columns of Matrix a - ! - ! b Matrix B - ! - ! ldb Leading dimension of b - ! ldbCols Columns of Matrix b - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - ! c Matrix C - ! - ! ldc Leading dimension of c - ! ldcCols Columns of Matrix C - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - character*1 :: uplo_a, uplo_c - integer(kind=ik), intent(in) :: lda, ldaCols, ldb, ldbCols, ldc, ldcCols - integer(kind=ik) :: na, ncb, nblk, mpi_comm_rows, mpi_comm_cols - complex(kind=ck) :: a(lda,ldaCols), b(ldb,ldbCols), c(ldc,ldcCols) ! removed assumed_size - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: l_cols, l_rows, l_rows_np - integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce - integer(kind=ik) :: gcol_min, gcol, goff - integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals - integer(kind=ik), allocatable :: lrs_save(:), lre_save(:) - - logical :: a_lower, a_upper, c_lower, c_upper - - complex(kind=ck), allocatable :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:) - integer(kind=ik) :: istat - character(200) :: errorMessage - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("mult_ah_b_complex") -#endif - if (na .lt. lda) then - print *,"na lt lda ",na,lda - stop - endif - if (na .lt. ldb) then - print *,"na lt ldb ",na,ldb - stop - endif - if (na .lt. ldc) then - print *,"na lt ldc ",na,ldc - stop - endif - if (na .lt. ldaCols) then - print *,"na lt ldaCols ",na,ldaCols - stop - endif - if (na .lt. ldbCols) then - print *,"na lt ldbCols ",na,ldbCols - stop - endif - if (na .lt. ldcCols) then - print *,"na lt ldcCols ",na,ldcCols - stop - endif - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b - l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b - - ! Block factor for matrix multiplications, must be a multiple of nblk - - if (na/np_rows<=256) then - nblk_mult = (31/nblk+1)*nblk - else - nblk_mult = (63/nblk+1)*nblk - endif - - allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_ah_b_complex: error when allocating aux_mat "//errorMessage - stop - endif - - allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_ah_b_complex: error when allocating aux_bc "//errorMessage - stop - endif - - allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_ah_b_complex: error when allocating lrs_save "//errorMessage - stop - endif - - allocate(lre_save(nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_ah_b_complex: error when allocating lre_save "//errorMessage - stop - endif - - a_lower = .false. - a_upper = .false. - c_lower = .false. - c_upper = .false. - - if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true. - if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true. - if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true. - if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true. - - ! Build up the result matrix by processor rows - - do np = 0, np_rows-1 - - ! In this turn, procs of row np assemble the result - - l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors - - nr_done = 0 ! Number of rows done - aux_mat = 0 - nstor = 0 ! Number of columns stored in aux_mat - - ! Loop over the blocks on row np - - do nb=0,(l_rows_np-1)/nblk - - goff = nb*np_rows + np ! Global offset in blocks corresponding to nb - - ! Get the processor column which owns this block (A is transposed, so we need the column) - ! and the offset in blocks within this column. - ! The corresponding block column in A is then broadcast to all for multiplication with B - - np_bc = MOD(goff,np_cols) - noff = goff/np_cols - n_aux_bc = 0 - - ! Gather up the complete block column of A on the owner - - do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast - - gcol = goff*nblk + n ! global column corresponding to n - if (nstor==0 .and. n==1) gcol_min = gcol - - lrs = 1 ! 1st local row number for broadcast - lre = l_rows ! last local row number for broadcast - if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1) - if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) - - if (lrs<=lre) then - nvals = lre-lrs+1 - if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n) - n_aux_bc = n_aux_bc + nvals - endif - - lrs_save(n) = lrs - lre_save(n) = lre - - enddo - - ! Broadcast block column -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Bcast(aux_bc, n_aux_bc, MPI_DOUBLE_COMPLEX, np_bc, mpi_comm_cols, mpierr) -#else - call MPI_Bcast(aux_bc, n_aux_bc, MPI_COMPLEX, np_bc, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - ! Insert what we got in aux_mat - - n_aux_bc = 0 - do n = 1, min(l_rows_np-nb*nblk,nblk) - nstor = nstor+1 - lrs = lrs_save(n) - lre = lre_save(n) - if (lrs<=lre) then - nvals = lre-lrs+1 - aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals) - n_aux_bc = n_aux_bc + nvals - endif - enddo - - ! If we got nblk_mult columns in aux_mat or this is the last block - ! do the matrix multiplication - - if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then - - lrs = 1 ! 1st local row number for multiply - lre = l_rows ! last local row number for multiply - if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1) - if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) - - lcs = 1 ! 1st local col number for multiply - lce = l_cols ! last local col number for multiply - if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1) - if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols) - - if (lcs<=lce) then - allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_ah_b_complex: error when allocating tmp1 "//errorMessage - stop - endif - - if (lrs<=lre) then -#ifdef DOUBLE_PRECISION_COMPLEX - call zgemm('C', 'N', nstor, lce-lcs+1, lre-lrs+1, (1.0_rk,0.0_rk), aux_mat(lrs,1), ubound(aux_mat,dim=1), & - b(lrs,lcs), ldb, (0.0_rk,0.0_rk), tmp1, nstor) -#else - call cgemm('C', 'N', nstor, lce-lcs+1, lre-lrs+1, (1.0_rk,0.0_rk), aux_mat(lrs,1), ubound(aux_mat,dim=1), & - b(lrs,lcs), ldb, (0.0_rk,0.0_rk), tmp1, nstor) -#endif - else - tmp1 = 0 - endif - - ! Sum up the results and send to processor row np -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr) -#else - call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - tmp2 = tmp1 -#endif /* WITH_MPI */ - ! Put the result into C - if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) - - deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage - stop - endif - - endif - - nr_done = nr_done+nstor - nstor=0 - aux_mat(:,:)=0 - endif - enddo - enddo - - deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"mult_ah_b_complex: error when deallocating aux_mat "//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("mult_ah_b_complex") -#endif - - end subroutine mult_ah_b_complex - - subroutine solve_tridi( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success ) -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - real(kind=rk) :: d(na), e(na) -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - real(kind=rk) :: q(ldq,*) -#else - real(kind=rk) :: q(ldq,matrixCols) -#endif - - integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - - integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:) - - logical, intent(in) :: wantDebug - logical, intent(out) :: success - integer(kind=ik) :: istat - character(200) :: errorMessage - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("solve_tridi") -#endif - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - success = .true. - - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q - l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q - - ! Set Q to 0 - - q(1:l_rows, 1:l_cols) = 0. - - ! Get the limits of the subdivisons, each subdivison has as many cols - ! as fit on the respective processor column - - allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi: error when allocating limits "//errorMessage - stop - endif - - limits(0) = 0 - do np=0,np_cols-1 - nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np - - ! Check for the case that a column has have zero width. - ! This is not supported! - ! Scalapack supports it but delivers no results for these columns, - ! which is rather annoying - if (nc==0) then -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_tridi") -#endif - if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width' - success = .false. - return - endif - limits(np+1) = limits(np) + nc - enddo - - ! Subdivide matrix by subtracting rank 1 modifications - - do i=1,np_cols-1 - n = limits(i) - d(n) = d(n)-abs(e(n)) - d(n+1) = d(n+1)-abs(e(n)) - enddo - - ! Solve sub problems on processsor columns - - nc = limits(my_pcol) ! column after which my problem starts - - if (np_cols>1) then - nev1 = l_cols ! all eigenvectors are needed - else - nev1 = MIN(nev,l_cols) - endif - - call solve_tridi_col(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & - matrixCols, mpi_comm_rows, wantDebug, success) - - if (.not.(success)) then -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_tridi") -#endif - return - endif - ! If there is only 1 processor column, we are done - - if (np_cols==1) then - deallocate(limits, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi: error when deallocating limits "//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_tridi") -#endif - return - endif - - ! Set index arrays for Q columns - - ! Dense distribution scheme: - - allocate(l_col(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi: error when allocating l_col "//errorMessage - stop - endif - - allocate(p_col(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi: error when allocating p_col "//errorMessage - stop - endif - - n = 0 - do np=0,np_cols-1 - nc = local_index(na, np, np_cols, nblk, -1) - do i=1,nc - n = n+1 - l_col(n) = i - p_col(n) = np - enddo - enddo - - ! Block cyclic distribution scheme, only nev columns are set: - - allocate(l_col_bc(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi: error when allocating l_col_bc "//errorMessage - stop - endif - - allocate(p_col_bc(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi: error when allocating p_col_bc "//errorMessage - stop - endif - - p_col_bc(:) = -1 - l_col_bc(:) = -1 - - do i = 0, na-1, nblk*np_cols - do j = 0, np_cols-1 - do n = 1, nblk - if (i+j*nblk+n <= MIN(nev,na)) then - p_col_bc(i+j*nblk+n) = j - l_col_bc(i+j*nblk+n) = i/np_cols + n - endif - enddo - enddo - enddo - - ! Recursively merge sub problems - - call merge_recursive(0, np_cols, wantDebug, success) - if (.not.(success)) then -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_tridi") -#endif - return - endif - - deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi: error when deallocating l_col "//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_tridi") -#endif - return - - contains - recursive subroutine merge_recursive(np_off, nprocs, wantDebug, success) - use precision - implicit none - - ! noff is always a multiple of nblk_ev - ! nlen-noff is always > nblk_ev - - integer(kind=ik) :: np_off, nprocs - integer(kind=ik) :: np1, np2, noff, nlen, nmid, n -#ifdef WITH_MPI - integer(kind=ik) :: mpi_status(mpi_status_size) -#endif - logical, intent(in) :: wantDebug - logical, intent(out) :: success - - success = .true. - - if (nprocs<=1) then - ! Safety check only - if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs - success = .false. - return - endif - ! Split problem into 2 subproblems of size np1 / np2 - - np1 = nprocs/2 - np2 = nprocs-np1 - - if (np1 > 1) call merge_recursive(np_off, np1, wantDebug, success) - if (.not.(success)) return - if (np2 > 1) call merge_recursive(np_off+np1, np2, wantDebug, success) - if (.not.(success)) return - - noff = limits(np_off) - nmid = limits(np_off+np1) - noff - nlen = limits(np_off+nprocs) - noff - -#ifdef WITH_MPI - if (my_pcol==np_off) then - do n=np_off+np1,np_off+nprocs-1 -#ifdef DOUBLE_PRECISION_REAL - call mpi_send(d(noff+1), nmid, MPI_REAL8, n, 1, mpi_comm_cols, mpierr) -#else - call mpi_send(d(noff+1), nmid, MPI_REAL4, n, 1, mpi_comm_cols, mpierr) -#endif - enddo - endif -#endif /* WITH_MPI */ - - if (my_pcol>=np_off+np1 .and. my_pcol=np_off .and. my_pcol2*min_submatrix_size) - n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries - ndiv = ndiv*2 - enddo - - ! If there is only 1 processor row and not all eigenvectors are needed - ! and the matrix size is big enough, then use 2 subdivisions - ! so that merge_systems is called once and only the needed - ! eigenvectors are calculated for the final problem. - - if (np_rows==1 .and. nev2*min_submatrix_size) ndiv = 2 - - allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi_col: error when allocating limits "//errorMessage - stop - endif - - limits(0) = 0 - limits(ndiv) = na - - n = ndiv - do while(n>1) - n = n/2 ! n is always a power of 2 - do i=0,ndiv-1,2*n - ! We want to have even boundaries (for cache line alignments) - limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2 - enddo - enddo - - ! Calculate the maximum size of a subproblem - - max_size = 0 - do i=1,ndiv - max_size = MAX(max_size,limits(i)-limits(i-1)) - enddo - - ! Subdivide matrix by subtracting rank 1 modifications - - do i=1,ndiv-1 - n = limits(i) - d(n) = d(n)-abs(e(n)) - d(n+1) = d(n+1)-abs(e(n)) - enddo - - if (np_rows==1) then - - ! For 1 processor row there may be 1 or 2 subdivisions - do n=0,ndiv-1 - noff = limits(n) ! Start of subproblem - nlen = limits(n+1)-noff ! Size of subproblem - - call solve_tridi_single(nlen,d(noff+1),e(noff+1), & - q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success) - if (.not.(success)) return - enddo - - else - - ! Solve sub problems in parallel with solve_tridi_single - ! There is at maximum 1 subproblem per processor - - allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi_col: error when allocating qmat1 "//errorMessage - stop - endif - - allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi_col: error when allocating qmat2 "//errorMessage - stop - endif - - qmat1 = 0 ! Make sure that all elements are defined - - if (my_prow < ndiv) then - - noff = limits(my_prow) ! Start of subproblem - nlen = limits(my_prow+1)-noff ! Size of subproblem - - call solve_tridi_single(nlen,d(noff+1),e(noff+1),qmat1, & - ubound(qmat1,dim=1), wantDebug, success) - - if (.not.(success)) return - endif - - ! Fill eigenvectors in qmat1 into global matrix q - - do np = 0, ndiv-1 - - noff = limits(np) - nlen = limits(np+1)-noff -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(d(noff+1), nlen, MPI_REAL8, np, mpi_comm_rows, mpierr) - qmat2 = qmat1 - call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL8, np, mpi_comm_rows, mpierr) -#else - - call MPI_Bcast(d(noff+1), nlen, MPI_REAL4, np, mpi_comm_rows, mpierr) - qmat2 = qmat1 - call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL4, np, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - qmat2 = qmat1 ! is this correct -#endif /* WITH_MPI */ - do i=1,nlen - call distribute_global_column(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) - enddo - - enddo - - deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi_col: error when deallocating qmat2 "//errorMessage - stop - endif - - endif - - ! Allocate and set index arrays l_col and p_col - - allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi_col: error when allocating l_col "//errorMessage - stop - endif - - do i=1,na - l_col(i) = i - p_col_i(i) = 0 - p_col_o(i) = 0 - enddo - - ! Merge subproblems - - n = 1 - do while(n 1e-14_rk) then - write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1) - else - write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1) - write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.' - write(error_unit,'(a)') 'In this extent, this is completely harmless.' - write(error_unit,'(a)') 'Still, we keep this info message just in case.' - end if - allocate(qtmp(nlen), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_tridi_single: error when allocating qtmp "//errorMessage - stop - endif - - dtmp = d(i+1) - qtmp(1:nlen) = q(1:nlen,i+1) - do j=i,1,-1 - if (dtmp=npc_0+npc_n) then -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("merge_systems") -#endif - return - endif - ! Determine number of "next" and "prev" column for ring sends - - if (my_pcol == npc_0+npc_n-1) then - np_next = npc_0 - else - np_next = my_pcol + 1 - endif - - if (my_pcol == npc_0) then - np_prev = npc_0+npc_n-1 - else - np_prev = my_pcol - 1 - endif - - call check_monotony(nm,d,'Input1',wantDebug, success) - if (.not.(success)) then -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("merge_systems") -#endif - return - endif - call check_monotony(na-nm,d(nm+1),'Input2',wantDebug, success) - if (.not.(success)) then -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("merge_systems") -#endif - return - endif - ! Get global number of processors and my processor number. - ! Please note that my_proc does not need to match any real processor number, - ! it is just used for load balancing some loops. - - n_procs = np_rows*npc_n - my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major - - - ! Local limits of the rows of Q - - l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q - l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm - l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q - - l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm - l_rows = l_rqe-l_rqs+1 ! Total number of local rows - - - ! My number of local columns - - l_cols = COUNT(p_col(1:na)==my_pcol) - - ! Get max number of local columns - - max_local_cols = 0 - do np = npc_0, npc_0+npc_n-1 - max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np)) - enddo - - ! Calculations start here - - beta = abs(e) - sig = sign(1.0_rk,e) - - ! Calculate rank-1 modifier z - - z(:) = 0 - - if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then - ! nm is local on my row - do i = 1, na - if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i)) - enddo - endif - - if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then - ! nm+1 is local on my row - do i = 1, na - if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i)) - enddo - endif - - call global_gather(z, na) - - ! Normalize z so that norm(z) = 1. Since z is the concatenation of - ! two normalized vectors, norm2(z) = sqrt(2). - - z = z/sqrt(2.0_rk) - rho = 2.*beta - - ! Calculate index for merging both systems by ascending eigenvalues -#ifdef DOUBLE_PRECISION_REAL - call DLAMRG( nm, na-nm, d, 1, 1, idx ) -#else - call SLAMRG( nm, na-nm, d, 1, 1, idx ) -#endif - -! Calculate the allowable deflation tolerance - - zmax = maxval(abs(z)) - dmax = maxval(abs(d)) -#ifdef DOUBLE_PRECISION_REAL - EPS = DLAMCH( 'Epsilon' ) -#else - EPS = SLAMCH( 'Epsilon' ) -#endif - TOL = 8.*EPS*MAX(dmax,zmax) - - ! If the rank-1 modifier is small enough, no more needs to be done - ! except to reorganize D and Q - - IF ( RHO*zmax <= TOL ) THEN - - ! Rearrange eigenvalues - - tmp = d - do i=1,na - d(i) = tmp(idx(i)) - enddo - - ! Rearrange eigenvectors - - call resort_ev(idx, na) -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("merge_systems") -#endif - - return - ENDIF - - ! Merge and deflate system - - na1 = 0 - na2 = 0 - - ! COLTYP: - ! 1 : non-zero in the upper half only; - ! 2 : dense; - ! 3 : non-zero in the lower half only; - ! 4 : deflated. - - coltyp(1:nm) = 1 - coltyp(nm+1:na) = 3 - - do i=1,na - - if (rho*abs(z(idx(i))) <= tol) then - - ! Deflate due to small z component. - - na2 = na2+1 - d2(na2) = d(idx(i)) - idx2(na2) = idx(i) - coltyp(idx(i)) = 4 - - else if (na1>0) then - - ! Check if eigenvalues are close enough to allow deflation. - - S = Z(idx(i)) - C = Z1(na1) - - ! Find sqrt(a**2+b**2) without overflow or - ! destructive underflow. -#ifdef DOUBLE_PRECISION_REAL - TAU = DLAPY2( C, S ) -#else - TAU = SLAPY2( C, S ) -#endif - T = D1(na1) - D(idx(i)) - C = C / TAU - S = -S / TAU - IF ( ABS( T*C*S ) <= TOL ) THEN - - ! Deflation is possible. - - na2 = na2+1 - - Z1(na1) = TAU - - d2new = D(idx(i))*C**2 + D1(na1)*S**2 - d1new = D(idx(i))*S**2 + D1(na1)*C**2 - - ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0 - ! This means that after the above transformation it must be - ! D1(na1) <= d1new <= D(idx(i)) - ! D1(na1) <= d2new <= D(idx(i)) - ! - ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1)) - ! so there is no problem with sorting here. - ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1) - ! which makes a check (and possibly a resort) necessary. - ! - ! The above relations may not hold exactly due to numeric differences - ! so they have to be enforced in order not to get troubles with sorting. - - - if (d1newD(idx(i))) d1new = D(idx(i)) - - if (d2newD(idx(i))) d2new = D(idx(i)) - - D1(na1) = d1new - - do j=na2-1,1,-1 - if (d2new2) then - - ! Solve secular equation - - z(1:na1) = 1 -#ifdef WITH_OPENMP - z_p(1:na1,:) = 1 -#endif - dbase(1:na1) = 0 - ddiff(1:na1) = 0 - - info = 0 -#ifdef WITH_OPENMP - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j) - my_thread = omp_get_thread_num() -!$OMP DO -#endif - DO i = my_proc+1, na1, n_procs ! work distributed over all processors -#ifdef DOUBLE_PRECISION_REAL - call DLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! -#else - call SLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! -#endif - if (info/=0) then - ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) - ! use the more stable bisection algorithm in solve_secular_equation - ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection' - call solve_secular_equation(na1, i, d1, z1, delta, rho, s) - endif - - ! Compute updated z - -#ifdef WITH_OPENMP - do j=1,na1 - if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) ) - enddo - z_p(i,my_thread) = z_p(i,my_thread)*delta(i) -#else - do j=1,na1 - if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) ) - enddo - z(i) = z(i)*delta(i) -#endif - ! store dbase/ddiff - - if (i1) then - - if (np_rem==npc_0) then - np_rem = npc_0+npc_n-1 - else - np_rem = np_rem-1 - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL8, & - np_next, 1111, np_prev, 1111, & - mpi_comm_cols, mpi_status, mpierr) -#else - call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL4, & - np_next, 1111, np_prev, 1111, & - mpi_comm_cols, mpi_status, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - ! Gather the parts in d1 and z which are fitting to qtmp1. - ! This also delivers nnzu/nnzl for proc np_rem - - nnzu = 0 - nnzl = 0 - do i=1,na1 - if (p_col(idx1(i))==np_rem) then - if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then - nnzu = nnzu+1 - d1u(nnzu) = d1(i) - zu (nnzu) = z (i) - endif - if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then - nnzl = nnzl+1 - d1l(nnzl) = d1(i) - zl (nnzl) = z (i) - endif - endif - enddo - - ! Set the deflated eigenvectors in Q (comming from proc np_rem) - - ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix - do i = 1, na - j = idx(i) - if (j>na1) then - if (p_col(idx2(j-na1))==np_rem) then - ndef = ndef+1 - if (p_col_out(i)==my_pcol) & - q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef) - endif - endif - enddo - - do ns = 0, nqcols1-1, max_strip ! strimining loop - - ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip - - ! Get partial result from (output) Q - - do i = 1, ncnt - qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) - enddo - - ! Compute eigenvectors of the rank-1 modified matrix. - ! Parts for multiplying with upper half of Q: - - do i = 1, ncnt - j = idx(idxq1(i+ns)) - ! Calculate the j-th eigenvector of the deflated system - ! See above why we are doing it this way! - tmp(1:nnzu) = d1u(1:nnzu)-dbase(j) - call v_add_s(tmp,nnzu,ddiff(j)) - ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j) - enddo - - ! Multiply old Q with eigenvectors (upper half) - - if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) & -#ifdef DOUBLE_PRECISION_REAL - call dgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & - 1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1)) -#else - call sgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & - 1.0_rk, qtmp2(1,1), ubound(qtmp2,dim=1)) -#endif - ! Compute eigenvectors of the rank-1 modified matrix. - ! Parts for multiplying with lower half of Q: - - do i = 1, ncnt - j = idx(idxq1(i+ns)) - ! Calculate the j-th eigenvector of the deflated system - ! See above why we are doing it this way! - tmp(1:nnzl) = d1l(1:nnzl)-dbase(j) - call v_add_s(tmp,nnzl,ddiff(j)) - ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j) - enddo - - ! Multiply old Q with eigenvectors (lower half) - - if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) & -#ifdef DOUBLE_PRECISION_REAL - call dgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & - ubound(ev,dim=1), 1.0_rk, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) -#else - call sgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & - ubound(ev,dim=1), 1.0_rk, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) -#endif - - ! Put partial result into (output) Q - - do i = 1, ncnt - q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i) - enddo - - enddo - enddo - - deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"merge_systems: error when deallocating ev "//errorMessage - stop - endif - endif - -#ifdef WITH_OPENMP - deallocate(z_p, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"merge_systems: error when deallocating z_p "//errorMessage - stop - endif -#endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("merge_systems") -#endif - - - return - - contains - subroutine add_tmp(d1, dbase, ddiff, z, ev_scale_value, na1,i) - use precision - implicit none - - integer(kind=ik), intent(in) :: na1, i - - real(kind=rk), intent(in) :: d1(:), dbase(:), ddiff(:), z(:) - real(kind=rk), intent(inout) :: ev_scale_value - real(kind=rk) :: tmp(1:na1) - - ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code - ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results - - ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i) - ! in exactly this order, but we want to prevent compiler optimization - - tmp(1:na1) = d1(1:na1) -dbase(i) - call v_add_s(tmp(1:na1),na1,ddiff(i)) - - tmp(1:na1) = z(1:na1) / tmp(1:na1) - - ev_scale_value = 1.0_rk/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) - - end subroutine add_tmp - - subroutine resort_ev(idx_ev, nLength) - use precision - implicit none - - integer(kind=ik), intent(in) :: nLength - integer(kind=ik) :: idx_ev(nLength) - integer(kind=ik) :: i, nc, pc1, pc2, lc1, lc2, l_cols_out - - real(kind=rk), allocatable :: qtmp(:,:) - integer(kind=ik) :: istat - character(200) :: errorMessage - - if (l_rows==0) return ! My processor column has no work to do - - ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i)) - - l_cols_out = COUNT(p_col_out(1:na)==my_pcol) - allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"resort_ev: error when allocating qtmp "//errorMessage - stop - endif - - nc = 0 - - do i=1,na - - pc1 = p_col(idx_ev(i)) - lc1 = l_col(idx_ev(i)) - pc2 = p_col_out(i) - - if (pc2<0) cycle ! This column is not needed in output - - if (pc2==my_pcol) nc = nc+1 ! Counter for output columns - - if (pc1==my_pcol) then - if (pc2==my_pcol) then - ! send and recieve column are local - qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) - else -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL8, pc2, mod(i,4096), mpi_comm_cols, mpierr) -#else - call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL4, pc2, mod(i,4096), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - else if (pc2==my_pcol) then -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL8, pc1, mod(i,4096), mpi_comm_cols, mpi_status, mpierr) -#else - call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL4, pc1, mod(i,4096), mpi_comm_cols, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,nc) -#endif /* WITH_MPI */ - endif - enddo - - ! Insert qtmp into (output) q - - nc = 0 - - do i=1,na - - pc2 = p_col_out(i) - lc2 = l_col_out(i) - - if (pc2==my_pcol) then - nc = nc+1 - q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc) - endif - enddo - - deallocate(qtmp, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"resort_ev: error when deallocating qtmp "//errorMessage - stop - endif - end subroutine resort_ev - - subroutine transform_columns(col1, col2) - use precision - implicit none - - integer(kind=ik) :: col1, col2 - integer(kind=ik) :: pc1, pc2, lc1, lc2 - - if (l_rows==0) return ! My processor column has no work to do - - pc1 = p_col(col1) - lc1 = l_col(col1) - pc2 = p_col(col2) - lc2 = l_col(col2) - - if (pc1==my_pcol) then - if (pc2==my_pcol) then - ! both columns are local - tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1) - q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) - q(l_rqs:l_rqe,lc1) = tmp(1:l_rows) - else -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL8, pc2, 1, & - tmp, l_rows, MPI_REAL8, pc2, 1, & - mpi_comm_cols, mpi_status, mpierr) -#else - call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL4, pc2, 1, & - tmp, l_rows, MPI_REAL4, pc2, 1, & - mpi_comm_cols, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - tmp(1:l_rows) = q(l_rqs:l_rqe,lc1) -#endif /* WITH_MPI */ - q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1) - endif - else if (pc2==my_pcol) then -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL8, pc1, 1, & - tmp, l_rows, MPI_REAL8, pc1, 1, & - mpi_comm_cols, mpi_status, mpierr) -#else - call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL4, pc1, 1, & - tmp, l_rows, MPI_REAL4, pc1, 1, & - mpi_comm_cols, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - tmp(1:l_rows) = q(l_rqs:l_rqe,lc2) -#endif /* WITH_MPI */ - - q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) - endif - - end subroutine transform_columns - - subroutine global_gather(z, n) - - ! This routine sums up z over all processors. - ! It should only be used for gathering distributed results, - ! i.e. z(i) should be nonzero on exactly 1 processor column, - ! otherways the results may be numerically different on different columns - use precision - implicit none - - integer(kind=ik) :: n - real(kind=rk) :: z(n) - real(kind=rk) :: tmp(n) - - if (npc_n==1 .and. np_rows==1) return ! nothing to do - - ! Do an mpi_allreduce over processor rows -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(z, tmp, n, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - tmp = z -#endif /* WITH_MPI */ - ! If only 1 processor column, we are done - if (npc_n==1) then - z(:) = tmp(:) - return - endif - - ! If all processor columns are involved, we can use mpi_allreduce - if (npc_n==np_cols) then -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) -#else - call mpi_allreduce(tmp, z, n, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) -#endif - -#else /* WITH_MPI */ - tmp = z -#endif /* WITH_MPI */ - - return - endif - - ! Do a ring send over processor columns - z(:) = 0 - do np = 1, npc_n - z(:) = z(:) + tmp(:) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Sendrecv_replace(z, n, MPI_REAL8, np_next, 1111, np_prev, 1111, & - mpi_comm_cols, mpi_status, mpierr) -#else - call MPI_Sendrecv_replace(z, n, MPI_REAL4, np_next, 1111, np_prev, 1111, & - mpi_comm_cols, mpi_status, mpierr) -#endif - -#endif /* WITH_MPI */ - enddo - - end subroutine global_gather - - subroutine global_product(z, n) - - ! This routine calculates the global product of z. - use precision - implicit none - - integer(kind=ik) :: n - real(kind=rk) :: z(n) - - real(kind=rk) :: tmp(n) - - if (npc_n==1 .and. np_rows==1) return ! nothing to do - - ! Do an mpi_allreduce over processor rows -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_PROD, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(z, tmp, n, MPI_REAL4, MPI_PROD, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - tmp = z -#endif /* WITH_MPI */ - ! If only 1 processor column, we are done - if (npc_n==1) then - z(:) = tmp(:) - return - endif - - ! If all processor columns are involved, we can use mpi_allreduce - if (npc_n==np_cols) then -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_PROD, mpi_comm_cols, mpierr) -#else - call mpi_allreduce(tmp, z, n, MPI_REAL4, MPI_PROD, mpi_comm_cols, mpierr) -#endif - -#else /* WITH_MPI */ - z = tmp -#endif /* WITH_MPI */ - return - endif - - ! We send all vectors to the first proc, do the product there - ! and redistribute the result. - - if (my_pcol == npc_0) then - z(1:n) = tmp(1:n) - do np = npc_0+1, npc_0+npc_n-1 -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_recv(tmp, n, MPI_REAL8, np, 1111, mpi_comm_cols, mpi_status, mpierr) -#else - call mpi_recv(tmp, n, MPI_REAL4, np, 1111, mpi_comm_cols, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - tmp(1:n) = z(1:n) -#endif /* WITH_MPI */ - z(1:n) = z(1:n)*tmp(1:n) - enddo - do np = npc_0+1, npc_0+npc_n-1 -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_send(z, n, MPI_REAL8, np, 1111, mpi_comm_cols, mpierr) -#else - call mpi_send(z, n, MPI_REAL4, np, 1111, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - enddo - else -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_send(tmp, n, MPI_REAL8, npc_0, 1111, mpi_comm_cols, mpierr) - call mpi_recv(z ,n, MPI_REAL8, npc_0, 1111, mpi_comm_cols, mpi_status, mpierr) -#else - call mpi_send(tmp, n, MPI_REAL4, npc_0, 1111, mpi_comm_cols, mpierr) - call mpi_recv(z ,n, MPI_REAL4, npc_0, 1111, mpi_comm_cols, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - z(1:n) = tmp(1:n) -#endif /* WITH_MPI */ - - endif - - end subroutine global_product - - subroutine check_monotony(n,d,text, wantDebug, success) - - ! This is a test routine for checking if the eigenvalues are monotonically increasing. - ! It is for debug purposes only, an error should never be triggered! - use precision - implicit none - - integer(kind=ik) :: n - real(kind=rk) :: d(n) - character*(*) :: text - - integer(kind=ik) :: i - logical, intent(in) :: wantDebug - logical, intent(out) :: success - - success = .true. - do i=1,n-1 - if (d(i+1) 0 and d(i+1) > d(i) - ! - ! but this routine will not terminate with error if these are not satisfied - ! (it will normally converge to a pole in this case). - ! - ! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases - ! N=1 and N=2 which is not compatible with DLAED4. - ! Thus this routine shouldn't be used for these cases as a simple replacement - ! of DLAED4. - ! - ! The arguments are the same as in DLAED4 (with the exception of the INFO argument): - ! - ! - ! N (input) INTEGER - ! The length of all arrays. - ! - ! I (input) INTEGER - ! The index of the eigenvalue to be computed. 1 <= I <= N. - ! - ! D (input) DOUBLE PRECISION array, dimension (N) - ! The original eigenvalues. It is assumed that they are in - ! order, D(I) < D(J) for I < J. - ! - ! Z (input) DOUBLE PRECISION array, dimension (N) - ! The components of the updating vector. - ! - ! DELTA (output) DOUBLE PRECISION array, dimension (N) - ! DELTA contains (D(j) - lambda_I) in its j-th component. - ! See remark above about DLAED4 compatibility! - ! - ! RHO (input) DOUBLE PRECISION - ! The scalar in the symmetric updating formula. - ! - ! DLAM (output) DOUBLE PRECISION - ! The computed lambda_I, the I-th updated eigenvalue. - !------------------------------------------------------------------------------- - -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik) :: n, i - real(kind=rk) :: d(n), z(n), delta(n), rho, dlam - - integer(kind=ik) :: iter - real(kind=rk) :: a, b, x, y, dshift - - ! In order to obtain sufficient numerical accuracy we have to shift the problem - ! either by d(i) or d(i+1), whichever is closer to the solution - - ! Upper and lower bound of the shifted solution interval are a and b - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("solve_secular_equation") -#endif - if (i==n) then - - ! Special case: Last eigenvalue - ! We shift always by d(n), lower bound is d(n), - ! upper bound is determined by a guess: - - dshift = d(n) - delta(:) = d(:) - dshift - - a = 0. ! delta(n) - b = rho*SUM(z(:)**2) + 1._rk ! rho*SUM(z(:)**2) is the lower bound for the guess - - else - - ! Other eigenvalues: lower bound is d(i), upper bound is d(i+1) - ! We check the sign of the function in the midpoint of the interval - ! in order to determine if eigenvalue is more close to d(i) or d(i+1) - - x = 0.5_rk*(d(i)+d(i+1)) - y = 1._rk + rho*SUM(z(:)**2/(d(:)-x)) - - if (y>0) then - ! solution is next to d(i) - dshift = d(i) - else - ! solution is next to d(i+1) - dshift = d(i+1) - endif - - delta(:) = d(:) - dshift - a = delta(i) - b = delta(i+1) - - endif - - ! Bisection: - - do iter=1,200 - - ! Interval subdivision - - x = 0.5_rk*(a+b) - - if (x==a .or. x==b) exit ! No further interval subdivisions possible -#ifdef DOUBLE_PRECISION_REAL - if (abs(x) < 1.e-200_rk) exit ! x next to pole -#else - if (abs(x) < 1.e-20_rk) exit ! x next to pole -#endif - ! evaluate value at x - - y = 1. + rho*SUM(z(:)**2/(delta(:)-x)) - - if (y==0) then - ! found exact solution - exit - elseif (y>0) then - b = x - else - a = x - endif - - enddo - - ! Solution: - - dlam = x + dshift - delta(:) = delta(:) - x -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_secular_equation") -#endif - - end subroutine solve_secular_equation - - !------------------------------------------------------------------------------- - - integer function local_index(idx, my_proc, num_procs, nblk, iflag) - - !------------------------------------------------------------------------------- - ! local_index: returns the local index for a given global index - ! If the global index has no local index on the - ! processor my_proc behaviour is defined by iflag - ! - ! Parameters - ! - ! idx Global index - ! - ! my_proc Processor row/column for which to calculate the local index - ! - ! num_procs Total number of processors along row/column - ! - ! nblk Blocksize - ! - ! iflag Controls the behaviour if idx is not on local processor - ! iflag< 0 : Return last local index before that row/col - ! iflag==0 : Return 0 - ! iflag> 0 : Return next local index after that row/col - !------------------------------------------------------------------------------- - use precision - implicit none - - integer(kind=ik) :: idx, my_proc, num_procs, nblk, iflag - - integer(kind=ik) :: iblk - - iblk = (idx-1)/nblk ! global block number, 0 based - - if (mod(iblk,num_procs) == my_proc) then - - ! block is local, always return local row/col number - - local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1 - - else - - ! non local block - - if (iflag == 0) then - - local_index = 0 - - else - - local_index = (iblk/num_procs)*nblk - - if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk - - if (iflag>0) local_index = local_index + 1 - endif - endif - - end function local_index - - subroutine cholesky_real(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success) - - !------------------------------------------------------------------------------- - ! cholesky_real: Cholesky factorization of a real symmetric matrix - ! - ! Parameters - ! - ! na Order of matrix - ! - ! a(lda,matrixCols) Distributed matrix which should be factorized. - ! Distribution is like in Scalapack. - ! Only upper triangle is needs to be set. - ! On return, the upper triangle contains the Cholesky factor - ! and the lower triangle is set to 0. - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix a - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols - real(kind=rk) :: a(lda,matrixCols) - ! was - ! real a(lda, *) - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx - integer(kind=ik) :: n, nc, i, info - integer(kind=ik) :: lcs, lce, lrs, lre - integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile - - real(kind=rk), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:) - - logical, intent(in) :: wantDebug - logical, intent(out) :: success - integer(kind=ik) :: istat - character(200) :: errorMessage - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("cholesky_real") -#endif - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - success = .true. - - ! Matrix is split into tiles; work is done only for tiles on the diagonal or above - - tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide - - l_rows_tile = tile_size/np_rows ! local rows of a tile - l_cols_tile = tile_size/np_cols ! local cols of a tile - - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a - l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a - - allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"cholesky_real: error when allocating tmp1 "//errorMessage - stop - endif - - allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"cholesky_real: error when allocating tmp2 "//errorMessage - stop - endif - - tmp1 = 0 - tmp2 = 0 - - allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"cholesky_real: error when allocating tmatr "//errorMessage - stop - endif - - allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"cholesky_real: error when allocating tmatc "//errorMessage - stop - endif - - tmatr = 0 - tmatc = 0 - - do n = 1, na, nblk - - ! Calculate first local row and column of the still remaining matrix - ! on the local processor - - l_row1 = local_index(n, my_prow, np_rows, nblk, +1) - l_col1 = local_index(n, my_pcol, np_cols, nblk, +1) - - l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1) - l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1) - - if (n+nblk > na) then - - ! This is the last step, just do a Cholesky-Factorization - ! of the remaining block - - if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then -#ifdef DOUBLE_PRECISION_REAL - call dpotrf('U', na-n+1, a(l_row1,l_col1), lda, info) -#else - call spotrf('U', na-n+1, a(l_row1,l_col1), lda, info) -#endif - if (info/=0) then - if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf" - success = .false. - return - endif - - endif - - exit ! Loop - - endif - - if (my_prow==prow(n, nblk, np_rows)) then - - if (my_pcol==pcol(n, nblk, np_cols)) then - - ! The process owning the upper left remaining block does the - ! Cholesky-Factorization of this block -#ifdef DOUBLE_PRECISION_REAL - call dpotrf('U', nblk, a(l_row1,l_col1), lda, info) -#else - call spotrf('U', nblk, a(l_row1,l_col1), lda, info) -#endif - if (info/=0) then - if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf" - success = .false. - return - endif - - nc = 0 - do i=1,nblk - tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1) - nc = nc+i - enddo - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) -#else - call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - nc = 0 - do i=1,nblk - tmp2(1:i,i) = tmp1(nc+1:nc+i) - nc = nc+i - enddo - - if (l_cols-l_colx+1>0) & -#ifdef DOUBLE_PRECISION_REAL - call dtrsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) -#else - call strsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) -#endif - endif - - do i=1,nblk - - if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols) -#ifdef WITH_MPI - if (l_cols-l_colx+1>0) & -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) -#else - call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) -#endif - -#endif /* WITH_MPI */ - enddo - ! this has to be checked since it was changed substantially when doing type safe - call elpa_transpose_vectors_real (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & - tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & - n, na, nblk, nblk) - - do i=0,(na-1)/tile_size - lcs = max(l_colx,i*l_cols_tile+1) - lce = min(l_cols,(i+1)*l_cols_tile) - lrs = l_rowx - lre = min(l_rows,(i+1)*l_rows_tile) - if (lce0) & -#ifdef DOUBLE_PRECISION_REAL - call DTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) -#else - call STRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) -#endif - if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols) - if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0 - - endif - - if (l_row1>1) then - if (my_pcol==pcol(n, nblk, np_cols)) then - tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1) - a(1:l_row1-1,l_col1:l_col1+nb-1) = 0 - endif - - do i=1,nb -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) -#else - call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - enddo - endif -#ifdef WITH_MPI - if (l_cols-l_col1+1>0) & -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) -#else - call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) -#endif - -#endif /* WITH_MPI */ - if (l_row1>1 .and. l_cols-l_col1+1>0) & -#ifdef DOUBLE_PRECISION_REAL - call dgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk, & - tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & - 1.0_rk, a(1,l_col1), lda) -#else - call sgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk, & - tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & - 1.0_rk, a(1,l_col1), lda) +#ifdef WANT_SINGLE_PRECISION_REAL + public :: tridiag_real_single ! Transform real single-precision symmetric matrix to tridiagonal form + public :: trans_ev_real_single ! Transform real single-precision eigenvectors of a tridiagonal matrix back + public :: mult_at_b_real_single ! Multiply real single-precision matrices A**T * B #endif - enddo - - deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"invert_trm_real: error when deallocating tmp1 "//errorMessage - stop - endif - - end subroutine invert_trm_real - - subroutine cholesky_complex(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success) - !------------------------------------------------------------------------------- - ! cholesky_complex: Cholesky factorization of a complex hermitian matrix - ! - ! Parameters - ! - ! na Order of matrix - ! - ! a(lda,matriCols) Distributed matrix which should be factorized. - ! Distribution is like in Scalapack. - ! Only upper triangle is needs to be set. - ! On return, the upper triangle contains the Cholesky factor - ! and the lower triangle is set to 0. - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix a - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none + public :: tridiag_complex_double ! Transform complex hermitian matrix to tridiagonal form + public :: tridiag_complex + public :: trans_ev_complex_double ! Transform eigenvectors of a tridiagonal matrix back + public :: trans_ev_complex + public :: mult_ah_b_complex_double ! Multiply complex matrices A**H * B + public :: mult_ah_b_complex - integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - complex(kind=ck) :: a(lda,*) -#else - complex(kind=ck) :: a(lda,matrixCols) -#endif - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx - integer(kind=ik) :: n, nc, i, info - integer(kind=ik) :: lcs, lce, lrs, lre - integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile + interface tridiag_complex + module procedure tridiag_complex_double + end interface - complex(kind=ck), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:) + interface trans_ev_complex + module procedure trans_ev_complex_double + end interface - logical, intent(in) :: wantDebug - logical, intent(out) :: success - integer(kind=ik) :: istat - character(200) :: errorMessage + interface mult_ah_b_complex + module procedure mult_ah_b_complex_double + end interface -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("cholesky_complex") +#ifdef WANT_SINGLE_PRECISION_COMPLEX + public :: tridiag_complex_single ! Transform complex single-precision hermitian matrix to tridiagonal form + public :: trans_ev_complex_single ! Transform complex single-precision eigenvectors of a tridiagonal matrix back + public :: mult_ah_b_complex_single ! Multiply complex single-precision matrices A**H * B #endif - success = .true. - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - ! Matrix is split into tiles; work is done only for tiles on the diagonal or above - - tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide - - l_rows_tile = tile_size/np_rows ! local rows of a tile - l_cols_tile = tile_size/np_cols ! local cols of a tile - - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a - l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a - - allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"cholesky_complex: error when allocating tmp1 "//errorMessage - stop - endif - - allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"cholesky_complex: error when allocating tmp2 "//errorMessage - stop - endif - - tmp1 = 0 - tmp2 = 0 - - allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"cholesky_complex: error when allocating tmatr "//errorMessage - stop - endif - - allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"cholesky_complex: error when allocating tmatc "//errorMessage - stop - endif - tmatr = 0 - tmatc = 0 + public :: solve_tridi_double ! Solve tridiagonal eigensystem with divide and conquer method + public :: solve_tridi - do n = 1, na, nblk + public :: cholesky_real_double ! Cholesky factorization of a real matrix + public :: cholesky_real - ! Calculate first local row and column of the still remaining matrix - ! on the local processor + public :: invert_trm_real_double ! Invert real triangular matrix + public :: invert_trm_real - l_row1 = local_index(n, my_prow, np_rows, nblk, +1) - l_col1 = local_index(n, my_pcol, np_cols, nblk, +1) + interface solve_tridi + module procedure solve_tridi_double + end interface - l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1) - l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1) + interface cholesky_real + module procedure cholesky_real_double + end interface - if (n+nblk > na) then + interface invert_trm_real + module procedure invert_trm_real_double + end interface - ! This is the last step, just do a Cholesky-Factorization - ! of the remaining block +#ifdef WANT_SINGLE_PRECISION_REAL + public :: solve_tridi_single ! Solve tridiagonal real single-precision eigensystem with divide and conquer method - if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then -#ifdef DOUBLE_PRECISION_COMPLEX - call zpotrf('U', na-n+1, a(l_row1,l_col1),lda, info) -#else - call cpotrf('U', na-n+1, a(l_row1,l_col1),lda, info) + public :: cholesky_real_single ! Cholesky factorization of a real single-precision matrix + public :: invert_trm_real_single ! Invert real single-precision triangular matrix #endif - if (info/=0) then - if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf" - success = .false. - return - endif - endif + public :: cholesky_complex_double ! Cholesky factorization of a complex single-precision matrix + public :: cholesky_complex + public :: invert_trm_complex_double ! Invert complex single-precision triangular matrix + public :: invert_trm_complex - exit ! Loop - endif + interface cholesky_complex + module procedure cholesky_complex_double + end interface - if (my_prow==prow(n, nblk, np_rows)) then + interface invert_trm_complex + module procedure invert_trm_complex_double + end interface - if (my_pcol==pcol(n, nblk, np_cols)) then - - ! The process owning the upper left remaining block does the - ! Cholesky-Factorization of this block -#ifdef DOUBLE_PRECISION_COMPLEX - call zpotrf('U', nblk, a(l_row1,l_col1),lda, info) -#else - call cpotrf('U', nblk, a(l_row1,l_col1),lda, info) +#ifdef WANT_SINGLE_PRECISION_COMPLEX + public :: cholesky_complex_single ! Cholesky factorization of a complex matrix + public :: invert_trm_complex_single ! Invert complex triangular matrix #endif - if (info/=0) then - if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf" - success = .false. - return - endif - - nc = 0 - do i=1,nblk - tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1) - nc = nc+i - enddo - endif -#ifdef WITH_MPI -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_DOUBLE_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) -#else - call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) -#endif + public :: local_index ! Get local index of a block cyclic distributed matrix + public :: least_common_multiple ! Get least common multiple -#endif /* WITH_MPI */ - nc = 0 - do i=1,nblk - tmp2(1:i,i) = tmp1(nc+1:nc+i) - nc = nc+i - enddo + public :: hh_transform_real_double + public :: hh_transform_real + public :: elpa_reduce_add_vectors_real_double + public :: elpa_reduce_add_vectors_real + public :: elpa_transpose_vectors_real_double + public :: elpa_transpose_vectors_real - if (l_cols-l_colx+1>0) & -#ifdef DOUBLE_PRECISION_COMPLEX - call ztrsm('L', 'U', 'C', 'N', nblk, l_cols-l_colx+1, (1.0_rk,0.0_rk), tmp2, ubound(tmp2,dim=1), & - a(l_row1,l_colx), lda) -#else - call ctrsm('L', 'U', 'C', 'N', nblk, l_cols-l_colx+1, (1.0_rk,0.0_rk), tmp2, ubound(tmp2,dim=1), & - a(l_row1,l_colx), lda) -#endif - endif + interface hh_transform_real + module procedure hh_transform_real_double + end interface - do i=1,nblk + interface elpa_reduce_add_vectors_real + module procedure elpa_reduce_add_vectors_real_double + end interface - if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols)) -#ifdef WITH_MPI - if (l_cols-l_colx+1>0) & -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_DOUBLE_COMPLEX, prow(n, nblk, np_rows), & - mpi_comm_rows, mpierr) -#else - call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_COMPLEX, prow(n, nblk, np_rows), & - mpi_comm_rows, mpierr) -#endif + interface elpa_transpose_vectors_real + module procedure elpa_transpose_vectors_real_double + end interface -#endif /* WITH_MPI */ - enddo - ! this has to be checked since it was changed substantially when doing type safe - call elpa_transpose_vectors_complex (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & - tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & - n, na, nblk, nblk) - do i=0,(na-1)/tile_size - lcs = max(l_colx,i*l_cols_tile+1) - lce = min(l_cols,(i+1)*l_cols_tile) - lrs = l_rowx - lre = min(l_rows,(i+1)*l_rows_tile) - if (lce0) & -#ifdef DOUBLE_PRECISION_COMPLEX - call ZTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, (1.0_rk,0.0_rk), tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) -#else - call CTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, (1.0_rk,0.0_rk), tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) #endif - if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols) - if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0 - - endif - if (l_row1>1) then - if (my_pcol==pcol(n, nblk, np_cols)) then - tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1) - a(1:l_row1-1,l_col1:l_col1+nb-1) = 0 - endif +! double precision +#define DOUBLE_PRECISION_COMPLEX 1 - do i=1,nb -#ifdef WITH_MPI +#define DATATYPE COMPLEX(kind=ck8) +#define BYTESIZE 16 +#define COMPLEXCASE 1 +#include "elpa_transpose_vectors.X90" +#include "elpa_reduce_add_vectors.X90" +#undef DATATYPE +#undef BYTESIZE +#undef COMPLEXCASE +#undef DOUBLE_PRECISION_COMPLEX -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_DOUBLE_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) -#else - call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) -#endif +#ifdef WANT_SINGLE_PRECISION_COMPLEX -#endif /* WITH_MPI */ - enddo - endif -#ifdef WITH_MPI - if (l_cols-l_col1+1>0) & -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_DOUBLE_COMPLEX, prow(n, nblk, np_rows), & - mpi_comm_rows, mpierr) -#else - call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_COMPLEX, prow(n, nblk, np_rows), & - mpi_comm_rows, mpierr) -#endif +#undef DOUBLE_PRECISION_COMPLEX +#undef DOUBLE_PRECISION_REAL +#define DATATYPE COMPLEX(kind=ck4) +#define COMPLEXCASE 1 +#include "elpa_transpose_vectors.X90" +#include "elpa_reduce_add_vectors.X90" +#undef DATATYPE +#undef BYTESIZE +#undef COMPLEXCASE -#endif /* WITH_MPI */ - if (l_row1>1 .and. l_cols-l_col1+1>0) & -#ifdef DOUBLE_PRECISION_COMPLEX - call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk,0.0_rk), & - tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & - (1.0_rk,0.0_rk), a(1,l_col1), lda) -#else - call CGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk,0.0_rk), & - tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & - (1.0_rk,0.0_rk), a(1,l_col1), lda) -#endif - enddo +#endif /* WANT_SINGLE_PRECISION_COMPLEX */ - deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"invert_trm_complex: error when deallocating tmp1 "//errorMessage - stop - endif - end subroutine invert_trm_complex +! real double precision +#define DOUBLE_PRECISION_REAL 1 +#define REAL_DATATYPE rk8 - integer function least_common_multiple(a, b) +#include "elpa1_compute_real_template.X90" - ! Returns the least common multiple of a and b - ! There may be more efficient ways to do this, we use the most simple approach - use precision - implicit none - integer(kind=ik), intent(in) :: a, b +#undef DOUBLE_PRECISION_REAL +#undef REAL_DATATYPE - do least_common_multiple = a, a*(b-1), a - if(mod(least_common_multiple,b)==0) exit - enddo - ! if the loop is left regularly, least_common_multiple = a*b +! real single precision +#if defined(WANT_SINGLE_PRECISION_REAL) - end function +#undef DOUBLE_PRECISION_REAL +#define REAL_DATATYPE rk4 - subroutine hh_transform_real(alpha, xnorm_sq, xf, tau) - ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:) - ! and returns the factor xf by which x has to be scaled. - ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150 - ! since this would be expensive for the parallel implementation. - use precision - implicit none - real(kind=rk), intent(inout) :: alpha - real(kind=rk), intent(in) :: xnorm_sq - real(kind=rk), intent(out) :: xf, tau +#include "elpa1_compute_real_template.X90" - real(kind=rk) :: BETA +#undef DOUBLE_PRECISION_REAL +#undef REAL_DATATYPE - if ( XNORM_SQ==0. ) then +#endif /* WANT_SINGLE_PRECISION_REAL */ - if ( ALPHA>=0. ) then - TAU = 0. - else - TAU = 2. - ALPHA = -ALPHA - endif - XF = 0. +! complex double precision +#define DOUBLE_PRECISION_COMPLEX 1 +#define REAL_DATATYPE rk8 +#define COMPLEX_DATATYPE ck8 +#include "elpa1_compute_complex_template.X90" - else +#undef DOUBLE_PRECISION_COMPLEX +#undef REAL_DATATYPE +#undef COMPLEX_DATATYPE - BETA = SIGN( SQRT( ALPHA**2 + XNORM_SQ ), ALPHA ) - ALPHA = ALPHA + BETA - IF ( BETA<0 ) THEN - BETA = -BETA - TAU = -ALPHA / BETA - ELSE - ALPHA = XNORM_SQ / ALPHA - TAU = ALPHA / BETA - ALPHA = -ALPHA - END IF - XF = 1./ALPHA - ALPHA = BETA - endif - end subroutine +! complex single precision +#if defined(WANT_SINGLE_PRECISION_COMPLEX) - subroutine hh_transform_complex(alpha, xnorm_sq, xf, tau) +#undef DOUBLE_PRECISION_COMPLEX +#define REAL_DATATYPE rk4 +#define COMPLEX_DATATYPE ck4 - ! Similar to LAPACK routine ZLARFP, but uses ||x||**2 instead of x(:) - ! and returns the factor xf by which x has to be scaled. - ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150 - ! since this would be expensive for the parallel implementation. - use precision - use precision - implicit none - complex(kind=ck), intent(inout) :: alpha - real(kind=rk), intent(in) :: xnorm_sq - complex(kind=ck), intent(out) :: xf, tau +#include "elpa1_compute_complex_template.X90" - real(kind=rk) :: ALPHR, ALPHI, BETA +#undef DOUBLE_PRECISION_COMPLEX +#undef COMPLEX_DATATYPE +#undef REAL_DATATYPE - ALPHR = real( ALPHA, kind=rk ) -#ifdef DOUBLE_PRECISION_COMPLEX - ALPHI = DIMAG( ALPHA ) -#else - ALPHI = AIMAG( ALPHA ) -#endif - if ( XNORM_SQ==0. .AND. ALPHI==0. ) then +#endif /* WANT_SINGLE_PRECISION_COMPLEX */ - if ( ALPHR>=0. ) then - TAU = 0. - else - TAU = 2. - ALPHA = -ALPHA - endif - XF = 0. - else - BETA = SIGN( SQRT( ALPHR**2 + ALPHI**2 + XNORM_SQ ), ALPHR ) - ALPHA = ALPHA + BETA - IF ( BETA<0 ) THEN - BETA = -BETA - TAU = -ALPHA / BETA - ELSE - ALPHR = ALPHI * (ALPHI/real( ALPHA , kind=rk)) - ALPHR = ALPHR + XNORM_SQ/real( ALPHA, kind=rk ) -#ifdef DOUBLE_PRECISION_COMPLEX - TAU = DCMPLX( ALPHR/BETA, -ALPHI/BETA ) - ALPHA = DCMPLX( -ALPHR, ALPHI ) -#else - TAU = CMPLX( ALPHR/BETA, -ALPHI/BETA ) - ALPHA = CMPLX( -ALPHR, ALPHI ) -#endif - END IF - XF = 1./ALPHA - ALPHA = BETA - endif - end subroutine end module ELPA1_compute diff --git a/src/elpa1_compute_complex_template.X90 b/src/elpa1_compute_complex_template.X90 new file mode 100644 index 0000000000000000000000000000000000000000..c93fc15118c573154a8983d628682f3fab7a99fe --- /dev/null +++ b/src/elpa1_compute_complex_template.X90 @@ -0,0 +1,1810 @@ +#if 0 +! This file is part of ELPA. +! +! The ELPA library was originally created by the ELPA consortium, +! consisting of the following organizations: +! +! - Max Planck Computing and Data Facility (MPCDF), formerly known as +! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +! - Bergische Universität Wuppertal, Lehrstuhl für angewandte +! Informatik, +! - Technische Universität München, Lehrstuhl für Informatik mit +! Schwerpunkt Wissenschaftliches Rechnen , +! - Fritz-Haber-Institut, Berlin, Abt. Theorie, +! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +! and +! - IBM Deutschland GmbH +! +! This particular source code file contains additions, changes and +! enhancements authored by Intel Corporation which is not part of +! the ELPA consortium. +! +! More information can be found here: +! http://elpa.mpcdf.mpg.de/ +! +! ELPA is free software: you can redistribute it and/or modify +! it under the terms of the version 3 of the license of the +! GNU Lesser General Public License as published by the Free +! Software Foundation. +! +! ELPA is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU Lesser General Public License for more details. +! +! You should have received a copy of the GNU Lesser General Public License +! along with ELPA. If not, see +! +! ELPA reflects a substantial effort on the part of the original +! ELPA consortium, and we ask you to respect the spirit of the +! license that we chose: i.e., please contribute any changes you +! may have back to the original ELPA library distribution, and keep +! any derivatives of ELPA under the same license that we chose for +! the original distribution, the GNU Lesser General Public License. +! +! +! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines +! +! Copyright of the original code rests with the authors inside the ELPA +! consortium. The copyright of any additional modifications shall rest +! with their original authors, but shall adhere to the licensing terms +! distributed along with the original code in the file "COPYING". +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + subroutine tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) +#else + subroutine tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) +#endif + !------------------------------------------------------------------------------- + ! tridiag_complex: Reduces a distributed hermitian matrix to tridiagonal form + ! (like Scalapack Routine PZHETRD) + ! + ! Parameters + ! + ! na Order of matrix + ! + ! a(lda,matrixCols) Distributed matrix which should be reduced. + ! Distribution is like in Scalapack. + ! Opposed to PZHETRD, a(:,:) must be set completely (upper and lower half) + ! a(:,:) is overwritten on exit with the Householder vectors + ! + ! lda Leading dimension of a + ! matrixCols local columns of matrix a + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + ! d(na) Diagonal elements (returned), identical on all processors + ! + ! e(na) Off-Diagonal elements (returned), identical on all processors + ! + ! tau(na) Factors for the Householder vectors (returned), needed for back transformation + ! + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + complex(kind=COMPLEX_DATATYPE) :: tau(na) +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + complex(kind=COMPLEX_DATATYPE) :: a(lda,*) +#else + complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols) +#endif + real(kind=REAL_DATATYPE) :: d(na), e(na) + + integer(kind=ik), parameter :: max_stored_rows = 32 +#ifdef DOUBLE_PRECISION_COMPLEX + complex(kind=ck8), parameter :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8) +#else + complex(kind=ck4), parameter :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4) +#endif + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols + integer(kind=ik) :: l_cols, l_rows, nstor + integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre + integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile + +#ifdef WITH_OPENMP + integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter + integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads +#endif + + real(kind=REAL_DATATYPE) :: vnorm2 + complex(kind=COMPLEX_DATATYPE) :: vav, xc, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf + + complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) +#ifdef WITH_OPENMP + complex(kind=COMPLEX_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:) +#endif + real(kind=REAL_DATATYPE), allocatable :: tmpr(:) + integer(kind=ik) :: istat + character(200) :: errorMessage + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%start("tridiag_complex_double") +#else + call timer%start("tridiag_complex_single") +#endif +#endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + + ! Matrix is split into tiles; work is done only for tiles on the diagonal or above + + tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size + tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + l_rows_tile = tile_size/np_rows ! local rows of a tile + l_cols_tile = tile_size/np_cols ! local cols of a tile + + + totalblocks = (na-1)/nblk + 1 + max_blocks_row = (totalblocks-1)/np_rows + 1 + max_blocks_col = (totalblocks-1)/np_cols + 1 + + max_local_rows = max_blocks_row*nblk + max_local_cols = max_blocks_col*nblk + + allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating tmp "//errorMessage + stop + endif + + allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating vr "//errorMessage + stop + endif + + allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating ur "//errorMessage + stop + endif + + allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating vc "//errorMessage + stop + endif + + allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating uc "//errorMessage + stop + endif + +#ifdef WITH_OPENMP + max_threads = omp_get_max_threads() + + allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating ur_p "//errorMessage + stop + endif + + allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating uc_p "//errorMessage + stop + endif +#endif + + tmp = 0 + vr = 0 + ur = 0 + vc = 0 + uc = 0 + + allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating vur "//errorMessage + stop + endif + + allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating uvc "//errorMessage + stop + endif + + d(:) = 0 + e(:) = 0 + tau(:) = 0 + + nstor = 0 + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a + l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a + if (my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols) + + do istep=na,3,-1 + + ! Calculate number of local rows and columns of the still remaining matrix + ! on the local processor + + l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1) + l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1) + + ! Calculate vector for Householder transformation on all procs + ! owning column istep + + if (my_pcol==pcol(istep, nblk, np_cols)) then + + ! Get vector to be transformed; distribute last element and norm of + ! remaining elements to all procs in current column + + vr(1:l_rows) = a(1:l_rows,l_cols+1) + if (nstor>0 .and. l_rows>0) then + aux(1:2*nstor) = conjg(uvc(l_cols+1,1:2*nstor)) +#ifdef DOUBLE_PRECISION_COMPLEX + call ZGEMV('N', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), & + aux, 1, CONE, vr, 1) +#else + call CGEMV('N', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), & + aux, 1, CONE, vr, 1) +#endif + endif + + if (my_prow==prow(istep-1, nblk, np_rows)) then + aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1)) + aux1(2) = vr(l_rows) + else + aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows)) + aux1(2) = 0. + endif +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + aux2 = aux1 +#endif /* WITH_MPI */ + vnorm2 = aux2(1) + vrl = aux2(2) + + ! Householder transformation +#ifdef DOUBLE_PRECISION_COMPLEX + call hh_transform_complex_double(vrl, vnorm2, xf, tau(istep)) +#else + call hh_transform_complex_single(vrl, vnorm2, xf, tau(istep)) +#endif + ! Scale vr and store Householder vector for back transformation + + vr(1:l_rows) = vr(1:l_rows) * xf + if (my_prow==prow(istep-1, nblk, np_rows)) then + vr(l_rows) = 1. + e(istep-1) = vrl + endif + a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation + + endif + + ! Broadcast the Householder vector (and tau) along columns + + if (my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(vr, l_rows+1, MPI_DOUBLE_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(vr, l_rows+1, MPI_COMPLEX, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + tau(istep) = vr(l_rows+1) + + ! Transpose Householder vector vr -> vc + +! call elpa_transpose_vectors (vr, 2*ubound(vr,dim=1), mpi_comm_rows, & +! vc, 2*ubound(vc,dim=1), mpi_comm_cols, & +! 1, 2*(istep-1), 1, 2*nblk) +#ifdef DOUBLE_PRECISION_COMPLEX + call elpa_transpose_vectors_complex_double (vr, ubound(vr,dim=1), mpi_comm_rows, & + vc, ubound(vc,dim=1), mpi_comm_cols, & + 1, (istep-1), 1, nblk) +#else + call elpa_transpose_vectors_complex_single (vr, ubound(vr,dim=1), mpi_comm_rows, & + vc, ubound(vc,dim=1), mpi_comm_cols, & + 1, (istep-1), 1, nblk) +#endif + ! Calculate u = (A + VU**T + UV**T)*v + + ! For cache efficiency, we use only the upper half of the matrix tiles for this, + ! thus the result is partly in uc(:) and partly in ur(:) + + uc(1:l_cols) = 0 + ur(1:l_rows) = 0 + if (l_rows>0 .and. l_cols>0) then + +#ifdef WITH_OPENMP + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%start("OpenMP parallel_double") +#else + call timer%start("OpenMP parallel_single") +#endif +#endif + +!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre) + + my_thread = omp_get_thread_num() + n_threads = omp_get_num_threads() + + n_iter = 0 + + uc_p(1:l_cols,my_thread) = 0. + ur_p(1:l_rows,my_thread) = 0. +#endif + + do i=0,(istep-2)/tile_size + lcs = i*l_cols_tile+1 + lce = min(l_cols,(i+1)*l_cols_tile) + if (lce0) then +#ifdef DOUBLE_PRECISION_COMPLEX + call ZGEMV('C', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), vr, 1, CZERO, aux, 1) + call ZGEMV('N', l_cols, 2*nstor, CONE, uvc, ubound(uvc,dim=1), aux, 1, CONE, uc, 1) +#else + call CGEMV('C', l_rows, 2*nstor, CONE, vur, ubound(vur,dim=1), vr, 1, CZERO, aux, 1) + call CGEMV('N', l_cols, 2*nstor, CONE, uvc, ubound(uvc,dim=1), aux, 1, CONE, uc, 1) +#endif + endif + + endif + + ! Sum up all ur(:) parts along rows and add them to the uc(:) parts + ! on the processors containing the diagonal + ! This is only necessary if ur has been calculated, i.e. if the + ! global tile size is smaller than the global remaining matrix + + if (tile_size < istep-1) then +#ifdef DOUBLE_PRECISION_COMPLEX + call elpa_reduce_add_vectors_COMPLEX_double (ur, ubound(ur,dim=1), mpi_comm_rows, & + uc, ubound(uc,dim=1), mpi_comm_cols, & + (istep-1), 1, nblk) +#else + call elpa_reduce_add_vectors_COMPLEX_single (ur, ubound(ur,dim=1), mpi_comm_rows, & + uc, ubound(uc,dim=1), mpi_comm_cols, & + (istep-1), 1, nblk) +#endif + endif + + ! Sum up all the uc(:) parts, transpose uc -> ur + + if (l_cols>0) then + tmp(1:l_cols) = uc(1:l_cols) +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(tmp, uc, l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(tmp, uc, l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + uc = tmp +#endif /* WITH_MPI */ + endif + +! call elpa_transpose_vectors (uc, 2*ubound(uc,dim=1), mpi_comm_cols, & +! ur, 2*ubound(ur,dim=1), mpi_comm_rows, & +! 1, 2*(istep-1), 1, 2*nblk) +#ifdef DOUBLE_PRECISION_COMPLEX + call elpa_transpose_vectors_complex_double (uc, ubound(uc,dim=1), mpi_comm_cols, & + ur, ubound(ur,dim=1), mpi_comm_rows, & + 1, (istep-1), 1, nblk) +#else + call elpa_transpose_vectors_complex_single (uc, ubound(uc,dim=1), mpi_comm_cols, & + ur, ubound(ur,dim=1), mpi_comm_rows, & + 1, (istep-1), 1, nblk) +#endif + + + ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) + + xc = 0 + if (l_cols>0) xc = dot_product(vc(1:l_cols),uc(1:l_cols)) +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(xc, vav, 1 , MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_cols, mpierr) +#else + call mpi_allreduce(xc, vav, 1 , MPI_COMPLEX, MPI_SUM, mpi_comm_cols, mpierr) +#endif + +#else /* WITH_MPI */ + vav = xc +#endif /* WITH_MPI */ + + ! store u and v in the matrices U and V + ! these matrices are stored combined in one here + + do j=1,l_rows + vur(j,2*nstor+1) = conjg(tau(istep))*vr(j) + vur(j,2*nstor+2) = 0.5*conjg(tau(istep))*vav*vr(j) - ur(j) + enddo + do j=1,l_cols + uvc(j,2*nstor+1) = 0.5*conjg(tau(istep))*vav*vc(j) - uc(j) + uvc(j,2*nstor+2) = conjg(tau(istep))*vc(j) + enddo + + nstor = nstor+1 + + ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T + + if (nstor==max_stored_rows .or. istep==3) then + + do i=0,(istep-2)/tile_size + lcs = i*l_cols_tile+1 + lce = min(l_cols,(i+1)*l_cols_tile) + lrs = 1 + lre = min(l_rows,(i+1)*l_rows_tile) + if (lce0) a(l_rows,l_cols) = a(l_rows,l_cols) & + + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor)) + d(istep-1) = a(l_rows,l_cols) + endif + + enddo ! istep + + ! Store e(1) and d(1) + + if (my_pcol==pcol(2, nblk, np_cols)) then + if (my_prow==prow(1, nblk, np_rows)) then + ! We use last l_cols value of loop above + vrl = a(1,l_cols) +#ifdef DOUBLE_PRECISION_COMPLEX + call hh_transform_complex_double(vrl, 0.0_rk8, xf, tau(2)) +#else + call hh_transform_complex_single(vrl, 0.0_rk4, xf, tau(2)) +#endif + e(1) = vrl + a(1,l_cols) = 1. ! for consistency only + endif + +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_bcast(tau(2), 1, MPI_DOUBLE_COMPLEX, prow(1, nblk, np_rows), mpi_comm_rows, mpierr) +#else + call mpi_bcast(tau(2), 1, MPI_COMPLEX, prow(1, nblk, np_rows), mpi_comm_rows, mpierr) +#endif + +#endif /* WITH_MPI */ + endif + +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_bcast(tau(2), 1, MPI_DOUBLE_COMPLEX, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call mpi_bcast(tau(2), 1, MPI_COMPLEX, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + + + if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1) + + deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when deallocating tmp "//errorMessage + stop + endif + ! distribute the arrays d and e to all processors + + allocate(tmpr(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when allocating tmpr "//errorMessage + stop + endif + +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + tmpr = d + call mpi_allreduce(tmpr, d, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) + tmpr = d + call mpi_allreduce(tmpr, d, na, MPI_REAL8 ,MPI_SUM, mpi_comm_cols, mpierr) + tmpr = e + call mpi_allreduce(tmpr, e, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) + tmpr = e + call mpi_allreduce(tmpr, e, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) +#else + tmpr = d + call mpi_allreduce(tmpr, d, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) + tmpr = d + call mpi_allreduce(tmpr, d, na, MPI_REAL4 ,MPI_SUM, mpi_comm_cols, mpierr) + tmpr = e + call mpi_allreduce(tmpr, e, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) + tmpr = e + call mpi_allreduce(tmpr, e, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + deallocate(tmpr, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_complex: error when deallocating tmpr "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%stop("tridiag_complex_double") +#else + call timer%stop("tridiag_complex_single") +#endif +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + end subroutine tridiag_complex_double +#else + end subroutine tridiag_complex_single +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + subroutine trans_ev_complex_double(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#else + subroutine trans_ev_complex_single(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#endif + !------------------------------------------------------------------------------- + ! trans_ev_complex: Transforms the eigenvectors of a tridiagonal matrix back + ! to the eigenvectors of the original matrix + ! (like Scalapack Routine PZUNMTR) + ! + ! Parameters + ! + ! na Order of matrix a, number of rows of matrix q + ! + ! nqc Number of columns of matrix q + ! + ! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_complex) + ! Distribution is like in Scalapack. + ! + ! lda Leading dimension of a + ! + ! tau(na) Factors of the Householder vectors + ! + ! q On input: Eigenvectors of tridiagonal matrix + ! On output: Transformed eigenvectors + ! Distribution is like in Scalapack. + ! + ! ldq Leading dimension of q + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + complex(kind=COMPLEX_DATATYPE) :: tau(na) +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*) +#else + complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols) +#endif + integer(kind=ik) :: max_stored_rows +#ifdef DOUBLE_PRECISION_COMPLEX + complex(kind=ck8), parameter :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8) +#else + complex(kind=ck4), parameter :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4) +#endif + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols + integer(kind=ik) :: l_cols, l_rows, l_colh, nstor + integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol + + complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) + complex(kind=COMPLEX_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:) + integer(kind=ik) :: istat + character(200) :: errorMessage +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%start("trans_ev_complex_double") +#else + call timer%start("trans_ev_complex_single") +#endif +#endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + + totalblocks = (na-1)/nblk + 1 + max_blocks_row = (totalblocks-1)/np_rows + 1 + max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q! + + max_local_rows = max_blocks_row*nblk + max_local_cols = max_blocks_col*nblk + + max_stored_rows = (63/nblk+1)*nblk + + allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_complex: error when allocating tmat "//errorMessage + stop + endif + + allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_complex: error when allocating h1 "//errorMessage + stop + endif + + allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_complex: error when allocating h2 "//errorMessage + stop + endif + + allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_complex: error when allocating tmp1 "//errorMessage + stop + endif + + allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_complex: error when allocating tmp2 "//errorMessage + stop + endif + + allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_complex: error when allocating hvb "//errorMessage + stop + endif + + allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_complex: error when allocating hvm "//errorMessage + stop + endif + + hvm = 0 ! Must be set to 0 !!! + hvb = 0 ! Safety only + + l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q + + nstor = 0 + + ! In the complex case tau(2) /= 0 + if (my_prow == prow(1, nblk, np_rows)) then +#ifdef DOUBLE_PRECISION_COMPLEX + q(1,1:l_cols) = q(1,1:l_cols)*((1.0_rk8,0.0_rk8)-tau(2)) +#else + q(1,1:l_cols) = q(1,1:l_cols)*((1.0_rk4,0.0_rk4)-tau(2)) +#endif + endif + + do istep=1,na,nblk + + ics = MAX(istep,3) + ice = MIN(istep+nblk-1,na) + if (ice0) & +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(hvb, nb, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(hvb, nb, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + nb = 0 + do ic=ics,ice + l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector + hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) + nstor = nstor+1 + nb = nb+l_rows + enddo + + ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough! + if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then + + ! Calculate scalar products of stored vectors. + ! This can be done in different ways, we use zherk + + tmat = 0 + if (l_rows>0) & +#ifdef DOUBLE_PRECISION_COMPLEX + call zherk('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows) +#else + call cherk('U', 'C', nstor, l_rows, CONE, hvm, ubound(hvm,dim=1), CZERO, tmat, max_stored_rows) +#endif + nc = 0 + do n=1,nstor-1 + h1(nc+1:nc+n) = tmat(1:n,n+1) + nc = nc+n + enddo +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + if (nc>0) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + if (nc>0) h2=h1 +#endif /* WITH_MPI */ + ! Calculate triangular matrix T + + nc = 0 + tmat(1,1) = tau(ice-nstor+1) + do n=1,nstor-1 +#ifdef DOUBLE_PRECISION_COMPLEX + call ztrmv('L', 'C', 'N', n, tmat, max_stored_rows, h2(nc+1),1) +#else + call ctrmv('L', 'C', 'N', n, tmat, max_stored_rows, h2(nc+1),1) +#endif + tmat(n+1,1:n) = -conjg(h2(nc+1:nc+n))*tau(ice-nstor+n+1) + tmat(n+1,n+1) = tau(ice-nstor+n+1) + nc = nc+n + enddo + + ! Q = Q - V * T * V**T * Q + + if (l_rows>0) then +#ifdef DOUBLE_PRECISION_COMPLEX + call zgemm('C', 'N', nstor, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & + q, ldq, CZERO, tmp1 ,nstor) +#else + call cgemm('C', 'N', nstor, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & + q, ldq, CZERO, tmp1 ,nstor) +#endif + else + tmp1(1:l_cols*nstor) = 0 + endif +#ifdef DOUBLE_PRECISION_COMPLEX + +#ifdef WITH_MPI + call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + tmp2 = tmp1 +#endif + if (l_rows>0) then + call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor) + call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), & + tmp2, nstor, CONE, q, ldq) + endif +#else /* DOUBLE_PRECISION_COMPLEX */ + +#ifdef WITH_MPI + call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + tmp2 = tmp1 +#endif + if (l_rows>0) then + call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor) + call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), & + tmp2, nstor, CONE, q, ldq) + endif +#endif /* DOUBLE_PRECISION_COMPLEX */ + nstor = 0 + endif + + enddo + + deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_complex: error when deallocating hvb "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%stop("trans_ev_complex_double") +#else + call timer%stop("trans_ev_complex_single") +#endif +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + end subroutine trans_ev_complex_double +#else + end subroutine trans_ev_complex_single +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + subroutine mult_ah_b_complex_double(uplo_a, uplo_c, na, ncb, a, lda, ldaCols, b, ldb, ldbCols, nblk, & + mpi_comm_rows, mpi_comm_cols, c, ldc, ldcCols) +#else + subroutine mult_ah_b_complex_single(uplo_a, uplo_c, na, ncb, a, lda, ldaCols, b, ldb, ldbCols, nblk, & + mpi_comm_rows, mpi_comm_cols, c, ldc, ldcCols) +#endif + + !------------------------------------------------------------------------------- + ! mult_ah_b_complex: Performs C := A**H * B + ! + ! where: A is a square matrix (na,na) which is optionally upper or lower triangular + ! B is a (na,ncb) matrix + ! C is a (na,ncb) matrix where optionally only the upper or lower + ! triangle may be computed + ! + ! Parameters + ! + ! uplo_a 'U' if A is upper triangular + ! 'L' if A is lower triangular + ! anything else if A is a full matrix + ! Please note: This pertains to the original A (as set in the calling program) + ! whereas the transpose of A is used for calculations + ! If uplo_a is 'U' or 'L', the other triangle is not used at all, + ! i.e. it may contain arbitrary numbers + ! + ! uplo_c 'U' if only the upper diagonal part of C is needed + ! 'L' if only the upper diagonal part of C is needed + ! anything else if the full matrix C is needed + ! Please note: Even when uplo_c is 'U' or 'L', the other triangle may be + ! written to a certain extent, i.e. one shouldn't rely on the content there! + ! + ! na Number of rows/columns of A, number of rows of B and C + ! + ! ncb Number of columns of B and C + ! + ! a Matrix A + ! + ! lda Leading dimension of a + ! ldaCols Columns of Matrix a + ! + ! b Matrix B + ! + ! ldb Leading dimension of b + ! ldbCols Columns of Matrix b + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + ! c Matrix C + ! + ! ldc Leading dimension of c + ! ldcCols Columns of Matrix C + ! + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + character*1 :: uplo_a, uplo_c + integer(kind=ik), intent(in) :: lda, ldaCols, ldb, ldbCols, ldc, ldcCols + integer(kind=ik) :: na, ncb, nblk, mpi_comm_rows, mpi_comm_cols + complex(kind=COMPLEX_DATATYPE) :: a(lda,ldaCols), b(ldb,ldbCols), c(ldc,ldcCols) ! removed assumed_size + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: l_cols, l_rows, l_rows_np + integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce + integer(kind=ik) :: gcol_min, gcol, goff + integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals + integer(kind=ik), allocatable :: lrs_save(:), lre_save(:) + + logical :: a_lower, a_upper, c_lower, c_upper + + complex(kind=COMPLEX_DATATYPE), allocatable :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:) + integer(kind=ik) :: istat + character(200) :: errorMessage + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%start("mult_ah_b_complex_double") +#else + call timer%start("mult_ah_b_complex_single") +#endif +#endif +! if (na .lt. lda) then +! print *,"na lt lda ",na,lda +! stop +! endif +! if (na .lt. ldb) then +! print *,"na lt ldb ",na,ldb +! stop +! endif +! if (na .lt. ldc) then +! print *,"na lt ldc ",na,ldc +! stop +! endif +! if (na .lt. ldaCols) then +! print *,"na lt ldaCols ",na,ldaCols +! stop +! endif +! if (na .lt. ldbCols) then +! print *,"na lt ldbCols ",na,ldbCols +! stop +! endif +! if (na .lt. ldcCols) then +! print *,"na lt ldcCols ",na,ldcCols +! stop +! endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b + l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b + + ! Block factor for matrix multiplications, must be a multiple of nblk + + if (na/np_rows<=256) then + nblk_mult = (31/nblk+1)*nblk + else + nblk_mult = (63/nblk+1)*nblk + endif + + allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_ah_b_complex: error when allocating aux_mat "//errorMessage + stop + endif + + allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_ah_b_complex: error when allocating aux_bc "//errorMessage + stop + endif + + allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_ah_b_complex: error when allocating lrs_save "//errorMessage + stop + endif + + allocate(lre_save(nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_ah_b_complex: error when allocating lre_save "//errorMessage + stop + endif + + a_lower = .false. + a_upper = .false. + c_lower = .false. + c_upper = .false. + + if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true. + if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true. + if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true. + if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true. + + ! Build up the result matrix by processor rows + + do np = 0, np_rows-1 + + ! In this turn, procs of row np assemble the result + + l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors + + nr_done = 0 ! Number of rows done + aux_mat = 0 + nstor = 0 ! Number of columns stored in aux_mat + + ! Loop over the blocks on row np + + do nb=0,(l_rows_np-1)/nblk + + goff = nb*np_rows + np ! Global offset in blocks corresponding to nb + + ! Get the processor column which owns this block (A is transposed, so we need the column) + ! and the offset in blocks within this column. + ! The corresponding block column in A is then broadcast to all for multiplication with B + + np_bc = MOD(goff,np_cols) + noff = goff/np_cols + n_aux_bc = 0 + + ! Gather up the complete block column of A on the owner + + do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast + + gcol = goff*nblk + n ! global column corresponding to n + if (nstor==0 .and. n==1) gcol_min = gcol + + lrs = 1 ! 1st local row number for broadcast + lre = l_rows ! last local row number for broadcast + if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1) + if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) + + if (lrs<=lre) then + nvals = lre-lrs+1 + if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n) + n_aux_bc = n_aux_bc + nvals + endif + + lrs_save(n) = lrs + lre_save(n) = lre + + enddo + + ! Broadcast block column +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(aux_bc, n_aux_bc, MPI_DOUBLE_COMPLEX, np_bc, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(aux_bc, n_aux_bc, MPI_COMPLEX, np_bc, mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + ! Insert what we got in aux_mat + + n_aux_bc = 0 + do n = 1, min(l_rows_np-nb*nblk,nblk) + nstor = nstor+1 + lrs = lrs_save(n) + lre = lre_save(n) + if (lrs<=lre) then + nvals = lre-lrs+1 + aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals) + n_aux_bc = n_aux_bc + nvals + endif + enddo + + ! If we got nblk_mult columns in aux_mat or this is the last block + ! do the matrix multiplication + + if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then + + lrs = 1 ! 1st local row number for multiply + lre = l_rows ! last local row number for multiply + if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1) + if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) + + lcs = 1 ! 1st local col number for multiply + lce = l_cols ! last local col number for multiply + if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1) + if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols) + + if (lcs<=lce) then + allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_ah_b_complex: error when allocating tmp1 "//errorMessage + stop + endif + + if (lrs<=lre) then +#ifdef DOUBLE_PRECISION_COMPLEX + call zgemm('C', 'N', nstor, lce-lcs+1, lre-lrs+1, (1.0_rk8,0.0_rk8), aux_mat(lrs,1), ubound(aux_mat,dim=1), & + b(lrs,lcs), ldb, (0.0_rk8,0.0_rk8), tmp1, nstor) +#else + call cgemm('C', 'N', nstor, lce-lcs+1, lre-lrs+1, (1.0_rk4,0.0_rk4), aux_mat(lrs,1), ubound(aux_mat,dim=1), & + b(lrs,lcs), ldb, (0.0_rk4,0.0_rk4), tmp1, nstor) +#endif + else + tmp1 = 0 + endif + + ! Sum up the results and send to processor row np +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr) +#else + call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + tmp2 = tmp1 +#endif /* WITH_MPI */ + ! Put the result into C + if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) + + deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage + stop + endif + + endif + + nr_done = nr_done+nstor + nstor=0 + aux_mat(:,:)=0 + endif + enddo + enddo + + deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_ah_b_complex: error when deallocating aux_mat "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%stop("mult_ah_b_complex_double") +#else + call timer%stop("mult_ah_b_complex_single") +#endif +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + end subroutine mult_ah_b_complex_double +#else + end subroutine mult_ah_b_complex_single +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + subroutine cholesky_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success) +#else + subroutine cholesky_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success) +#endif + !------------------------------------------------------------------------------- + ! cholesky_complex: Cholesky factorization of a complex hermitian matrix + ! + ! Parameters + ! + ! na Order of matrix + ! + ! a(lda,matriCols) Distributed matrix which should be factorized. + ! Distribution is like in Scalapack. + ! Only upper triangle is needs to be set. + ! On return, the upper triangle contains the Cholesky factor + ! and the lower triangle is set to 0. + ! + ! lda Leading dimension of a + ! matrixCols local columns of matrix a + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + complex(kind=COMPLEX_DATATYPE) :: a(lda,*) +#else + complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols) +#endif + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx + integer(kind=ik) :: n, nc, i, info + integer(kind=ik) :: lcs, lce, lrs, lre + integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile + + complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:) + + logical, intent(in) :: wantDebug + logical, intent(out) :: success + integer(kind=ik) :: istat + character(200) :: errorMessage + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%start("cholesky_complex_double") +#else + call timer%start("cholesky_complex_single") +#endif +#endif + success = .true. + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + ! Matrix is split into tiles; work is done only for tiles on the diagonal or above + + tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size + tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + l_rows_tile = tile_size/np_rows ! local rows of a tile + l_cols_tile = tile_size/np_cols ! local cols of a tile + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a + l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a + + allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"cholesky_complex: error when allocating tmp1 "//errorMessage + stop + endif + + allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"cholesky_complex: error when allocating tmp2 "//errorMessage + stop + endif + + tmp1 = 0 + tmp2 = 0 + + allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"cholesky_complex: error when allocating tmatr "//errorMessage + stop + endif + + allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"cholesky_complex: error when allocating tmatc "//errorMessage + stop + endif + + tmatr = 0 + tmatc = 0 + + do n = 1, na, nblk + + ! Calculate first local row and column of the still remaining matrix + ! on the local processor + + l_row1 = local_index(n, my_prow, np_rows, nblk, +1) + l_col1 = local_index(n, my_pcol, np_cols, nblk, +1) + + l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1) + l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1) + + if (n+nblk > na) then + + ! This is the last step, just do a Cholesky-Factorization + ! of the remaining block + + if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then +#ifdef DOUBLE_PRECISION_COMPLEX + call zpotrf('U', na-n+1, a(l_row1,l_col1),lda, info) +#else + call cpotrf('U', na-n+1, a(l_row1,l_col1),lda, info) +#endif + if (info/=0) then + if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf" + success = .false. + return + endif + + endif + + exit ! Loop + endif + + if (my_prow==prow(n, nblk, np_rows)) then + + if (my_pcol==pcol(n, nblk, np_cols)) then + + ! The process owning the upper left remaining block does the + ! Cholesky-Factorization of this block +#ifdef DOUBLE_PRECISION_COMPLEX + call zpotrf('U', nblk, a(l_row1,l_col1),lda, info) +#else + call cpotrf('U', nblk, a(l_row1,l_col1),lda, info) +#endif + if (info/=0) then + if (wantDebug) write(error_unit,*) "ELPA1_cholesky_complex: Error in zpotrf" + success = .false. + return + endif + + nc = 0 + do i=1,nblk + tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1) + nc = nc+i + enddo + endif +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_DOUBLE_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + nc = 0 + do i=1,nblk + tmp2(1:i,i) = tmp1(nc+1:nc+i) + nc = nc+i + enddo + + if (l_cols-l_colx+1>0) & +#ifdef DOUBLE_PRECISION_COMPLEX + call ztrsm('L', 'U', 'C', 'N', nblk, l_cols-l_colx+1, (1.0_rk8,0.0_rk8), tmp2, ubound(tmp2,dim=1), & + a(l_row1,l_colx), lda) +#else + call ctrsm('L', 'U', 'C', 'N', nblk, l_cols-l_colx+1, (1.0_rk4,0.0_rk4), tmp2, ubound(tmp2,dim=1), & + a(l_row1,l_colx), lda) +#endif + endif + + do i=1,nblk + + if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = conjg(a(l_row1+i-1,l_colx:l_cols)) +#ifdef WITH_MPI + if (l_cols-l_colx+1>0) & +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_DOUBLE_COMPLEX, prow(n, nblk, np_rows), & + mpi_comm_rows, mpierr) +#else + call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_COMPLEX, prow(n, nblk, np_rows), & + mpi_comm_rows, mpierr) +#endif + +#endif /* WITH_MPI */ + enddo + ! this has to be checked since it was changed substantially when doing type safe +#ifdef DOUBLE_PRECISION_COMPLEX + call elpa_transpose_vectors_complex_double (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & + tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & + n, na, nblk, nblk) +#else + call elpa_transpose_vectors_complex_single (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & + tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & + n, na, nblk, nblk) +#endif + do i=0,(na-1)/tile_size + lcs = max(l_colx,i*l_cols_tile+1) + lce = min(l_cols,(i+1)*l_cols_tile) + lrs = l_rowx + lre = min(l_rows,(i+1)*l_rows_tile) + if (lce0) & +#ifdef DOUBLE_PRECISION_COMPLEX + call ZTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, (1.0_rk8,0.0_rk8), tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#else + call CTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, (1.0_rk4,0.0_rk4), tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#endif + if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols) + if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0 + + endif + + if (l_row1>1) then + if (my_pcol==pcol(n, nblk, np_cols)) then + tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1) + a(1:l_row1-1,l_col1:l_col1+nb-1) = 0 + endif + + do i=1,nb +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_DOUBLE_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_COMPLEX, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + enddo + endif +#ifdef WITH_MPI + if (l_cols-l_col1+1>0) & +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_DOUBLE_COMPLEX, prow(n, nblk, np_rows), & + mpi_comm_rows, mpierr) +#else + call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_COMPLEX, prow(n, nblk, np_rows), & + mpi_comm_rows, mpierr) +#endif + +#endif /* WITH_MPI */ + if (l_row1>1 .and. l_cols-l_col1+1>0) & +#ifdef DOUBLE_PRECISION_COMPLEX + call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk8,0.0_rk8), & + tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & + (1.0_rk8,0.0_rk8), a(1,l_col1), lda) +#else + call CGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk4,0.0_rk4), & + tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & + (1.0_rk4,0.0_rk4), a(1,l_col1), lda) +#endif + enddo + + deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"invert_trm_complex: error when deallocating tmp1 "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%stop("invert_trm_complex_double") +#else + call timer%stop("invert_trm_complex_single") +#endif +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + end subroutine invert_trm_complex_double +#else + end subroutine invert_trm_complex_single +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + subroutine hh_transform_complex_double(alpha, xnorm_sq, xf, tau) +#else + subroutine hh_transform_complex_single(alpha, xnorm_sq, xf, tau) +#endif + + ! Similar to LAPACK routine ZLARFP, but uses ||x||**2 instead of x(:) + ! and returns the factor xf by which x has to be scaled. + ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150 + ! since this would be expensive for the parallel implementation. + use precision +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + implicit none + complex(kind=COMPLEX_DATATYPE), intent(inout) :: alpha + real(kind=REAL_DATATYPE), intent(in) :: xnorm_sq + complex(kind=COMPLEX_DATATYPE), intent(out) :: xf, tau + + real(kind=REAL_DATATYPE) :: ALPHR, ALPHI, BETA +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%start("hh_transform_complex_double") +#else + call timer%start("hh_transform_complex_single") +#endif +#endif + ALPHR = real( ALPHA, kind=REAL_DATATYPE ) +#ifdef DOUBLE_PRECISION_COMPLEX + ALPHI = DIMAG( ALPHA ) +#else + ALPHI = AIMAG( ALPHA ) +#endif + if ( XNORM_SQ==0. .AND. ALPHI==0. ) then + + if ( ALPHR>=0. ) then + TAU = 0. + else + TAU = 2. + ALPHA = -ALPHA + endif + XF = 0. + + else + + BETA = SIGN( SQRT( ALPHR**2 + ALPHI**2 + XNORM_SQ ), ALPHR ) + ALPHA = ALPHA + BETA + IF ( BETA<0 ) THEN + BETA = -BETA + TAU = -ALPHA / BETA + ELSE +#ifdef DOUBLE_PRECISION_COMPLEX + ALPHR = ALPHI * (ALPHI/real( ALPHA , kind=rk8)) + ALPHR = ALPHR + XNORM_SQ/real( ALPHA, kind=rk8 ) + + TAU = DCMPLX( ALPHR/BETA, -ALPHI/BETA ) + ALPHA = DCMPLX( -ALPHR, ALPHI ) +#else + ALPHR = ALPHI * (ALPHI/real( ALPHA , kind=rk4)) + ALPHR = ALPHR + XNORM_SQ/real( ALPHA, kind=rk4 ) + + TAU = CMPLX( ALPHR/BETA, -ALPHI/BETA ) + ALPHA = CMPLX( -ALPHR, ALPHI ) +#endif + END IF +#ifdef DOUBLE_PRECISION_COMPLEX + XF = 1._rk8/ALPHA +#else + XF = 1._rk4/ALPHA +#endif + ALPHA = BETA + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%stop("hh_transform_complex_double") +#else + call timer%stop("hh_transform_complex_single") +#endif +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + end subroutine hh_transform_complex_double +#else + end subroutine hh_transform_complex_single +#endif + +#define ALREADY_DEFINED 1 diff --git a/src/elpa1_compute_real_template.X90 b/src/elpa1_compute_real_template.X90 new file mode 100644 index 0000000000000000000000000000000000000000..be37a511e695b3fc510dbf546d3cd08cd14cf998 --- /dev/null +++ b/src/elpa1_compute_real_template.X90 @@ -0,0 +1,4067 @@ +#if 0 +! This file is part of ELPA. +! +! The ELPA library was originally created by the ELPA consortium, +! consisting of the following organizations: +! +! - Max Planck Computing and Data Facility (MPCDF), formerly known as +! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +! - Bergische Universität Wuppertal, Lehrstuhl für angewandte +! Informatik, +! - Technische Universität München, Lehrstuhl für Informatik mit +! Schwerpunkt Wissenschaftliches Rechnen , +! - Fritz-Haber-Institut, Berlin, Abt. Theorie, +! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +! and +! - IBM Deutschland GmbH +! +! This particular source code file contains additions, changes and +! enhancements authored by Intel Corporation which is not part of +! the ELPA consortium. +! +! More information can be found here: +! http://elpa.mpcdf.mpg.de/ +! +! ELPA is free software: you can redistribute it and/or modify +! it under the terms of the version 3 of the license of the +! GNU Lesser General Public License as published by the Free +! Software Foundation. +! +! ELPA is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU Lesser General Public License for more details. +! +! You should have received a copy of the GNU Lesser General Public License +! along with ELPA. If not, see +! +! ELPA reflects a substantial effort on the part of the original +! ELPA consortium, and we ask you to respect the spirit of the +! license that we chose: i.e., please contribute any changes you +! may have back to the original ELPA library distribution, and keep +! any derivatives of ELPA under the same license that we chose for +! the original distribution, the GNU Lesser General Public License. +! +! +! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines +! +! Copyright of the original code rests with the authors inside the ELPA +! consortium. The copyright of any additional modifications shall rest +! with their original authors, but shall adhere to the licensing terms +! distributed along with the original code in the file "COPYING". +#endif + +#ifdef DOUBLE_PRECISION_REAL + subroutine tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) +#else + subroutine tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) +#endif + !------------------------------------------------------------------------------- + ! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form + ! (like Scalapack Routine PDSYTRD) + ! + ! Parameters + ! + ! na Order of matrix + ! + ! a(lda,matrixCols) Distributed matrix which should be reduced. + ! Distribution is like in Scalapack. + ! Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half) + ! a(:,:) is overwritten on exit with the Householder vectors + ! + ! lda Leading dimension of a + ! matrixCols local columns of matrix + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + ! d(na) Diagonal elements (returned), identical on all processors + ! + ! e(na) Off-Diagonal elements (returned), identical on all processors + ! + ! tau(na) Factors for the Householder vectors (returned), needed for back transformation + ! + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + real(kind=REAL_DATATYPE) :: d(na), e(na), tau(na) +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + real(kind=REAL_DATATYPE) :: a(lda,*) +#else + real(kind=REAL_DATATYPE) :: a(lda,matrixCols) +#endif + + integer(kind=ik), parameter :: max_stored_rows = 32 + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols + integer(kind=ik) :: l_cols, l_rows, nstor + integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre + integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile + +#ifdef WITH_OPENMP + integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter + integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads +#endif + + real(kind=REAL_DATATYPE) :: vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf + + real(kind=REAL_DATATYPE), allocatable :: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) +#ifdef WITH_OPENMP + real(kind=REAL_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:) +#endif + integer(kind=ik) :: istat + character(200) :: errorMessage + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%start("tridiag_real_double") +#else + call timer%start("tridiag_real_single") +#endif +#endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + + ! Matrix is split into tiles; work is done only for tiles on the diagonal or above + + tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size + tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + l_rows_tile = tile_size/np_rows ! local rows of a tile + l_cols_tile = tile_size/np_cols ! local cols of a tile + + + totalblocks = (na-1)/nblk + 1 + max_blocks_row = (totalblocks-1)/np_rows + 1 + max_blocks_col = (totalblocks-1)/np_cols + 1 + + max_local_rows = max_blocks_row*nblk + max_local_cols = max_blocks_col*nblk + + allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating tmp "//errorMessage + stop + endif + + allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating vr "//errorMessage + stop + endif + + allocate(ur(max_local_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating ur "//errorMessage + stop + endif + + allocate(vc(max_local_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating vc "//errorMessage + stop + endif + + allocate(uc(max_local_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating uc "//errorMessage + stop + endif + +#ifdef WITH_OPENMP + max_threads = omp_get_max_threads() + + allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating ur_p "//errorMessage + stop + endif + + allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating uc_p "//errorMessage + stop + endif + +#endif + + tmp = 0 + vr = 0 + ur = 0 + vc = 0 + uc = 0 + + allocate(vur(max_local_rows,2*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating vur "//errorMessage + stop + endif + + allocate(uvc(max_local_cols,2*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating uvc "//errorMessage + stop + endif + + d(:) = 0 + e(:) = 0 + tau(:) = 0 + + nstor = 0 + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a + l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a + if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) d(na) = a(l_rows,l_cols) + + do istep=na,3,-1 + + ! Calculate number of local rows and columns of the still remaining matrix + ! on the local processor + + l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1) + l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1) + + ! Calculate vector for Householder transformation on all procs + ! owning column istep + + if(my_pcol==pcol(istep, nblk, np_cols)) then + + ! Get vector to be transformed; distribute last element and norm of + ! remaining elements to all procs in current column + + vr(1:l_rows) = a(1:l_rows,l_cols+1) + if(nstor>0 .and. l_rows>0) then +#ifdef DOUBLE_PRECISION_REAL + call DGEMV('N', l_rows, 2*nstor, 1.0_rk8, vur, ubound(vur,dim=1), & + uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk8, vr, 1) +#else + call SGEMV('N', l_rows, 2*nstor, 1.0_rk4, vur, ubound(vur,dim=1), & + uvc(l_cols+1,1), ubound(uvc,dim=1), 1.0_rk4, vr, 1) +#endif + endif + + if(my_prow==prow(istep-1, nblk, np_rows)) then + aux1(1) = dot_product(vr(1:l_rows-1),vr(1:l_rows-1)) + aux1(2) = vr(l_rows) + else + aux1(1) = dot_product(vr(1:l_rows),vr(1:l_rows)) + aux1(2) = 0. + endif + +#ifdef WITH_MPI + +#if DOUBLE_PRECISION_REAL + call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + aux2 = aux1 +#endif /* WITH_MPI */ + vnorm2 = aux2(1) + vrl = aux2(2) + + ! Householder transformation +#ifdef DOUBLE_PRECISION_REAL + call hh_transform_real_double(vrl, vnorm2, xf, tau(istep)) +#else + call hh_transform_real_single(vrl, vnorm2, xf, tau(istep)) +#endif + ! Scale vr and store Householder vector for back transformation + + vr(1:l_rows) = vr(1:l_rows) * xf + if(my_prow==prow(istep-1, nblk, np_rows)) then + vr(l_rows) = 1. + e(istep-1) = vrl + endif + a(1:l_rows,l_cols+1) = vr(1:l_rows) ! store Householder vector for back transformation + + endif + + ! Broadcast the Householder vector (and tau) along columns + + if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(vr, l_rows+1, MPI_REAL8, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(vr, l_rows+1, MPI_REAL4, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + tau(istep) = vr(l_rows+1) + + ! Transpose Householder vector vr -> vc + +#ifdef DOUBLE_PRECISION_REAL + call elpa_transpose_vectors_real_double (vr, ubound(vr,dim=1), mpi_comm_rows, & + vc, ubound(vc,dim=1), mpi_comm_cols, & + 1, istep-1, 1, nblk) +#else + call elpa_transpose_vectors_real_single (vr, ubound(vr,dim=1), mpi_comm_rows, & + vc, ubound(vc,dim=1), mpi_comm_cols, & + 1, istep-1, 1, nblk) +#endif + + ! Calculate u = (A + VU**T + UV**T)*v + + ! For cache efficiency, we use only the upper half of the matrix tiles for this, + ! thus the result is partly in uc(:) and partly in ur(:) + + uc(1:l_cols) = 0 + ur(1:l_rows) = 0 + if (l_rows>0 .and. l_cols>0) then + +#ifdef WITH_OPENMP + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%start("OpenMP parallel_double") +#else + call timer%start("OpenMP parallel_single") +#endif +#endif + +!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre) + + my_thread = omp_get_thread_num() + n_threads = omp_get_num_threads() + + n_iter = 0 + + uc_p(1:l_cols,my_thread) = 0. + ur_p(1:l_rows,my_thread) = 0. +#endif + do i=0,(istep-2)/tile_size + lcs = i*l_cols_tile+1 + lce = min(l_cols,(i+1)*l_cols_tile) + if (lce0) then +#ifdef DOUBLE_PRECISION_REAL + call DGEMV('T', l_rows, 2*nstor, 1.0_rk8, vur, ubound(vur,dim=1), vr, 1, 0.0_rk8, aux, 1) + call DGEMV('N', l_cols, 2*nstor, 1.0_rk8, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk8, uc, 1) +#else + call SGEMV('T', l_rows, 2*nstor, 1.0_rk4, vur, ubound(vur,dim=1), vr, 1, 0.0_rk4, aux, 1) + call SGEMV('N', l_cols, 2*nstor, 1.0_rk4, uvc, ubound(uvc,dim=1), aux, 1, 1.0_rk4, uc, 1) +#endif + endif + + endif + + ! Sum up all ur(:) parts along rows and add them to the uc(:) parts + ! on the processors containing the diagonal + ! This is only necessary if ur has been calculated, i.e. if the + ! global tile size is smaller than the global remaining matrix + + if (tile_size < istep-1) then +#ifdef DOUBLE_PRECISION_REAL + call elpa_reduce_add_vectors_REAL_double (ur, ubound(ur,dim=1), mpi_comm_rows, & + uc, ubound(uc,dim=1), mpi_comm_cols, & + istep-1, 1, nblk) +#else + call elpa_reduce_add_vectors_REAL_single (ur, ubound(ur,dim=1), mpi_comm_rows, & + uc, ubound(uc,dim=1), mpi_comm_cols, & + istep-1, 1, nblk) +#endif + endif + + ! Sum up all the uc(:) parts, transpose uc -> ur + + if (l_cols>0) then + tmp(1:l_cols) = uc(1:l_cols) +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(tmp, uc, l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(tmp, uc, l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + uc = tmp +#endif /* WITH_MPI */ + + endif + +#ifdef DOUBLE_PRECISION_REAL + call elpa_transpose_vectors_real_double (uc, ubound(uc,dim=1), mpi_comm_cols, & + ur, ubound(ur,dim=1), mpi_comm_rows, & + 1, istep-1, 1, nblk) +#else + call elpa_transpose_vectors_real_single (uc, ubound(uc,dim=1), mpi_comm_cols, & + ur, ubound(ur,dim=1), mpi_comm_rows, & + 1, istep-1, 1, nblk) +#endif + ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) + + x = 0 + if (l_cols>0) x = dot_product(vc(1:l_cols),uc(1:l_cols)) +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(x, vav, 1, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) +#else + call mpi_allreduce(x, vav, 1, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) +#endif + +#else /* WITH_MPI */ + + vav = x +#endif /* WITH_MPI */ + + ! store u and v in the matrices U and V + ! these matrices are stored combined in one here + + do j=1,l_rows + vur(j,2*nstor+1) = tau(istep)*vr(j) + vur(j,2*nstor+2) = 0.5*tau(istep)*vav*vr(j) - ur(j) + enddo + do j=1,l_cols + uvc(j,2*nstor+1) = 0.5*tau(istep)*vav*vc(j) - uc(j) + uvc(j,2*nstor+2) = tau(istep)*vc(j) + enddo + + nstor = nstor+1 + + ! If the limit of max_stored_rows is reached, calculate A + VU**T + UV**T + + if (nstor==max_stored_rows .or. istep==3) then + + do i=0,(istep-2)/tile_size + lcs = i*l_cols_tile+1 + lce = min(l_cols,(i+1)*l_cols_tile) + lrs = 1 + lre = min(l_rows,(i+1)*l_rows_tile) + if (lce0) a(l_rows,l_cols) = a(l_rows,l_cols) & + + dot_product(vur(l_rows,1:2*nstor),uvc(l_cols,1:2*nstor)) + d(istep-1) = a(l_rows,l_cols) + endif + + enddo + + ! Store e(1) and d(1) + + if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) e(1) = a(1,l_cols) ! use last l_cols value of loop above + if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) d(1) = a(1,1) + + deallocate(tmp, vr, ur, vc, uc, vur, uvc, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when deallocating uvc "//errorMessage + stop + endif + + + ! distribute the arrays d and e to all processors + + allocate(tmp(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when allocating tmp "//errorMessage + stop + endif +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + tmp = d + call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) + tmp = d + call mpi_allreduce(tmp, d, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) + tmp = e + call mpi_allreduce(tmp, e, na, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) + tmp = e + call mpi_allreduce(tmp, e, na, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) +#else + tmp = d + call mpi_allreduce(tmp, d, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) + tmp = d + call mpi_allreduce(tmp, d, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) + tmp = e + call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) + tmp = e + call mpi_allreduce(tmp, e, na, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + deallocate(tmp, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"tridiag_real: error when deallocating tmp "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("tridiag_real_double") +#else + call timer%stop("tridiag_real_single") +#endif +#endif + +#ifdef DOUBLE_PRECISION_REAL + end subroutine tridiag_real_double +#else + end subroutine tridiag_real_single +#endif + +#ifdef DOUBLE_PRECISION_REAL + subroutine trans_ev_real_double(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#else + subroutine trans_ev_real_single(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) +#endif + !------------------------------------------------------------------------------- + ! trans_ev_real: Transforms the eigenvectors of a tridiagonal matrix back + ! to the eigenvectors of the original matrix + ! (like Scalapack Routine PDORMTR) + ! + ! Parameters + ! + ! na Order of matrix a, number of rows of matrix q + ! + ! nqc Number of columns of matrix q + ! + ! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real) + ! Distribution is like in Scalapack. + ! + ! lda Leading dimension of a + ! matrixCols local columns of matrix a and q + ! + ! tau(na) Factors of the Householder vectors + ! + ! q On input: Eigenvectors of tridiagonal matrix + ! On output: Transformed eigenvectors + ! Distribution is like in Scalapack. + ! + ! ldq Leading dimension of q + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + real(kind=REAL_DATATYPE) :: tau(na) +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + real(kind=REAL_DATATYPE) :: a(lda,*), q(ldq,*) +#else + real(kind=REAL_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols) +#endif + + integer(kind=ik) :: max_stored_rows + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols + integer(kind=ik) :: l_cols, l_rows, l_colh, nstor + integer(kind=ik) :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol + + real(kind=REAL_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:) + real(kind=REAL_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:) + integer(kind=ik) :: istat + character(200) :: errorMessage +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%start("trans_ev_real_double") +#else + call timer%start("trans_ev_real_single") +#endif +#endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + + totalblocks = (na-1)/nblk + 1 + max_blocks_row = (totalblocks-1)/np_rows + 1 + max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q! + + max_local_rows = max_blocks_row*nblk + max_local_cols = max_blocks_col*nblk + + max_stored_rows = (63/nblk+1)*nblk + + allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_real: error when allocating tmat "//errorMessage + stop + endif + + allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_real: error when allocating h1 "//errorMessage + stop + endif + + allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_real: error when allocating h2 "//errorMessage + stop + endif + + allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_real: error when allocating tmp1 "//errorMessage + stop + endif + + allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_real: error when allocating tmp2 "//errorMessage + stop + endif + + allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_real: error when allocating hvn "//errorMessage + stop + endif + + allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_real: error when allocating hvm "//errorMessage + stop + endif + + hvm = 0 ! Must be set to 0 !!! + hvb = 0 ! Safety only + + l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q + + nstor = 0 + + do istep=1,na,nblk + + ics = MAX(istep,3) + ice = MIN(istep+nblk-1,na) + if (ice0) & +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(hvb, nb, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(hvb, nb, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + nb = 0 + do ic=ics,ice + l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector + hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows) + nstor = nstor+1 + nb = nb+l_rows + enddo + + ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough! + if (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32)) then + + ! Calculate scalar products of stored vectors. + ! This can be done in different ways, we use dsyrk + + tmat = 0 + if (l_rows>0) & +#ifdef DOUBLE_PRECISION_REAL + call dsyrk('U', 'T', nstor, l_rows, 1.0_rk8, hvm, ubound(hvm,dim=1), 0.0_rk8, tmat, max_stored_rows) +#else + call ssyrk('U', 'T', nstor, l_rows, 1.0_rk4, hvm, ubound(hvm,dim=1), 0.0_rk4, tmat, max_stored_rows) +#endif + + + nc = 0 + do n=1,nstor-1 + h1(nc+1:nc+n) = tmat(1:n,n+1) + nc = nc+n + enddo +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) +#else + if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + if (nc>0) h2 = h1 +#endif /* WITH_MPI */ + ! Calculate triangular matrix T + + nc = 0 + tmat(1,1) = tau(ice-nstor+1) + do n=1,nstor-1 +#ifdef DOUBLE_PRECISION_REAL + call dtrmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1) +#else + call strmv('L', 'T', 'N', n, tmat, max_stored_rows, h2(nc+1), 1) +#endif + tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1) + tmat(n+1,n+1) = tau(ice-nstor+n+1) + nc = nc+n + enddo + + ! Q = Q - V * T * V**T * Q + + if (l_rows>0) then +#ifdef DOUBLE_PRECISION_REAL + call dgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk8, hvm, ubound(hvm,dim=1), & + q, ldq, 0.0_rk8, tmp1, nstor) +#else + call sgemm('T', 'N', nstor, l_cols, l_rows, 1.0_rk4, hvm, ubound(hvm,dim=1), & + q, ldq, 0.0_rk4, tmp1, nstor) +#endif + + else + tmp1(1:l_cols*nstor) = 0 + endif + +#ifdef DOUBLE_PRECISION_REAL + +#ifdef WITH_MPI + call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) +#else + tmp2 = tmp1 +#endif + if (l_rows>0) then + call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor) + call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), & + tmp2, nstor, 1.0_rk8, q, ldq) + endif +#else /* DOUBLE_PRECISION_REAL */ + +#ifdef WITH_MPI + call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#else + tmp2 = tmp1 +#endif + + if (l_rows>0) then + call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor) + call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), & + tmp2, nstor, 1.0_rk4, q, ldq) + endif +#endif /* DOUBLE_PRECISION_REAL */ + nstor = 0 + endif + + enddo + + deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"trans_ev_real: error when deallocating hvm "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("trans_ev_real_double") +#else + call timer%stop("trans_ev_real_single") +#endif +#endif + +#ifdef DOUBLE_PRECISION_REAL + end subroutine trans_ev_real_double +#else + end subroutine trans_ev_real_single +#endif + +#ifdef DOUBLE_PRECISION_REAL + subroutine mult_at_b_real_double(uplo_a, uplo_c, na, ncb, a, lda, ldaCols, b, ldb, ldbCols, nblk, & + mpi_comm_rows, mpi_comm_cols, c, ldc, ldcCols) +#else + subroutine mult_at_b_real_single(uplo_a, uplo_c, na, ncb, a, lda, ldaCols, b, ldb, ldbCols, nblk, & + mpi_comm_rows, mpi_comm_cols, c, ldc, ldcCols) +#endif + + !------------------------------------------------------------------------------- + ! mult_at_b_real: Performs C := A**T * B + ! + ! where: A is a square matrix (na,na) which is optionally upper or lower triangular + ! B is a (na,ncb) matrix + ! C is a (na,ncb) matrix where optionally only the upper or lower + ! triangle may be computed + ! + ! Parameters + ! + ! uplo_a 'U' if A is upper triangular + ! 'L' if A is lower triangular + ! anything else if A is a full matrix + ! Please note: This pertains to the original A (as set in the calling program) + ! whereas the transpose of A is used for calculations + ! If uplo_a is 'U' or 'L', the other triangle is not used at all, + ! i.e. it may contain arbitrary numbers + ! + ! uplo_c 'U' if only the upper diagonal part of C is needed + ! 'L' if only the upper diagonal part of C is needed + ! anything else if the full matrix C is needed + ! Please note: Even when uplo_c is 'U' or 'L', the other triangle may be + ! written to a certain extent, i.e. one shouldn't rely on the content there! + ! + ! na Number of rows/columns of A, number of rows of B and C + ! + ! ncb Number of columns of B and C + ! + ! a Matrix A + ! + ! lda Leading dimension of a + ! ldaCols Columns of Matrix a + ! + ! b Matrix B + ! ldbCol Columns of Matrix b + ! + ! ldb Leading dimension of b + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + ! c Matrix C + ! + ! ldc Leading dimension of c + ! ldcCol Columns of Matrix c + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + character*1 :: uplo_a, uplo_c + + integer(kind=ik), intent(in) :: na, lda, ldaCols, ldb, ldbCols, ldc, ldcCols, nblk + integer(kind=ik) :: ncb, mpi_comm_rows, mpi_comm_cols + real(kind=REAL_DATATYPE) :: a(lda,ldaCols), b(ldb,ldbCols), c(ldc,ldcCols) + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: l_cols, l_rows, l_rows_np + integer(kind=ik) :: np, n, nb, nblk_mult, lrs, lre, lcs, lce + integer(kind=ik) :: gcol_min, gcol, goff + integer(kind=ik) :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals + integer(kind=ik), allocatable :: lrs_save(:), lre_save(:) + + logical :: a_lower, a_upper, c_lower, c_upper + + real(kind=REAL_DATATYPE), allocatable :: aux_mat(:,:), aux_bc(:), tmp1(:,:), tmp2(:,:) + integer(kind=ik) :: istat + character(200) :: errorMessage +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%start("mult_at_b_real_double") +#else + call timer%start("mult_at_b_real_single") +#endif +#endif +! if (na .lt. lda) then +! print *,"na lt lda ",na,lda +! stop +! endif +! if (na .lt. ldb) then +! print *,"na lt ldb ",na,ldb +! stop +! endif +! if (na .lt. ldc) then +! print *,"na lt ldc ",na,ldc +! stop +! endif +! if (na .lt. ldaCols) then +! print *,"na lt ldaCols ",na,ldaCols +! stop +! endif +! if (na .lt. ldbCols) then +! print *,"na lt ldbCols ",na,ldbCols +! stop +! endif +! if (na .lt. ldcCols) then +! print *,"na lt ldcCols ",na,ldcCols +! stop +! endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and b + l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b + + ! Block factor for matrix multiplications, must be a multiple of nblk + + if (na/np_rows<=256) then + nblk_mult = (31/nblk+1)*nblk + else + nblk_mult = (63/nblk+1)*nblk + endif + + allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_at_b_real: error when allocating aux_mat "//errorMessage + stop + endif + + allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_at_b_real: error when allocating aux_bc "//errorMessage + stop + endif + + allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_at_b_real: error when allocating lrs_save "//errorMessage + stop + endif + + allocate(lre_save(nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_at_b_real: error when allocating lre_save "//errorMessage + stop + endif + + a_lower = .false. + a_upper = .false. + c_lower = .false. + c_upper = .false. + + if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true. + if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true. + if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true. + if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true. + + ! Build up the result matrix by processor rows + + do np = 0, np_rows-1 + + ! In this turn, procs of row np assemble the result + + l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors + + nr_done = 0 ! Number of rows done + aux_mat = 0 + nstor = 0 ! Number of columns stored in aux_mat + + ! Loop over the blocks on row np + + do nb=0,(l_rows_np-1)/nblk + + goff = nb*np_rows + np ! Global offset in blocks corresponding to nb + + ! Get the processor column which owns this block (A is transposed, so we need the column) + ! and the offset in blocks within this column. + ! The corresponding block column in A is then broadcast to all for multiplication with B + + np_bc = MOD(goff,np_cols) + noff = goff/np_cols + n_aux_bc = 0 + + ! Gather up the complete block column of A on the owner + + do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast + + gcol = goff*nblk + n ! global column corresponding to n + if (nstor==0 .and. n==1) gcol_min = gcol + + lrs = 1 ! 1st local row number for broadcast + lre = l_rows ! last local row number for broadcast + if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1) + if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) + + if (lrs<=lre) then + nvals = lre-lrs+1 + if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n) + n_aux_bc = n_aux_bc + nvals + endif + + lrs_save(n) = lrs + lre_save(n) = lre + + enddo + + ! Broadcast block column +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL8, np_bc, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(aux_bc, n_aux_bc, MPI_REAL4, np_bc, mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + ! Insert what we got in aux_mat + + n_aux_bc = 0 + do n = 1, min(l_rows_np-nb*nblk,nblk) + nstor = nstor+1 + lrs = lrs_save(n) + lre = lre_save(n) + if (lrs<=lre) then + nvals = lre-lrs+1 + aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals) + n_aux_bc = n_aux_bc + nvals + endif + enddo + + ! If we got nblk_mult columns in aux_mat or this is the last block + ! do the matrix multiplication + + if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then + + lrs = 1 ! 1st local row number for multiply + lre = l_rows ! last local row number for multiply + if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1) + if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1) + + lcs = 1 ! 1st local col number for multiply + lce = l_cols ! last local col number for multiply + if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1) + if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols) + + if (lcs<=lce) then + allocate(tmp1(nstor,lcs:lce),tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_at_b_real: error when allocating tmp1 "//errorMessage + stop + endif + + if (lrs<=lre) then +#ifdef DOUBLE_PRECISION_REAL + call dgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk8, aux_mat(lrs,1), ubound(aux_mat,dim=1), & + b(lrs,lcs), ldb, 0.0_rk8, tmp1, nstor) +#else + call sgemm('T', 'N', nstor, lce-lcs+1, lre-lrs+1, 1.0_rk4, aux_mat(lrs,1), ubound(aux_mat,dim=1), & + b(lrs,lcs), ldb, 0.0_rk4, tmp1, nstor) +#endif + + else + tmp1 = 0 + endif + + ! Sum up the results and send to processor row np +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL8, MPI_SUM, np, mpi_comm_rows, mpierr) +#else + call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + tmp2 = tmp1 +#endif /* WITH_MPI */ + ! Put the result into C + if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) + + deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_at_b_real: error when deallocating tmp1 "//errorMessage + stop + endif + + endif + + nr_done = nr_done+nstor + nstor=0 + aux_mat(:,:)=0 + endif + enddo + enddo + + deallocate(aux_mat, aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"mult_at_b_real: error when deallocating aux_mat "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("mult_at_b_real_double") +#else + call timer%stop("mult_at_b_real_single") +#endif + +#endif + +#ifdef DOUBLE_PRECISION_REAL + end subroutine mult_at_b_real_double +#else + end subroutine mult_at_b_real_single +#endif + +#if defined(DOUBLE_PRECISION_REAL) + subroutine solve_tridi_double( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success ) +#else + subroutine solve_tridi_single( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success ) +#endif + +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + integer(kind=ik) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + real(kind=REAL_DATATYPE) :: d(na), e(na) +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + real(kind=REAL_DATATYPE) :: q(ldq,*) +#else + real(kind=REAL_DATATYPE) :: q(ldq,matrixCols) +#endif + + integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + + integer(kind=ik), allocatable :: limits(:), l_col(:), p_col(:), l_col_bc(:), p_col_bc(:) + + logical, intent(in) :: wantDebug + logical, intent(out) :: success + integer(kind=ik) :: istat + character(200) :: errorMessage + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%start("solve_tridi_double") +#else + call timer%start("solve_tridi_single") +#endif +#endif + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + success = .true. + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q + l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q + + ! Set Q to 0 +#ifdef DOUBLE_PRECISION_REAL + q(1:l_rows, 1:l_cols) = 0._rk8 +#else + q(1:l_rows, 1:l_cols) = 0._rk4 +#endif + + ! Get the limits of the subdivisons, each subdivison has as many cols + ! as fit on the respective processor column + + allocate(limits(0:np_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi: error when allocating limits "//errorMessage + stop + endif + + limits(0) = 0 + do np=0,np_cols-1 + nc = local_index(na, np, np_cols, nblk, -1) ! number of columns on proc column np + + ! Check for the case that a column has have zero width. + ! This is not supported! + ! Scalapack supports it but delivers no results for these columns, + ! which is rather annoying + if (nc==0) then +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("solve_tridi_double") +#else + call timer%stop("solve_tridi_single") +#endif + +#endif + if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width' + success = .false. + return + endif + limits(np+1) = limits(np) + nc + enddo + + ! Subdivide matrix by subtracting rank 1 modifications + + do i=1,np_cols-1 + n = limits(i) + d(n) = d(n)-abs(e(n)) + d(n+1) = d(n+1)-abs(e(n)) + enddo + + ! Solve sub problems on processsor columns + + nc = limits(my_pcol) ! column after which my problem starts + + if (np_cols>1) then + nev1 = l_cols ! all eigenvectors are needed + else + nev1 = MIN(nev,l_cols) + endif +#ifdef DOUBLE_PRECISION_REAL + call solve_tridi_col_double(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & + matrixCols, mpi_comm_rows, wantDebug, success) +#else + call solve_tridi_col_single(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, & + matrixCols, mpi_comm_rows, wantDebug, success) +#endif + if (.not.(success)) then +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("solve_tridi_double") +#else + call timer%stop("solve_tridi_single") +#endif +#endif + return + endif + ! If there is only 1 processor column, we are done + + if (np_cols==1) then + deallocate(limits, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi: error when deallocating limits "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("solve_tridi_double") +#else + call timer%stop("solve_tridi_single") +#endif +#endif + return + endif + + ! Set index arrays for Q columns + + ! Dense distribution scheme: + + allocate(l_col(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi: error when allocating l_col "//errorMessage + stop + endif + + allocate(p_col(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi: error when allocating p_col "//errorMessage + stop + endif + + n = 0 + do np=0,np_cols-1 + nc = local_index(na, np, np_cols, nblk, -1) + do i=1,nc + n = n+1 + l_col(n) = i + p_col(n) = np + enddo + enddo + + ! Block cyclic distribution scheme, only nev columns are set: + + allocate(l_col_bc(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi: error when allocating l_col_bc "//errorMessage + stop + endif + + allocate(p_col_bc(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi: error when allocating p_col_bc "//errorMessage + stop + endif + + p_col_bc(:) = -1 + l_col_bc(:) = -1 + + do i = 0, na-1, nblk*np_cols + do j = 0, np_cols-1 + do n = 1, nblk + if (i+j*nblk+n <= MIN(nev,na)) then + p_col_bc(i+j*nblk+n) = j + l_col_bc(i+j*nblk+n) = i/np_cols + n + endif + enddo + enddo + enddo + + ! Recursively merge sub problems +#ifdef DOUBLE_PRECISION_REAL + call merge_recursive_double(0, np_cols, wantDebug, success) +#else + call merge_recursive_single(0, np_cols, wantDebug, success) +#endif + if (.not.(success)) then +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("solve_tridi_double") +#else + call timer%stop("solve_tridi_single") +#endif + +#endif + return + endif + + deallocate(limits,l_col,p_col,l_col_bc,p_col_bc, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi: error when deallocating l_col "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("solve_tridi_double") +#else + call timer%stop("solve_tridi_single") +#endif + +#endif + return + + contains +#ifdef DOUBLE_PRECISION_REAL + recursive subroutine merge_recursive_double(np_off, nprocs, wantDebug, success) +#else + recursive subroutine merge_recursive_single(np_off, nprocs, wantDebug, success) +#endif + use precision + implicit none + + ! noff is always a multiple of nblk_ev + ! nlen-noff is always > nblk_ev + + integer(kind=ik) :: np_off, nprocs + integer(kind=ik) :: np1, np2, noff, nlen, nmid, n +#ifdef WITH_MPI + integer(kind=ik) :: mpi_status(mpi_status_size) +#endif + logical, intent(in) :: wantDebug + logical, intent(out) :: success + + success = .true. + + if (nprocs<=1) then + ! Safety check only + if (wantDebug) write(error_unit,*) "ELPA1_merge_recursive: INTERNAL error merge_recursive: nprocs=",nprocs + success = .false. + return + endif + ! Split problem into 2 subproblems of size np1 / np2 + + np1 = nprocs/2 + np2 = nprocs-np1 + +#ifdef DOUBLE_PRECISION_REAL + if (np1 > 1) call merge_recursive_double(np_off, np1, wantDebug, success) + if (.not.(success)) return + if (np2 > 1) call merge_recursive_double(np_off+np1, np2, wantDebug, success) + if (.not.(success)) return +#else + if (np1 > 1) call merge_recursive_single(np_off, np1, wantDebug, success) + if (.not.(success)) return + if (np2 > 1) call merge_recursive_single(np_off+np1, np2, wantDebug, success) + if (.not.(success)) return +#endif + noff = limits(np_off) + nmid = limits(np_off+np1) - noff + nlen = limits(np_off+nprocs) - noff + +#ifdef WITH_MPI + if (my_pcol==np_off) then + do n=np_off+np1,np_off+nprocs-1 +#ifdef DOUBLE_PRECISION_REAL + call mpi_send(d(noff+1), nmid, MPI_REAL8, n, 1, mpi_comm_cols, mpierr) +#else + call mpi_send(d(noff+1), nmid, MPI_REAL4, n, 1, mpi_comm_cols, mpierr) +#endif + enddo + endif +#endif /* WITH_MPI */ + + if (my_pcol>=np_off+np1 .and. my_pcol=np_off .and. my_pcol2*min_submatrix_size) + n = ((n+3)/4)*2 ! the bigger one of the two halves, we want EVEN boundaries + ndiv = ndiv*2 + enddo + + ! If there is only 1 processor row and not all eigenvectors are needed + ! and the matrix size is big enough, then use 2 subdivisions + ! so that merge_systems is called once and only the needed + ! eigenvectors are calculated for the final problem. + + if (np_rows==1 .and. nev2*min_submatrix_size) ndiv = 2 + + allocate(limits(0:ndiv), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi_col: error when allocating limits "//errorMessage + stop + endif + + limits(0) = 0 + limits(ndiv) = na + + n = ndiv + do while(n>1) + n = n/2 ! n is always a power of 2 + do i=0,ndiv-1,2*n + ! We want to have even boundaries (for cache line alignments) + limits(i+n) = limits(i) + ((limits(i+2*n)-limits(i)+3)/4)*2 + enddo + enddo + + ! Calculate the maximum size of a subproblem + + max_size = 0 + do i=1,ndiv + max_size = MAX(max_size,limits(i)-limits(i-1)) + enddo + + ! Subdivide matrix by subtracting rank 1 modifications + + do i=1,ndiv-1 + n = limits(i) + d(n) = d(n)-abs(e(n)) + d(n+1) = d(n+1)-abs(e(n)) + enddo + + if (np_rows==1) then + + ! For 1 processor row there may be 1 or 2 subdivisions + do n=0,ndiv-1 + noff = limits(n) ! Start of subproblem + nlen = limits(n+1)-noff ! Size of subproblem + +#ifdef DOUBLE_PRECISION_REAL + call solve_tridi_single_problem_double(nlen,d(noff+1),e(noff+1), & + q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success) +#else + call solve_tridi_single_problem_single(nlen,d(noff+1),e(noff+1), & + q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success) +#endif + if (.not.(success)) return + enddo + + else + + ! Solve sub problems in parallel with solve_tridi_single + ! There is at maximum 1 subproblem per processor + + allocate(qmat1(max_size,max_size), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi_col: error when allocating qmat1 "//errorMessage + stop + endif + + allocate(qmat2(max_size,max_size), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi_col: error when allocating qmat2 "//errorMessage + stop + endif + + qmat1 = 0 ! Make sure that all elements are defined + + if (my_prow < ndiv) then + + noff = limits(my_prow) ! Start of subproblem + nlen = limits(my_prow+1)-noff ! Size of subproblem +#ifdef DOUBLE_PRECISION_REAL + call solve_tridi_single_problem_double(nlen,d(noff+1),e(noff+1),qmat1, & + ubound(qmat1,dim=1), wantDebug, success) +#else + call solve_tridi_single_problem_single(nlen,d(noff+1),e(noff+1),qmat1, & + ubound(qmat1,dim=1), wantDebug, success) +#endif + if (.not.(success)) return + endif + + ! Fill eigenvectors in qmat1 into global matrix q + + do np = 0, ndiv-1 + + noff = limits(np) + nlen = limits(np+1)-noff +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(d(noff+1), nlen, MPI_REAL8, np, mpi_comm_rows, mpierr) + qmat2 = qmat1 + call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL8, np, mpi_comm_rows, mpierr) +#else + + call MPI_Bcast(d(noff+1), nlen, MPI_REAL4, np, mpi_comm_rows, mpierr) + qmat2 = qmat1 + call MPI_Bcast(qmat2, max_size*max_size, MPI_REAL4, np, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + qmat2 = qmat1 ! is this correct +#endif /* WITH_MPI */ + do i=1,nlen +#ifdef DOUBLE_PRECISION_REAL + call distribute_global_column_double(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) +#else + call distribute_global_column_single(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) +#endif + enddo + + enddo + + deallocate(qmat1, qmat2, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi_col: error when deallocating qmat2 "//errorMessage + stop + endif + + endif + + ! Allocate and set index arrays l_col and p_col + + allocate(l_col(na), p_col_i(na), p_col_o(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi_col: error when allocating l_col "//errorMessage + stop + endif + + do i=1,na + l_col(i) = i + p_col_i(i) = 0 + p_col_o(i) = 0 + enddo + + ! Merge subproblems + + n = 1 + do while(n 1e-14_rk8) then +#else + if (abs(d(i+1) - d(i)) / abs(d(i+1) + d(i)) > 1e-14_rk4) then +#endif + write(error_unit,'(a,i8,2g25.16)') '***WARNING: Monotony error dste**:',i+1,d(i),d(i+1) + else + write(error_unit,'(a,i8,2g25.16)') 'Info: Monotony error dste{dc,qr}:',i+1,d(i),d(i+1) + write(error_unit,'(a)') 'The eigenvalues from a lapack call are not sorted to machine precision.' + write(error_unit,'(a)') 'In this extent, this is completely harmless.' + write(error_unit,'(a)') 'Still, we keep this info message just in case.' + end if + allocate(qtmp(nlen), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_tridi_single: error when allocating qtmp "//errorMessage + stop + endif + + dtmp = d(i+1) + qtmp(1:nlen) = q(1:nlen,i+1) + do j=i,1,-1 + if (dtmp=npc_0+npc_n) then +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("merge_systems_double") +#else + call timer%stop("merge_systems_single") +#endif +#endif + return + endif + ! Determine number of "next" and "prev" column for ring sends + + if (my_pcol == npc_0+npc_n-1) then + np_next = npc_0 + else + np_next = my_pcol + 1 + endif + + if (my_pcol == npc_0) then + np_prev = npc_0+npc_n-1 + else + np_prev = my_pcol - 1 + endif +#ifdef DOUBLE_PRECISION_REAL + call check_monotony_double(nm,d,'Input1',wantDebug, success) +#else + call check_monotony_single(nm,d,'Input1',wantDebug, success) +#endif + if (.not.(success)) then +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("merge_systems_double") +#else + call timer%stop("merge_systems_single") +#endif +#endif + return + endif +#ifdef DOUBLE_PRECISION_REAL + call check_monotony_double(na-nm,d(nm+1),'Input2',wantDebug, success) +#else + call check_monotony_single(na-nm,d(nm+1),'Input2',wantDebug, success) +#endif + if (.not.(success)) then +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("merge_systems_double") +#else + call timer%stop("merge_systems_single") +#endif +#endif + return + endif + ! Get global number of processors and my processor number. + ! Please note that my_proc does not need to match any real processor number, + ! it is just used for load balancing some loops. + + n_procs = np_rows*npc_n + my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major + + + ! Local limits of the rows of Q + + l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q + l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm + l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q + + l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm + l_rows = l_rqe-l_rqs+1 ! Total number of local rows + + + ! My number of local columns + + l_cols = COUNT(p_col(1:na)==my_pcol) + + ! Get max number of local columns + + max_local_cols = 0 + do np = npc_0, npc_0+npc_n-1 + max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np)) + enddo + + ! Calculations start here + + beta = abs(e) +#ifdef DOUBLE_PRECISION_REAL + sig = sign(1.0_rk8,e) +#else + sig = sign(1.0_rk4,e) +#endif + + ! Calculate rank-1 modifier z + + z(:) = 0 + + if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then + ! nm is local on my row + do i = 1, na + if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i)) + enddo + endif + + if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then + ! nm+1 is local on my row + do i = 1, na + if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i)) + enddo + endif + +#ifdef DOUBLE_PRECISION_REAL + call global_gather_double(z, na) +#else + call global_gather_single(z, na) +#endif + ! Normalize z so that norm(z) = 1. Since z is the concatenation of + ! two normalized vectors, norm2(z) = sqrt(2). +#ifdef DOUBLE_PRECISION_REAL + z = z/sqrt(2.0_rk8) + rho = 2._rk8*beta +#else + z = z/sqrt(2.0_rk4) + rho = 2._rk4*beta +#endif + ! Calculate index for merging both systems by ascending eigenvalues +#ifdef DOUBLE_PRECISION_REAL + call DLAMRG( nm, na-nm, d, 1, 1, idx ) +#else + call SLAMRG( nm, na-nm, d, 1, 1, idx ) +#endif + +! Calculate the allowable deflation tolerance + + zmax = maxval(abs(z)) + dmax = maxval(abs(d)) +#ifdef DOUBLE_PRECISION_REAL + EPS = DLAMCH( 'Epsilon' ) + TOL = 8._rk8*EPS*MAX(dmax,zmax) +#else + EPS = SLAMCH( 'Epsilon' ) + TOL = 8._rk4*EPS*MAX(dmax,zmax) +#endif + + ! If the rank-1 modifier is small enough, no more needs to be done + ! except to reorganize D and Q + + IF ( RHO*zmax <= TOL ) THEN + + ! Rearrange eigenvalues + + tmp = d + do i=1,na + d(i) = tmp(idx(i)) + enddo + + ! Rearrange eigenvectors +#ifdef DOUBLE_PRECISION_REAL + call resort_ev_double(idx, na) +#else + call resort_ev_single(idx, na) +#endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("merge_systems_double") +#else + call timer%stop("merge_systems_single") +#endif +#endif + + return + ENDIF + + ! Merge and deflate system + + na1 = 0 + na2 = 0 + + ! COLTYP: + ! 1 : non-zero in the upper half only; + ! 2 : dense; + ! 3 : non-zero in the lower half only; + ! 4 : deflated. + + coltyp(1:nm) = 1 + coltyp(nm+1:na) = 3 + + do i=1,na + + if (rho*abs(z(idx(i))) <= tol) then + + ! Deflate due to small z component. + + na2 = na2+1 + d2(na2) = d(idx(i)) + idx2(na2) = idx(i) + coltyp(idx(i)) = 4 + + else if (na1>0) then + + ! Check if eigenvalues are close enough to allow deflation. + + S = Z(idx(i)) + C = Z1(na1) + + ! Find sqrt(a**2+b**2) without overflow or + ! destructive underflow. +#ifdef DOUBLE_PRECISION_REAL + TAU = DLAPY2( C, S ) +#else + TAU = SLAPY2( C, S ) +#endif + T = D1(na1) - D(idx(i)) + C = C / TAU + S = -S / TAU + IF ( ABS( T*C*S ) <= TOL ) THEN + + ! Deflation is possible. + + na2 = na2+1 + + Z1(na1) = TAU + + d2new = D(idx(i))*C**2 + D1(na1)*S**2 + d1new = D(idx(i))*S**2 + D1(na1)*C**2 + + ! D(idx(i)) >= D1(na1) and C**2 + S**2 == 1.0 + ! This means that after the above transformation it must be + ! D1(na1) <= d1new <= D(idx(i)) + ! D1(na1) <= d2new <= D(idx(i)) + ! + ! D1(na1) may get bigger but it is still smaller than the next D(idx(i+1)) + ! so there is no problem with sorting here. + ! d2new <= D(idx(i)) which means that it might be smaller than D2(na2-1) + ! which makes a check (and possibly a resort) necessary. + ! + ! The above relations may not hold exactly due to numeric differences + ! so they have to be enforced in order not to get troubles with sorting. + + + if (d1newD(idx(i))) d1new = D(idx(i)) + + if (d2newD(idx(i))) d2new = D(idx(i)) + + D1(na1) = d1new + + do j=na2-1,1,-1 + if (d2new2) then + + ! Solve secular equation + + z(1:na1) = 1 +#ifdef WITH_OPENMP + z_p(1:na1,:) = 1 +#endif + dbase(1:na1) = 0 + ddiff(1:na1) = 0 + + info = 0 +#ifdef WITH_OPENMP + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%start("OpenMP parallel_double") +#else + call timer%start("OpenMP parallel_single") +#endif +#endif + +!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j) + my_thread = omp_get_thread_num() +!$OMP DO +#endif + DO i = my_proc+1, na1, n_procs ! work distributed over all processors +#ifdef DOUBLE_PRECISION_REAL + call DLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! +#else + call SLAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! +#endif + if (info/=0) then + ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) + ! use the more stable bisection algorithm in solve_secular_equation + ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection' +#ifdef DOUBLE_PRECISION_REAL + call solve_secular_equation_double(na1, i, d1, z1, delta, rho, s) +#else + call solve_secular_equation_single(na1, i, d1, z1, delta, rho, s) +#endif + endif + + ! Compute updated z + +#ifdef WITH_OPENMP + do j=1,na1 + if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) ) + enddo + z_p(i,my_thread) = z_p(i,my_thread)*delta(i) +#else + do j=1,na1 + if (i/=j) z(j) = z(j)*( delta(j) / (d1(j)-d1(i)) ) + enddo + z(i) = z(i)*delta(i) +#endif + ! store dbase/ddiff + + if (i1) then + + if (np_rem==npc_0) then + np_rem = npc_0+npc_n-1 + else + np_rem = np_rem-1 + endif +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL8, & + np_next, 1111, np_prev, 1111, & + mpi_comm_cols, mpi_status, mpierr) +#else + call MPI_Sendrecv_replace(qtmp1, l_rows*max_local_cols, MPI_REAL4, & + np_next, 1111, np_prev, 1111, & + mpi_comm_cols, mpi_status, mpierr) +#endif + +#endif /* WITH_MPI */ + endif + + ! Gather the parts in d1 and z which are fitting to qtmp1. + ! This also delivers nnzu/nnzl for proc np_rem + + nnzu = 0 + nnzl = 0 + do i=1,na1 + if (p_col(idx1(i))==np_rem) then + if (coltyp(idx1(i))==1 .or. coltyp(idx1(i))==2) then + nnzu = nnzu+1 + d1u(nnzu) = d1(i) + zu (nnzu) = z (i) + endif + if (coltyp(idx1(i))==3 .or. coltyp(idx1(i))==2) then + nnzl = nnzl+1 + d1l(nnzl) = d1(i) + zl (nnzl) = z (i) + endif + endif + enddo + + ! Set the deflated eigenvectors in Q (comming from proc np_rem) + + ndef = MAX(nnzu,nnzl) ! Remote counter in input matrix + do i = 1, na + j = idx(i) + if (j>na1) then + if (p_col(idx2(j-na1))==np_rem) then + ndef = ndef+1 + if (p_col_out(i)==my_pcol) & + q(l_rqs:l_rqe,l_col_out(i)) = qtmp1(1:l_rows,ndef) + endif + endif + enddo + + do ns = 0, nqcols1-1, max_strip ! strimining loop + + ncnt = MIN(max_strip,nqcols1-ns) ! number of columns in this strip + + ! Get partial result from (output) Q + + do i = 1, ncnt + qtmp2(1:l_rows,i) = q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) + enddo + + ! Compute eigenvectors of the rank-1 modified matrix. + ! Parts for multiplying with upper half of Q: + + do i = 1, ncnt + j = idx(idxq1(i+ns)) + ! Calculate the j-th eigenvector of the deflated system + ! See above why we are doing it this way! + tmp(1:nnzu) = d1u(1:nnzu)-dbase(j) +#ifdef DOUBLE_PRECISION_REAL + call v_add_s_double(tmp,nnzu,ddiff(j)) +#else + call v_add_s_single(tmp,nnzu,ddiff(j)) +#endif + ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j) + enddo + + ! Multiply old Q with eigenvectors (upper half) + + if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) & +#ifdef DOUBLE_PRECISION_REAL + call dgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk8, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & + 1.0_rk8, qtmp2(1,1), ubound(qtmp2,dim=1)) +#else + call sgemm('N', 'N', l_rnm, ncnt, nnzu, 1.0_rk4, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & + 1.0_rk4, qtmp2(1,1), ubound(qtmp2,dim=1)) +#endif + ! Compute eigenvectors of the rank-1 modified matrix. + ! Parts for multiplying with lower half of Q: + + do i = 1, ncnt + j = idx(idxq1(i+ns)) + ! Calculate the j-th eigenvector of the deflated system + ! See above why we are doing it this way! + tmp(1:nnzl) = d1l(1:nnzl)-dbase(j) +#ifdef DOUBLE_PRECISION_REAL + call v_add_s_double(tmp,nnzl,ddiff(j)) +#else + call v_add_s_single(tmp,nnzl,ddiff(j)) +#endif + ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j) + enddo + + ! Multiply old Q with eigenvectors (lower half) + + if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) & +#ifdef DOUBLE_PRECISION_REAL + call dgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk8, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & + ubound(ev,dim=1), 1.0_rk8, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) +#else + call sgemm('N', 'N', l_rows-l_rnm, ncnt, nnzl, 1.0_rk4, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & + ubound(ev,dim=1), 1.0_rk4, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) +#endif + + ! Put partial result into (output) Q + + do i = 1, ncnt + q(l_rqs:l_rqe,l_col_out(idxq1(i+ns))) = qtmp2(1:l_rows,i) + enddo + + enddo + enddo + + deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"merge_systems: error when deallocating ev "//errorMessage + stop + endif + endif + +#ifdef WITH_OPENMP + deallocate(z_p, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"merge_systems: error when deallocating z_p "//errorMessage + stop + endif +#endif + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("merge_systems_double") +#else + call timer%stop("merge_systems_single") +#endif +#endif + + + return + + contains +#ifdef DOUBLE_PRECISION_REAL + subroutine add_tmp_double(d1, dbase, ddiff, z, ev_scale_value, na1,i) +#else + subroutine add_tmp_single(d1, dbase, ddiff, z, ev_scale_value, na1,i) +#endif + use precision + implicit none + + integer(kind=ik), intent(in) :: na1, i + + real(kind=REAL_DATATYPE), intent(in) :: d1(:), dbase(:), ddiff(:), z(:) + real(kind=REAL_DATATYPE), intent(inout) :: ev_scale_value + real(kind=REAL_DATATYPE) :: tmp(1:na1) + + ! tmp(1:na1) = z(1:na1) / delta(1:na1,i) ! original code + ! tmp(1:na1) = z(1:na1) / (d1(1:na1)-d(i))! bad results + + ! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i) + ! in exactly this order, but we want to prevent compiler optimization + + tmp(1:na1) = d1(1:na1) -dbase(i) +#ifdef DOUBLE_PRECISION_REAL + call v_add_s_double(tmp(1:na1),na1,ddiff(i)) +#else + call v_add_s_single(tmp(1:na1),na1,ddiff(i)) +#endif + tmp(1:na1) = z(1:na1) / tmp(1:na1) +#ifdef DOUBLE_PRECISION_REAL + ev_scale_value = 1.0_rk8/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) +#else + ev_scale_value = 1.0_rk4/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) + +#endif + +#ifdef DOUBLE_PRECISION_REAL + end subroutine add_tmp_double +#else + end subroutine add_tmp_single +#endif + +#ifdef DOUBLE_PRECISION_REAL + subroutine resort_ev_double(idx_ev, nLength) +#else + subroutine resort_ev_single(idx_ev, nLength) +#endif + use precision + implicit none + + integer(kind=ik), intent(in) :: nLength + integer(kind=ik) :: idx_ev(nLength) + integer(kind=ik) :: i, nc, pc1, pc2, lc1, lc2, l_cols_out + + real(kind=REAL_DATATYPE), allocatable :: qtmp(:,:) + integer(kind=ik) :: istat + character(200) :: errorMessage + + if (l_rows==0) return ! My processor column has no work to do + + ! Resorts eigenvectors so that q_new(:,i) = q_old(:,idx_ev(i)) + + l_cols_out = COUNT(p_col_out(1:na)==my_pcol) + allocate(qtmp(l_rows,l_cols_out), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"resort_ev: error when allocating qtmp "//errorMessage + stop + endif + + nc = 0 + + do i=1,na + + pc1 = p_col(idx_ev(i)) + lc1 = l_col(idx_ev(i)) + pc2 = p_col_out(i) + + if (pc2<0) cycle ! This column is not needed in output + + if (pc2==my_pcol) nc = nc+1 ! Counter for output columns + + if (pc1==my_pcol) then + if (pc2==my_pcol) then + ! send and recieve column are local + qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) + else +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL8, pc2, mod(i,4096), mpi_comm_cols, mpierr) +#else + call mpi_send(q(l_rqs,lc1), l_rows, MPI_REAL4, pc2, mod(i,4096), mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + endif + else if (pc2==my_pcol) then +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL8, pc1, mod(i,4096), mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_recv(qtmp(1,nc), l_rows, MPI_REAL4, pc1, mod(i,4096), mpi_comm_cols, mpi_status, mpierr) +#endif + +#else /* WITH_MPI */ + qtmp(1:l_rows,nc) = q(l_rqs:l_rqe,lc1) +#endif /* WITH_MPI */ + endif + enddo + + ! Insert qtmp into (output) q + + nc = 0 + + do i=1,na + + pc2 = p_col_out(i) + lc2 = l_col_out(i) + + if (pc2==my_pcol) then + nc = nc+1 + q(l_rqs:l_rqe,lc2) = qtmp(1:l_rows,nc) + endif + enddo + + deallocate(qtmp, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"resort_ev: error when deallocating qtmp "//errorMessage + stop + endif +#ifdef DOUBLE_PRECISION_REAL + end subroutine resort_ev_double +#else + end subroutine resort_ev_single +#endif + +#ifdef DOUBLE_PRECISION_REAL + subroutine transform_columns_double(col1, col2) +#else + subroutine transform_columns_single(col1, col2) +#endif + use precision + implicit none + + integer(kind=ik) :: col1, col2 + integer(kind=ik) :: pc1, pc2, lc1, lc2 + + if (l_rows==0) return ! My processor column has no work to do + + pc1 = p_col(col1) + lc1 = l_col(col1) + pc2 = p_col(col2) + lc2 = l_col(col2) + + if (pc1==my_pcol) then + if (pc2==my_pcol) then + ! both columns are local + tmp(1:l_rows) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + q(l_rqs:l_rqe,lc2)*qtrans(2,1) + q(l_rqs:l_rqe,lc2) = q(l_rqs:l_rqe,lc1)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) + q(l_rqs:l_rqe,lc1) = tmp(1:l_rows) + else +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL8, pc2, 1, & + tmp, l_rows, MPI_REAL8, pc2, 1, & + mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_sendrecv(q(l_rqs,lc1), l_rows, MPI_REAL4, pc2, 1, & + tmp, l_rows, MPI_REAL4, pc2, 1, & + mpi_comm_cols, mpi_status, mpierr) +#endif + +#else /* WITH_MPI */ + tmp(1:l_rows) = q(l_rqs:l_rqe,lc1) +#endif /* WITH_MPI */ + q(l_rqs:l_rqe,lc1) = q(l_rqs:l_rqe,lc1)*qtrans(1,1) + tmp(1:l_rows)*qtrans(2,1) + endif + else if (pc2==my_pcol) then +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL8, pc1, 1, & + tmp, l_rows, MPI_REAL8, pc1, 1, & + mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_sendrecv(q(l_rqs,lc2), l_rows, MPI_REAL4, pc1, 1, & + tmp, l_rows, MPI_REAL4, pc1, 1, & + mpi_comm_cols, mpi_status, mpierr) +#endif + +#else /* WITH_MPI */ + tmp(1:l_rows) = q(l_rqs:l_rqe,lc2) +#endif /* WITH_MPI */ + + q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) + endif +#ifdef DOUBLE_PRECISION_REAL + end subroutine transform_columns_double +#else + end subroutine transform_columns_single +#endif + +#ifdef DOUBLE_PRECISION_REAL + subroutine global_gather_double(z, n) +#else + subroutine global_gather_single(z, n) +#endif + ! This routine sums up z over all processors. + ! It should only be used for gathering distributed results, + ! i.e. z(i) should be nonzero on exactly 1 processor column, + ! otherways the results may be numerically different on different columns + use precision + implicit none + + integer(kind=ik) :: n + real(kind=REAL_DATATYPE) :: z(n) + real(kind=REAL_DATATYPE) :: tmp(n) + + if (npc_n==1 .and. np_rows==1) return ! nothing to do + + ! Do an mpi_allreduce over processor rows +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(z, tmp, n, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + tmp = z +#endif /* WITH_MPI */ + ! If only 1 processor column, we are done + if (npc_n==1) then + z(:) = tmp(:) + return + endif + + ! If all processor columns are involved, we can use mpi_allreduce + if (npc_n==np_cols) then +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_SUM, mpi_comm_cols, mpierr) +#else + call mpi_allreduce(tmp, z, n, MPI_REAL4, MPI_SUM, mpi_comm_cols, mpierr) +#endif + +#else /* WITH_MPI */ + tmp = z +#endif /* WITH_MPI */ + + return + endif + + ! Do a ring send over processor columns + z(:) = 0 + do np = 1, npc_n + z(:) = z(:) + tmp(:) +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Sendrecv_replace(z, n, MPI_REAL8, np_next, 1111, np_prev, 1111, & + mpi_comm_cols, mpi_status, mpierr) +#else + call MPI_Sendrecv_replace(z, n, MPI_REAL4, np_next, 1111, np_prev, 1111, & + mpi_comm_cols, mpi_status, mpierr) +#endif + +#endif /* WITH_MPI */ + enddo +#ifdef DOUBLE_PRECISION_REAL + end subroutine global_gather_double +#else + end subroutine global_gather_single +#endif + +#ifdef DOUBLE_PRECISION_REAL + subroutine global_product_double(z, n) +#else + subroutine global_product_single(z, n) +#endif + ! This routine calculates the global product of z. + use precision + implicit none + + integer(kind=ik) :: n + real(kind=REAL_DATATYPE) :: z(n) + + real(kind=REAL_DATATYPE) :: tmp(n) + + if (npc_n==1 .and. np_rows==1) return ! nothing to do + + ! Do an mpi_allreduce over processor rows +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(z, tmp, n, MPI_REAL8, MPI_PROD, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(z, tmp, n, MPI_REAL4, MPI_PROD, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + tmp = z +#endif /* WITH_MPI */ + ! If only 1 processor column, we are done + if (npc_n==1) then + z(:) = tmp(:) + return + endif + + ! If all processor columns are involved, we can use mpi_allreduce + if (npc_n==np_cols) then +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_allreduce(tmp, z, n, MPI_REAL8, MPI_PROD, mpi_comm_cols, mpierr) +#else + call mpi_allreduce(tmp, z, n, MPI_REAL4, MPI_PROD, mpi_comm_cols, mpierr) +#endif + +#else /* WITH_MPI */ + z = tmp +#endif /* WITH_MPI */ + return + endif + + ! We send all vectors to the first proc, do the product there + ! and redistribute the result. + + if (my_pcol == npc_0) then + z(1:n) = tmp(1:n) + do np = npc_0+1, npc_0+npc_n-1 +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_recv(tmp, n, MPI_REAL8, np, 1111, mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_recv(tmp, n, MPI_REAL4, np, 1111, mpi_comm_cols, mpi_status, mpierr) +#endif + +#else /* WITH_MPI */ + tmp(1:n) = z(1:n) +#endif /* WITH_MPI */ + z(1:n) = z(1:n)*tmp(1:n) + enddo + do np = npc_0+1, npc_0+npc_n-1 +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_send(z, n, MPI_REAL8, np, 1111, mpi_comm_cols, mpierr) +#else + call mpi_send(z, n, MPI_REAL4, np, 1111, mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + enddo + else +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call mpi_send(tmp, n, MPI_REAL8, npc_0, 1111, mpi_comm_cols, mpierr) + call mpi_recv(z ,n, MPI_REAL8, npc_0, 1111, mpi_comm_cols, mpi_status, mpierr) +#else + call mpi_send(tmp, n, MPI_REAL4, npc_0, 1111, mpi_comm_cols, mpierr) + call mpi_recv(z ,n, MPI_REAL4, npc_0, 1111, mpi_comm_cols, mpi_status, mpierr) +#endif + +#else /* WITH_MPI */ + z(1:n) = tmp(1:n) +#endif /* WITH_MPI */ + + endif +#ifdef DOUBLE_PRECISION_REAL + end subroutine global_product_double +#else + end subroutine global_product_single +#endif + +#ifdef DOUBLE_PRECISION_REAL + subroutine check_monotony_double(n,d,text, wantDebug, success) +#else + subroutine check_monotony_single(n,d,text, wantDebug, success) +#endif + ! This is a test routine for checking if the eigenvalues are monotonically increasing. + ! It is for debug purposes only, an error should never be triggered! + use precision + implicit none + + integer(kind=ik) :: n + real(kind=REAL_DATATYPE) :: d(n) + character*(*) :: text + + integer(kind=ik) :: i + logical, intent(in) :: wantDebug + logical, intent(out) :: success + + success = .true. + do i=1,n-1 + if (d(i+1) 0 and d(i+1) > d(i) + ! + ! but this routine will not terminate with error if these are not satisfied + ! (it will normally converge to a pole in this case). + ! + ! The output in DELTA(j) is always (D(j) - lambda_I), even for the cases + ! N=1 and N=2 which is not compatible with DLAED4. + ! Thus this routine shouldn't be used for these cases as a simple replacement + ! of DLAED4. + ! + ! The arguments are the same as in DLAED4 (with the exception of the INFO argument): + ! + ! + ! N (input) INTEGER + ! The length of all arrays. + ! + ! I (input) INTEGER + ! The index of the eigenvalue to be computed. 1 <= I <= N. + ! + ! D (input) DOUBLE PRECISION array, dimension (N) + ! The original eigenvalues. It is assumed that they are in + ! order, D(I) < D(J) for I < J. + ! + ! Z (input) DOUBLE PRECISION array, dimension (N) + ! The components of the updating vector. + ! + ! DELTA (output) DOUBLE PRECISION array, dimension (N) + ! DELTA contains (D(j) - lambda_I) in its j-th component. + ! See remark above about DLAED4 compatibility! + ! + ! RHO (input) DOUBLE PRECISION + ! The scalar in the symmetric updating formula. + ! + ! DLAM (output) DOUBLE PRECISION + ! The computed lambda_I, the I-th updated eigenvalue. + !------------------------------------------------------------------------------- + +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + integer(kind=ik) :: n, i + real(kind=REAL_DATATYPE) :: d(n), z(n), delta(n), rho, dlam + + integer(kind=ik) :: iter + real(kind=REAL_DATATYPE) :: a, b, x, y, dshift + + ! In order to obtain sufficient numerical accuracy we have to shift the problem + ! either by d(i) or d(i+1), whichever is closer to the solution + + ! Upper and lower bound of the shifted solution interval are a and b + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%start("solve_secular_equation_double") +#else + call timer%start("solve_secular_equation_single") +#endif +#endif + if (i==n) then + + ! Special case: Last eigenvalue + ! We shift always by d(n), lower bound is d(n), + ! upper bound is determined by a guess: + + dshift = d(n) + delta(:) = d(:) - dshift + +#ifdef DOUBLE_PRECISION_REAL + a = 0._rk8 ! delta(n) + b = rho*SUM(z(:)**2) + 1._rk8 ! rho*SUM(z(:)**2) is the lower bound for the guess +#else + a = 0._rk4 ! delta(n) + b = rho*SUM(z(:)**2) + 1._rk4 ! rho*SUM(z(:)**2) is the lower bound for the guess +#endif + else + + ! Other eigenvalues: lower bound is d(i), upper bound is d(i+1) + ! We check the sign of the function in the midpoint of the interval + ! in order to determine if eigenvalue is more close to d(i) or d(i+1) +#ifdef DOUBLE_PRECISION_REAL + x = 0.5_rk8*(d(i)+d(i+1)) + y = 1._rk8 + rho*SUM(z(:)**2/(d(:)-x)) +#else + x = 0.5_rk4*(d(i)+d(i+1)) + y = 1._rk4 + rho*SUM(z(:)**2/(d(:)-x)) +#endif + if (y>0) then + ! solution is next to d(i) + dshift = d(i) + else + ! solution is next to d(i+1) + dshift = d(i+1) + endif + + delta(:) = d(:) - dshift + a = delta(i) + b = delta(i+1) + + endif + + ! Bisection: + + do iter=1,200 + + ! Interval subdivision +#ifdef DOUBLE_PRECISION_REAL + x = 0.5_rk8*(a+b) +#else + x = 0.5_rk4*(a+b) +#endif + if (x==a .or. x==b) exit ! No further interval subdivisions possible +#ifdef DOUBLE_PRECISION_REAL + if (abs(x) < 1.e-200_rk8) exit ! x next to pole +#else + if (abs(x) < 1.e-20_rk4) exit ! x next to pole +#endif + ! evaluate value at x + + y = 1. + rho*SUM(z(:)**2/(delta(:)-x)) + + if (y==0) then + ! found exact solution + exit + elseif (y>0) then + b = x + else + a = x + endif + + enddo + + ! Solution: + + dlam = x + dshift + delta(:) = delta(:) - x +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%stop("solve_secular_equation_double") +#else + call timer%stop("solve_secular_equation_single") +#endif +#endif + +#ifdef DOUBLE_PRECISION_REAL + end subroutine solve_secular_equation_double +#else + end subroutine solve_secular_equation_single +#endif + !------------------------------------------------------------------------------- + +#ifndef ALREADY_DEFINED + integer function local_index(idx, my_proc, num_procs, nblk, iflag) + + !------------------------------------------------------------------------------- + ! local_index: returns the local index for a given global index + ! If the global index has no local index on the + ! processor my_proc behaviour is defined by iflag + ! + ! Parameters + ! + ! idx Global index + ! + ! my_proc Processor row/column for which to calculate the local index + ! + ! num_procs Total number of processors along row/column + ! + ! nblk Blocksize + ! + ! iflag Controls the behaviour if idx is not on local processor + ! iflag< 0 : Return last local index before that row/col + ! iflag==0 : Return 0 + ! iflag> 0 : Return next local index after that row/col + !------------------------------------------------------------------------------- + use precision + implicit none + + integer(kind=ik) :: idx, my_proc, num_procs, nblk, iflag + + integer(kind=ik) :: iblk + + iblk = (idx-1)/nblk ! global block number, 0 based + + if (mod(iblk,num_procs) == my_proc) then + + ! block is local, always return local row/col number + + local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1 + + else + + ! non local block + + if (iflag == 0) then + + local_index = 0 + + else + + local_index = (iblk/num_procs)*nblk + + if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk + + if (iflag>0) local_index = local_index + 1 + endif + endif + + end function local_index +#endif /* ALREADY_DEFINED */ + +#ifdef DOUBLE_PRECISION_REAL + subroutine cholesky_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, & + wantDebug, success) +#else + subroutine cholesky_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, & + wantDebug, success) +#endif + + !------------------------------------------------------------------------------- + ! cholesky_real: Cholesky factorization of a real symmetric matrix + ! + ! Parameters + ! + ! na Order of matrix + ! + ! a(lda,matrixCols) Distributed matrix which should be factorized. + ! Distribution is like in Scalapack. + ! Only upper triangle is needs to be set. + ! On return, the upper triangle contains the Cholesky factor + ! and the lower triangle is set to 0. + ! + ! lda Leading dimension of a + ! matrixCols local columns of matrix a + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + implicit none + + integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols + real(kind=REAL_DATATYPE) :: a(lda,matrixCols) + ! was + ! real a(lda, *) + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx + integer(kind=ik) :: n, nc, i, info + integer(kind=ik) :: lcs, lce, lrs, lre + integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile + + real(kind=REAL_DATATYPE), allocatable :: tmp1(:), tmp2(:,:), tmatr(:,:), tmatc(:,:) + + logical, intent(in) :: wantDebug + logical, intent(out) :: success + integer(kind=ik) :: istat + character(200) :: errorMessage + +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_REAL + call timer%start("cholesky_real_double") +#else + call timer%start("cholesky_real_single") +#endif +#endif + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + success = .true. + + ! Matrix is split into tiles; work is done only for tiles on the diagonal or above + + tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size + tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + l_rows_tile = tile_size/np_rows ! local rows of a tile + l_cols_tile = tile_size/np_cols ! local cols of a tile + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a + l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a + + allocate(tmp1(nblk*nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"cholesky_real: error when allocating tmp1 "//errorMessage + stop + endif + + allocate(tmp2(nblk,nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"cholesky_real: error when allocating tmp2 "//errorMessage + stop + endif + + tmp1 = 0 + tmp2 = 0 + + allocate(tmatr(l_rows,nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"cholesky_real: error when allocating tmatr "//errorMessage + stop + endif + + allocate(tmatc(l_cols,nblk), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"cholesky_real: error when allocating tmatc "//errorMessage + stop + endif + + tmatr = 0 + tmatc = 0 + + do n = 1, na, nblk + + ! Calculate first local row and column of the still remaining matrix + ! on the local processor + + l_row1 = local_index(n, my_prow, np_rows, nblk, +1) + l_col1 = local_index(n, my_pcol, np_cols, nblk, +1) + + l_rowx = local_index(n+nblk, my_prow, np_rows, nblk, +1) + l_colx = local_index(n+nblk, my_pcol, np_cols, nblk, +1) + + if (n+nblk > na) then + + ! This is the last step, just do a Cholesky-Factorization + ! of the remaining block + + if (my_prow==prow(n, nblk, np_rows) .and. my_pcol==pcol(n, nblk, np_cols)) then +#ifdef DOUBLE_PRECISION_REAL + call dpotrf('U', na-n+1, a(l_row1,l_col1), lda, info) +#else + call spotrf('U', na-n+1, a(l_row1,l_col1), lda, info) +#endif + if (info/=0) then + if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf" + success = .false. + return + endif + + endif + + exit ! Loop + + endif + + if (my_prow==prow(n, nblk, np_rows)) then + + if (my_pcol==pcol(n, nblk, np_cols)) then + + ! The process owning the upper left remaining block does the + ! Cholesky-Factorization of this block +#ifdef DOUBLE_PRECISION_REAL + call dpotrf('U', nblk, a(l_row1,l_col1), lda, info) +#else + call spotrf('U', nblk, a(l_row1,l_col1), lda, info) +#endif + if (info/=0) then + if (wantDebug) write(error_unit,*) "ELPA1_cholesky_real: Error in dpotrf" + success = .false. + return + endif + + nc = 0 + do i=1,nblk + tmp1(nc+1:nc+i) = a(l_row1:l_row1+i-1,l_col1+i-1) + nc = nc+i + enddo + endif +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(tmp1, nblk*(nblk+1)/2, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + nc = 0 + do i=1,nblk + tmp2(1:i,i) = tmp1(nc+1:nc+i) + nc = nc+i + enddo + + if (l_cols-l_colx+1>0) & +#ifdef DOUBLE_PRECISION_REAL + call dtrsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk8, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#else + call strsm('L', 'U', 'T', 'N', nblk, l_cols-l_colx+1, 1.0_rk4, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#endif + endif + + do i=1,nblk + + if (my_prow==prow(n, nblk, np_rows)) tmatc(l_colx:l_cols,i) = a(l_row1+i-1,l_colx:l_cols) +#ifdef WITH_MPI + if (l_cols-l_colx+1>0) & +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) +#else + call MPI_Bcast(tmatc(l_colx,i), l_cols-l_colx+1, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) +#endif + +#endif /* WITH_MPI */ + enddo + ! this has to be checked since it was changed substantially when doing type safe +#ifdef DOUBLE_PRECISION_REAL + call elpa_transpose_vectors_real_double (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & + tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & + n, na, nblk, nblk) +#else + call elpa_transpose_vectors_real_single (tmatc, ubound(tmatc,dim=1), mpi_comm_cols, & + tmatr, ubound(tmatr,dim=1), mpi_comm_rows, & + n, na, nblk, nblk) +#endif + + do i=0,(na-1)/tile_size + lcs = max(l_colx,i*l_cols_tile+1) + lce = min(l_cols,(i+1)*l_cols_tile) + lrs = l_rowx + lre = min(l_rows,(i+1)*l_rows_tile) + if (lce0) & +#ifdef DOUBLE_PRECISION_REAL + call DTRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk8, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#else + call STRMM('L', 'U', 'N', 'N', nb, l_cols-l_colx+1, 1.0_rk4, tmp2, ubound(tmp2,dim=1), a(l_row1,l_colx), lda) +#endif + if (l_colx<=l_cols) tmat2(1:nb,l_colx:l_cols) = a(l_row1:l_row1+nb-1,l_colx:l_cols) + if (my_pcol==pcol(n, nblk, np_cols)) tmat2(1:nb,l_col1:l_col1+nb-1) = tmp2(1:nb,1:nb) ! tmp2 has the lower left triangle 0 + + endif + + if (l_row1>1) then + if (my_pcol==pcol(n, nblk, np_cols)) then + tmat1(1:l_row1-1,1:nb) = a(1:l_row1-1,l_col1:l_col1+nb-1) + a(1:l_row1-1,l_col1:l_col1+nb-1) = 0 + endif + + do i=1,nb +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL8, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#else + call MPI_Bcast(tmat1(1,i), l_row1-1, MPI_REAL4, pcol(n, nblk, np_cols), mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + enddo + endif +#ifdef WITH_MPI + if (l_cols-l_col1+1>0) & +#ifdef DOUBLE_PRECISION_REAL + call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL8, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) +#else + call MPI_Bcast(tmat2(1,l_col1), (l_cols-l_col1+1)*nblk, MPI_REAL4, prow(n, nblk, np_rows), mpi_comm_rows, mpierr) +#endif + +#endif /* WITH_MPI */ + if (l_row1>1 .and. l_cols-l_col1+1>0) & +#ifdef DOUBLE_PRECISION_REAL + call dgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk8, & + tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & + 1.0_rk8, a(1,l_col1), lda) +#else + call sgemm('N', 'N', l_row1-1, l_cols-l_col1+1, nb, -1.0_rk4, & + tmat1, ubound(tmat1,dim=1), tmat2(1,l_col1), ubound(tmat2,dim=1), & + 1.0_rk4, a(1,l_col1), lda) +#endif + enddo + + deallocate(tmp1, tmp2, tmat1, tmat2, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"invert_trm_real: error when deallocating tmp1 "//errorMessage + stop + endif +#ifdef DOUBLE_PRECISION_REAL + end subroutine invert_trm_real_double +#else + end subroutine invert_trm_real_single +#endif + +#ifndef ALREADY_DEFINED + integer function least_common_multiple(a, b) + + ! Returns the least common multiple of a and b + ! There may be more efficient ways to do this, we use the most simple approach + use precision + implicit none + integer(kind=ik), intent(in) :: a, b + + do least_common_multiple = a, a*(b-1), a + if(mod(least_common_multiple,b)==0) exit + enddo + ! if the loop is left regularly, least_common_multiple = a*b + + end function least_common_multiple +#endif /* ALREADY_DEFINED */ + +#ifdef DOUBLE_PRECISION_REAL + subroutine hh_transform_real_double(alpha, xnorm_sq, xf, tau) +#else + subroutine hh_transform_real_single(alpha, xnorm_sq, xf, tau) +#endif + ! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:) + ! and returns the factor xf by which x has to be scaled. + ! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150 + ! since this would be expensive for the parallel implementation. + use precision + implicit none + real(kind=REAL_DATATYPE), intent(inout) :: alpha + real(kind=REAL_DATATYPE), intent(in) :: xnorm_sq + real(kind=REAL_DATATYPE), intent(out) :: xf, tau + + real(kind=REAL_DATATYPE) :: BETA + + if ( XNORM_SQ==0. ) then + + if ( ALPHA>=0. ) then + TAU = 0. + else + TAU = 2. + ALPHA = -ALPHA + endif + XF = 0. + + else + + BETA = SIGN( SQRT( ALPHA**2 + XNORM_SQ ), ALPHA ) + ALPHA = ALPHA + BETA + IF ( BETA<0 ) THEN + BETA = -BETA + TAU = -ALPHA / BETA + ELSE + ALPHA = XNORM_SQ / ALPHA + TAU = ALPHA / BETA + ALPHA = -ALPHA + END IF + XF = 1./ALPHA + ALPHA = BETA + endif +#ifdef DOUBLE_PRECISION_REAL + end subroutine hh_transform_real_double +#else + end subroutine hh_transform_real_single +#endif +#define ALREADY_DEFINED 1 diff --git a/src/elpa2.F90 b/src/elpa2.F90 index f9defdf01fbee8d08920a5c8edbbc3f107d27b99..74a6eb30d21801a1317112966eb618a1a128b281 100644 --- a/src/elpa2.F90 +++ b/src/elpa2.F90 @@ -88,13 +88,30 @@ module ELPA2 ! The following routines are public: - public :: solve_evp_real_2stage - public :: solve_evp_complex_2stage + public :: solve_evp_real_2stage_double + public :: solve_evp_complex_2stage_double + + interface solve_evp_real_2stage + module procedure solve_evp_real_2stage_double + end interface + + interface solve_evp_complex_2stage + module procedure solve_evp_complex_2stage_double + end interface + +#ifdef WANT_SINGLE_PRECISION_REAL + public :: solve_evp_real_2stage_single +#endif + +#ifdef WANT_SINGLE_PRECISION_COMPLEX + public :: solve_evp_complex_2stage_single +#endif + !****** contains !------------------------------------------------------------------------------- -!> \brief solve_evp_real_2stage: Fortran function to solve the real eigenvalue problem with a 2 stage approach +!> \brief solve_evp_real_2stage_double: Fortran function to solve the double-precision real eigenvalue problem with a 2 stage approach !> !> Parameters !> @@ -133,11 +150,21 @@ contains !> \result success logical, false if error occured !------------------------------------------------------------------------------- - function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, & +#define DOUBLE_PRECISION_REAL + +#ifdef DOUBLE_PRECISION_REAL + function solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, & + matrixCols, & + mpi_comm_rows, mpi_comm_cols, & + mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,& + useQR) result(success) +#else + function solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, & matrixCols, & mpi_comm_rows, mpi_comm_cols, & mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,& useQR) result(success) +#endif #ifdef HAVE_DETAILED_TIMINGS @@ -157,14 +184,14 @@ contains integer(kind=ik), intent(in) :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, & mpi_comm_cols, mpi_comm_all integer(kind=ik), intent(in) :: nblk - real(kind=rk), intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) + real(kind=rk8), intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) ! was ! real a(lda,*), q(ldq,*) - real(kind=rk), allocatable :: hh_trans_real(:,:) + real(kind=rk8), allocatable :: hh_trans_real(:,:) integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: nbw, num_blocks - real(kind=rk), allocatable :: tmat(:,:,:), e(:) + real(kind=rk8), allocatable :: tmat(:,:,:), e(:) real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double integer(kind=ik) :: i logical :: success @@ -176,7 +203,7 @@ contains integer(kind=ik) :: numberOfGPUDevices #ifdef HAVE_DETAILED_TIMINGS - call timer%start("solve_evp_real_2stage") + call timer%start("solve_evp_real_2stage_double") #endif call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) @@ -295,8 +322,13 @@ contains ttt0 = MPI_Wtime() ttts = ttt0 - call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & +#ifdef DOUBLE_PRECISION_REAL + call bandred_real_double(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & tmat, wantDebug, useGPU, success, useQRActual) +#else + call bandred_real_single(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & + tmat, wantDebug, useGPU, success, useQRActual) +#endif if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & @@ -311,8 +343,13 @@ contains endif ttt0 = MPI_Wtime() - call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, & +#ifdef DOUBLE_PRECISION_REAL + call tridiag_band_real_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, & mpi_comm_rows, mpi_comm_cols, mpi_comm_all) +#else + call tridiag_band_real_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, & + mpi_comm_rows, mpi_comm_cols, mpi_comm_all) +#endif ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & @@ -335,9 +372,13 @@ contains ! Solve tridiagonal system ttt0 = MPI_Wtime() - call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & +#ifdef DOUBLE_PRECISION_REAL + call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & mpi_comm_cols, wantDebug, success) - +#else + call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#endif if (.not.(success)) return ttt1 = MPI_Wtime() @@ -354,9 +395,15 @@ contains ! Backtransform stage 1 ttt0 = MPI_Wtime() - call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, & +#ifdef DOUBLE_PRECISION_REAL + call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, & + mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, & + THIS_REAL_ELPA_KERNEL) +#else + call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, & mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, & THIS_REAL_ELPA_KERNEL) +#endif if (.not.(success)) return ttt1 = MPI_Wtime() @@ -374,8 +421,13 @@ contains ! Backtransform stage 2 print *,"useGPU== ",useGPU ttt0 = MPI_Wtime() - call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, & +#ifdef DOUBLE_PRECISION_REAL + call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, & + mpi_comm_cols, useGPU, useQRActual) +#else + call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, & mpi_comm_cols, useGPU, useQRActual) +#endif ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & @@ -389,13 +441,20 @@ contains endif #ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_evp_real_2stage") + call timer%stop("solve_evp_real_2stage_double") #endif 1 format(a,f10.3) - end function solve_evp_real_2stage +#ifdef DOUBLE_PRECISION_REAL + end function solve_evp_real_2stage_double +#else + end function solve_evp_real_2stage_single +#endif -!> \brief solve_evp_complex_2stage: Fortran function to solve the complex eigenvalue problem with a 2 stage approach +#ifdef WANT_SINGLE_PRECISION_REAL +#undef DOUBLE_PRECISION_REAL +!------------------------------------------------------------------------------- +!> \brief solve_evp_real_2stage_single: Fortran function to solve the single-precision real eigenvalue problem with a 2 stage approach !> !> Parameters !> @@ -429,45 +488,62 @@ contains !> !> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API !> +!> \param use_qr (optional) use QR decomposition +!> !> \result success logical, false if error occured !------------------------------------------------------------------------------- -function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & - matrixCols, mpi_comm_rows, mpi_comm_cols, & - mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success) + +#ifdef DOUBLE_PRECISION_REAL + function solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, & + matrixCols, & + mpi_comm_rows, mpi_comm_cols, & + mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,& + useQR) result(success) +#else + function solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, & + matrixCols, & + mpi_comm_rows, mpi_comm_cols, & + mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,& + useQR) result(success) +#endif #ifdef HAVE_DETAILED_TIMINGS - use timings + use timings #endif + use precision use cuda_functions use mod_check_for_gpu use iso_c_binding implicit none - integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API - integer(kind=ik) :: THIS_COMPLEX_ELPA_KERNEL - integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all - complex(kind=ck), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols) + logical, intent(in), optional :: useQR + logical :: useQRActual, useQREnvironment + integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API + integer(kind=ik) :: THIS_REAL_ELPA_KERNEL + + integer(kind=ik), intent(in) :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, & + mpi_comm_cols, mpi_comm_all + integer(kind=ik), intent(in) :: nblk + real(kind=rk4), intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) ! was - ! complex a(lda,*), q(ldq,*) - real(kind=rk), intent(inout) :: ev(na) - complex(kind=ck), allocatable :: hh_trans_complex(:,:) + ! real a(lda,*), q(ldq,*) + real(kind=rk4), allocatable :: hh_trans_real(:,:) - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes - integer(kind=ik) :: l_cols, l_rows, l_cols_nev, nbw, num_blocks - complex(kind=ck), allocatable :: tmat(:,:,:) - real(kind=rk), allocatable :: q_real(:,:), e(:) + integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: nbw, num_blocks + real(kind=rk4), allocatable :: tmat(:,:,:), e(:) real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double integer(kind=ik) :: i - - logical :: success, wantDebug + logical :: success logical, save :: firstCall = .true. + logical :: wantDebug integer(kind=ik) :: istat character(200) :: errorMessage logical :: useGPU integer(kind=ik) :: numberOfGPUDevices #ifdef HAVE_DETAILED_TIMINGS - call timer%start("solve_evp_complex_2stage") + call timer%start("solve_evp_real_2stage_single") #endif call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) @@ -478,7 +554,7 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - useGPU = .false. + wantDebug = .false. if (firstCall) then ! are debug messages desired? @@ -486,42 +562,69 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & firstCall = .false. endif - success = .true. - if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then + useQRActual = .false. + useGPU = .false. + + ! set usage of qr decomposition via API call + if (present(useQR)) then + if (useQR) useQRActual = .true. + if (.not.(useQR)) useQRACtual = .false. + endif + + ! overwrite this with environment variable settings + if (qr_decomposition_via_environment_variable(useQREnvironment)) then + useQRActual = useQREnvironment + endif + + if (useQRActual) then + if (mod(na,nblk) .ne. 0) then + if (wantDebug) then + write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize" + endif + print *, "Do not use QR-decomposition for this matrix and blocksize." + success = .false. + return + endif + endif + + + if (present(THIS_REAL_ELPA_KERNEL_API)) then ! user defined kernel via the optional argument in the API call - THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API + THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API else + ! if kernel is not choosen via api ! check whether set by environment variable - THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel() + THIS_REAL_ELPA_KERNEL = get_actual_real_kernel() endif ! check whether choosen kernel is allowed - if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then + if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then if (my_pe == 0) then write(error_unit,*) " " - write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL) + write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL) write(error_unit,*) "is not in the list of the allowed kernels!" write(error_unit,*) " " write(error_unit,*) "Allowed kernels are:" - do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:)) - if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then - write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i) + do i=1,size(REAL_ELPA_KERNEL_NAMES(:)) + if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then + write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i) endif enddo write(error_unit,*) " " - write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" + write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !" endif - THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC + THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC + endif - if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then - if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then - useGPU=.true. + if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then + if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then + useGPU = .true. endif if (nblk .ne. 128) then print *,"At the moment GPU version needs blocksize 128" @@ -537,90 +640,725 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & endif ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 - - nbw = (31/nblk+1)*nblk + ! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal. + ! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32! + ! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye + ! on this and maybe allow a run-time optimization here + if (useGPU) then + nbw = nblk + else + nbw = (63/nblk+1)*nblk + endif num_blocks = (na-1)/nbw + 1 allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage) if (istat .ne. 0) then - print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage + print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage stop endif + ! Reduction full -> band ttt0 = MPI_Wtime() ttts = ttt0 - call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & - tmat, wantDebug, useGPU, success) - if (.not.(success)) then - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop() +#ifdef DOUBLE_PRECISION_REAL + call bandred_real_double(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & + tmat, wantDebug, useGPU, success, useQRActual) +#else + call bandred_real_single(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & + tmat, wantDebug, useGPU, success, useQRActual) #endif - return - endif + if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & - write(error_unit,*) 'Time bandred_complex :',ttt1-ttt0 - - ! Reduction band -> tridiagonal + write(error_unit,*) 'Time bandred_real :',ttt1-ttt0 - allocate(e(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage - stop - endif + ! Reduction band -> tridiagonal + allocate(e(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_real_2stage: error when allocating e "//errorMessage + stop + endif - ttt0 = MPI_Wtime() - call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, & - mpi_comm_rows, mpi_comm_cols, mpi_comm_all) + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_REAL + call tridiag_band_real_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, & + mpi_comm_rows, mpi_comm_cols, mpi_comm_all) +#else + call tridiag_band_real_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_real, & + mpi_comm_rows, mpi_comm_cols, mpi_comm_all) +#endif - ttt1 = MPI_Wtime() - if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & - write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0 + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time tridiag_band_real :',ttt1-ttt0 #ifdef WITH_MPI -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_bcast(ev, na, mpi_real8, 0, mpi_comm_all, mpierr) - call mpi_bcast(e, na, mpi_real8, 0, mpi_comm_all, mpierr) +#ifdef DOUBLE_PRECISION_REAL + call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr) + call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr) #else - call mpi_bcast(ev, na, mpi_real4, 0, mpi_comm_all, mpierr) - call mpi_bcast(e, na, mpi_real4, 0, mpi_comm_all, mpierr) + call mpi_bcast(ev,na,MPI_REAL4,0,mpi_comm_all,mpierr) + call mpi_bcast(e,na,MPI_REAL4,0,mpi_comm_all,mpierr) #endif #endif /* WITH_MPI */ - ttt1 = MPI_Wtime() - time_evp_fwd = ttt1-ttts + ttt1 = MPI_Wtime() + time_evp_fwd = ttt1-ttts - l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q - l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q - l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev + ! Solve tridiagonal system - allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage - stop - endif + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_REAL + call solve_tridi_double(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#else + call solve_tridi_single(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, & + mpi_comm_cols, wantDebug, success) +#endif + if (.not.(success)) return - ! Solve tridiagonal system + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 + time_evp_solve = ttt1-ttt0 + ttts = ttt1 - ttt0 = MPI_Wtime() - call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, & - mpi_comm_rows, mpi_comm_cols, wantDebug, success) - if (.not.(success)) return + deallocate(e, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage + stop + endif + ! Backtransform stage 1 - ttt1 = MPI_Wtime() - if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & - write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 - time_evp_solve = ttt1-ttt0 - ttts = ttt1 + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_REAL + call trans_ev_tridi_to_band_real_double(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, & + mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, & + THIS_REAL_ELPA_KERNEL) +#else + call trans_ev_tridi_to_band_real_single(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, & + mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, & + THIS_REAL_ELPA_KERNEL) +#endif - q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) + if (.not.(success)) return + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0 - deallocate(e, q_real, stat=istat, errmsg=errorMessage) + ! We can now deallocate the stored householder vectors + deallocate(hh_trans_real, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage + stop + endif + + + ! Backtransform stage 2 + print *,"useGPU== ",useGPU + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_REAL + call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, & + mpi_comm_cols, useGPU, useQRActual) +#else + call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, & + mpi_comm_cols, useGPU, useQRActual) +#endif + + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0 + time_evp_back = ttt1-ttts + + deallocate(tmat, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS + call timer%stop("solve_evp_real_2stage_single") +#endif +1 format(a,f10.3) + +#ifdef DOUBLE_PRECISION_REAL + end function solve_evp_real_2stage_double +#else + end function solve_evp_real_2stage_single +#endif + +#endif /* WANT_SINGLE_PRECISION_REAL */ + + !> \brief solve_evp_complex_2stage_double: Fortran function to solve the double-precision complex eigenvalue problem with a 2 stage approach +!> +!> Parameters +!> +!> \param na Order of matrix a +!> +!> \param nev Number of eigenvalues needed +!> +!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. +!> Distribution is like in Scalapack. +!> The full matrix must be set (not only one half like in scalapack). +!> Destroyed on exit (upper and lower half). +!> +!> \param lda Leading dimension of a +!> +!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set +!> +!> \param q(ldq,matrixCols) On output: Eigenvectors of a +!> Distribution is like in Scalapack. +!> Must be always dimensioned to the full size (corresponding to (na,na)) +!> even if only a part of the eigenvalues is needed. +!> +!> \param ldq Leading dimension of q +!> +!> \param nblk blocksize of cyclic distribution, must be the same in both directions! +!> +!> \param matrixCols local columns of matrix a and q +!> +!> \param mpi_comm_rows MPI communicator for rows +!> \param mpi_comm_cols MPI communicator for columns +!> \param mpi_comm_all MPI communicator for the total processor set +!> +!> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API +!> +!> \result success logical, false if error occured +!------------------------------------------------------------------------------- +#define DOUBLE_PRECISION_COMPLEX 1 + +#ifdef DOUBLE_PRECISION_COMPLEX +function solve_evp_complex_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, & + matrixCols, mpi_comm_rows, mpi_comm_cols, & + mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success) +#else +function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, & + matrixCols, mpi_comm_rows, mpi_comm_cols, & + mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success) +#endif + + +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + use cuda_functions + use mod_check_for_gpu + use iso_c_binding + implicit none + integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API + integer(kind=ik) :: THIS_COMPLEX_ELPA_KERNEL + integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all + complex(kind=ck8), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols) + ! was + ! complex a(lda,*), q(ldq,*) + real(kind=rk8), intent(inout) :: ev(na) + complex(kind=ck8), allocatable :: hh_trans_complex(:,:) + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes + integer(kind=ik) :: l_cols, l_rows, l_cols_nev, nbw, num_blocks + complex(kind=ck8), allocatable :: tmat(:,:,:) + real(kind=rk8), allocatable :: q_real(:,:), e(:) + real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double + integer(kind=ik) :: i + + logical :: success, wantDebug + logical, save :: firstCall = .true. + integer(kind=ik) :: istat + character(200) :: errorMessage + logical :: useGPU + integer(kind=ik) :: numberOfGPUDevices + +#ifdef HAVE_DETAILED_TIMINGS + call timer%start("solve_evp_complex_2stage_double") +#endif + + call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) + call mpi_comm_size(mpi_comm_all,n_pes,mpierr) + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + + useGPU = .false. + wantDebug = .false. + if (firstCall) then + ! are debug messages desired? + wantDebug = debug_messages_via_environment_variable() + firstCall = .false. + endif + + + success = .true. + + if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then + ! user defined kernel via the optional argument in the API call + THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API + else + ! if kernel is not choosen via api + ! check whether set by environment variable + THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel() + endif + + ! check whether choosen kernel is allowed + if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then + + if (my_pe == 0) then + write(error_unit,*) " " + write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL) + write(error_unit,*) "is not in the list of the allowed kernels!" + write(error_unit,*) " " + write(error_unit,*) "Allowed kernels are:" + do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:)) + if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then + write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i) + endif + enddo + + write(error_unit,*) " " + write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" + endif + THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC + endif + + if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then + if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then + useGPU=.true. + endif + if (nblk .ne. 128) then + print *,"At the moment GPU version needs blocksize 128" + stop + endif + + ! set the neccessary parameters + cudaMemcpyHostToDevice = cuda_memcpyHostToDevice() + cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost() + cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice() + cudaHostRegisterPortable = cuda_hostRegisterPortable() + cudaHostRegisterMapped = cuda_hostRegisterMapped() + endif + + ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 + + nbw = (31/nblk+1)*nblk + + num_blocks = (na-1)/nbw + 1 + + allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage + stop + endif + ! Reduction full -> band + + ttt0 = MPI_Wtime() + ttts = ttt0 +#ifdef DOUBLE_PRECISION_COMPLEX + call bandred_complex_double(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & + tmat, wantDebug, useGPU, success) +#else + call bandred_complex_single(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & + tmat, wantDebug, useGPU, success) +#endif + if (.not.(success)) then + +#ifdef HAVE_DETAILED_TIMINGS + call timer%stop("solve_evp_complex_2stage_double") +#endif + return + endif + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time bandred_complex :',ttt1-ttt0 + + ! Reduction band -> tridiagonal + + allocate(e(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage + stop + endif + + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_COMPLEX + call tridiag_band_complex_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, & + mpi_comm_rows, mpi_comm_cols, mpi_comm_all) +#else + call tridiag_band_complex_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, & + mpi_comm_rows, mpi_comm_cols, mpi_comm_all) +#endif + + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0 + +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_bcast(ev, na, mpi_real8, 0, mpi_comm_all, mpierr) + call mpi_bcast(e, na, mpi_real8, 0, mpi_comm_all, mpierr) +#else + call mpi_bcast(ev, na, mpi_real4, 0, mpi_comm_all, mpierr) + call mpi_bcast(e, na, mpi_real4, 0, mpi_comm_all, mpierr) +#endif + +#endif /* WITH_MPI */ + ttt1 = MPI_Wtime() + time_evp_fwd = ttt1-ttts + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q + l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q + l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev + + allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage + stop + endif + + ! Solve tridiagonal system + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_COMPLEX + call solve_tridi_double(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, & + mpi_comm_rows, mpi_comm_cols, wantDebug, success) +#else + call solve_tridi_single(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, & + mpi_comm_rows, mpi_comm_cols, wantDebug, success) +#endif + if (.not.(success)) return + + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 + time_evp_solve = ttt1-ttt0 + ttts = ttt1 + + q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) + + deallocate(e, q_real, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage + stop + endif + + + ! Backtransform stage 1 + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_COMPLEX + call trans_ev_tridi_to_band_complex_double(na, nev, nblk, nbw, q, ldq, & + matrixCols, hh_trans_complex, & + mpi_comm_rows, mpi_comm_cols, & + wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL) +#else + call trans_ev_tridi_to_band_complex_single(na, nev, nblk, nbw, q, ldq, & + matrixCols, hh_trans_complex, & + mpi_comm_rows, mpi_comm_cols, & + wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL) +#endif + if (.not.(success)) return + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0 + + ! We can now deallocate the stored householder vectors + deallocate(hh_trans_complex, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when deallocating hh_trans_complex"//errorMessage + stop + endif + + ! Backtransform stage 2 + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_COMPLEX + call trans_ev_band_to_full_complex_double(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, & + mpi_comm_rows, mpi_comm_cols, useGPU) +#else + call trans_ev_band_to_full_complex_single(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, & + mpi_comm_rows, mpi_comm_cols, useGPU) +#endif + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0 + time_evp_back = ttt1-ttts + + deallocate(tmat, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when deallocating tmat "//errorMessage + stop + endif + +#ifdef HAVE_DETAILED_TIMINGS + call timer%stop("solve_evp_complex_2stage_double") +#endif + +1 format(a,f10.3) +#ifdef DOUBLE_PRECISION_COMPLEX +end function solve_evp_complex_2stage_double +#else +end function solve_evp_complex_2stage_single +#endif + +#ifdef WANT_SINGLE_PRECISION_COMPLEX +#undef DOUBLE_PRECISION_COMPLEX + +!> \brief solve_evp_complex_2stage_single: Fortran function to solve the single-precision complex eigenvalue problem with a 2 stage approach +!> +!> Parameters +!> +!> \param na Order of matrix a +!> +!> \param nev Number of eigenvalues needed +!> +!> \param a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed. +!> Distribution is like in Scalapack. +!> The full matrix must be set (not only one half like in scalapack). +!> Destroyed on exit (upper and lower half). +!> +!> \param lda Leading dimension of a +!> +!> \param ev(na) On output: eigenvalues of a, every processor gets the complete set +!> +!> \param q(ldq,matrixCols) On output: Eigenvectors of a +!> Distribution is like in Scalapack. +!> Must be always dimensioned to the full size (corresponding to (na,na)) +!> even if only a part of the eigenvalues is needed. +!> +!> \param ldq Leading dimension of q +!> +!> \param nblk blocksize of cyclic distribution, must be the same in both directions! +!> +!> \param matrixCols local columns of matrix a and q +!> +!> \param mpi_comm_rows MPI communicator for rows +!> \param mpi_comm_cols MPI communicator for columns +!> \param mpi_comm_all MPI communicator for the total processor set +!> +!> \param THIS_REAL_ELPA_KERNEL_API (optional) specify used ELPA2 kernel via API +!> +!> \result success logical, false if error occured +!------------------------------------------------------------------------------- + +#ifdef DOUBLE_PRECISION_COMPLEX +function solve_evp_complex_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, & + matrixCols, mpi_comm_rows, mpi_comm_cols, & + mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success) +#else +function solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, & + matrixCols, mpi_comm_rows, mpi_comm_cols, & + mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success) +#endif + + +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + use cuda_functions + use mod_check_for_gpu + use iso_c_binding + implicit none + integer(kind=ik), intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API + integer(kind=ik) :: THIS_COMPLEX_ELPA_KERNEL + integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all + complex(kind=ck4), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols) + ! was + ! complex a(lda,*), q(ldq,*) + real(kind=rk4), intent(inout) :: ev(na) + complex(kind=ck4), allocatable :: hh_trans_complex(:,:) + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes + integer(kind=ik) :: l_cols, l_rows, l_cols_nev, nbw, num_blocks + complex(kind=ck4), allocatable :: tmat(:,:,:) + real(kind=rk4), allocatable :: q_real(:,:), e(:) + real(kind=c_double) :: ttt0, ttt1, ttts ! MPI_WTIME always needs double + integer(kind=ik) :: i + + logical :: success, wantDebug + logical, save :: firstCall = .true. + integer(kind=ik) :: istat + character(200) :: errorMessage + logical :: useGPU + integer(kind=ik) :: numberOfGPUDevices + +#ifdef HAVE_DETAILED_TIMINGS + call timer%start("solve_evp_complex_2stage_single") +#endif + + call mpi_comm_rank(mpi_comm_all,my_pe,mpierr) + call mpi_comm_size(mpi_comm_all,n_pes,mpierr) + + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + + useGPU = .false. + wantDebug = .false. + if (firstCall) then + ! are debug messages desired? + wantDebug = debug_messages_via_environment_variable() + firstCall = .false. + endif + + + success = .true. + + if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then + ! user defined kernel via the optional argument in the API call + THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API + else + ! if kernel is not choosen via api + ! check whether set by environment variable + THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel() + endif + + ! check whether choosen kernel is allowed + if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then + + if (my_pe == 0) then + write(error_unit,*) " " + write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL) + write(error_unit,*) "is not in the list of the allowed kernels!" + write(error_unit,*) " " + write(error_unit,*) "Allowed kernels are:" + do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:)) + if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then + write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i) + endif + enddo + + write(error_unit,*) " " + write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !" + endif + THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC + endif + + if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then + if (check_for_gpu(my_pe, numberOfGPUDevices, wantDebug=wantDebug)) then + useGPU=.true. + endif + if (nblk .ne. 128) then + print *,"At the moment GPU version needs blocksize 128" + stop + endif + + ! set the neccessary parameters + cudaMemcpyHostToDevice = cuda_memcpyHostToDevice() + cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost() + cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice() + cudaHostRegisterPortable = cuda_hostRegisterPortable() + cudaHostRegisterMapped = cuda_hostRegisterMapped() + endif + + ! Choose bandwidth, must be a multiple of nblk, set to a value >= 32 + + nbw = (31/nblk+1)*nblk + + num_blocks = (na-1)/nbw + 1 + + allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage + stop + endif + ! Reduction full -> band + + ttt0 = MPI_Wtime() + ttts = ttt0 +#ifdef DOUBLE_PRECISION_COMPLEX + call bandred_complex_double(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & + tmat, wantDebug, useGPU, success) +#else + call bandred_complex_single(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, & + tmat, wantDebug, useGPU, success) +#endif + if (.not.(success)) then + +#ifdef HAVE_DETAILED_TIMINGS + call timer%stop("solve_evp_complex_2stage_single") +#endif + return + endif + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time bandred_complex :',ttt1-ttt0 + + ! Reduction band -> tridiagonal + + allocate(e(na), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage + stop + endif + + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_COMPLEX + call tridiag_band_complex_double(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, & + mpi_comm_rows, mpi_comm_cols, mpi_comm_all) +#else + call tridiag_band_complex_single(na, nbw, nblk, a, lda, ev, e, matrixCols, hh_trans_complex, & + mpi_comm_rows, mpi_comm_cols, mpi_comm_all) +#endif + + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0 + +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_bcast(ev, na, mpi_real8, 0, mpi_comm_all, mpierr) + call mpi_bcast(e, na, mpi_real8, 0, mpi_comm_all, mpierr) +#else + call mpi_bcast(ev, na, mpi_real4, 0, mpi_comm_all, mpierr) + call mpi_bcast(e, na, mpi_real4, 0, mpi_comm_all, mpierr) +#endif + +#endif /* WITH_MPI */ + ttt1 = MPI_Wtime() + time_evp_fwd = ttt1-ttts + + l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q + l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q + l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev + + allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage + stop + endif + + ! Solve tridiagonal system + + ttt0 = MPI_Wtime() +#ifdef DOUBLE_PRECISION_COMPLEX + call solve_tridi_double(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, & + mpi_comm_rows, mpi_comm_cols, wantDebug, success) +#else + call solve_tridi_single(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, & + mpi_comm_rows, mpi_comm_cols, wantDebug, success) +#endif + if (.not.(success)) return + + ttt1 = MPI_Wtime() + if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & + write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0 + time_evp_solve = ttt1-ttt0 + ttts = ttt1 + + q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) + + deallocate(e, q_real, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage stop @@ -630,10 +1368,17 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & ! Backtransform stage 1 ttt0 = MPI_Wtime() - call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, & +#ifdef DOUBLE_PRECISION_COMPLEX + call trans_ev_tridi_to_band_complex_double(na, nev, nblk, nbw, q, ldq, & + matrixCols, hh_trans_complex, & + mpi_comm_rows, mpi_comm_cols, & + wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL) +#else + call trans_ev_tridi_to_band_complex_single(na, nev, nblk, nbw, q, ldq, & matrixCols, hh_trans_complex, & mpi_comm_rows, mpi_comm_cols, & wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL) +#endif if (.not.(success)) return ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & @@ -649,8 +1394,13 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & ! Backtransform stage 2 ttt0 = MPI_Wtime() - call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, & +#ifdef DOUBLE_PRECISION_COMPLEX + call trans_ev_band_to_full_complex_double(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, & + mpi_comm_rows, mpi_comm_cols, useGPU) +#else + call trans_ev_band_to_full_complex_single(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, & mpi_comm_rows, mpi_comm_cols, useGPU) +#endif ttt1 = MPI_Wtime() if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0 @@ -663,11 +1413,16 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & endif #ifdef HAVE_DETAILED_TIMINGS - call timer%stop("solve_evp_complex_2stage") + call timer%stop("solve_evp_complex_2stage_single") #endif 1 format(a,f10.3) +#ifdef DOUBLE_PRECISION_COMPLEX +end function solve_evp_complex_2stage_double +#else +end function solve_evp_complex_2stage_single +#endif -end function solve_evp_complex_2stage +#endif /* WANT_SINGLE_PRECISION_COMPLEX */ end module ELPA2 diff --git a/src/elpa2_compute.F90 b/src/elpa2_compute.F90 index e817e0441b22327f3a5bc3ae09ee10a967e1cc5e..23df210fb483f007864665339d8b14e1d04a1319 100644 --- a/src/elpa2_compute.F90 +++ b/src/elpa2_compute.F90 @@ -78,10341 +78,136 @@ module ELPA2_compute PRIVATE ! By default, all routines contained are private - public :: bandred_real - public :: tridiag_band_real - public :: trans_ev_tridi_to_band_real - public :: trans_ev_band_to_full_real - - public :: bandred_complex - public :: tridiag_band_complex - public :: trans_ev_tridi_to_band_complex - public :: trans_ev_band_to_full_complex - - public :: band_band_real - public :: divide_band - - integer(kind=ik), public :: which_qr_decomposition = 1 ! defines, which QR-decomposition algorithm will be used - ! 0 for unblocked - ! 1 for blocked (maxrank: nblk) - contains - - subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, & - tmat, wantDebug, useGPU, success, useQR) - - !------------------------------------------------------------------------------- - ! bandred_real: Reduces a distributed symmetric matrix to band form - ! - ! Parameters - ! - ! na Order of matrix - ! - ! a(lda,matrixCols) Distributed matrix which should be reduced. - ! Distribution is like in Scalapack. - ! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half) - ! a(:,:) is overwritten on exit with the band and the Householder vectors - ! in the upper half. - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix a - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! nbw semi bandwith of output matrix - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - ! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1 - ! Factors for the Householder vectors (returned), needed for back transformation - ! - !------------------------------------------------------------------------------- - - use cuda_functions - use iso_c_binding - -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif -#ifdef WITH_OPENMP - use omp_lib -#endif - use precision - implicit none - - integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - real(kind=rk) :: a(lda,*), tmat(nbw,nbw,*) -#else - real(kind=rk) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks) -#endif - real(kind=rk) :: eps - logical, intent(in) :: useGPU - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: l_cols, l_rows, vmrCols - integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow - integer(kind=ik) :: istep, ncol, lch, lcx, nlc, mynlc - integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile - - real(kind=rk) :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) - - real(kind=rk), allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:) - real(kind=rk), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:) - real(kind=rk), allocatable :: vr(:) - ! needed for blocked QR decomposition - integer(kind=ik) :: PQRPARAM(11), work_size - real(kind=rk) :: dwork_size(1) - real(kind=rk), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:) - - integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev -#ifdef WITH_MPI - integer(kind=ik), external :: numroc -#endif - integer(kind=ik) :: ierr - integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size - integer(kind=c_size_t) :: lc_start, lc_end - integer(kind=ik) :: lr_end - integer(kind=ik) :: na_rows, na_cols - - logical, intent(in) :: wantDebug - logical, intent(out) :: success - logical :: successCUDA - integer(kind=ik) :: istat - character(200) :: errorMessage - - logical, intent(in) :: useQR - - integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, ii, pp, transformChunkSize - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("bandred_real") -#endif - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - success = .true. - - - ! Semibandwith nbw must be a multiple of blocksize nblk - if (mod(nbw,nblk)/=0) then - if (my_prow==0 .and. my_pcol==0) then - if (wantDebug) then - write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk - write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk' - endif - success = .false. - return - endif - endif - - if (useGPU) then -#ifdef WITH_MPI - na_rows = numroc(na, nblk, my_prow, 0, np_rows) - na_cols = numroc(na, nblk, my_pcol, 0, np_cols) -#else - na_rows = na - na_cols = na -#endif - endif - - ! Matrix is split into tiles; work is done only for tiles on the diagonal or above - - tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide - - l_rows_tile = tile_size/np_rows ! local rows of a tile - l_cols_tile = tile_size/np_cols ! local cols of a tile - - if (useQR) then - - if (useGPU) then - print *,"qr decomposition at the moment not supported with GPU" - stop - endif - - if (which_qr_decomposition == 1) then - call qr_pqrparam_init(pqrparam(1:11), nblk,'M',0, nblk,'M',0, nblk,'M',1,'s') - allocate(tauvector(na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating tauvector "//errorMessage - stop - endif - - allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating blockheuristic "//errorMessage - stop - endif - - l_rows = local_index(na, my_prow, np_rows, nblk, -1) - allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating vmrCPU "//errorMessage - stop - endif - - vmrCols = na -#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR - call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), & - nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & - mpi_comm_rows, mpi_comm_cols, blockheuristic) - -#else - call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), & - vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, & - nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), & - mpi_comm_rows, mpi_comm_cols, blockheuristic) -#endif - work_size = dwork_size(1) - allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating work_blocked "//errorMessage - stop - endif - - work_blocked = 0.0_rk - deallocate(vmrCPU, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when deallocating vmrCPU "//errorMessage - stop - endif - - endif ! which_qr_decomposition - - endif ! useQr - - if (useGPU) then - ! Here we convert the regular host array into a pinned host array - successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_real_datatype) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMalloc" - stop - endif - - successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_real_datatype) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMalloc" - stop - endif - - successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_real_datatype) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMalloc" - stop - endif - - cur_l_rows = 0 - cur_l_cols = 0 - - successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*size_of_real_datatype, cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - endif ! useGPU - - - do istep = (na-1)/nbw, 1, -1 - - n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step - - ! Number of local columns/rows of remaining matrix - l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) - l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) - - if (useGPU) then - cur_l_rows = max(l_rows, 1) - cur_l_cols = max(l_cols, 1) - - vmr_size = cur_l_rows * 2 * n_cols - umc_size = cur_l_cols * 2 * n_cols - - ! Allocate vmr and umc only if the inew size exceeds their current capacity - ! Added for FORTRAN CALLS - if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then - if (allocated(vr)) then - deallocate(vr, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when deallocating vr "//errorMessage - stop - endif - endif - allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating vr "//errorMessage - stop - endif - - endif - - if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then - if (allocated(vmrCUDA)) then - deallocate(vmrCUDA, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating vmrCUDA "//errorMessage - stop - endif - - successCUDA = cuda_free(vmr_dev) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cuda_free" - stop - endif - endif - - allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating vmrCUDA "//errorMessage - stop - endif - - successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_real_datatype) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMalloc" - stop - endif - - endif - - if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then - if (allocated(umcCUDA)) then - deallocate(umcCUDA, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when deallocating umcCUDA "//errorMessage - stop - endif - - successCUDA = cuda_free(umc_dev) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaFree" - stop - endif - - endif - - allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when deallocating umcCUDA "//errorMessage - stop - endif - - successCUDA = cuda_malloc(umc_dev, umc_size*size_of_real_datatype) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMalloc" - stop - endif - - endif - else ! GPU not used - ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces - - allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating vmrCPU "//errorMessage - stop - endif - - allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating umcCPU "//errorMessage - stop - endif - - allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating vr "//errorMessage - stop - endif - endif ! use GPU - - if (useGPU) then - vmrCUDA(1 : cur_l_rows * n_cols) = 0._rk - else - vmrCPU(1:l_rows,1:n_cols) = 0._rk - endif - - vr(:) = 0 - tmat(:,:,istep) = 0 - - if (useGPU) then - umcCUDA(1 : umc_size) = 0._rk - - lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1) - lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1) - lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1) - - if(lc_start .le. 0) lc_start = 1 - - ! Here we assume that the processor grid and the block grid are aligned - cur_pcol = pcol(istep*nbw+1, nblk, np_cols) - - if(my_pcol == cur_pcol) then - - successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), lda*size_of_real_datatype, & - (a_dev + ((lc_start-1) * lda*size_of_real_datatype)), & - lda*size_of_real_datatype, lr_end*size_of_real_datatype, & - (lc_end - lc_start+1), cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy2d" - stop - endif - - endif - endif ! useGPU - - ! Reduce current block to lower triangular form - - if (useQR) then - if (which_qr_decomposition == 1) then - vmrCols = 2*n_cols -#ifdef DESPERATELY_WANT_ASSUMED_SIZE_QR - call qr_pdgeqrf_2dcomm(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), & - na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, & - work_size, na, n_cols, nblk, nblk, & - istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& - 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& - blockheuristic) - -#else - call qr_pdgeqrf_2dcomm(a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , & - max(l_rows,1), vmrCols, tauvector(1:na), na, & - tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, & - work_size, na, n_cols, nblk, nblk, & - istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,& - 0, PQRPARAM(1:11), mpi_comm_rows, mpi_comm_cols,& - blockheuristic) -#endif - endif - else !useQR - - do lc = n_cols, 1, -1 - - ncol = istep*nbw + lc ! absolute column number of householder vector - nrow = ncol - nbw ! Absolute number of pivot row - - lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length - lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number - - tau = 0 - - if (nrow == 1) exit ! Nothing to do - - cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block - - if (my_pcol==cur_pcol) then - - ! Get vector to be transformed; distribute last element and norm of - ! remaining elements to all procs in current column - - vr(1:lr) = a(1:lr,lch) ! vector to be transformed - - if (my_prow==prow(nrow, nblk, np_rows)) then - aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) - aux1(2) = vr(lr) - else - aux1(1) = dot_product(vr(1:lr),vr(1:lr)) - aux1(2) = 0._rk - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(aux1, aux2, 2, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(aux1, aux2, 2, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - aux2 = aux1 ! this should be optimized -#endif /* WITH_MPI */ - - vnorm2 = aux2(1) - vrl = aux2(2) - - ! Householder transformation - - call hh_transform_real(vrl, vnorm2, xf, tau) - - ! Scale vr and store Householder vector for back transformation - - vr(1:lr) = vr(1:lr) * xf - if (my_prow==prow(nrow, nblk, np_rows)) then - a(1:lr-1,lch) = vr(1:lr-1) - a(lr,lch) = vrl - vr(lr) = 1._rk - else - a(1:lr,lch) = vr(1:lr) - endif - - endif - - ! Broadcast Householder vector and tau along columns - - vr(lr+1) = tau -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(vr, lr+1, MPI_REAL8, cur_pcol, mpi_comm_cols, mpierr) -#else - call MPI_Bcast(vr, lr+1, MPI_REAL4, cur_pcol, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - if (useGPU) then - vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr) - else - vmrCPU(1:lr,lc) = vr(1:lr) - endif - - tau = vr(lr+1) - tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat - - ! Transform remaining columns in current block with Householder vector - ! Local dot product - - aux1 = 0 - -#ifdef WITH_OPENMP - !Open up one omp region to avoid paying openmp overhead. - !This does not help performance due to the addition of two openmp barriers around the MPI call, - !But in the future this may be beneficial if these barriers are replaced with a faster implementation - - !$omp parallel private(mynlc, j, lcx, ii, pp ) shared(aux1) - mynlc = 0 ! number of local columns - - !This loop does not have independent iterations, - !'mynlc' is incremented each iteration, and it is difficult to remove this dependency - !Thus each thread executes every iteration of the loop, except it only does the work if it 'owns' that iteration - !That is, a thread only executes the work associated with an iteration if its thread id is congruent to - !the iteration number modulo the number of threads - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0 ) then - mynlc = mynlc+1 - if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then - if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx)) - endif - endif - enddo - - ! Get global dot products - - !$omp barrier - !$omp single -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - if (mynlc>0) call mpi_allreduce(aux1, aux2, mynlc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - if (mynlc>0) aux2 = aux1 - -#endif /* WITH_MPI */ - !$omp end single - !$omp barrier - - ! Transform - transformChunkSize=32 - mynlc = 0 - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - mynlc = mynlc+1 - !This loop could be parallelized with an openmp pragma with static scheduling and chunk size 32 - !However, for some reason this is slower than doing it manually, so it is parallelized as below. - do ii=omp_get_thread_num()*transformChunkSize,lr,omp_get_num_threads()*transformChunkSize - do pp = 1,transformChunkSize - if (pp + ii > lr) exit - a(ii+pp,lcx) = a(ii+pp,lcx) - tau*aux2(mynlc)*vr(ii+pp) - enddo - enddo - endif - enddo - !$omp end parallel -#else /* WITH_OPENMP */ - nlc = 0 ! number of local columns - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 - if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx)) - endif - enddo - - ! Get global dot products -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - if (nlc>0) aux2=aux1 -#endif /* WITH_MPI */ - ! Transform - - nlc = 0 - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 - a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr) - endif - enddo -#endif /* WITH_OPENMP */ - enddo ! lc - - if (useGPU) then - ! store column tiles back to GPU - cur_pcol = pcol(istep*nbw+1, nblk, np_cols) - if (my_pcol == cur_pcol) then - successCUDA = cuda_memcpy2d((a_dev+((lc_start-1)*lda*size_of_real_datatype)), & - lda*size_of_real_datatype, loc(a(1, lc_start)), & - lda*size_of_real_datatype, lr_end*size_of_real_datatype, & - (lc_end - lc_start+1),cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy2d" - stop - endif - - endif - endif - - ! Calculate scalar products of stored Householder vectors. - ! This can be done in different ways, we use dsyrk - - vav = 0 - -#ifdef DOUBLE_PRECISION_REAL - if (useGPU) then - if (l_rows>0) & - call dsyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCUDA, cur_l_rows, 0.0_rk, vav, ubound(vav,dim=1)) - else - if (l_rows>0) & - call dsyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCPU, ubound(vmrCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1)) - endif -#else - if (useGPU) then - if (l_rows>0) & - call ssyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCUDA, cur_l_rows, 0.0_rk, vav, ubound(vav,dim=1)) - else - if (l_rows>0) & - call ssyrk('U', 'T', n_cols, l_rows, 1.0_rk, vmrCPU, ubound(vmrCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1)) - endif -#endif - - call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows) - - ! Calculate triangular matrix T for block Householder Transformation - - do lc=n_cols,1,-1 - tau = tmat(lc,lc,istep) - if (lc vmc (stored in umc, second half) - - if (useGPU) then - call elpa_transpose_vectors_real (vmrCUDA, cur_l_rows, mpi_comm_rows, & - umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, mpi_comm_cols, & - 1, istep*nbw, n_cols, nblk) - else - call elpa_transpose_vectors_real (vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, & - umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, & - 1, istep*nbw, n_cols, nblk) - endif - - ! Calculate umc = A**T * vmr - ! Note that the distributed A has to be transposed - ! Opposed to direct tridiagonalization there is no need to use the cache locality - ! of the tiles, so we can use strips of the matrix - - ! here the GPU version and CPU version diverged substantially, due to the newest - ! optimizations due to Intel. The GPU version has to be re-written - if (useGPU) then - umcCUDA(1 : l_cols * n_cols) = 0.0_rk - vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = 0 - - if (l_cols>0 .and. l_rows>0) then - successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_real_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - - successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_real_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - - do i=0,(istep*nbw-1)/tile_size - - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - if (lce0 .and. l_rows>0 - - else ! do not useGPU version - !Code for Algorithm 4 - - n_way = 1 -#ifdef WITH_OPENMP - n_way = omp_get_max_threads() -#endif - !umc(1:l_cols,1:n_cols) = 0.d0 - !vmr(1:l_rows,n_cols+1:2*n_cols) = 0 -#ifdef WITH_OPENMP - !$omp parallel private( i,lcs,lce,lrs,lre) -#endif - if (n_way > 1) then - !$omp do - do i=1,min(l_cols_tile, l_cols) - umcCPU(i,1:n_cols) = 0.0_rk - enddo - - !$omp do - do i=1,l_rows - vmrCPU(i,n_cols+1:2*n_cols) = 0.0_rk - enddo - if (l_cols>0 .and. l_rows>0) then - - !SYMM variant 4 - !Partitioned Matrix Expression: - ! Ct = Atl Bt + Atr Bb - ! Cb = Atr' Bt + Abl Bb - ! - !Loop invariant: - ! Ct = Atl Bt + Atr Bb - ! - !Update: - ! C1 = A10'B0 + A11B1 + A21 B2 - ! - !This algorithm chosen because in this algoirhtm, the loop around the dgemm calls - !is easily parallelized, and regardless of choise of algorithm, - !the startup cost for parallelizing the dgemms inside the loop is too great - - !$omp do schedule(static,1) - do i=0,(istep*nbw-1)/tile_size - lcs = i*l_cols_tile+1 ! local column start - lce = min(l_cols, (i+1)*l_cols_tile) ! local column end - - lrs = i*l_rows_tile+1 ! local row start - lre = min(l_rows, (i+1)*l_rows_tile) ! local row end - - !C1 += [A11 A12] [B1 - ! B2] - if ( lre > lrs .and. l_cols > lcs ) then -#ifdef DOUBLE_PRECISION_REAL - call DGEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, & - 1.0_rk, a(lrs,lcs), ubound(a,dim=1), & - umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), & - 0.0_rk, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1)) -#else - call SGEMM('N', 'N', lre-lrs+1, n_cols, l_cols-lcs+1, & - 1.0_rk, a(lrs,lcs), ubound(a,dim=1), & - umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), & - 0.0_rk, vmrCPU(lrs,n_cols+1), ubound(vmrCPU,dim=1)) -#endif - endif - - ! C1 += A10' B0 - if ( lce > lcs .and. i > 0 ) then -#ifdef DOUBLE_PRECISION_REAL - call DGEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, & - 1.0_rk, a(1,lcs), ubound(a,dim=1), & - vmrCPU(1,1), ubound(vmrCPU,dim=1), & - 0.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1)) -#else - call SGEMM('T', 'N', lce-lcs+1, n_cols, lrs-1, & - 1.0_rk, a(1,lcs), ubound(a,dim=1), & - vmrCPU(1,1), ubound(vmrCPU,dim=1), & - 0.0_rk, umcCPU(lcs,1), ubound(umcCPU,dim=1)) -#endif - endif - enddo - endif ! l_cols>0 .and. l_rows>0 - else ! n_way > 1 - umcCPU(1:l_cols,1:n_cols) = 0.0_rk - vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0 - if (l_cols>0 .and. l_rows>0) then - do i=0,(istep*nbw-1)/tile_size - - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - if (lce 1 -#ifdef WITH_OPENMP - !$omp end parallel -#endif - endif ! do not useGPU version - - ! Sum up all ur(:) parts along rows and add them to the uc(:) parts - ! on the processors containing the diagonal - ! This is only necessary if ur has been calculated, i.e. if the - ! global tile size is smaller than the global remaining matrix - - if (useGPU) then - ! here the GPU version and CPU version divereged due to the same reasons as above - - if (tile_size < istep*nbw) then - call elpa_reduce_add_vectors_real (vmrCUDA(cur_l_rows * n_cols + 1),cur_l_rows,mpi_comm_rows, & - umcCUDA, cur_l_cols, mpi_comm_cols, & - istep*nbw, n_cols, nblk) - endif - - if (l_cols>0) then - allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating tmpCUDA "//errorMessage - stop - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, ierr) -#else - call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, ierr) -#endif - -#else /* WITH_MPI */ - tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols) -#endif /* WITH_MPI */ - umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols) - - if (allocated(tmpCUDA)) then - deallocate(tmpCUDA, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when deallocating tmpCUDA "//errorMessage - stop - endif - endif - endif ! l_cols - - ! U = U * Tmat**T - successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_real_datatype, cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - - successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif -#ifdef DOUBLE_PRECISION_REAL - call cublas_dtrmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, & - 1.0_rk, tmat_dev, nbw, umc_dev, cur_l_cols) -#else - call cublas_strmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols, n_cols, & - 1.0_rk, tmat_dev, nbw, umc_dev, cur_l_cols) -#endif - ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T - - successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif -#ifdef DOUBLE_PRECISION_REAL - call cublas_dgemm('T', 'N', n_cols, n_cols, l_cols, & - 1.0_rk, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, & - 0.0_rk, vav_dev, nbw) - - call cublas_dtrmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, & - 1.0_rk, tmat_dev, nbw, vav_dev, nbw) -#else - call cublas_sgemm('T', 'N', n_cols, n_cols, l_cols, & - 1.0_rk, umc_dev, cur_l_cols, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, & - 0.0_rk, vav_dev, nbw) - - call cublas_strmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, & - 1.0_rk, tmat_dev, nbw, vav_dev, nbw) -#endif - - - - successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_real_datatype, cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - - call symm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_cols) - - successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - - ! U = U - 0.5 * V * VAV -#ifdef DOUBLE_PRECISION_REAL - call cublas_dgemm('N', 'N', l_cols, n_cols, n_cols,& - -0.5_rk, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, vav_dev,nbw,& - 1.0_rk, umc_dev, cur_l_cols) -#else - call cublas_sgemm('N', 'N', l_cols, n_cols, n_cols,& - -0.5_rk, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, vav_dev,nbw,& - 1.0_rk, umc_dev, cur_l_cols) -#endif - successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_real_datatype, cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - - ! Transpose umc -> umr (stored in vmr, second half) - - call elpa_transpose_vectors_real (umcCUDA, cur_l_cols, mpi_comm_cols, & - vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, & - 1, istep*nbw, n_cols, nblk) - successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_real_datatype, cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - - successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_real_datatype, cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"bandred_real: error in cudaMemcpy" - stop - endif - - ! A = A - V*U**T - U*V**T - do i=0,(istep*nbw-1)/tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - lre = min(l_rows,(i+1)*l_rows_tile) - if (lce 1) then - call elpa_reduce_add_vectors_real (vmrCPU(1,n_cols+1),ubound(vmrCPU,dim=1),mpi_comm_rows, & - umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & - istep*nbw, n_cols, nblk) - endif - - if (l_cols>0) then - allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when allocating tmpCPU "//errorMessage - stop - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols) -#endif /* WITH_MPI */ - umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols) - - deallocate(tmpCPU, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_real: error when deallocating tmpCPU "//errorMessage - stop - endif - endif - - ! U = U * Tmat**T -#ifdef DOUBLE_PRECISION_REAL - call dtrmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols,n_cols, 1.0_rk, tmat(1,1,istep), ubound(tmat,dim=1), & - umcCPU, ubound(umcCPU,dim=1)) - - ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T - - call dgemm('T', 'N', n_cols, n_cols, l_cols, 1.0_rk, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), & - ubound(umcCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1)) - - call dtrmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, 1.0_rk, tmat(1,1,istep), & - ubound(tmat,dim=1), vav, ubound(vav,dim=1)) -#else - call strmm('Right', 'Upper', 'Trans', 'Nonunit', l_cols,n_cols, 1.0_rk, tmat(1,1,istep), ubound(tmat,dim=1), & - umcCPU, ubound(umcCPU,dim=1)) - - ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T - - call sgemm('T', 'N', n_cols, n_cols, l_cols, 1.0_rk, umcCPU, ubound(umcCPU,dim=1), umcCPU(1,n_cols+1), & - ubound(umcCPU,dim=1), 0.0_rk, vav, ubound(vav,dim=1)) - - call strmm('Right', 'Upper', 'Trans', 'Nonunit', n_cols, n_cols, 1.0_rk, tmat(1,1,istep), & - ubound(tmat,dim=1), vav, ubound(vav,dim=1)) -#endif - - call symm_matrix_allreduce(n_cols,vav, nbw, nbw ,mpi_comm_cols) - - ! U = U - 0.5 * V * VAV -#ifdef DOUBLE_PRECISION_REAL - call dgemm('N', 'N', l_cols, n_cols, n_cols, -0.5_rk, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, & - ubound(vav,dim=1), 1.0_rk, umcCPU, ubound(umcCPU,dim=1)) -#else - call sgemm('N', 'N', l_cols, n_cols, n_cols, -0.5_rk, umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, & - ubound(vav,dim=1), 1.0_rk, umcCPU, ubound(umcCPU,dim=1)) -#endif - - ! Transpose umc -> umr (stored in vmr, second half) - - call elpa_transpose_vectors_real(umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, & - vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, & - 1, istep*nbw, n_cols, nblk) - - ! A = A - V*U**T - U*V**T -#ifdef WITH_OPENMP - !$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) - n_threads = omp_get_num_threads() - if (mod(n_threads, 2) == 0) then - n_way = 2 - else - n_way = 1 - endif - - m_way = n_threads / n_way - - m_id = mod(omp_get_thread_num(), m_way) - n_id = omp_get_thread_num() / m_way - - do ii=n_id*tile_size,(istep*nbw-1),tile_size*n_way - i = ii / tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - lre = min(l_rows,(i+1)*l_rows_tile) - if (lce lre ) myend = lre - if ( myend-mystart+1 < 1) cycle -#ifdef DOUBLE_PRECISION_REAL - call dgemm('N', 'T', myend-mystart+1, lce-lcs+1, 2*n_cols, -1.0_rk, & - vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), & - 1.0_rk, a(mystart,lcs), ubound(a,1)) -#else - call sgemm('N', 'T', myend-mystart+1, lce-lcs+1, 2*n_cols, -1.0_rk, & - vmrCPU(mystart, 1), ubound(vmrCPU,1), umcCPU(lcs,1), ubound(umcCPU,1), & - 1.0_rk, a(mystart,lcs), ubound(a,1)) -#endif - enddo - !$omp end parallel -#else /* WITH_OPENMP */ - do i=0,(istep*nbw-1)/tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - lre = min(l_rows,(i+1)*l_rows_tile) - if (lce0) then -#ifdef DOUBLE_PRECISION_REAL - call cublas_dgemm('T', 'N', n_cols, l_cols, l_rows, 1.0_rk, hvm_dev, max_local_rows, & - q_dev, ldq , 0.0_rk, tmp_dev, n_cols) -#else - call cublas_sgemm('T', 'N', n_cols, l_cols, l_rows, 1.0_rk, hvm_dev, max_local_rows, & - q_dev, ldq , 0.0_rk, tmp_dev, n_cols) -#endif - successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_real_datatype, cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaMemcpy" - stop - endif - - else - !#ifdef WITH_GPU_VERSION - ! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_real_datatype) - ! if (istat .ne. 0) then - ! print *,"trans_ev_band_to_full_real: error in cudaMemset" - ! stop - ! endif - ! - !#else - tmp1(1:l_cols*n_cols) = 0 - !#endif - endif - - !#ifdef WITH_GPU_VERSION - ! istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*size_of_real_datatype,cudaMemcpyDeviceToHost) - ! if (istat .ne. 0) then - ! print *,"error in cudaMemcpy" - ! stop - ! endif - !#endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - tmp2(1:n_cols*l_cols) = tmp1(n_cols*l_cols) -#endif /* WITH_MPI */ - !#ifdef WITH_GPU_VERSION - ! istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*size_of_real_datatype,cudaMemcpyHostToDevice) - ! if (istat .ne. 0) then - ! print *,"error in cudaMemcpy" - ! stop - ! endif - !#endif - - if (l_rows>0) then - successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*size_of_real_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaMemcpy" - stop - endif - - successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaMemcpy" - stop - endif -#ifdef DOUBLE_PRECISION_REAL - call cublas_dtrmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk, tmat_dev, nbw, tmp_dev, n_cols) - call cublas_dgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk, hvm_dev, max_local_rows, & - tmp_dev, n_cols, 1.0_rk, q_dev, ldq) -#else - call cublas_strmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk, tmat_dev, nbw, tmp_dev, n_cols) - call cublas_sgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk, hvm_dev, max_local_rows, & - tmp_dev, n_cols, 1.0_rk, q_dev, ldq) -#endif - successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_real_datatype),cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaMemcpy" - stop - endif - - endif ! l_rows > 0 - !#ifdef WITH_GPU_VERSION - ! istat = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_real_datatype),cudaMemcpyDeviceToHost) - ! if (istat .ne. 0) then - ! print *,"error in cudaMemcpy" - ! stop - ! endif - ! - !#endif - enddo ! istep - - else ! do not useGPU - - ! t_blocking was formerly 2; 3 is a better choice - t_blocking = 3 ! number of matrices T (tmat) which are aggregated into a new (larger) T matrix (tmat_complete) and applied at once - - ! we only use the t_blocking if we could call it fully, this is might be better but needs to benchmarked. -! if ( na >= ((t_blocking+1)*nbw) ) then - cwy_blocking = t_blocking * nbw - - allocate(tmp1(max_local_cols*cwy_blocking)) - allocate(tmp2(max_local_cols*cwy_blocking)) - allocate(hvb(max_local_rows*cwy_blocking)) - allocate(hvm(max_local_rows,cwy_blocking)) - allocate(tmat_complete(cwy_blocking,cwy_blocking)) - allocate(t_tmp(cwy_blocking,nbw)) - allocate(t_tmp2(cwy_blocking,nbw)) -! else -! allocate(tmp1(max_local_cols*nbw)) -! allocate(tmp2(max_local_cols*nbw)) -! allocate(hvb(max_local_rows*nbw)) -! allocate(hvm(max_local_rows,nbw)) -! endif - - hvm = 0 ! Must be set to 0 !!! - hvb = 0 ! Safety only - - l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q - -! if ( na >= ((t_blocking+1)*nbw) ) then - - do istep=1,((na-1)/nbw-1)/t_blocking + 1 - ! This the call when using na >= ((t_blocking+1)*nbw) - ! n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step - ! As an alternative we add some special case handling if na < cwy_blocking - IF (na < cwy_blocking) THEN - n_cols = MAX(0, na-nbw) - IF ( n_cols .eq. 0 ) THEN - EXIT - END IF - ELSE - n_cols = MIN(na,istep*cwy_blocking+nbw) - (istep-1)*cwy_blocking - nbw ! Number of columns in current step - END IF - ! Broadcast all Householder vectors for current step compressed in hvb - - nb = 0 - ns = 0 - - do lc = 1, n_cols - ncol = (istep-1)*cwy_blocking + nbw + lc ! absolute column number of householder vector - nrow = ncol - nbw ! absolute number of pivot row - - l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast - l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number - - if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh) - - nb = nb+l_rows - - if (lc==n_cols .or. mod(ncol,nblk)==0) then -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Bcast(hvb(ns+1), nb-ns, MPI_REAL8, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr) -#else - call MPI_Bcast(hvb(ns+1), nb-ns, MPI_REAL4, pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - ns = nb - endif - enddo - - ! Expand compressed Householder vectors into matrix hvm - - nb = 0 - do lc = 1, n_cols - nrow = (istep-1)*cwy_blocking + lc ! absolute number of pivot row - l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast - - hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows) - if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1._rk - - nb = nb+l_rows - enddo - - l_rows = local_index(MIN(na,(istep+1)*cwy_blocking), my_prow, np_rows, nblk, -1) - - ! compute tmat2 out of tmat(:,:,) - tmat_complete = 0 - do i = 1, t_blocking - t_cols = MIN(nbw, n_cols - (i-1)*nbw) - if (t_cols <= 0) exit - t_rows = (i - 1) * nbw - tmat_complete(t_rows+1:t_rows+t_cols,t_rows+1:t_rows+t_cols) = tmat(1:t_cols,1:t_cols,(istep-1)*t_blocking + i) - if (i > 1) then -#ifdef DOUBLE_PRECISION_REAL - call dgemm('T', 'N', t_rows, t_cols, l_rows, 1.0_rk, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), & - max_local_rows, 0.0_rk, t_tmp, cwy_blocking) -#ifdef WITH_MPI - call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) -#else - t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw) -#endif - call dtrmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking) - call dtrmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, & - t_tmp2, cwy_blocking) -#else /* DOUBLE_PRECISION_REAL */ - call sgemm('T', 'N', t_rows, t_cols, l_rows, 1.0_rk, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), & - max_local_rows, 0.0_rk, t_tmp, cwy_blocking) -#ifdef WITH_MPI - call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) -#else - t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw) -#endif - call strmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking) - call strmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk, tmat_complete(t_rows+1,t_rows+1), & - cwy_blocking, t_tmp2, cwy_blocking) -#endif /* DOUBLE_PRECISION_REAL */ - - tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols) - endif - enddo - - ! Q = Q - V * T**T * V**T * Q - - if (l_rows>0) then -#ifdef DOUBLE_PRECISION_REAL - call dgemm('T', 'N', n_cols, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), & - q, ldq, 0.0_rk, tmp1, n_cols) -#else - call sgemm('T', 'N', n_cols, l_cols, l_rows, 1.0_rk, hvm, ubound(hvm,dim=1), & - q, ldq, 0.0_rk, tmp1, n_cols) -#endif - - else - tmp1(1:l_cols*n_cols) = 0 - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows ,mpierr) -#else - call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows ,mpierr) -#endif - -#else /* WITH_MPI */ - tmp2 = tmp1 - -#endif /* WITH_MPI */ - - if (l_rows>0) then -#ifdef DOUBLE_PRECISION_REAL - call dtrmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk, tmat_complete, cwy_blocking, tmp2, n_cols) - call dgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk, hvm, ubound(hvm,dim=1), tmp2, n_cols, 1.0_rk, q, ldq) -#else - call strmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk, tmat_complete, cwy_blocking, tmp2, n_cols) - call sgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk, hvm, ubound(hvm,dim=1), tmp2, n_cols, 1.0_rk, q, ldq) -#endif - - endif - enddo ! istep - - endif ! useGPU - - deallocate(tmp1, tmp2, hvb, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_band_to_full_real: error when deallocating tmp1 tmp2 hvb "//errorMessage - stop - endif - - if (useGPU) then - successCUDA = cuda_free(hvm_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaFree" - stop - endif - - successCUDA = cuda_free(tmp_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaFree" - stop - endif - - successCUDA = cuda_free(tmat_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaFree" - stop - endif - - successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols*size_of_real_datatype, cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaFree" - stop - endif - - ! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols) - - successCUDA = cuda_free(q_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_real: error in cudaFree" - stop - endif - - ! deallocate(q_temp, stat=istat, errmsg=errorMessage) - ! if (istat .ne. 0) then - ! print *,"error when deallocating q_temp "//errorMessage - ! stop - ! endif - ! deallocate(tmat_temp, stat=istat, errmsg=errorMessage) - ! if (istat .ne. 0) then - ! print *,"trans_ev_band_to_full_real: error when deallocating tmat_temp "//errorMessage - ! stop - ! endif - - endif ! useGPU - - deallocate(hvm, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_band_to_full_real: error when deallocating hvm "//errorMessage - stop - endif - - if (useQr) then - deallocate(tmat_complete, t_tmp, t_tmp2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_band_to_full_real: error when deallocating tmat_complete, t_tmp, t_tmp2 "//errorMessage - stop - endif - - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("trans_ev_band_to_full_real") -#endif - end subroutine trans_ev_band_to_full_real - - subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_real, & - mpi_comm_rows, mpi_comm_cols, mpi_comm) - - !------------------------------------------------------------------------------- - ! tridiag_band_real: - ! Reduces a real symmetric band matrix to tridiagonal form - ! - ! na Order of matrix a - ! - ! nb Semi bandwith - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix a - ! - ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output) - ! - ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output) - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! mpi_comm - ! MPI-Communicator for the total processor set - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - real(kind=rk), intent(in) :: a(lda,*) -#else - real(kind=rk), intent(in) :: a(lda,matrixCols) -#endif - real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0 - real(kind=rk), intent(out), & - allocatable :: hh_trans_real(:,:) - - real(kind=rk) :: vnorm2, hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf - real(kind=rk) :: hd(nb), hs(nb) - - integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt - integer(kind=ik) :: my_pe, n_pes, mpierr - integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols - integer(kind=ik) :: ireq_ab, ireq_hv - integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off -#ifdef WITH_OPENMP - integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter -#ifdef WITH_MPI - integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) -#endif - integer(kind=ik), allocatable :: mpi_statuses(:,:), global_id_tmp(:,:) - integer(kind=ik), allocatable :: omp_block_limits(:) - real(kind=rk), allocatable :: hv_t(:,:), tau_t(:) -#endif - integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) - integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) - integer(kind=ik), allocatable :: block_limits(:) - real(kind=rk), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) - -#ifdef WITH_OPENMP - integer(kind=ik) :: omp_get_max_threads -#endif - integer :: istat - character(200) :: errorMessage - -#ifndef WITH_MPI - integer(kind=ik) :: startAddr -#endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("tridiag_band_real") -#endif - call mpi_comm_rank(mpi_comm,my_pe,mpierr) - call mpi_comm_size(mpi_comm,n_pes,mpierr) - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - ! Get global_id mapping 2D procssor coordinates to global id - - allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating global_id "//errorMessage - stop - endif - - - global_id(:,:) = 0 - global_id(my_prow, my_pcol) = my_pe -#ifdef WITH_OPENMP - allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating global_id_tmp "//errorMessage - stop - endif -#endif - -#ifdef WITH_MPI - -#ifndef WITH_OPENMP - call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) -#else - global_id_tmp(:,:) = global_id(:,:) - call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) - deallocate(global_id_tmp, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when deallocating global_id_tmp "//errorMessage - stop - endif -#endif /* WITH_OPENMP */ - -#endif /* WITH_MPI */ - ! Total number of blocks in the band: - - nblocks_total = (na-1)/nb + 1 - - ! Set work distribution - - allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating block_limits"//errorMessage - stop - endif - - call divide_band(nblocks_total, n_pes, block_limits) - - ! nblocks: the number of blocks for my task - nblocks = block_limits(my_pe+1) - block_limits(my_pe) - - ! allocate the part of the band matrix which is needed by this PE - ! The size is 1 block larger than needed to avoid extensive shifts - allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating ab"//errorMessage - stop - endif - - ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety - - ! n_off: Offset of ab within band - n_off = block_limits(my_pe)*nb - - ! Redistribute band in a to ab - call redist_band_real(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab) - - ! Calculate the workload for each sweep in the back transformation - ! and the space requirements to hold the HH vectors - - allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating limits"//errorMessage - stop - endif - - call determine_workload(na, nb, np_rows, limits) - max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) - - num_hh_vecs = 0 - num_chunks = 0 - nx = na - do n = 1, nblocks_total - call determine_workload(nx, nb, np_rows, limits) - local_size = limits(my_prow+1) - limits(my_prow) - ! add to number of householder vectors - ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below! - if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then - num_hh_vecs = num_hh_vecs + local_size - num_chunks = num_chunks+1 - endif - nx = nx - nb - enddo - - ! Allocate space for HH vectors - - allocate(hh_trans_real(nb,num_hh_vecs), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating hh_trans_real"//errorMessage - stop - endif - - - ! Allocate and init MPI requests - - allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating ireq_hhr"//errorMessage - stop - endif - allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating ireq_hhs"//errorMessage - stop - endif - - num_hh_vecs = 0 - num_chunks = 0 - nx = na - nt = 0 - do n = 1, nblocks_total - call determine_workload(nx, nb, np_rows, limits) - local_size = limits(my_prow+1) - limits(my_prow) - if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then - num_chunks = num_chunks+1 -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_irecv(hh_trans_real(1,num_hh_vecs+1), nb*local_size, mpi_real8, nt, & - 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) -#else - call mpi_irecv(hh_trans_real(1,num_hh_vecs+1), nb*local_size, mpi_real4, nt, & - 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) -#endif - -#else /* WITH_MPI */ - ! carefull non-block recv data copy must be done at wait or send - ! hh_trans_real(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk) -#endif /* WITH_MPI */ - num_hh_vecs = num_hh_vecs + local_size - endif - nx = nx - nb - if (n == block_limits(nt+1)) then - nt = nt + 1 - endif - enddo -#ifdef WITH_MPI - ireq_hhs(:) = MPI_REQUEST_NULL -#endif - ! Buffers for gathering/sending the HH vectors - - allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating hh_gath"//errorMessage - stop - endif - - allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating hh_send"//errorMessage - stop - endif - hh_gath(:,:,:) = 0 - hh_send(:,:,:) = 0 - - ! Some counters - - allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating hh_cnt"//errorMessage - stop - endif - - allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating hh_dst"//errorMessage - stop - endif - - hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all - hh_dst(:) = 0 ! PE number for receive -#ifdef WITH_MPI - ireq_ab = MPI_REQUEST_NULL - ireq_hv = MPI_REQUEST_NULL -#endif - ! Limits for sending - - allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating snd_limits"//errorMessage - stop - endif - do iblk=1,nblocks - call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk)) - enddo - -#ifdef WITH_OPENMP - ! OpenMP work distribution: - - max_threads = 1 - max_threads = omp_get_max_threads() - ! For OpenMP we need at least 2 blocks for every thread - max_threads = MIN(max_threads, nblocks/2) - if (max_threads==0) max_threads = 1 - - allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating omp_block_limits"//errorMessage - stop - endif - - ! Get the OpenMP block limits - call divide_band(nblocks, max_threads, omp_block_limits) - - allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating hv_t, tau_t"//errorMessage - stop - endif - - hv_t = 0 - tau_t = 0 -#endif /* WITH_OPENMP */ - ! --------------------------------------------------------------------------- - ! Start of calculations - - na_s = block_limits(my_pe)*nb + 1 - - if (my_pe>0 .and. na_s<=na) then - ! send first column to previous PE - ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) - ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_isend(ab_s, nb+1, mpi_real8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#else - call mpi_isend(ab_s, nb+1, mpi_real4, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - -#ifndef WITH_MPI - startAddr = ubound(hh_trans_real,dim=2) -#endif - -#ifdef WITH_OPENMP - do istep=1,na-1-block_limits(my_pe)*nb -#else - do istep=1,na-1 -#endif - - if (my_pe==0) then - n = MIN(na-na_s,nb) ! number of rows to be reduced - hv(:) = 0 - tau = 0 - ! The last step (istep=na-1) is only needed for sending the last HH vectors. - ! We don't want the sign of the last element flipped (analogous to the other sweeps) - if (istep < na-1) then - ! Transform first column of remaining matrix - vnorm2 = sum(ab(3:n+1,na_s-n_off)**2) - call hh_transform_real(ab(2,na_s-n_off),vnorm2,hf,tau) - hv(1) = 1 - hv(2:n) = ab(3:n+1,na_s-n_off)*hf - endif - d(istep) = ab(1,na_s-n_off) - e(istep) = ab(2,na_s-n_off) - if (istep == na-1) then - d(na) = ab(1,na_s+1-n_off) - e(na) = 0 - endif - else - if (na>na_s) then - ! Receive Householder vector from previous task, from PE owning subdiagonal - -#ifdef WITH_OPENMP - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_recv(hv, nb, mpi_real8, my_pe-1, 2, mpi_comm, MPI_STATUS, mpierr) -#else - call mpi_recv(hv, nb, mpi_real4, my_pe-1, 2, mpi_comm, MPI_STATUS, mpierr) -#endif - -#else /* WITH_MPI */ - - hv(1:nb) = hv_s(1:nb) - -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_recv(hv, nb, mpi_real8, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) -#else - call mpi_recv(hv, nb, mpi_real4, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - hv(1:nb) = hv_s(1:nb) -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - tau = hv(1) - hv(1) = 1._rk - endif - endif - - na_s = na_s+1 - if (na_s-n_off > nb) then - ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) - ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 - n_off = n_off + nb - endif - -#ifdef WITH_OPENMP - if (max_threads > 1) then - - ! Codepath for OpenMP - - ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! - ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. - ! This simulates the behaviour of the MPI tasks which also work after each other. - ! The code would be considerably easier, if the MPI communication would be made within - ! the parallel region - this is avoided here since this would require - ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. - - hv_t(:,1) = hv - tau_t(1) = tau - - do iter = 1, 2 - - ! iter=1 : work on first block - ! iter=2 : work on remaining blocks - ! This is done in 2 iterations so that we have a barrier in between: - ! After the first iteration, it is guaranteed that the last row of the last block - ! is completed by the next thread. - ! After the first iteration it is also the place to exchange the last row - ! with MPI calls -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & -!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) - do my_thread = 1, max_threads - - if (iter == 1) then - my_block_s = omp_block_limits(my_thread-1) + 1 - my_block_e = my_block_s - else - my_block_s = omp_block_limits(my_thread-1) + 2 - my_block_e = omp_block_limits(my_thread) - endif - - do iblk = my_block_s, my_block_e - - ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block - ne = ns+nb-1 ! last column in block - - if (istepna) exit - - hv = hv_t(:,my_thread) - tau = tau_t(my_thread) - - ! Store Householder vector for back transformation - - hh_cnt(iblk) = hh_cnt(iblk) + 1 - - hh_gath(1 ,hh_cnt(iblk),iblk) = tau - hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) - - nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block - nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) - ! Note that nr>=0 implies that diagonal block is full (nc==nb)! - - ! Transform diagonal block -#ifdef DOUBLE_PRECISION_REAL - call DSYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, 0.0_rk, hd, 1) -#else - call SSYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, 0.0_rk, hd, 1) -#endif - x = dot_product(hv(1:nc),hd(1:nc))*tau - hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc) -#ifdef DOUBLE_PRECISION_REAL - call DSYR2('L', nc, -1.0_rk ,hd, 1, hv, 1, ab(1,ns), 2*nb-1) -#else - call SSYR2('L', nc, -1.0_rk ,hd, 1, hv, 1, ab(1,ns), 2*nb-1) -#endif - hv_t(:,my_thread) = 0 - tau_t(my_thread) = 0 - - if (nr<=0) cycle ! No subdiagonal block present any more - - ! Transform subdiagonal block -#ifdef DOUBLE_PRECISION_REAL - call DGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, 0.0_rk, hs, 1) -#else - call SGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, 0.0_rk, hs, 1) -#endif - if (nr>1) then - - ! complete (old) Householder transformation for first column - - ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 - - ! calculate new Householder transformation for first column - ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) - - vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) - call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) - hv_t(1 ,my_thread) = 1._rk - hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf - ab(nb+2:,ns) = 0.0_rk - - ! update subdiagonal block for old and new Householder transformation - ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster -#ifdef DOUBLE_PRECSION_REAL - call DGEMV('T',nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, 0.0_rk, h(2), 1) -#else - call SGEMV('T',nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, 0.0_rk, h(2), 1) -#endif - x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) - h(2:nb) = h(2:nb) - x*hv(2:nb) - ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") - do i=2,nb - ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i) - enddo - - else - - ! No new Householder transformation for nr=1, just complete the old one - ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 - do i=2,nb - ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) - enddo - ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) - hv_t(1,my_thread) = 1. - - endif - - enddo - - enddo ! my_thread -!$omp end parallel do - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - - if (iter==1) then - ! We are at the end of the first block - - ! Send our first column to previous PE - if (my_pe>0 .and. na_s <= na) then -#ifdef WITH_MPI - call mpi_wait(ireq_ab, mpi_status, mpierr) -#endif - ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_isend(ab_s, nb+1, mpi_real8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#else - call mpi_isend(ab_s, nb+1, mpi_real4, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - ! Request last column from next PE - ne = na_s + nblocks*nb - (max_threads-1) - 1 -#ifdef WITH_MPI - if (istep>=max_threads .and. ne <= na) then -#ifdef DOUBLE_PRECISION_REAL - call mpi_recv(ab(1,ne-n_off), nb+1, mpi_real8, my_pe+1, 1, mpi_comm, mpi_status, mpierr) -#else - call mpi_recv(ab(1,ne-n_off), nb+1, mpi_real4, my_pe+1, 1, mpi_comm, mpi_status, mpierr) -#endif - endif -#else /* WITH_MPI */ - if (istep>=max_threads .and. ne <= na) then - ab(1:nb+1,ne-n_off) = ab_s(1:nb+1) - endif -#endif /* WITH_MPI */ - else - ! We are at the end of all blocks - - ! Send last HH vector and TAU to next PE if it has been calculated above - ne = na_s + nblocks*nb - (max_threads-1) - 1 - if (istep>=max_threads .and. ne < na) then -#ifdef WITH_MPI - call mpi_wait(ireq_hv, mpi_status, mpierr) -#endif - hv_s(1) = tau_t(max_threads) - hv_s(2:) = hv_t(2:,max_threads) - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_isend(hv_s, nb, mpi_real8, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) -#else - call mpi_isend(hv_s, nb, mpi_real4, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - ! "Send" HH vector and TAU to next OpenMP thread - do my_thread = max_threads, 2, -1 - hv_t(:,my_thread) = hv_t(:,my_thread-1) - tau_t(my_thread) = tau_t(my_thread-1) - enddo - - endif - enddo ! iter - - else - - ! Codepath for 1 thread without OpenMP - - ! The following code is structured in a way to keep waiting times for - ! other PEs at a minimum, especially if there is only one block. - ! For this reason, it requests the last column as late as possible - ! and sends the Householder vector and the first column as early - ! as possible. - -#endif /* WITH_OPENMP */ - - do iblk=1,nblocks - ns = na_s + (iblk-1)*nb - n_off ! first column in block - ne = ns+nb-1 ! last column in block - - if (ns+n_off>na) exit - - ! Store Householder vector for back transformation - - hh_cnt(iblk) = hh_cnt(iblk) + 1 - - hh_gath(1 ,hh_cnt(iblk),iblk) = tau - hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) - -#ifndef WITH_OPENMP - if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then - ! Wait for last transfer to finish -#ifdef WITH_MPI - call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) -#endif - ! Copy vectors into send buffer - hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) - ! Send to destination - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, & - global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) -#else - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real4, & - global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) -#endif - -#else /* WITH_MPI */ - ! do the post-poned irecv here - startAddr = startAddr - hh_cnt(iblk) - hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) -#endif /* WITH_MPI */ - - ! Reset counter and increase destination row - hh_cnt(iblk) = 0 - hh_dst(iblk) = hh_dst(iblk)+1 - endif - - ! The following code is structured in a way to keep waiting times for - ! other PEs at a minimum, especially if there is only one block. - ! For this reason, it requests the last column as late as possible - ! and sends the Householder vector and the first column as early - ! as possible. -#endif /* WITH_OPENMP */ - nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block - nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) - ! Note that nr>=0 implies that diagonal block is full (nc==nb)! - - ! Multiply diagonal block and subdiagonal block with Householder vector - - if (iblk==nblocks .and. nc==nb) then - - ! We need the last column from the next PE. - ! First do the matrix multiplications without last column ... - - ! Diagonal block, the contribution of the last element is added below! - ab(1,ne) = 0 -#ifdef DOUBLE_PRECISION_REAL - call DSYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, 0.0_rk, hd, 1) - - ! Subdiagonal block - if (nr>0) call DGEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, 0.0_rk, hs, 1) - - ! ... then request last column ... -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call mpi_recv(ab(1,ne), nb+1, mpi_real8, my_pe+1, 1, mpi_comm, MPI_STATUS, mpierr) -#else - call mpi_recv(ab(1,ne), nb+1, mpi_real8, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - - ab(1:nb+1,ne) = ab_s(1:nb+1) - -#endif /* WITH_MPI */ - -#else /* DOUBLE_PRECISION_REAL */ - call SSYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, 0.0_rk, hd, 1) - - ! Subdiagonal block - if (nr>0) call SGEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1, 0.0_rk, hs, 1) - - ! ... then request last column ... -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call mpi_recv(ab(1,ne), nb+1, mpi_real4, my_pe+1, 1, mpi_comm, MPI_STATUS, mpierr) -#else - call mpi_recv(ab(1,ne), nb+1, mpi_real4, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - - ab(1:nb+1,ne) = ab_s(1:nb+1) - -#endif /* WITH_MPI */ - -#endif /* DOUBLE_PRECISION_REAL */ - - ! ... and complete the result - hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) - hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau - - else - - ! Normal matrix multiply -#ifdef DOUBLE_PRECISION_REAL - call DSYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, 0.0_rk, hd, 1) - if (nr>0) call DGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, 0.0_rk, hs, 1) -#else - call SSYMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, 0.0_rk, hd, 1) - if (nr>0) call SGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, 0.0_rk, hs, 1) -#endif - endif - - ! Calculate first column of subdiagonal block and calculate new - ! Householder transformation for this column - - hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb - tau_new = 0 - - if (nr>0) then - - ! complete (old) Householder transformation for first column - - ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 - - ! calculate new Householder transformation ... - if (nr>1) then - vnorm2 = sum(ab(nb+2:nb+nr,ns)**2) - call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_new) - hv_new(1) = 1. - hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf - ab(nb+2:,ns) = 0 - endif - - ! ... and send it away immediatly if this is the last block - - if (iblk==nblocks) then -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call mpi_wait(ireq_hv,MPI_STATUS,mpierr) -#else - call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) -#endif - -#endif /* WITH_MPI */ - hv_s(1) = tau_new - hv_s(2:) = hv_new(2:) - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_isend(hv_s, nb, mpi_real8, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) -#else - call mpi_isend(hv_s, nb, mpi_real4, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - endif - - ! Transform diagonal block - x = dot_product(hv(1:nc),hd(1:nc))*tau - hd(1:nc) = hd(1:nc) - 0.5_rk*x*hv(1:nc) - - if (my_pe>0 .and. iblk==1) then - - ! The first column of the diagonal block has to be send to the previous PE - ! Calculate first column only ... - - ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1) - - ! ... send it away ... -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call mpi_wait(ireq_ab,MPI_STATUS,mpierr) -#else - call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) -#endif - -#endif /* WITH_MPI */ - ab_s(1:nb+1) = ab(1:nb+1,ns) - -#ifdef DOUBLE_PRECISION_REAL - -#ifdef WITH_MPI - call mpi_isend(ab_s, nb+1, mpi_real8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif - ! ... and calculate remaining columns with rank-2 update - if (nc>1) call DSYR2('L', nc-1, -1.0_rk, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) - else - ! No need to send, just a rank-2 update - call DSYR2('L', nc, -1.0_rk, hd, 1, hv, 1, ab(1,ns), 2*nb-1) - endif -#else /* DOUBLE_PRECISION_REAL */ - -#ifdef WITH_MPI - call mpi_isend(ab_s, nb+1, mpi_real4, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif - ! ... and calculate remaining columns with rank-2 update - if (nc>1) call SSYR2('L', nc-1, -1.0_rk, hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) - else - ! No need to send, just a rank-2 update - call SSYR2('L', nc, -1.0_rk, hd, 1, hv, 1, ab(1,ns), 2*nb-1) - endif -#endif /* DOUBLE_PRECISION_REAL */ - - - ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb - - if (nr>0) then - if (nr>1) then -#ifdef DOUBLE_PRECISION_REAL - call DGEMV('T', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, 0.0_rk, h(2), 1) -#else - call SGEMV('T', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, 0.0_rk, h(2), 1) -#endif - x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new - h(2:nb) = h(2:nb) - x*hv(2:nb) - ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update - do i=2,nb - ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i) - enddo - else - ! No double Householder transformation for nr=1, just complete the row - do i=2,nb - ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i) - enddo - endif - endif - - ! Use new HH vector for the next block - hv(:) = hv_new(:) - tau = tau_new - - enddo - -#ifdef WITH_OPENMP - endif - - do iblk = 1, nblocks - - if (hh_dst(iblk) >= np_rows) exit - if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit - - if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then - ! Wait for last transfer to finish -#ifdef WITH_MPI - call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr) -#endif - ! Copy vectors into send buffer - hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) - ! Send to destination - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, & - global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) -#else - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real4, & - global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) -#endif - -#else /* WITH_MPI */ - ! do the post-poned irecv here - startAddr = startAddr - hh_cnt(iblk) - hh_trans_real(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) -#endif /* WITH_MPI */ - - ! Reset counter and increase destination row - hh_cnt(iblk) = 0 - hh_dst(iblk) = hh_dst(iblk)+1 - endif - - enddo -#endif /* WITH_OPENMP */ - enddo ! istep - - ! Finish the last outstanding requests - -#ifdef WITH_OPENMP - -#ifdef WITH_MPI - call mpi_wait(ireq_ab,MPI_STATUS,mpierr) - call mpi_wait(ireq_hv,MPI_STATUS,mpierr) - - allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating mpi_statuses"//errorMessage - stop - endif - - call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES, mpierr) - call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES, mpierr) - deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when deallocating mpi_statuses"//errorMessage - stop - endif -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) - call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) - - call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) - call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) -#endif - -#endif /* WITH_OPENMP */ - -#ifdef WITH_MPI - call mpi_barrier(mpi_comm,mpierr) -#endif - deallocate(ab, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when deallocating ab"//errorMessage - stop - endif - - deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when deallocating ireq_hhr, ireq_hhs"//errorMessage - stop - endif - - deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when deallocating hh_cnt, hh_dst"//errorMessage - stop - endif - - deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when deallocating hh_gath, hh_send"//errorMessage - stop - endif - - deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when deallocating limits, send_limits"//errorMessage - stop - endif - - deallocate(block_limits, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when deallocating block_limits"//errorMessage - stop - endif - - deallocate(global_id, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_real: error when allocating global_id"//errorMessage - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("tridiag_band_real") -#endif - - end subroutine tridiag_band_real - - subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, hh_trans_real, & - mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, & - THIS_REAL_ELPA_KERNEL) - !------------------------------------------------------------------------------- - ! trans_ev_tridi_to_band_real: - ! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix - ! - ! Parameters - ! - ! na Order of matrix a, number of rows of matrix q - ! - ! nev Number eigenvectors to compute (= columns of matrix q) - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! nb semi bandwith - ! - ! q On input: Eigenvectors of tridiagonal matrix - ! On output: Transformed eigenvectors - ! Distribution is like in Scalapack. - ! - ! ldq Leading dimension of q - ! matrixCols local columns of matrix q - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns/both - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use cuda_functions - use precision - use pack_unpack_real - use pack_unpack_real_gpu - use compute_hh_trafo_real - use iso_c_binding - implicit none - logical, intent(in) :: useGPU - - integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL - integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - real(kind=rk) :: q(ldq,*) -#else - real(kind=rk) :: q(ldq,matrixCols) -#endif - real(kind=rk), intent(in) :: hh_trans_real(:,:) - integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol - - integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2 - integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end - integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end - integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length - integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count -#ifdef WITH_OPENMP - integer(kind=ik) :: thread_width, csw, b_off, b_len -#endif - integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd - integer(kind=ik) :: a_off, current_tv_off, max_blk_size - integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk -#ifdef WITH_OPENMP -#ifdef WITH_MPI - integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) -#endif -#endif - logical :: flag - -#ifdef WITH_OPENMP - real(kind=rk), allocatable :: a(:,:,:,:), row(:) -#else - real(kind=rk), allocatable :: a(:,:,:), row(:) -#endif - - real(kind=rk) , allocatable :: row_group(:,:) - -#ifdef WITH_OPENMP - real(kind=rk), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) - real(kind=rk), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) -#else - real(kind=rk), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) - real(kind=rk), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) -#endif - real(kind=rk), allocatable :: result_buffer(:,:,:) - real(kind=rk), allocatable :: bcast_buffer(:,:) - integer(kind=ik) :: tmp - -! real*8, allocatable, device :: a_dev(:,:,:) -! real*8, allocatable, device :: bcast_buffer_dev(:,:) -! real*8, allocatable, device :: row_dev(:) -! real*8, allocatable, device :: row_group_dev(:,:) -! real*8, allocatable, device :: hh_dot_dev(:) -! real*8, allocatable, device :: hh_tau_dev(:) - - integer(kind=c_intptr_t) :: a_dev - integer(kind=c_intptr_t) :: bcast_buffer_dev - integer(kind=c_size_t) :: num - integer(kind=c_size_t) :: dev_offset, dev_offset_1 - - - integer(kind=c_intptr_t) :: row_dev - integer(kind=c_intptr_t) :: row_group_dev - integer(kind=c_intptr_t) :: hh_dot_dev - integer(kind=c_intptr_t) :: hh_tau_dev - Integer(kind=ik) :: top, chunk, this_chunk - integer(kind=ik) :: row_group_size, unpack_idx - - integer(kind=ik) :: n_off - integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:) - integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:) - integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:) -#ifdef WITH_OPENMP - integer(kind=ik), allocatable :: mpi_statuses(:,:) -#endif - ! MPI send/recv tags, arbitrary - - integer(kind=ik), parameter :: bottom_recv_tag = 111 - integer(kind=ik), parameter :: top_recv_tag = 222 - integer(kind=ik), parameter :: result_recv_tag = 333 - - ! Just for measuring the kernel performance - real(kind=c_double) :: kernel_time ! MPI_WTIME always needs double - ! long integer - integer(kind=lik) :: kernel_flops - -#ifdef WITH_OPENMP - integer(kind=ik) :: max_threads, my_thread - integer(kind=ik) :: omp_get_max_threads -#endif - - logical, intent(in) :: wantDebug - logical :: success - integer(kind=ik) :: istat - character(200) :: errorMessage - logical :: successCUDA -#ifndef WITH_MPI - integer(kind=ik) :: j1 -#endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("trans_ev_tridi_to_band_real") -#endif - -#ifdef WITH_GPU_VERSION - unpack_idx = 0 - row_group_size = 0 -#endif - success = .true. - kernel_time = 0.0 - kernel_flops = 0 - -#ifdef WITH_OPENMP - max_threads = 1 - max_threads = omp_get_max_threads() -#endif - call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) - call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) - call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr) - call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr) - if (mod(nbw,nblk)/=0) then - if (my_prow==0 .and. my_pcol==0) then - if (wantDebug) then - write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: ERROR: nbw=',nbw,', nblk=',nblk - write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: band backtransform works only for nbw==n*nblk' - endif - success = .false. - return - endif - endif - - nfact = nbw / nblk - - - ! local number of eigenvectors - l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) - - if (l_nev==0) then -#ifdef WITH_OPENMP - thread_width = 0 -#endif - stripe_width = 0 - stripe_count = 0 - last_stripe_width = 0 - else - - ! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into - ! every primary cache - if (.not.(useGPU)) then - -#ifdef WITH_OPENMP - thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread -#endif - stripe_width = 48 ! Must be a multiple of 4 -#ifdef WITH_OPENMP - stripe_count = (thread_width-1)/stripe_width + 1 -#else - stripe_count = (l_nev-1)/stripe_width + 1 -#endif - ! Adapt stripe width so that last one doesn't get too small -#ifdef WITH_OPENMP - stripe_width = (thread_width-1)/stripe_count + 1 -#else - stripe_width = (l_nev-1)/stripe_count + 1 -#endif - stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!! - else ! GPUs are used - stripe_width = 256 ! Must be a multiple of 4 - stripe_count = (l_nev - 1) / stripe_width + 1 - endif - - last_stripe_width = l_nev - (stripe_count-1)*stripe_width - endif - - ! Determine the matrix distribution at the beginning - - allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating limits"//errorMessage - stop - endif - call determine_workload(na, nbw, np_rows, limits) - - max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) - - a_dim2 = max_blk_size + nbw - - if (useGPU) then - num = (stripe_width*a_dim2*stripe_count)*size_of_real_datatype - successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_real_datatype) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage - stop - endif - - successCUDA = cuda_memset(a_dev , 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage - stop - endif - - else ! GPUs are not used -!DEC$ ATTRIBUTES ALIGN: 64:: a -#ifdef WITH_OPENMP - allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage - stop - endif - - ! a(:,:,:,:) should be set to 0 in a parallel region, not here! -#else - allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage - stop - endif - - a(:,:,:) = 0 -#endif - endif !useGPU - - allocate(row(l_nev), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating row"//errorMessage - stop - endif - - row(:) = 0 - - if (useGPU) then - num = (l_nev)*size_of_real_datatype - successCUDA = cuda_malloc( row_dev,num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMalloc "//errorMessage - stop - endif - - successCUDA = cuda_memset(row_dev , 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage - stop - endif - - ! "row_group" and "row_group_dev" are needed for GPU optimizations - allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating row_group"//errorMessage - stop - endif - row_group(:, :) = 0 - - num = (l_nev*nblk)*size_of_real_datatype - ! call cuda_malloc2d( row_group_dev,l_nev*size_of_real_datatype,nblk*size_of_real_datatype) - successCUDA = cuda_malloc(row_group_dev, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage - stop - endif - successCUDA = cuda_memset(row_group_dev , 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage - stop - endif - - endif ! useGPU - - ! Copy q from a block cyclic distribution into a distribution with contiguous rows, - ! and transpose the matrix using stripes of given stripe_width for cache blocking. - - ! The peculiar way it is done below is due to the fact that the last row should be - ! ready first since it is the first one to start below - -#ifdef WITH_OPENMP - ! Please note about the OMP usage below: - ! This is not for speed, but because we want the matrix a in the memory and - ! in the cache of the correct thread (if possible) -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - !$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - a(:,:,:,my_thread) = 0.0_rk ! if possible, do first touch allocation! - enddo - !$omp end parallel do - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif -#endif /* WITH_OPENMP */ - - do ip = np_rows-1, 0, -1 - if (my_prow == ip) then - ! Receive my rows which have not yet been received - src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) - do i=limits(ip)+1,limits(ip+1) - src = mod((i-1)/nblk, np_rows) - if (src < my_prow) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr) -#else - call MPI_Recv(row, l_nev, MPI_REAL4, src, 0, mpi_comm_rows, MPI_STATUS, mpierr) -#endif - -#else /* WITH_MPI */ - row(1:l_nev) = row(1:l_nev) -#endif /* WITH_MPI */ - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - call unpack_row_real_cpu_openmp(a, row, i-limits(ip), my_thread, stripe_count, & - thread_width, stripe_width, l_nev) - enddo -!$omp end parallel do - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - - if (useGPU) then - ! An unpacking of the current row group may occur before queuing the next row - call unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, & - stripe_width, last_stripe_width, a_dim2, l_nev,& - row_group_size, nblk, unpack_idx, i - limits(ip), .false.) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL4, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct? -#endif /* WITH_MPI */ - - else -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - call MPI_Recv(row, l_nev, MPI_REAL4, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - row(1:l_nev) = row(1:l_nev) -#endif /* WITH_MPI */ - call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) - endif - -#endif /* WITH_OPENMP */ - elseif (src==my_prow) then - src_offset = src_offset+1 - if (.not.(useGPU)) row(:) = q(src_offset, 1:l_nev) - -#ifdef WITH_OPENMP - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - call unpack_row_real_cpu_openmp(a, row, i-limits(ip), my_thread, & - stripe_count, thread_width, stripe_width, l_nev) - enddo -!$omp end parallel do - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - if (useGPU) then - ! An unpacking of the current row group may occur before queuing the next row - call unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, & - stripe_width, last_stripe_width, a_dim2, l_nev,& - row_group_size, nblk, unpack_idx, i - limits(ip), .false.) - row_group(:, row_group_size) = q(src_offset, 1:l_nev) - else - call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) - endif -#endif /* WITH_OPENMP */ - endif - enddo - ! Send all rows which have not yet been send - src_offset = 0 - do dst = 0, ip-1 - do i=limits(dst)+1,limits(dst+1) - if (mod((i-1)/nblk, np_rows) == my_prow) then - src_offset = src_offset+1 - row(:) = q(src_offset, 1:l_nev) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Send(row, l_nev, MPI_REAL8, dst, 0, mpi_comm_rows, mpierr) -#else - call MPI_Send(row, l_nev, MPI_REAL4, dst, 0, mpi_comm_rows, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - enddo - enddo - - else if (my_prow < ip) then - ! Send all rows going to PE ip - src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) - do i=limits(ip)+1,limits(ip+1) - src = mod((i-1)/nblk, np_rows) - if (src == my_prow) then - src_offset = src_offset+1 - row(:) = q(src_offset, 1:l_nev) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Send(row, l_nev, MPI_REAL8, ip, 0, mpi_comm_rows, mpierr) -#else - call MPI_Send(row, l_nev, MPI_REAL4, ip, 0, mpi_comm_rows, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - enddo - ! Receive all rows from PE ip - do i=limits(my_prow)+1,limits(my_prow+1) - src = mod((i-1)/nblk, np_rows) - if (src == ip) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr) -#else - call MPI_Recv(row, l_nev, MPI_REAL4, src, 0, mpi_comm_rows, MPI_STATUS, mpierr) -#endif - -#else /* WITH_MPI */ - row(1:l_nev) = row(1:l_nev) - -#endif /* WITH_MPI */ - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - call unpack_row_real_cpu_openmp(a, row, i-limits(my_prow), my_thread, & - stripe_count, thread_width, stripe_width, l_nev) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - if (useGPU) then - ! An unpacking of the current row group may occur before queuing the next row - call unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, stripe_width, & - last_stripe_width, a_dim2, l_nev, row_group_size, nblk, & - unpack_idx, i - limits(my_prow), .false.) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL4, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ? -#endif /* WITH_MPI */ - else -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - call MPI_Recv(row, l_nev, MPI_REAL4, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - row(1:l_nev) = row(1:l_nev) - -#endif /* WITH_MPI */ - call unpack_row_real_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width) - endif - -#endif /* WITH_OPENMP */ - - endif - enddo - endif - enddo - - if (useGPU) then - ! Force an unpacking of all remaining rows that haven't been unpacked yet - call unpack_and_prepare_row_group_real_gpu(row_group, row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, & - a_dim2, l_nev, row_group_size, nblk, unpack_idx, -1, .true.) - successCUDA = cuda_devicesynchronize() - - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaDeviceSynchronize"//errorMessage - stop - endif - endif - - ! Set up result buffer queue - - num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows - - num_result_buffers = 4*nfact - allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating result_buffer"//errorMessage - stop - endif - - allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating result_send_request"//errorMessage - stop - endif - - allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating result_recv_request"//errorMessage - stop - endif - -#ifdef WITH_MPI - result_send_request(:) = MPI_REQUEST_NULL - result_recv_request(:) = MPI_REQUEST_NULL -#endif - ! Queue up buffers - - if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends - do j = 1, min(num_result_buffers, num_result_blocks) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(j), mpierr) -#else - call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_REAL4, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(j), mpierr) - -#endif - -#else /* WITH_MPI */ - - ! carefull the "recv" has to be done at the corresponding wait or send - ! result_buffer(1: l_nev*nblk,1,j) =result_buffer(1:l_nev*nblk,1,nbuf) - -#endif /* WITH_MPI */ - enddo - endif - - num_bufs_recvd = 0 ! No buffers received yet - - ! Initialize top/bottom requests - - allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating top_send_request"//errorMessage - stop - endif - - allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating top_recv_request"//errorMessage - stop - endif - - allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating bottom_send_request"//errorMessage - stop - endif - - allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating bottom_recv_request"//errorMessage - stop - endif - -#ifdef WITH_MPI - top_send_request(:) = MPI_REQUEST_NULL - top_recv_request(:) = MPI_REQUEST_NULL - bottom_send_request(:) = MPI_REQUEST_NULL - bottom_recv_request(:) = MPI_REQUEST_NULL -#endif - -#ifdef WITH_OPENMP - allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_buffer"//errorMessage - stop - endif - - allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating top_border_recv_buffer"//errorMessage - stop - endif - - allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_send_buffer"//errorMessage - stop - endif - - allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_recv_buffer"//errorMessage - stop - endif - top_border_send_buffer(:,:) = 0 - top_border_recv_buffer(:,:) = 0 - bottom_border_send_buffer(:,:) = 0 - bottom_border_recv_buffer(:,:) = 0 - - ! Initialize broadcast buffer -#else /* WITH_OPENMP */ - allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_bufer"//errorMessage - stop - endif - - allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating top_border_recv_buffer"//errorMessage - stop - endif - - allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_send_buffer"//errorMessage - stop - endif - - allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_recv_buffer"//errorMessage - stop - endif - top_border_send_buffer(:,:,:) = 0 - top_border_recv_buffer(:,:,:) = 0 - bottom_border_send_buffer(:,:,:) = 0 - bottom_border_recv_buffer(:,:,:) = 0 -#endif /* WITH_OPENMP */ - - allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating bcast_buffer"//errorMessage - stop - endif - - bcast_buffer = 0 - - if (useGPU) then - num = ( nbw * max_blk_size) * size_of_real_datatype - successCUDA = cuda_malloc(bcast_buffer_dev, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" - stop - endif - - successCUDA = cuda_memset( bcast_buffer_dev, 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemset" - stop - endif - - num = ((max_blk_size-1))*size_of_real_datatype - successCUDA = cuda_malloc( hh_dot_dev, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" - stop - endif - - successCUDA = cuda_memset( hh_dot_dev, 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemset" - stop - endif - - num = (max_blk_size)*size_of_real_datatype - successCUDA = cuda_malloc( hh_tau_dev, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMalloc" - stop - endif - - successCUDA = cuda_memset( hh_tau_dev, 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemset" - stop - endif - endif ! useGPU - - current_tv_off = 0 ! Offset of next row to be broadcast - - ! ------------------- start of work loop ------------------- - - a_off = 0 ! offset in A (to avoid unnecessary shifts) - - top_msg_length = 0 - bottom_msg_length = 0 - - do sweep = 0, (na-1)/nbw - - current_n = na - sweep*nbw - call determine_workload(current_n, nbw, np_rows, limits) - current_n_start = limits(my_prow) - current_n_end = limits(my_prow+1) - current_local_n = current_n_end - current_n_start - - next_n = max(current_n - nbw, 0) - call determine_workload(next_n, nbw, np_rows, limits) - next_n_start = limits(my_prow) - next_n_end = limits(my_prow+1) - next_local_n = next_n_end - next_n_start - - if (next_n_end < next_n) then - bottom_msg_length = current_n_end - next_n_end - else - bottom_msg_length = 0 - endif - - if (next_local_n > 0) then - next_top_msg_length = current_n_start - next_n_start - else - next_top_msg_length = 0 - endif - - if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then - do i = 1, stripe_count -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - - csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" - b_len = csw*nbw*max_threads -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#else - call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_REAL4, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be done at the corresponding wait or send -! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#else - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL4, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the recieve has to be done at the corresponding wait or send -! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - enddo - endif - - if (current_local_n > 1) then - if (my_pcol == mod(sweep,np_cols)) then - bcast_buffer(:,1:current_local_n) = hh_trans_real(:,current_tv_off+1:current_tv_off+current_local_n) - current_tv_off = current_tv_off + current_local_n - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_REAL8, mod(sweep,np_cols), mpi_comm_cols, mpierr) -#else - call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_REAL4, mod(sweep,np_cols), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - - if (useGPU) then - successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), & - nbw * current_local_n * size_of_real_datatype , cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" - stop - endif - - call extract_hh_tau_real_gpu(bcast_buffer_dev, hh_tau_dev, nbw, current_local_n, .false.) - call compute_hh_dot_products_real_gpu(bcast_buffer_dev, hh_dot_dev, nbw, current_local_n) - endif - - else - ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real - bcast_buffer(:,1) = 0 - - if (useGPU) then - successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_real_datatype) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemset" - stop - endif - - call extract_hh_tau_real_gpu(bcast_buffer_dev, hh_tau_dev, nbw, 1, .true.) - endif - endif - - if (l_nev == 0) cycle - - if (current_local_n > 0) then - - do i = 1, stripe_count -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - - ! Get real stripe width for strip i; - ! The last OpenMP tasks may have an even smaller stripe with, - ! but we don't care about this, i.e. we send/recv a bit too much in this case. - ! csw: current_stripe_width - - csw = min(stripe_width, thread_width-(i-1)*stripe_width) -#endif /* WITH_OPENMP */ - - !wait_b - if (current_n_end < current_n) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - -#ifdef WITH_MPI - call MPI_Wait(bottom_recv_request(i), MPI_STATUS, mpierr) -#endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) - do my_thread = 1, max_threads - n_off = current_local_n+a_off - b_len = csw*nbw - b_off = (my_thread-1)*b_len - a(1:csw,n_off+1:n_off+nbw,i,my_thread) = & - reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - n_off = current_local_n+a_off - - if (useGPU) then - dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *size_of_real_datatype - successCUDA = cuda_memcpy( a_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), & - stripe_width*nbw*size_of_real_datatype ,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" - stop - endif - - else - a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) - endif - -#endif /* WITH_OPENMP */ - - if (next_n_end < next_n) then - -#ifdef WITH_OPENMP - - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & - MPI_REAL8, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#else - call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & - MPI_REAL4, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#endif - -#else /* WTIH_MPI */ -! carefull the recieve has to be done at the corresponding wait or send -! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) - -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) - -#else - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL4, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) - -#endif - -#else /* WITH_MPI */ - -!! carefull the recieve has to be done at the corresponding wait or send -!! bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i) - -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - endif - endif - - if (current_local_n <= bottom_msg_length + top_msg_length) then - - !wait_t - if (top_msg_length>0) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i), MPI_STATUS, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - - if (useGPU) then - dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype - ! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8 - successCUDA = cuda_memcpy( a_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), & - stripe_width*top_msg_length*size_of_real_datatype , cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" - stop - endif - else - a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) - endif ! useGPU -#endif /* WITH_OPENMP */ - endif ! top_msg_length - - !compute -#ifdef WITH_OPENMP -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - -!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) - do my_thread = 1, max_threads - if (top_msg_length>0) then - b_len = csw*top_msg_length - b_off = (my_thread-1)*b_len - a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & - reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) - endif - call compute_hh_trafo_real_cpu_openmp(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & - a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & - hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, & - my_thread, thread_width, THIS_REAL_ELPA_KERNEL) - -! call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, l_nev, & -! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & -!0, current_local_n, i, my_thread, thread_width, & -! THIS_REAL_ELPA_KERNEL) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - call compute_hh_trafo_real_cpu(a, a_dev, stripe_width, a_dim2, stripe_count, & - a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & - hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, & - last_stripe_width, THIS_REAL_ELPA_KERNEL) -#endif /* WITH_OPENMP */ - - !send_b -#ifdef WITH_OPENMP - - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) -#endif - if (bottom_msg_length>0) then - n_off = current_local_n+nbw-bottom_msg_length+a_off - b_len = csw*bottom_msg_length*max_threads - bottom_border_send_buffer(1:b_len,i) = & - reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#else - call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL4, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (next_top_msg_length > 0) then - top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & - bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) - endif - -#endif /* WITH_MPI */ - endif -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - if (bottom_msg_length>0) then - n_off = current_local_n+nbw-bottom_msg_length+a_off - - if (useGPU) then - dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype - successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, & - stripe_width * bottom_msg_length * size_of_real_datatype ,cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" - stop - endif - else - bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#else - call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL4, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (next_top_msg_length > 0) then - top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & - bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) - endif - -#endif /* WITH_MPI */ - endif -#endif /* WITH_OPENMP */ - else ! current_local_n <= bottom_msg_length + top_msg_length - - !compute -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) - do my_thread = 1, max_threads - call compute_hh_trafo_real_cpu_openmp(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & - a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & - hh_tau_dev, kernel_flops, kernel_time, & - current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, & - thread_width, THIS_REAL_ELPA_KERNEL) -! call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, l_nev, & -! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & -!current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, thread_width, & -! THIS_REAL_ELPA_KERNEL) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - - !send_b -#ifdef WITH_MPI - call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) -#endif - if (bottom_msg_length > 0) then - n_off = current_local_n+nbw-bottom_msg_length+a_off - b_len = csw*bottom_msg_length*max_threads - bottom_border_send_buffer(1:b_len,i) = & - reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#else - call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL4, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (next_top_msg_length > 0) then - top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & - bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) - endif - -#endif /* WITH_MPI */ - endif -#else /* WITH_OPENMP */ - call compute_hh_trafo_real_cpu(a, a_dev, stripe_width, a_dim2, stripe_count, & - a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & - hh_tau_dev, kernel_flops, kernel_time, & - current_local_n - bottom_msg_length, bottom_msg_length, i, & - last_stripe_width, THIS_REAL_ELPA_KERNEL) - - !send_b -#ifdef WITH_MPI - call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - if (bottom_msg_length > 0) then - n_off = current_local_n+nbw-bottom_msg_length+a_off - - if (useGPU) then - dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype - successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, & - stripe_width*bottom_msg_length*size_of_real_datatype ,cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error cudaMemcpy" - stop - endif - else - bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#else - call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL4, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (next_top_msg_length > 0) then - top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & - bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) - endif - -#endif /* WITH_MPI */ - endif -#endif /* WITH_OPENMP */ - - !compute -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - call compute_hh_trafo_real_cpu_openmp(a, a_dev, stripe_width ,a_dim2, stripe_count, max_threads, l_nev, & - a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & - hh_tau_dev, kernel_flops, kernel_time, & - top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & - my_thread, thread_width, THIS_REAL_ELPA_KERNEL) -! call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, l_nev, & -! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & -! top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, thread_width, & -! THIS_REAL_ELPA_KERNEL) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - call compute_hh_trafo_real_cpu(a, a_dev, stripe_width, a_dim2, stripe_count, & - a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & - hh_tau_dev, kernel_flops, kernel_time, top_msg_length, & - current_local_n-top_msg_length-bottom_msg_length, i, & - last_stripe_width, THIS_REAL_ELPA_KERNEL) -#endif /* WITH_OPENMP */ - - !wait_t - if (top_msg_length>0) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i), mpi_status, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - if (useGPU) then - dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype - successCUDA = cuda_memcpy( a_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), & - stripe_width * top_msg_length *size_of_real_datatype ,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" - stop - endif - else - a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) - endif -#endif /* WITH_OPENMP */ - endif - - !compute -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) - do my_thread = 1, max_threads - if (top_msg_length>0) then - b_len = csw*top_msg_length - b_off = (my_thread-1)*b_len - a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & - reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) - endif - call compute_hh_trafo_real_cpu_openmp(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & - a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & - hh_tau_dev, kernel_flops, kernel_time, & - 0, top_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL) -! call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, l_nev, & -! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & -! 0, top_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - call compute_hh_trafo_real_cpu(a, a_dev, stripe_width, a_dim2, stripe_count, & - a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, & - hh_tau_dev, kernel_flops, kernel_time, 0, top_msg_length, i, & - last_stripe_width, THIS_REAL_ELPA_KERNEL) -#endif /* WITH_OPENMP */ - endif - - if (next_top_msg_length > 0) then - !request top_border data -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - - b_len = csw*next_top_msg_length*max_threads -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow-1, & - top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) -#else - call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_REAL4, my_prow-1, & - top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be done at the corresponding wait or send -! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_REAL8, my_prow-1, & - top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) -#else - call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_REAL4, my_prow-1, & - top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be done at the corresponding wait or send -! top_border_recv_buffer(1:stripe_width,1:next_top_msg_length,i) = & -! bottom_border_send_buffer(1:stripe_width,1:next_top_msg_length,i) -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - - endif - - !send_t - if (my_prow > 0) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(top_send_request(i), mpi_status, mpierr) -#endif - b_len = csw*nbw*max_threads - top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_REAL8, & - my_prow-1, bottom_recv_tag, & - mpi_comm_rows, top_send_request(i), mpierr) -#else - call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_REAL4, & - my_prow-1, bottom_recv_tag, & - mpi_comm_rows, top_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then - bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) - endif - if (next_n_end < next_n) then - bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) - endif -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - if (useGPU) then - dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_real_datatype - successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), a_dev + dev_offset, & - stripe_width*nbw*size_of_real_datatype ,cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy" - stop - endif - - else - top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i) - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow-1, bottom_recv_tag, & - mpi_comm_rows, top_send_request(i), mpierr) -#else - call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_REAL4, my_prow-1, bottom_recv_tag, & - mpi_comm_rows, top_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then - bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) - endif - if (next_n_end < next_n) then - bottom_border_recv_buffer(1:stripe_width,1:nbw,i) = top_border_send_buffer(1:stripe_width,1:nbw,i) - endif -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - endif - - ! Care that there are not too many outstanding top_recv_request's - if (stripe_count > 1) then - if (i>1) then - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i-1), MPI_STATUS, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_OPENMP */ - else - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_OPENMP */ - endif - endif - - enddo - - top_msg_length = next_top_msg_length - - else - ! wait for last top_send_request - do i = 1, stripe_count -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(top_send_request(i), MPI_STATUS, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_OPENMP */ - enddo - endif - - ! Care about the result - - if (my_prow == 0) then - - ! topmost process sends nbw rows to destination processes - - do j=0,nfact-1 - num_blk = sweep*nfact+j ! global number of destination block, 0 based - if (num_blk*nblk >= na) exit - - nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(result_send_request(nbuf), MPI_STATUS, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_OPENMP */ - dst = mod(num_blk, np_rows) - - if (dst == 0) then - if (useGPU) then - row_group_size = min(na - num_blk*nblk, nblk) - call pack_row_group_real_gpu(row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, & - row_group(:, :), j * nblk + a_off, row_group_size) - - do i = 1, row_group_size - q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i) - enddo - else - - do i = 1, min(na - num_blk*nblk, nblk) -#ifdef WITH_OPENMP - call pack_row_real_cpu_openmp(a, row, j*nblk+i+a_off, stripe_width, & - stripe_count, max_threads, thread_width, l_nev) -#else - call pack_row_real_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) -#endif - q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) - enddo - endif - else - if (useGPU) then - call pack_row_group_real_gpu(row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, & - result_buffer(:, :, nbuf), j * nblk + a_off, nblk) - else - do i = 1, nblk -#ifdef WITH_OPENMP - call pack_row_real_cpu_openmp(a, result_buffer(:,i,nbuf), j*nblk+i+a_off, & - stripe_width, stripe_count, max_threads, thread_width, l_nev) -#else - call pack_row_real_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) -#endif - enddo - endif ! useGPU -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, dst, & - result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) -#else - call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL4, dst, & - result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) -#endif - -#else /* WITH_MPI */ - if (j+num_result_buffers < num_result_blocks) & - result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf) - if (my_prow > 0 .and. l_nev>0) then - do j1 = 1, min(num_result_buffers, num_result_blocks) - result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf) - enddo - endif - -#endif /* WITH_MPI */ - endif ! else - enddo - - else - - ! receive and store final result - - do j = num_bufs_recvd, num_result_blocks-1 - - nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block - - ! If there is still work to do, just test for the next result request - ! and leave the loop if it is not ready, otherwise wait for all - ! outstanding requests - - if (next_local_n > 0) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS, mpierr) -#else /* WITH_MPI */ - flag = .true. -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) -#else /* WITH_MPI */ - flag = .true. -#endif - -#endif /* WITH_OPENMP */ - - if (.not.flag) exit - else - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(result_recv_request(nbuf), MPI_STATUS, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_OPENMP */ - - endif - - ! Fill result buffer into q - num_blk = j*np_rows + my_prow ! global number of current block, 0 based - do i = 1, min(na - num_blk*nblk, nblk) - q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) - enddo - - ! Queue result buffer again if there are outstanding blocks left -#ifdef WITH_MPI - if (j+num_result_buffers < num_result_blocks) & - -#ifdef DOUBLE_PRECISION_REAL - call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(nbuf), mpierr) -#else - call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL4, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(nbuf), mpierr) -#endif - ! carefull the "recieve" has to be done at the corresponding wait or send -! if (j+num_result_buffers < num_result_blocks) & -! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf) - -#else /* WITH_MPI */ - -#endif /* WITH_MPI */ - enddo - num_bufs_recvd = j - - endif - - ! Shift the remaining rows to the front of A (if necessary) - - offset = nbw - top_msg_length - if (offset<0) then - if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: internal error, offset for shifting = ',offset - success = .false. - return - endif - - a_off = a_off + offset - if (a_off + next_local_n + nbw > a_dim2) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - - !$omp parallel do private(my_thread, i, j), schedule(static, 1) - do my_thread = 1, max_threads - do i = 1, stripe_count - do j = top_msg_length+1, top_msg_length+next_local_n - A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread) - enddo - enddo - enddo -#else /* WITH_OPENMP */ - do i = 1, stripe_count - if (useGPU) then - chunk = min(next_local_n - 1, a_off) - do j = top_msg_length + 1, top_msg_length + next_local_n, chunk - top = min(j + chunk, top_msg_length + next_local_n) - this_chunk = top - j + 1 - dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype - dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype - - ! it is not logical to set here always the value for the parameter - ! "cudaMemcpyDeviceToDevice" do this ONCE at startup - ! tmp = cuda_d2d(1) - successCUDA = cuda_memcpy( a_dev + dev_offset , a_dev +dev_offset_1, & - stripe_width*this_chunk*size_of_real_datatype, cudaMemcpyDeviceToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error cudaMemcpy" - stop - endif - enddo - else ! not useGPU - do j = top_msg_length+1, top_msg_length+next_local_n - A(:,j,i) = A(:,j+a_off,i) - enddo - endif - enddo ! stripe_count -#endif /* WITH_OPENMP */ - -#ifdef WITH_OPENMP -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif -#endif - a_off = 0 - endif - - enddo - - ! Just for safety: -#ifdef WITH_MPI - if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol - if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol - if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol - if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol -#endif - if (my_prow == 0) then - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_real: not yet implemented" - stop - endif -#ifdef WITH_MPI - allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when allocating mpi_statuses"//errorMessage - stop - endif - - call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr) - deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating mpi_statuses"//errorMessage - stop - endif -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) -#endif - -#endif /* WITH_OPENMP */ - endif -#ifdef WITH_MPI - if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol - if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol -#endif - - if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & - write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6 - - ! deallocate all working space - - if (.not.(useGPU)) then - deallocate(a, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating a "//errorMessage - stop - endif - endif - - deallocate(row, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating row "//errorMessage - stop - endif - - deallocate(limits, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating limits"//errorMessage - stop - endif - - deallocate(result_send_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating result_send_request "//errorMessage - stop - endif - - deallocate(result_recv_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating result_recv_request "//errorMessage - stop - endif - - deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating top_border_send_buffer "//errorMessage - stop - endif - - deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating top_border_recv_buffer "//errorMessage - stop - endif - - deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_border_send_buffer "//errorMessage - stop - endif - - deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_border_recv_buffer "//errorMessage - stop - endif - - deallocate(result_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating result_buffer "//errorMessage - stop - endif - - deallocate(bcast_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating bcast_buffer "//errorMessage - stop - endif - - deallocate(top_send_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating top_send_request "//errorMessage - stop - endif - - deallocate(top_recv_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating top_recv_request "//errorMessage - stop - endif - - deallocate(bottom_send_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_send_request "//errorMessage - stop - endif - - deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_recv_request "//errorMessage - stop - endif - - if (useGPU) then - successCUDA = cuda_free(hh_dot_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage - stop - endif - - successCUDA = cuda_free(hh_tau_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage - stop - endif - - successCUDA = cuda_free(row_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage - stop - endif - - deallocate(row_group, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_real: error when deallocating row_group "//errorMessage - stop - endif - - successCUDA = cuda_free(row_group_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage - stop - endif - - successCUDA = cuda_free(bcast_buffer_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage - stop - endif - endif ! useGPU - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("trans_ev_tridi_to_band_real") -#endif - return - - end subroutine trans_ev_tridi_to_band_real - - subroutine determine_workload(na, nb, nprocs, limits) -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik), intent(in) :: na, nb, nprocs - integer(kind=ik), intent(out) :: limits(0:nprocs) - - integer(kind=ik) :: i - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("determine_workload") -#endif - - if (na <= 0) then - limits(:) = 0 -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("determine_workload") -#endif - return - endif - - if (nb*nprocs > na) then - ! there is not enough work for all - do i = 0, nprocs - limits(i) = min(na, i*nb) - enddo - else - do i = 0, nprocs - limits(i) = (i*na)/nprocs - enddo - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("determine_workload") -#endif - end subroutine - - subroutine bandred_complex(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, & - useGPU, success) - !------------------------------------------------------------------------------- - ! bandred_complex: Reduces a distributed hermitian matrix to band form - ! - ! Parameters - ! - ! na Order of matrix - ! - ! a(lda,matrixCols) Distributed matrix which should be reduced. - ! Distribution is like in Scalapack. - ! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half) - ! a(:,:) is overwritten on exit with the band and the Householder vectors - ! in the upper half. - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix a - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! nbw semi bandwith of output matrix - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! - ! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1 - ! Factors for the Householder vectors (returned), needed for back transformation - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - use cuda_functions - use iso_c_binding - - implicit none - - logical, intent(in) :: useGPU - - integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - complex(kind=ck) :: a(lda,*), tmat(nbw,nbw,*) -#else - complex(kind=ck) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks) -#endif - complex(kind=ck), parameter :: CZERO = (0.0_rk, 0.0_rk), CONE = (1.0_rk, 0.0_rk) - - integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr - integer(kind=ik) :: l_cols, l_rows - integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow - integer(kind=ik) :: istep, ncol, lch, lcx, nlc - integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile - - real(kind=rk) :: vnorm2 - complex(kind=ck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) - - complex(kind=ck), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:) - integer(kind=c_intptr_t) :: umc_dev, tmat_dev,vav_dev,vmr_dev,a_dev - integer(kind=ik) :: cur_l_rows, cur_l_cols,vmr_size ,umc_size - integer(kind=c_size_t) :: lc_start, lc_end, lr_end, lce_1, lcs_1,lre_1 - integer(kind=ik) :: na_rows, na_cols -#ifdef WITH_MPI - integer(kind=ik), external :: numroc -#endif - - logical, intent(in) :: wantDebug - logical, intent(out) :: success - character(200) :: errorMessage - integer(kind=ik) :: istat - logical :: successCUDA -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("bandred_complex") -#endif - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - success = .true. - - ! Semibandwith nbw must be a multiple of blocksize nblk - - if (mod(nbw,nblk)/=0) then - if (my_prow==0 .and. my_pcol==0) then - if (wantDebug) then - write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk - write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk' - endif - success = .false. - return - endif - endif - if (useGPU) then -#ifdef WITH_MPI - na_rows = numroc(na, nblk, my_prow, 0, np_rows) - na_cols = numroc(na, nblk, my_pcol, 0, np_cols) -#else - na_rows = na - na_cols = na -#endif - successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_complex_datatype) - if (.not.(successCUDA)) then - print *, " bandred_complex: cuda malloc failed tmat_dev ", istat - stop - endif - - successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_complex_datatype) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda malloc failed vav_dev ", istat - stop - endif - - successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_complex_datatype) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda malloc failed a_dev ", istat - stop - endif - endif ! useGPU - - ! Matrix is split into tiles; work is done only for tiles on the diagonal or above - - tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size - tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide - - l_rows_tile = tile_size/np_rows ! local rows of a tile - l_cols_tile = tile_size/np_cols ! local cols of a tile - - if (useGPU) then - if (size(a,dim=1) .ne. lda .or. size(a,dim=2) .ne. na_cols) then - print *,"bandred_complex: sizes of a wrong ? ",lda,size(a,dim=1),na_cols,size(a,dim=2) - endif - successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda memcpy faild a_dev ", istat - stop - endif - endif - - do istep = (na-1)/nbw, 1, -1 - - n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step - - ! Number of local columns/rows of remaining matrix - l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) - l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) - - ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces - - if (useGPU) then - cur_l_rows = max(l_rows, 1) - cur_l_cols = max(l_cols, 1) - - vmr_size = cur_l_rows * 2 * n_cols - umc_size = cur_l_cols * 2 * n_cols - - if ((.not. allocated(umc)) .or. (umc_size .gt. ubound(umc, dim=1))) then - if (allocated(umc)) then - deallocate(umc, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating umc "//errorMessage - stop - endif - successCUDA = cuda_free(umc_dev) - if (.not.(successCUDA))then - print *,"bandred_complex: error in cudaFree" - stop - endif - endif - - allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating umc "//errorMessage - stop - endif - - if (max(l_cols,1) * 2*n_cols .gt. umc_size) then - print *,"bandred_complex: umc_size ",max(l_cols,1) * 2*n_cols,umc_size - endif - - successCUDA = cuda_malloc(umc_dev, umc_size*size_of_complex_datatype) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda malloc failed umc_dev ", istat - stop - endif - endif - - if ((.not. allocated(vmr)) .or. (vmr_size .gt. ubound(vmr, dim=1))) then - if (allocated(vmr)) then - deallocate(vmr, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when deallocating vmr "//errorMessage - stop - endif - successCUDA = cuda_free(vmr_dev) - if (.not.(successCUDA))then - print *,"bandred_complex: error in cudaFree" - stop - endif - endif - - allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating vmr "//errorMessage - stop - endif - - if (max(l_rows,1) * 2*n_cols .gt. vmr_size) then - print *,"bandred_complex: vmc_size ",max(l_rows,1) * 2*n_cols,vmr_size - endif - - - successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_complex_datatype) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda malloc failed vmr_dev ", istat - stop - endif - - endif - - if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then - if (allocated(vr)) then - deallocate(vr, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when deallocating vr "//errorMessage - stop - endif - endif - - allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating vr "//errorMessage - stop - endif - endif - - else ! GPU not used - allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating vmr "//errorMessage - stop - endif - - allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating umc "//errorMessage - stop - endif - - allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating vr "//errorMessage - stop - endif - endif ! useGPU - - vmr(1:l_rows,1:n_cols) = 0._ck - vr(:) = 0._ck - tmat(:,:,istep) = 0._ck - - if (useGPU) then - lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1) - lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1) - lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1) - - if (lc_start .le. 0) lc_start = 1 - cur_pcol = pcol(istep*nbw+1, nblk, np_cols) - if (my_pcol == cur_pcol) then - successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_complex_datatype,kind=c_size_t), & - (a_dev + int( ( (lc_start-1) * lda*size_of_complex_datatype),kind=c_size_t )), & - int(lda*size_of_complex_datatype,kind=c_size_t), & - int(lr_end*size_of_complex_datatype,kind=c_size_t), & - int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int)) - if (.not.(successCUDA)) then - print *, "bandred_complex: error in cudaMemcpy2" - stop - endif - endif - endif - - ! Reduce current block to lower triangular form - - do lc = n_cols, 1, -1 - - ncol = istep*nbw + lc ! absolute column number of householder vector - nrow = ncol - nbw ! Absolute number of pivot row - - lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length - lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number - - tau = 0 - - if(nrow == 1) exit ! Nothing to do - - cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block - - if (my_pcol==cur_pcol) then - - ! Get vector to be transformed; distribute last element and norm of - ! remaining elements to all procs in current column - - vr(1:lr) = a(1:lr,lch) ! vector to be transformed - - if (my_prow==prow(nrow, nblk, np_rows)) then - aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) - aux1(2) = vr(lr) - else - aux1(1) = dot_product(vr(1:lr),vr(1:lr)) - aux1(2) = 0. - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - aux2 = aux1 -#endif /* WITH_MPI */ - vnorm2 = aux2(1) - vrl = aux2(2) - - ! Householder transformation - - call hh_transform_complex(vrl, vnorm2, xf, tau) - - ! Scale vr and store Householder vector for back transformation - - vr(1:lr) = vr(1:lr) * xf - if (my_prow==prow(nrow, nblk, np_rows)) then - a(1:lr-1,lch) = vr(1:lr-1) - a(lr,lch) = vrl - vr(lr) = 1._ck - else - a(1:lr,lch) = vr(1:lr) - endif - - endif - - ! Broadcast Householder vector and tau along columns - - vr(lr+1) = tau -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Bcast(vr, lr+1, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) -#else - call MPI_Bcast(vr, lr+1, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - vmr(1:lr,lc) = vr(1:lr) - tau = vr(lr+1) - tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat - - ! Transform remaining columns in current block with Householder vector - - ! Local dot product - - aux1 = 0 - - nlc = 0 ! number of local columns - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 - aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx)) - endif - enddo - - ! Get global dot products -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - if (nlc>0) aux2=aux1 -#endif /* WITH_MPI */ - ! Transform - - nlc = 0 - do j=1,lc-1 - lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) - if (lcx>0) then - nlc = nlc+1 - a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) - endif - enddo - - enddo - - ! Calculate scalar products of stored Householder vectors. - ! This can be done in different ways, we use zherk - - if (useGPU) then - cur_pcol = pcol(istep*nbw+1, nblk, np_cols) - if (my_pcol == cur_pcol) then - successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_complex_datatype),kind=c_size_t)), & - int(lda*size_of_complex_datatype,kind=c_size_t), loc(a(1,lc_start)), & - int(lda*size_of_complex_datatype,kind=c_size_t), & - int(lr_end*size_of_complex_datatype,kind=c_size_t), & - int((lc_end - lc_start+1),kind=c_size_t) & - ,int(cudaMemcpyHostToDevice,kind=c_int)) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda memcpy a_dev failed ", istat - stop - endif - endif - endif - - vav = 0 - if (l_rows>0) & -#ifdef DOUBLE_PRECISION_COMPLEX - call zherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) -#else - call cherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) -#endif - call herm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_rows) - - ! Calculate triangular matrix T for block Householder Transformation - - do lc=n_cols,1,-1 - tau = tmat(lc,lc,istep) - if (lc vmc (stored in umc, second half) - - call elpa_transpose_vectors_complex (vmr, ubound(vmr,dim=1), mpi_comm_rows, & - umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, & - 1, istep*nbw, n_cols, nblk) - - ! Calculate umc = A**T * vmr - ! Note that the distributed A has to be transposed - ! Opposed to direct tridiagonalization there is no need to use the cache locality - ! of the tiles, so we can use strips of the matrix - - umc(1:l_cols,1:n_cols) = 0.0_rk - vmr(1:l_rows,n_cols+1:2*n_cols) = 0 - if (l_cols>0 .and. l_rows>0) then - if (useGPU) then - if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then - print *,"bandred_complex: vmr size 2 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size - stop - endif - successCUDA = cuda_memcpy(vmr_dev, loc(vmr(1,1)),vmr_size*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda memcpy vmr_dev failed ", istat - stop - endif - if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then - print *,"bandred_complex: umc size 2 :",size(umc,dim=1)*size(umc,dim=2),umc_size - stop - endif - - successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda memcpy umc_dev failed ", istat - stop - endif - endif - do i=0,(istep*nbw-1)/tile_size - - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - if (lce0) then - allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating tmp "//errorMessage - stop - endif -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#endif - umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols) - deallocate(tmp, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when deallocating tmp "//errorMessage - stop - endif - endif -#else /* WITH_MPI */ - if (l_cols>0) then - allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when allocating tmp "//errorMessage - stop - endif - tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols) - - umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols) - deallocate(tmp, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"bandred_complex: error when deallocating tmp "//errorMessage - stop - endif - endif - -#endif /* WITH_MPI */ - ! U = U * Tmat**T - if (useGPU) then - if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then - print *,"bandred_complex: umc size 4 :",size(umc,dim=1)*size(umc,dim=2),umc_size - stop - endif - - successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuad memcpy failed umc_dev ", istat - stop - endif - - successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuad memcpy failed tmat_dev ", istat - stop - endif -#ifdef DOUBLE_PRECISION_COMPLEX - call cublas_ztrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols) -#else - call cublas_ctrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat_dev, nbw, umc_dev, cur_l_cols) -#endif - else ! not useGPU -#ifdef DOUBLE_PRECISION_COMPLEX - call ztrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), & - umc, ubound(umc,dim=1)) -#else - call ctrmm('Right', 'Upper', 'C', 'Nonunit', l_cols, n_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), & - umc, ubound(umc,dim=1)) -#endif - endif - - ! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T - if (useGPU) then - successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuad memcpy failed vav_dev ", istat - stop - endif -#ifdef DOUBLE_PRECISION_COMPLEX - call cublas_zgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) & - *size_of_complex_datatype ), cur_l_cols, CZERO, vav_dev, nbw) - - call cublas_ztrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat_dev, nbw, vav_dev, nbw) -#else - call cublas_cgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc_dev, cur_l_cols, (umc_dev +( cur_l_cols *n_cols) & - *size_of_complex_datatype ), cur_l_cols, CZERO, vav_dev, nbw) - - call cublas_ctrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat_dev, nbw, vav_dev, nbw) -#endif - successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_complex_datatype,cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuad memcpy failed vav ", istat - stop - endif - - call herm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_cols) - - successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuad memcpy failed vav_dev ", istat - stop - endif - else -#ifdef DOUBLE_PRECISION_COMPLEX - call zgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc, ubound(umc,dim=1), umc(1,n_cols+1), & - ubound(umc,dim=1), CZERO, vav, ubound(vav,dim=1)) - call ztrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat(1,1,istep), & - ubound(tmat,dim=1), vav, ubound(vav,dim=1)) -#else - call cgemm('C', 'N', n_cols, n_cols, l_cols, CONE, umc, ubound(umc,dim=1), umc(1,n_cols+1), & - ubound(umc,dim=1), CZERO, vav, ubound(vav,dim=1)) - call ctrmm('Right', 'Upper', 'C', 'Nonunit', n_cols, n_cols, CONE, tmat(1,1,istep), & - ubound(tmat,dim=1), vav, ubound(vav,dim=1)) -#endif - call herm_matrix_allreduce(n_cols,vav,nbw,nbw,mpi_comm_cols) - endif - - ! U = U - 0.5 * V * VAV - - if (useGPU) then -#ifdef DOUBLE_PRECISION_COMPLEX - call cublas_zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk, 0.0_rk), (umc_dev + & - (cur_l_cols * n_cols )*size_of_complex_datatype), & - cur_l_cols, vav_dev, nbw, CONE, umc_dev, cur_l_cols) -#else - call cublas_cgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk, 0.0_rk), (umc_dev + & - (cur_l_cols * n_cols )*size_of_complex_datatype), & - cur_l_cols, vav_dev, nbw, CONE, umc_dev, cur_l_cols) -#endif - ! Transpose umc -> umr (stored in vmr, second half) - - if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then - print *,"bandred_complex: umc size 5 :",size(umc,dim=1)*size(umc,dim=2),umc_size - stop - endif - - successCUDA = cuda_memcpy(loc(umc(1,1)),umc_dev,umc_size*size_of_complex_datatype,cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuad memcpy failed umc ", istat - stop - endif - - call elpa_transpose_vectors_complex (umc, ubound(umc,dim=1), mpi_comm_cols, & - vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, & - 1, istep*nbw, n_cols, nblk) - if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then - print *,"bandred_complex: vmr size 4 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size - stop - endif - - successCUDA = cuda_memcpy(vmr_dev,loc(vmr(1,1)),vmr_size*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda memcpy failed vav_dev", istat - stop - endif - - if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then - print *,"bandred_complex: umc size 6 :",size(umc,dim=1)*size(umc,dim=2),umc_size - stop - endif - - successCUDA = cuda_memcpy(umc_dev,loc(umc(1,1)),umc_size*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *, "bandred_complex: cuda memcpy failed umc_dev ", istat - stop - endif - else ! not useGPU -#ifdef DOUBLE_PRECISION_COMPLEX - call zgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk, 0.0_rk), umc(1,n_cols+1), ubound(umc,dim=1), & - vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1)) -#else - call cgemm('N', 'N', l_cols, n_cols, n_cols, (-0.5_rk, 0.0_rk), umc(1,n_cols+1), ubound(umc,dim=1), & - vav, ubound(vav,dim=1), CONE, umc, ubound(umc,dim=1)) -#endif - ! Transpose umc -> umr (stored in vmr, second half) - - call elpa_transpose_vectors_complex (umc, ubound(umc,dim=1), mpi_comm_cols, & - vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, & - 1, istep*nbw, n_cols, nblk) - - endif - ! A = A - V*U**T - U*V**T - - do i=0,(istep*nbw-1)/tile_size - lcs = i*l_cols_tile+1 - lce = min(l_cols,(i+1)*l_cols_tile) - lre = min(l_rows,(i+1)*l_rows_tile) - if (lce 0) then - if (useGPU) then -#ifdef DOUBLE_PRECISION_COMPLEX - call cublas_zgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, & - q_dev, ldq, CZERO, tmp_dev, n_cols) -#else - call cublas_cgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm_dev, max_local_rows, & - q_dev, ldq, CZERO, tmp_dev, n_cols) -#endif - successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_complex_datatype, & - cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" - stop - endif - else -#ifdef DOUBLE_PRECISION_COMPLEX - call zgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & - q, ldq, CZERO, tmp1, n_cols) -#else - call cgemm('C', 'N', n_cols, l_cols, l_rows, CONE, hvm, ubound(hvm,dim=1), & - q, ldq, CZERO, tmp1, n_cols) -#endif - endif - else ! l_rows > 0 - if (useGPU) then - if (l_cols*n_cols .gt. (max_local_cols)*(nbw)) then - print *,"trans_ev_band_to_full_complex: tmp_dev ",l_cols*n_cols,max_local_cols - stop - endif - - ! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_complex_datatype) - ! if (istat .ne. 0) then - ! print *,"trans_ev_band_to_full_complex: error in cudaMemset" - ! stop - ! endif - endif - - tmp1(1:l_cols*n_cols) = 0 - - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#else - call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) -#endif - -#else /* WITH_MPI */ - tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols) -#endif /* WITH_MPI */ - if (l_rows>0) then - - if (useGPU) then - - successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" - stop - endif - - ! tmat_temp(1:nbw,1:nbw) = tmat(1:nbw,1:nbw,istep) - - successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)),nbw*nbw*size_of_complex_datatype,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" - stop - endif -#ifdef DOUBLE_PRECISION_COMPLEX - call cublas_ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols) - call cublas_zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, & - tmp_dev, n_cols, CONE, q_dev, ldq) -#else - call cublas_ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat_dev, nbw, tmp_dev, n_cols) - call cublas_cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm_dev, max_local_rows, & - tmp_dev, n_cols, CONE, q_dev, ldq) -#endif - else ! not useGPU -#ifdef DOUBLE_PRECISION_COMPLEX - call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols) - call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), & - tmp2, n_cols, CONE, q, ldq) -#else - call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols) - call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), & - tmp2, n_cols, CONE, q, ldq) -#endif - endif - endif - - !#ifdef WITH_GPU_VERSION - ! istat =cuda_memcpy(loc(hvm(1,1)),hvm_dev,((max_local_rows)*nbw*size_of_complex_datatype),cudaMemcpyDeviceToHost) - ! if (istat .ne. 0) then - ! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" - ! stop - ! endif - !#endif - - enddo - - deallocate(tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_band_to_full_complex: error when deallocating tmp1, tmp2, hvb, hvm "//errorMessage - stop - endif - - if (useGPU) then - - successCUDA = cuda_free(hvm_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_complex: error in cudaFree" - stop - endif - - successCUDA = cuda_free(tmp_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_complex: error in cudaFree" - stop - endif - - successCUDA = cuda_free(tmat_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_complex: error in cudaFree" - stop - endif - - successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_complex_datatype, cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" - stop - endif - ! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols) - - successCUDA = cuda_free(q_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_band_to_full_complex: error in cudaFree" - stop - endif - - ! deallocate(q_temp, stat=istat, errmsg=errorMessage) - ! if (istat .ne. 0) then - ! print *,"trans_ev_band_to_full_complex: error when deallocating q_temp "//errorMessage - ! endif - - !deallocate(tmat_temp, stat=istat, errmsg=errorMessage) - !if (istat .ne. 0) then - !print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage - !endif - endif ! use GPU -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("trans_ev_band_to_full_complex") -#endif - - end subroutine trans_ev_band_to_full_complex - - subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, matrixCols, hh_trans_complex, & - mpi_comm_rows, mpi_comm_cols, mpi_comm) - - !------------------------------------------------------------------------------- - ! tridiag_band_complex: - ! Reduces a complex hermitian symmetric band matrix to tridiagonal form - ! - ! na Order of matrix a - ! - ! nb Semi bandwith - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal - ! - ! lda Leading dimension of a - ! matrixCols local columns of matrix a - ! - ! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output) - ! - ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output) - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns - ! mpi_comm - ! MPI-Communicator for the total processor set - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - !#ifdef WITH_GPU_VERSION - ! integer(C_SIZE_T) :: h_dev, hv_new_dev ,ab_dev,x_dev,hs_dev,tau_new_dev,hv_dev,hd_dev - ! complex*16, allocatable :: ab_temp(:,:) - !#endif - - integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - complex(kind=ck),intent(in) :: a(lda,*) -#else - complex(kind=ck), intent(in) :: a(lda,matrixCols) -#endif - real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0 - complex(kind=ck), intent(inout), & - allocatable :: hh_trans_complex(:,:) - - real(kind=rk) :: vnorm2 - complex(kind=ck) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf - complex(kind=ck) :: hd(nb), hs(nb) - - integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt - integer(kind=ik) :: my_pe, n_pes, mpierr - integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols - integer(kind=ik) :: ireq_ab, ireq_hv - integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off -#ifdef WITH_OPENMP - integer(kind=ik), allocatable :: mpi_statuses(:,:) - integer(kind=ik), allocatable :: omp_block_limits(:) - integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter - integer(kind=ik) :: omp_get_max_threads -#ifdef WITH_MPI - integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) -#endif - complex(kind=ck), allocatable :: hv_t(:,:), tau_t(:) -#endif - integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:) - integer(kind=ik), allocatable :: limits(:), snd_limits(:,:) - integer(kind=ik), allocatable :: block_limits(:) - complex(kind=ck), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:) - integer(kind=ik) :: istat - character(200) :: errorMessage -#ifndef WITH_MPI - integer(kind=ik) :: startAddr -#endif - -! ! dummies for calling redist_band -! real*8 :: r_a(1,1), r_ab(1,1) - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("tridiag_band_complex") -#endif - call mpi_comm_rank(mpi_comm,my_pe,mpierr) - call mpi_comm_size(mpi_comm,n_pes,mpierr) - - call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) - call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) - call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) - call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) - -!#ifdef WITH_GPU_VERSION -! t_1 = 0 -! t_2 = 0 -!#endif - ! Get global_id mapping 2D procssor coordinates to global id - - allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating global_id "//errorMessage - stop - endif - global_id(:,:) = 0 - global_id(my_prow, my_pcol) = my_pe -#ifdef WITH_MPI - call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr) -#endif - - ! Total number of blocks in the band: - - nblocks_total = (na-1)/nb + 1 - - ! Set work distribution - - allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating block_limits "//errorMessage - stop - endif - - call divide_band(nblocks_total, n_pes, block_limits) - - ! nblocks: the number of blocks for my task - nblocks = block_limits(my_pe+1) - block_limits(my_pe) - - ! allocate the part of the band matrix which is needed by this PE - ! The size is 1 block larger than needed to avoid extensive shifts - allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating ab "//errorMessage - stop - endif - - !#ifdef WITH_GPU_VERSION - ! allocate(ab_temp(2*nb,nblocks*nb), stat=istat, errmsg=errorMessage) - ! if (istat .ne. 0) then - ! print *,"error when allocating ab_temp "//errorMessage - ! stop - ! endif - !#endif - ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety - - - - !#ifdef WITH_GPU_VERSION - ! - ! istat = cuda_malloc(ab_dev, 2*nb*(nblocks+1)*nb*size_of_complex_datatype) - ! if (istat .ne. 0) print *, " cuda malloc failed ab_dev", istat - ! - ! istat = cuda_malloc(hv_new_dev, nb*size_of_complex_datatype ) - ! if (istat .ne. 0) print *, " cuda malloc failed hv_new_dev", istat - ! - !! istat = cuda_malloc(temp_c_dev, nb*nb*size_of_complex_datatype ) - !! if(istat .ne. 0) print *, " cuda malloc failed temp_c", istat - ! - ! istat = cuda_malloc(h_dev , nb*size_of_complex_datatype) - ! if (istat .ne. 0) print *, " cuda malloc failed h_dev", istat - ! - ! istat = cuda_malloc(hs_dev , nb*size_of_complex_datatype) - ! if (istat .ne. 0) print *, " cuda malloc failed hs_dev", istat - ! - ! istat = cuda_malloc(x_dev , 1*size_of_complex_datatype) - ! if (istat .ne. 0) print *, " cuda malloc failed x_dev", istat - ! - ! istat = cuda_malloc( tau_new_dev , 1*size_of_complex_datatype) - ! if (istat .ne. 0) print *, " cuda malloc failed tau_new_dev", istat - ! - ! istat = cuda_malloc(hv_dev , nb*size_of_complex_datatype) - ! if (istat .ne. 0) print *, " cuda malloc failed hv_dev", istat - ! - ! istat = cuda_malloc(hd_dev , nb*size_of_complex_datatype) - ! if (istat .ne. 0) print *, " cuda malloc failed hd_dev", istat - !#endif - ! n_off: Offset of ab within band - n_off = block_limits(my_pe)*nb - - ! Redistribute band in a to ab - call redist_band_complex(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab) - - ! Calculate the workload for each sweep in the back transformation - ! and the space requirements to hold the HH vectors - - allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating limits "//errorMessage - stop - endif - - call determine_workload(na, nb, np_rows, limits) - max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) - - num_hh_vecs = 0 - num_chunks = 0 - nx = na - do n = 1, nblocks_total - call determine_workload(nx, nb, np_rows, limits) - local_size = limits(my_prow+1) - limits(my_prow) - ! add to number of householder vectors - ! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below! - if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then - num_hh_vecs = num_hh_vecs + local_size - num_chunks = num_chunks+1 - endif - nx = nx - nb - enddo - - ! Allocate space for HH vectors - - allocate(hh_trans_complex(nb,num_hh_vecs), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating hh_trans_comples "//errorMessage - stop - endif - ! Allocate and init MPI requests - - allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating ireq_hhr "//errorMessage - stop - endif - - allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating ireq_hhs "//errorMessage - stop - endif - - num_hh_vecs = 0 - num_chunks = 0 - nx = na - nt = 0 - do n = 1, nblocks_total - call determine_workload(nx, nb, np_rows, limits) - local_size = limits(my_prow+1) - limits(my_prow) - if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then - num_chunks = num_chunks+1 -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX16, nt, & - 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) -#else - call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX8, nt, & - 10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr) -#endif - -#else /* WITH_MPI */ - ! carefull non-block recv data copy must be done at wait or send - ! hh_trans_complex(1:nb*local_size,num_hh_vecs+1) = hh_send(1:nb*hh_cnt(iblk),1,iblk) - -#endif /* WITH_MPI */ - num_hh_vecs = num_hh_vecs + local_size - endif - nx = nx - nb - if (n == block_limits(nt+1)) then - nt = nt + 1 - endif - enddo -#ifdef WITH_MPI - ireq_hhs(:) = MPI_REQUEST_NULL -#endif - ! Buffers for gathering/sending the HH vectors - - allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating hh_gath "//errorMessage - stop - endif - - allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating hh_sebd "//errorMessage - stop - endif - - hh_gath(:,:,:) = 0 - hh_send(:,:,:) = 0 - - ! Some counters - - allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating hh_cnt "//errorMessage - stop - endif - allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating hh_dst "//errorMessage - stop - endif - - hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all - hh_dst(:) = 0 ! PE number for receive -#ifdef WITH_MPI - ireq_ab = MPI_REQUEST_NULL - ireq_hv = MPI_REQUEST_NULL -#endif - ! Limits for sending - - allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating snd_limits "//errorMessage - stop - endif - - do iblk=1,nblocks - call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk)) - enddo - -#ifdef WITH_OPENMP - ! OpenMP work distribution: - - max_threads = 1 -!$ max_threads = omp_get_max_threads() - - ! For OpenMP we need at least 2 blocks for every thread - max_threads = MIN(max_threads, nblocks/2) - if (max_threads==0) max_threads = 1 - - allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating omp_block_limits "//errorMessage - stop - endif - - ! Get the OpenMP block limits - call divide_band(nblocks, max_threads, omp_block_limits) - - allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating hv_t, tau_t "//errorMessage - stop - endif - hv_t = 0 - tau_t = 0 -#endif - - - ! --------------------------------------------------------------------------- - ! Start of calculations - - na_s = block_limits(my_pe)*nb + 1 - - if (my_pe>0 .and. na_s<=na) then - ! send first column to previous PE - ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) - ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#else - call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - -#ifndef WITH_MPI - startAddr = ubound(hh_trans_complex,dim=2) -#endif - -#ifdef WITH_OPENMP - do istep=1,na-1-block_limits(my_pe)*nb -#else - do istep=1,na-1 -#endif - if (my_pe==0) then - n = MIN(na-na_s,nb) ! number of rows to be reduced - hv(:) = 0 - tau = 0 - ! Transform first column of remaining matrix - ! Opposed to the real case, the last step (istep=na-1) is needed here for making - ! the last subdiagonal element a real number -#ifdef DOUBLE_PRECISION_COMPLEX - vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk)**2+dimag(ab(3:n+1,na_s-n_off))**2) -#else - vnorm2 = sum(real(ab(3:n+1,na_s-n_off),kind=rk)**2+aimag(ab(3:n+1,na_s-n_off))**2) -#endif - if (n<2) vnorm2 = 0. ! Safety only - call hh_transform_complex(ab(2,na_s-n_off),vnorm2,hf,tau) - hv(1) = 1 - hv(2:n) = ab(3:n+1,na_s-n_off)*hf - - d(istep) = ab(1,na_s-n_off) - e(istep) = ab(2,na_s-n_off) - if (istep == na-1) then - d(na) = ab(1,na_s+1-n_off) - e(na) = 0 - endif - else - if (na>na_s) then - ! Receive Householder vector from previous task, from PE owning subdiagonal -#ifdef WITH_OPENMP - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_recv(hv, nb, MPI_COMPLEX16, my_pe-1, 2, mpi_comm, mpi_status, mpierr) -#else - call mpi_recv(hv, nb, MPI_COMPLEX8, my_pe-1, 2, mpi_comm, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - hv(1:nb) = hv_s(1:nb) -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_recv(hv, nb, MPI_COMPLEX16, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) -#else - call mpi_recv(hv, nb, MPI_COMPLEX8, my_pe-1, 2, mpi_comm, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - hv(1:nb) = hv_s(1:nb) -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - tau = hv(1) - hv(1) = 1._CK - endif - endif - - na_s = na_s+1 - if (na_s-n_off > nb) then - !#ifdef WITH_GPU_VERSION - ! ab_temp(:,1:nblocks*nb) = ab(:,nb+1:(nblocks +1)*nb) - ! ab(:, 1:nblocks*nb) = ab_temp(:, 1:nblocks*nb) - !#else - ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) - !#endif - ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 - n_off = n_off + nb - endif -#ifdef WITH_OPENMP - if (max_threads > 1) then - - ! Codepath for OpenMP - - ! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread! - ! Every thread is one reduction cycle behind its predecessor and thus starts one step later. - ! This simulates the behaviour of the MPI tasks which also work after each other. - ! The code would be considerably easier, if the MPI communication would be made within - ! the parallel region - this is avoided here since this would require - ! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program. - - hv_t(:,1) = hv - tau_t(1) = tau - - do iter = 1, 2 - - ! iter=1 : work on first block - ! iter=2 : work on remaining blocks - ! This is done in 2 iterations so that we have a barrier in between: - ! After the first iteration, it is guaranteed that the last row of the last block - ! is completed by the next thread. - ! After the first iteration it is also the place to exchange the last row - ! with MPI calls -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, & -!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads) - do my_thread = 1, max_threads - - if (iter == 1) then - my_block_s = omp_block_limits(my_thread-1) + 1 - my_block_e = my_block_s - else - my_block_s = omp_block_limits(my_thread-1) + 2 - my_block_e = omp_block_limits(my_thread) - endif - - do iblk = my_block_s, my_block_e - - ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block - ne = ns+nb-1 ! last column in block - - if (istepna) exit - - hv = hv_t(:,my_thread) - tau = tau_t(my_thread) - - ! Store Householder vector for back transformation - - hh_cnt(iblk) = hh_cnt(iblk) + 1 - - hh_gath(1 ,hh_cnt(iblk),iblk) = tau - hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) - - nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block - nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) - ! Note that nr>=0 implies that diagonal block is full (nc==nb)! - - ! Transform diagonal block -#ifdef DOUBLE_PRECISION_COMPLEX - call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk, 0.0_rk), hd, 1) -#else - call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk, 0.0_rk), hd, 1) -#endif - x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) - hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) -#ifdef DOUBLE_PRECISION_COMPLEX - call ZHER2('L', nc, (-1.0_rk, 0.0_rk), hd, 1, hv, 1, ab(1,ns), 2*nb-1) -#else - call CHER2('L', nc, (-1.0_rk, 0.0_rk), hd, 1, hv, 1, ab(1,ns), 2*nb-1) -#endif - - hv_t(:,my_thread) = 0 - tau_t(my_thread) = 0 - - if (nr<=0) cycle ! No subdiagonal block present any more - - ! Transform subdiagonal block -#ifdef DOUBLE_PRECISION_COMPLEX - call ZGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk,0.0_rk), hs, 1) -#else - call CGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk,0.0_rk), hs, 1) -#endif - - if (nr>1) then - - ! complete (old) Householder transformation for first column - - ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 - - ! calculate new Householder transformation for first column - ! (stored in hv_t(:,my_thread) and tau_t(my_thread)) - - vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2) - call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread)) - hv_t(1 ,my_thread) = 1. - hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf - ab(nb+2:,ns) = 0 - - ! update subdiagonal block for old and new Householder transformation - ! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster -#ifdef DOUBLE_PRECISION_COMPLEX - call ZGEMV('C', nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, (0.0_rk,0.0_rk), h(2), 1) -#else - call CGEMV('C', nr, nb-1, tau_t(my_thread), ab(nb,ns+1), 2*nb-1, hv_t(1,my_thread), 1, (0.0_rk,0.0_rk), h(2), 1) -#endif - x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread) - h(2:nb) = h(2:nb) - x*hv(2:nb) - ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2") - do i=2,nb - ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) & - - hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) - enddo - - else - - ! No new Householder transformation for nr=1, just complete the old one - ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1 - do i=2,nb - ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) - enddo - ! For safety: there is one remaining dummy transformation (but tau is 0 anyways) - hv_t(1,my_thread) = 1. - - endif - - enddo - - enddo ! my_thread -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - - if (iter==1) then - ! We are at the end of the first block - - ! Send our first column to previous PE - if (my_pe>0 .and. na_s <= na) then -#ifdef WITH_MPI - call mpi_wait(ireq_ab,mpi_status,mpierr) -#endif - ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#else - call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - ! Request last column from next PE - ne = na_s + nblocks*nb - (max_threads-1) - 1 - if (istep>=max_threads .and. ne <= na) then - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_recv(ab(1,ne-n_off), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, mpi_status, mpierr) -#else - call mpi_recv(ab(1,ne-n_off), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - ab(1:nb+1,ne-n_off) = ab_s(1:nb+1) - -#endif /* WITH_MPI */ - endif - - else - ! We are at the end of all blocks - - ! Send last HH vector and TAU to next PE if it has been calculated above - ne = na_s + nblocks*nb - (max_threads-1) - 1 - if (istep>=max_threads .and. ne < na) then - -#ifdef WITH_MPI - call mpi_wait(ireq_hv,mpi_status,mpierr) -#endif - hv_s(1) = tau_t(max_threads) - hv_s(2:) = hv_t(2:,max_threads) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_isend(hv_s, nb, MPI_COMPLEX16, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) -#else - call mpi_isend(hv_s, nb, MPI_COMPLEX8, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - ! "Send" HH vector and TAU to next OpenMP thread - do my_thread = max_threads, 2, -1 - hv_t(:,my_thread) = hv_t(:,my_thread-1) - tau_t(my_thread) = tau_t(my_thread-1) - enddo - - endif - enddo ! iter - - else - - ! Codepath for 1 thread without OpenMP - - ! The following code is structured in a way to keep waiting times for - ! other PEs at a minimum, especially if there is only one block. - ! For this reason, it requests the last column as late as possible - ! and sends the Householder vector and the first column as early - ! as possible. - -#endif /* WITH_OPENMP */ - - !#ifdef WITH_GPU_VERSION - ! call cpu_time(start) - !#endif - do iblk=1,nblocks - - ns = na_s + (iblk-1)*nb - n_off ! first column in block - ne = ns+nb-1 ! last column in block - - if (ns+n_off>na) exit - - ! Store Householder vector for back transformation - - hh_cnt(iblk) = hh_cnt(iblk) + 1 - - hh_gath(1 ,hh_cnt(iblk),iblk) = tau - hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb) - - -#ifndef WITH_OPENMP - if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then - ! Wait for last transfer to finish -#ifdef WITH_MPI - call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr) -#endif - ! Copy vectors into send buffer - hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) - ! Send to destination -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX16, & - global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) -#else - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX8, & - global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1,np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) -#endif -#else /* WITH_MPI */ - startAddr = startAddr - hh_cnt(iblk) - hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) -#endif /* WITH_MPI */ - ! Reset counter and increase destination row - hh_cnt(iblk) = 0 - hh_dst(iblk) = hh_dst(iblk)+1 - endif - - ! The following code is structured in a way to keep waiting times for - ! other PEs at a minimum, especially if there is only one block. - ! For this reason, it requests the last column as late as possible - ! and sends the Householder vector and the first column as early - ! as possible. -#endif /* OpenMP */ - - nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block - nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) - ! Note that nr>=0 implies that diagonal block is full (nc==nb)! - - - ! Multiply diagonal block and subdiagonal block with Householder vector - - if (iblk==nblocks .and. nc==nb) then - - ! We need the last column from the next PE. - ! First do the matrix multiplications without last column ... - - ! Diagonal block, the contribution of the last element is added below! - ab(1,ne) = 0 -#ifdef DOUBLE_PRECISION_COMPLEX - call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,(0.0_rk,0.0_rk),hd,1) - - ! Subdiagonal block - if (nr>0) call ZGEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,(0.0_rk,0.0_rk),hs,1) - - ! ... then request last column ... - ! ... then request last column ... -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, mpi_status, mpierr) - -#else - call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX16, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - ab(1:nb+1,ne) = ab_s(1:nb+1) -#endif /* WITH_MPI */ - - ! ... and complete the result - hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) - hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau - - else - ! Normal matrix multiply - call ZHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk,0.0_rk), hd, 1) - if (nr>0) call ZGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk,0.0_rk), hs, 1) - - endif -#else /* DOUBLE_PRECISION_COMPLEX */ - call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1,(0.0_rk,0.0_rk),hd,1) - - ! Subdiagonal block - if (nr>0) call CGEMV('N', nr, nb-1, tau, ab(nb+1,ns), 2*nb-1, hv, 1,(0.0_rk,0.0_rk),hs,1) - - ! ... then request last column ... -#ifdef WITH_OPENMP - call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, mpi_status, mpierr) - -#else - call mpi_recv(ab(1,ne), nb+1, MPI_COMPLEX8, my_pe+1, 1, mpi_comm, MPI_STATUS_IGNORE, mpierr) -#endif - ! ... and complete the result - hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb) - hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau - - else - ! Normal matrix multiply - call CHEMV('L', nc, tau, ab(1,ns), 2*nb-1, hv, 1, (0.0_rk,0.0_rk), hd, 1) - if (nr>0) call CGEMV('N', nr, nb, tau, ab(nb+1,ns), 2*nb-1, hv, 1, (0.0_rk,0.0_rk), hs, 1) - - endif -#endif /* DOUBLE_PRECISION_COMPLEX */ - - ! Calculate first column of subdiagonal block and calculate new - ! Householder transformation for this column - - hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb - !#ifdef WITH_GPU_VERSION - ! istat = cuda_memset(hv_new_dev, 0,nb*size_of_complex_datatype ) - ! if (istat .ne. 0) print *, " cuda memset failed hv_new_dev", istat - !#endif - tau_new = 0 - - if (nr>0) then - - ! complete (old) Householder transformation for first column - - ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1 - - ! calculate new Householder transformation ... - if (nr>1) then -#ifdef DOUBLE_PRECISION_COMPLEX - vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk)**2+dimag(ab(nb+2:nb+nr,ns))**2) -#else - vnorm2 = sum(real(ab(nb+2:nb+nr,ns),kind=rk)**2+aimag(ab(nb+2:nb+nr,ns))**2) -#endif - call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_new) - hv_new(1) = 1. - hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf - ab(nb+2:,ns) = 0 - endif - - ! ... and send it away immediatly if this is the last block - - if (iblk==nblocks) then -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call mpi_wait(ireq_hv,mpi_status,mpierr) -#else - call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) -#endif - -#endif /* WITH_MPI */ - hv_s(1) = tau_new - hv_s(2:) = hv_new(2:) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_isend(hv_s, nb, MPI_COMPLEX16, my_pe+1, 2 ,mpi_comm, ireq_hv, mpierr) -#else - call mpi_isend(hv_s, nb, MPI_COMPLEX8, my_pe+1, 2 ,mpi_comm, ireq_hv, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - endif - - - ! Transform diagonal block - x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau) - hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc) - - !#ifdef WITH_GPU_VERSION - ! istat = cuda_memcpy2d((ab_dev + (ns-1)*2*nb*size_of_complex_datatype), 2*nb*size_of_complex_datatype,loc(a(1,ns)), 2*nb*size_of_complex_datatype, 2*size_of_complex_datatype , & - ! 2*nb*size_of_complex_datatype,cudaMemcpyHostToDevice) - ! if (istat .ne. 0) print *, "cuda memcpy a_dev H2D failed ", istat - ! istat =cuda_memcpy(hv_dev,loc(hv),nc*size_of_complex_datatype,cudaMemcpyHostToDevice) - ! if (istat .ne. 0) print *,"cuda memcpy failed hv_dev", istat - ! istat =cuda_memcpy(hd_dev,loc(hd), nb*size_of_complex_datatype,cudaMemcpyHostToDevice) - ! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat - !#endif - - if (my_pe>0 .and. iblk==1) then - - ! The first column of the diagonal block has to be send to the previous PE - ! Calculate first column only ... - - !#ifdef WITH_GPU_VERSION - ! call double_hh_transform_2( ns, nc, nb ) - ! istat=cuda_memcpy(loc(ab),ab_dev,(2*nb*(nblocks+1)*nb)*size_of_complex_datatype,cudaMemcpyDeviceToHost) - ! if (istat .ne. 0) print *, " cuda memcpy failed ab ", istat - !#else - ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1)) - !#endif - - ! ... send it away ... - -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call mpi_wait(ireq_ab,mpi_status,mpierr) -#else - call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) -#endif - -#endif /* WITH_MPI */ - ab_s(1:nb+1) = ab(1:nb+1,ns) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_isend(ab_s, nb+1, MPI_COMPLEX16, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#else - call mpi_isend(ab_s, nb+1, MPI_COMPLEX8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif - -#endif /* WITH_MPI */ - - ! ... and calculate remaining columns with rank-2 update - if (nc>1) then - - !#ifdef WITH_GPU_VERSION - ! call cublas_ZHER2( 'L',nc -1,(-1.d0,0.d0), hd_dev + 1*16, 1, hv_dev +1*16, 1 , ab_dev + (ns*2*nb )*16, 2*nb-1) - !#else -#ifdef DOUBLE_PRECISION_COMPLEX - call ZHER2('L', nc-1, (-1.0_rk,0.0_rk), hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) -#else - call CHER2('L', nc-1, (-1.0_rk,0.0_rk), hd(2), 1, hv(2), 1, ab(1,ns+1), 2*nb-1) -#endif - !#endif - endif - else - - ! No need to send, just a rank-2 update - !#ifdef WITH_GPU_VERSION - ! call cublas_ZHER2( 'L',nc ,(-1.d0,0.d0), hd_dev, 1, hv_dev, 1 , ab_dev + ((ns-1)*2*nb )*16, 2*nb-1) - !#else -#ifdef DOUBLE_PRECISION_COMPLEX - call ZHER2('L', nc, (-1.0_rk,0.0_rk), hd, 1, hv, 1, ab(1,ns), 2*nb-1) -#else - call CHER2('L', nc, (-1.0_rk,0.0_rk), hd, 1, hv, 1, ab(1,ns), 2*nb-1) -#endif - !#endif - endif - - !#ifdef WITH_GPU_VERSION - ! istat=cuda_memcpy( loc(hd),hd_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost) - ! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat - !#endif - - ! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb - - !#ifdef WITH_GPU_VERSION - ! istat =cuda_memcpy(hs_dev,loc(hs),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) - ! if (istat .ne. 0) print *,"cuda memcpy failed hs_dev", istat - !#endif - - if (nr>0) then - if (nr>1) then - !#ifdef WITH_GPU_VERSION - ! istat = cuda_memcpy(hv_new_dev,loc(hv_new),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) - ! if (istat .ne. 0) print *,"cuda memcpy failed hv_new_dev", istat - ! - ! istat = cuda_memcpy(h_dev,loc(h),nb*size_of_complex_datatype,cudaMemcpyHostToDevice) - ! if (istat .ne. 0) print *,"cuda memcpy failed h_dev", istat - ! - ! call cublas_ZGEMV('C',nr,nb-1,tau_new,ab_dev + (nb-1 + ns *2*nb)*16,2*nb-1,hv_new_dev,1,(0.d0,0.d0),h_dev + 1* 16,1) - ! - ! istat = cuda_memcpy(tau_new_dev,loc(tau_new),1*size_of_complex_datatype,cudaMemcpyHostToDevice) - ! if (istat .ne. 0) print *,"cuda memcpy failed tau_new_dev", istat - ! - ! call dot_product_kernel(nr , tau_new) - ! call dot_product_kernel_1( nb, nr , ns) - ! - ! istat = cuda_memcpy(loc(x),x_dev,1*size_of_complex_datatype,cudaMemcpyDeviceToHost) - ! if (istat .ne. 0) print *, " cuda memcpy failed x_dev ", istat - ! - ! istat =cuda_memcpy(loc(h),h_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost) - ! if (istat .ne. 0) print *, " cuda memcpy failed h ", istat - !#else -#ifdef DOUBLE_PRECISION_COMPLEX - call ZGEMV('C', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, (0.0_rk, 0.0_rk), h(2), 1) -#else - call CGEMV('C', nr, nb-1, tau_new, ab(nb,ns+1), 2*nb-1, hv_new, 1, (0.0_rk, 0.0_rk), h(2), 1) -#endif - x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new - h(2:nb) = h(2:nb) - x*hv(2:nb) - ! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update - do i=2,nb - ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i)) - enddo - !#endif - else - ! No double Householder transformation for nr=1, just complete the row - !#ifdef WITH_GPU_VERSION - ! call double_hh_transform_1(nb, ns) - !#else - do i=2,nb - ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i)) - enddo - !#endif - - endif - endif - - ! Use new HH vector for the next block - hv(:) = hv_new(:) - tau = tau_new - - enddo - !#ifdef WITH_GPU_VERSION - ! call cpu_time(finish) - ! tstep2 = finish-start - ! t_2 = t_2 + tstep2 - !#endif -#ifdef WITH_OPENMP - endif -#endif - -#ifdef WITH_OPENMP - do iblk = 1, nblocks - - if (hh_dst(iblk) >= np_rows) exit - if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit - - if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then - ! Wait for last transfer to finish -#ifdef WITH_MPI - call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr) -#endif - ! Copy vectors into send buffer - hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk) - ! Send to destination -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex16, & - global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) -#else - call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex8, & - global_id(hh_dst(iblk), mod(iblk+block_limits(my_pe)-1, np_cols)), & - 10+iblk, mpi_comm, ireq_hhs(iblk), mpierr) -#endif - -#else /* WITH_MPI */ - startAddr = startAddr - hh_cnt(iblk) - hh_trans_complex(1:nb,startAddr+1:startAddr+hh_cnt(iblk)) = hh_send(1:nb,1:hh_cnt(iblk),iblk) -#endif /* WITH_MPI */ - ! Reset counter and increase destination row - hh_cnt(iblk) = 0 - hh_dst(iblk) = hh_dst(iblk)+1 - endif - enddo -#endif /* WITH_OPENMP */ - enddo -!#ifdef WITH_GPU_VERSION -! call cpu_time(finish_1) -! tstep1 = finish_1-start_1 -! t_1 = t_1 + tstep1 -!#endif - - ! Finish the last outstanding requests -#ifdef WITH_OPENMP - -#ifdef WITH_MPI - call mpi_wait(ireq_ab,mpi_status,mpierr) - call mpi_wait(ireq_hv,mpi_status,mpierr) - - allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when allocating mpi_statuses "//errorMessage - stop - endif - call mpi_waitall(nblocks, ireq_hhs, mpi_statuses, mpierr) - call mpi_waitall(num_chunks, ireq_hhr, mpi_statuses, mpierr) - deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when deallocating mpi_statuses "//errorMessage - stop - endif -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr) - call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr) - - call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr) - call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr) -#endif - -#endif /* WITH_OPENMP */ - -#ifdef WITH_MPI - call mpi_barrier(mpi_comm,mpierr) -#endif - deallocate(ab, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when deallocating ab "//errorMessage - stop - endif - -!#ifdef WITH_GPU_VERSION -! deallocate(ab_temp, stat=istat, errmsg=errorMessage) -! if (istat .ne. 0) then -! print *,"error when deallocating ab_temp "//errorMessage -! stop -! endif -! -!#endif - - deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when deallocating ireq_hhr, ireq_hhs "//errorMessage - stop - endif - - deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when deallocating hh_cnt, hh_dst "//errorMessage - stop - endif - - deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when deallocating hh_gath, hh_send, "//errorMessage - stop - endif - - deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when deallocating limits, snd_limits "//errorMessage - stop - endif - - deallocate(block_limits, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when deallocating block_limits, "//errorMessage - stop - endif - - deallocate(global_id, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"tridiag_band_complex: error when deallocating global_id, "//errorMessage - stop - endif - - !#ifdef WITH_GPU_VERSION - ! istat = cuda_free(ab_dev) - ! if (istat .ne. 0) then - ! print *,"error in cudaFree" - ! stop - ! endif - ! - ! istat = cuda_free(hv_new_dev) - ! if (istat .ne. 0) then - ! print *,"error in cudaFree" - ! stop - ! endif - ! - ! istat = cuda_free(hs_dev) - ! if (istat .ne. 0) then - ! print *,"error in cudaFree" - ! stop - ! endif - ! - ! istat = cuda_free(h_dev) - ! if (istat .ne. 0) then - ! print *,"error in cudaFree" - ! stop - ! endif - ! - ! istat = cuda_free(tau_new_dev) - ! if (istat .ne. 0) then - ! print *,"error in cudaFree" - ! stop - ! endif - ! - ! istat = cuda_free(x_dev) - ! if (istat .ne. 0) then - ! print *,"error in cudaFree" - ! stop - ! endif - ! - !#endif -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("tridiag_band_complex") -#endif - - !#ifdef WITH_GPU_VERSION - ! contains - ! - ! subroutine dot_product_kernel(nr,tau_new) - ! implicit none - ! integer, intent(in) :: nr - ! complex*16, intent(in) :: tau_new - ! - ! call launch_dot_product_kernel( hs_dev,hv_new_dev,tau_new,x_dev,h_dev,hv_dev, nr ) - ! end subroutine - ! - ! subroutine dot_product_kernel_1( nb , nr , ns) - ! implicit none - ! integer, intent(in) :: nb, nr, ns - ! - ! call launch_dot_product_kernel_1(ab_dev,hs_dev, hv_new_dev,x_dev,h_dev,hv_dev,nb , nr, ns) - ! end subroutine - ! - ! subroutine double_hh_transform_1( nb , ns) - ! implicit none - ! integer, intent(in) :: nb, ns - ! - ! call launch_double_hh_transform_1(ab_dev,hs_dev,hv_dev,nb , ns) - ! end subroutine - ! - ! subroutine double_hh_transform_2( ns,nc, nb) - ! implicit none - ! integer, intent(in) :: nc, ns, nb - ! - ! call launch_double_hh_transform_2(ab_dev,hd_dev,hv_dev,nc , ns, nb) - ! end subroutine - !#endif - end subroutine tridiag_band_complex ! has to be checked for GPU - -#define ATODEV istat = cuda_memcpy(loc(a), a_dev, stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost) -#define ATOHOST istat = cuda_memcpy(a_dev, loc(a), stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost) - - subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, matrixCols, & - hh_trans_complex, mpi_comm_rows, mpi_comm_cols, & - wantDebug, useGPU, success, THIS_COMPLEX_ELPA_KERNEL) - - !------------------------------------------------------------------------------- - ! trans_ev_tridi_to_band_complex: - ! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix - ! - ! Parameters - ! - ! na Order of matrix a, number of rows of matrix q - ! - ! nev Number eigenvectors to compute (= columns of matrix q) - ! - ! nblk blocksize of cyclic distribution, must be the same in both directions! - ! - ! nb semi bandwith - ! - ! q On input: Eigenvectors of tridiagonal matrix - ! On output: Transformed eigenvectors - ! Distribution is like in Scalapack. - ! - ! ldq Leading dimension of q - ! matrixCols local columns of matrix q - ! - ! mpi_comm_rows - ! mpi_comm_cols - ! MPI-Communicators for rows/columns/both - ! - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use pack_unpack_complex - use compute_hh_trafo_complex - use precision - use cuda_functions - use iso_c_binding - implicit none - - logical, intent(in) :: useGPU - integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL - integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols -#ifdef DESPERATELY_WANT_ASSUMED_SIZE - complex(kind=ck) :: q(ldq,*) -#else - complex(kind=ck) :: q(ldq,matrixCols) -#endif - complex(kind=ck) :: hh_trans_complex(:,:) - integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol - integer(kind=ik) :: tmp - - integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2 - integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end - integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end - integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length - integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count -#ifdef WITH_OPENMP - integer(kind=ik) :: thread_width, csw, b_off, b_len -#endif - integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd - integer(kind=ik) :: a_off, current_tv_off, max_blk_size - integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk - logical :: flag - -#ifdef WITH_OPENMP - complex(kind=ck), allocatable :: a(:,:,:,:), row(:) -#else - complex(kind=ck), allocatable :: a(:,:,:), row(:) -#endif - - complex(kind=ck), allocatable :: row_group(:,:) - -#ifdef WITH_OPENMP - complex(kind=ck), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:) - complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:) -#else - complex(kind=ck), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:) - complex(kind=ck), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:) -#endif - integer(kind=c_intptr_t) :: a_dev - integer(kind=c_intptr_t) :: bcast_buffer_dev - integer(kind=c_size_t) :: num - integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2 - - - integer(kind=c_intptr_t) :: row_dev - integer(kind=c_intptr_t) :: row_group_dev - integer(kind=c_intptr_t) :: hh_tau_dev - integer(kind=c_intptr_t) :: hh_dot_dev - integer(kind=ik) :: row_group_size, unpack_idx - integer(kind=ik) :: n_times - - integer(kind=ik) :: top, chunk, this_chunk - complex(kind=ck), allocatable :: result_buffer(:,:,:) - complex(kind=ck), allocatable :: bcast_buffer(:,:) - - integer(kind=ik) :: n_off - integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:) - integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:) - integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:) -#ifdef WITH_OPENMP - integer(kind=ik), allocatable :: mpi_statuses(:,:) -#ifdef WITH_MPI - integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) -#endif -#endif - -#ifdef WITH_MPI - integer(kind=ik), external :: numroc -#endif - integer(kind=ik) :: na_rows, na_cols -! real*8 :: ttt0, ttt1, ttt2, t2_compute_kernel, t0_compute_kernel,t1_compute_kernel, & -! t0_mpi_time, t1_mpi_time,t2_mpi_time -! real*8 :: t0_cpu_code,t1_cpu_code,t2_cpu_code,t0_block_time,t1_block_time,t2_block_time,t0_cuda_memcpy -! real*8 :: t0_inner_do_time, t1_inner_do_time , t2_inner_do_time,t0_outer_do_time ,t1_outer_do_time , & -! t2_outer_do_time ,t0_result_time ,t1_result_time, t2_result_time,t0_mpi_recv_time, & -! t1_mpi_recv_time,t2_mpi_recv_time -! real*8 :: t1_mpi_wait_time,t0_mpi_wait_time,t2_mpi_wait_time,t1_memcpy_time,t0_memcpy_time,t2_memcpy_time, & -! t1_mpi_irecv_time,t0_mpi_irecv_time,t2_mpi_irecv_time,t0_mpi_outer_wait_time,t1_mpi_outer_wait_time,& -! t2_mpi_outer_wait_time, time0 -! real*4 :: time1 - - ! MPI send/recv tags, arbitrary - - integer(kind=ik), parameter :: bottom_recv_tag = 111 - integer(kind=ik), parameter :: top_recv_tag = 222 - integer(kind=ik), parameter :: result_recv_tag = 333 - -#ifdef WITH_OPENMP - integer(kind=ik) :: max_threads, my_thread - integer(kind=ik) :: omp_get_max_threads -#endif - - ! Just for measuring the kernel performance - real(kind=c_double) :: kernel_time ! MPI_WTIME always needs double - ! long integer - integer(kind=lik) :: kernel_flops - - logical, intent(in) :: wantDebug - integer(kind=ik) :: istat - character(200) :: errorMessage - logical :: successCUDA - logical :: success -#ifndef WITH_MPI - integer(kind=ik) :: j1 -#endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("trans_ev_tridi_to_band_complex") -#endif - - if (useGPU) then - n_times =0 - ! n_times_1 =0 - unpack_idx = 0 - row_group_size = 0 - ! time0=0 - ! t0_compute_kernel=0 - endif - - kernel_time = 0.0 - kernel_flops = 0 - -#ifdef WITH_OPENMP - max_threads = 1 - max_threads = omp_get_max_threads() -#endif - call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr) - call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr) - call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr) - call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr) - - if (useGPU) then -#ifdef WITH_MPI - na_rows = numroc(na, nblk, my_prow, 0, np_rows) - na_cols = numroc(na, nblk, my_pcol, 0, np_cols) -#else - na_rows = na - na_cols = na -#endif - endif - - success = .true. - if (mod(nbw,nblk)/=0) then - if (my_prow==0 .and. my_pcol==0) then - if (wantDebug) then - write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk - write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk' - endif - - success = .false. - return - endif - endif - - nfact = nbw / nblk - - - ! local number of eigenvectors - l_nev = local_index(nev, my_pcol, np_cols, nblk, -1) - - if (l_nev==0) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - thread_width = 0 -#endif - stripe_width = 0 - stripe_count = 0 - last_stripe_width = 0 - else - ! Suggested stripe width is 48 - should this be reduced for the complex case ??? -#ifdef WITH_OPENMP - thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread -#endif - if (useGPU) then - stripe_width = 256 - else - stripe_width = 48 ! Must be a multiple of 4 - endif - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - stripe_count = (thread_width-1)/stripe_width + 1 -#else /* WITH_OPENMP */ - - stripe_count = (l_nev-1)/stripe_width + 1 -#endif /* WITH_OPENMP */ - - ! Adapt stripe width so that last one doesn't get too small -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - - stripe_width = (thread_width-1)/stripe_count + 1 -#else /* WITH_OPENMP */ - - if (.not.(useGPU)) then - stripe_width = (l_nev-1)/stripe_count + 1 - endif - -#endif /* WITH_OPENMP */ - if (.not.(useGPU)) then - stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!! - endif -#ifndef WITH_OPENMP - last_stripe_width = l_nev - (stripe_count-1)*stripe_width -#endif /* WITH_OPENMP */ - endif - - ! Determine the matrix distribution at the beginning - - allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error when allocating limits "//errorMessage - stop - endif - - call determine_workload(na, nbw, np_rows, limits) - - max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1)) - - a_dim2 = max_blk_size + nbw -!DEC$ ATTRIBUTES ALIGN: 64:: a - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - - if (.not.(useGPU)) then - allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage - stop - endif - - ! a(:,:,:,:) should be set to 0 in a parallel region, not here! - endif - -#else /* OpenMP */ - - if (.not.(useGPU)) then - allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage - stop - endif - - a(:,:,:) = 0 - endif - -#endif /* WITH_OPENMP */ - - allocate(row(l_nev), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating row "//errorMessage - stop - endif - - row(:) = 0 - - if (useGPU) then - num = (stripe_width*a_dim2*stripe_count)*size_of_complex_datatype - if (na_rows * na_cols .lt. stripe_width*a_dim2*stripe_count) then - print *,"trans_ev_tridi_to_band_complex a_dev ",na_rows * na_cols, stripe_width*a_dim2*stripe_count - ! stop - endif - - successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_complex_datatype) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc " - stop - endif - - if (num .gt. na_rows * na_cols) then - print *,"trans_ev_tridi_to_band_complex a_dev 1",num, na_rows * na_cols - ! stop - endif - successCUDA = cuda_memset(a_dev , 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemset " - stop - endif - - num = (l_nev)*size_of_complex_datatype - successCUDA = cuda_malloc( row_dev,num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc " - stop - endif - - successCUDA = cuda_memset(row_dev , 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemset " - stop - endif - - ! "row_group" and "row_group_dev" are needed for GPU optimizations - allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating row_group "//errorMessage - stop - endif - - row_group(:, :) = 0 - - num = (l_nev*nblk)*size_of_complex_datatype - successCUDA = cuda_malloc(row_group_dev, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc " - stop - endif - - successCUDA = cuda_memset(row_group_dev , 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemset " - stop - endif - endif ! useGPU - - ! Copy q from a block cyclic distribution into a distribution with contiguous rows, - ! and transpose the matrix using stripes of given stripe_width for cache blocking. - - ! The peculiar way it is done below is due to the fact that the last row should be - ! ready first since it is the first one to start below -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - - ! Please note about the OMP usage below: - ! This is not for speed, but because we want the matrix a in the memory and - ! in the cache of the correct thread (if possible) -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation! - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#endif /* WITH_OPENMP */ - - do ip = np_rows-1, 0, -1 - if (my_prow == ip) then - ! Receive my rows which have not yet been received - src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) - do i=limits(ip)+1,limits(ip+1) - src = mod((i-1)/nblk, np_rows) - if (src < my_prow) then - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr) -#else - call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - row(1:l_nev) = row(1:l_nev) -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef DOUBLE_PRECISION_COMPLEX - if (useGPU) then - call unpack_and_prepare_row_group_complex_gpu(i - limits(ip), .false.) -#ifdef WITH_MPI - call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct? -#endif - else -#ifdef WITH_MPI - call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - row(1:l_nev) = row(1:l_nev) -#endif - endif -#else /* DOUBLE_PRECISION_COMPLEX */ - - if (useGPU) then - call unpack_and_prepare_row_group_complex_gpu(i - limits(ip), .false.) -#ifdef WITH_MPI - call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct? -#endif - else -#ifdef WITH_MPI - call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - row(1:l_nev) = row(1:l_nev) - -#endif - endif -#endif /* DOUBLE_PRECISION_COMPLEX */ - -#endif /* WITH_OPENMP */ - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - call unpack_row_complex_cpu_openmp(a, row,i-limits(ip),my_thread, & - stripe_count, thread_width, stripe_width, l_nev) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - - if (.not.(useGPU)) then - call unpack_row_complex_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) - endif - -#endif /* WITH_OPENMP */ - - elseif (src==my_prow) then - src_offset = src_offset+1 - if (useGPU) then - call unpack_and_prepare_row_group_complex_gpu(i - limits(ip),.false.) - row_group(:, row_group_size) = q(src_offset, 1:l_nev) - else - row(:) = q(src_offset, 1:l_nev) - endif - -#ifdef WITH_OPENMP -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - call unpack_row_complex_cpu_openmp(a, row,i-limits(ip),my_thread, & - stripe_count, thread_width, stripe_width, l_nev) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - - if (.not.(useGPU)) then - call unpack_row_complex_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width) - endif - -#endif /* WITH_OPENMP */ - - endif - enddo - ! Send all rows which have not yet been send - src_offset = 0 - do dst = 0, ip-1 - do i=limits(dst)+1,limits(dst+1) - if(mod((i-1)/nblk, np_rows) == my_prow) then - src_offset = src_offset+1 - row(:) = q(src_offset, 1:l_nev) - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Send(row, l_nev, MPI_COMPLEX16, dst, 0, mpi_comm_rows, mpierr) -#else - call MPI_Send(row, l_nev, MPI_COMPLEX8, dst, 0, mpi_comm_rows, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - enddo - enddo - else if(my_prow < ip) then - ! Send all rows going to PE ip - src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1) - do i=limits(ip)+1,limits(ip+1) - src = mod((i-1)/nblk, np_rows) - if (src == my_prow) then - src_offset = src_offset+1 - row(:) = q(src_offset, 1:l_nev) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Send(row, l_nev, MPI_COMPLEX16, ip, 0, mpi_comm_rows, mpierr) -#else - call MPI_Send(row, l_nev, MPI_COMPLEX8, ip, 0, mpi_comm_rows, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - enddo - ! Receive all rows from PE ip - do i=limits(my_prow)+1,limits(my_prow+1) - src = mod((i-1)/nblk, np_rows) - if (src == ip) then - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr) -#else - call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - row(1:l_nev) = row(1:l_nev) -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef DOUBLE_PRECISION_COMPLEX - if (useGPU) then - call unpack_and_prepare_row_group_complex_gpu(i - limits(my_prow), .false.) -#ifdef WITH_MPI - call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ? -#endif - else -#ifdef WITH_MPI - call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - row(1:l_nev) = row(1:l_nev) -#endif - endif -#else /* DOUBLE_PRECISION_COMPLEX */ - if (useGPU) then - call unpack_and_prepare_row_group_complex_gpu(i - limits(my_prow), .false.) -#ifdef WITH_MPI - call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - row_group(1:l_nev,row_group_size) = -#endif - else -#ifdef WITH_MPI - call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) -#else - row(1:l_nev) = row(1:l_nev) -#endif - endif -#endif /* DOUBLE_PRECISION_COMPLEX */ - -#endif /* WITH_OPENMP */ - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - call unpack_row_complex_cpu_openmp(a, row,i-limits(my_prow),my_thread, & - stripe_count, thread_width, stripe_width, l_nev) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - - if (.not.(useGPU)) then - call unpack_row_complex_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width) - endif - -#endif /* WITH_OPENMP */ - - endif - enddo - endif - enddo - - if (useGPU) then - call unpack_and_prepare_row_group_complex_gpu(-1, .true.) - successCUDA = cuda_devicesynchronize() - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaDeviceSynchronize" - stop - endif - endif - - ! Set up result buffer queue - - num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows - - num_result_buffers = 4*nfact - allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating result_buffer "//errorMessage - stop - endif - - allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating result_send_request "//errorMessage - stop - endif - - allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating result_recv_request "//errorMessage - stop - endif - -#ifdef WITH_MPI - result_send_request(:) = MPI_REQUEST_NULL - result_recv_request(:) = MPI_REQUEST_NULL -#endif - - ! Queue up buffers -#ifdef WITH_MPI - if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends - do j = 1, min(num_result_buffers, num_result_blocks) -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(j), mpierr) -#else - call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX8, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(j), mpierr) -#endif - enddo - endif -#else /* WITH_MPI */ - ! carefull the "recieve" has to be done at the corresponding wait or send - !if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends - ! do j = 1, min(num_result_buffers, num_result_blocks) - ! result_buffer(1:l_nev*nblk,1,j) = result_buffer(1:l_nev*nblk,1,nbuf) - ! enddo - !endif - -#endif /* WITH_MPI */ - num_bufs_recvd = 0 ! No buffers received yet - - ! Initialize top/bottom requests - - allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating top_send_request "//errorMessage - stop - endif - - allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating top_recv_request "//errorMessage - stop - endif - - allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating bottom_send_request "//errorMessage - stop - endif - - allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating bottom_recv_request "//errorMessage - stop - endif - -#ifdef WITH_MPI - top_send_request(:) = MPI_REQUEST_NULL - top_recv_request(:) = MPI_REQUEST_NULL - bottom_send_request(:) = MPI_REQUEST_NULL - bottom_recv_request(:) = MPI_REQUEST_NULL -#endif - -#ifdef WITH_OPENMP - - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - - allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating top_border_send_buffer "//errorMessage - stop - endif - - allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating top_border_recv_buffer "//errorMessage - stop - endif - - allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_send_buffer "//errorMessage - stop - endif - - allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_recv_buffer "//errorMessage - stop - endif - - - top_border_send_buffer(:,:) = 0 - top_border_recv_buffer(:,:) = 0 - bottom_border_send_buffer(:,:) = 0 - bottom_border_recv_buffer(:,:) = 0 -#else /* OpenMP */ - allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating top_border_send_buffer "//errorMessage - stop - endif - - allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating top_border_recv_buffer "//errorMessage - stop - endif - - allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_send_buffer "//errorMessage - stop - endif - - allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_recv_buffer "//errorMessage - stop - endif - - top_border_send_buffer(:,:,:) = 0 - top_border_recv_buffer(:,:,:) = 0 - bottom_border_send_buffer(:,:,:) = 0 - bottom_border_recv_buffer(:,:,:) = 0 -#endif /* WITH_OPENMP */ - - ! Initialize broadcast buffer - - allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating bcast_buffer "//errorMessage - stop - endif - bcast_buffer = 0 - - if (useGPU) then - num = ( nbw * max_blk_size) * size_of_complex_datatype - successCUDA = cuda_malloc(bcast_buffer_dev, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" - stop - endif - - successCUDA = cuda_memset( bcast_buffer_dev, 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemset" - stop - endif - - num = ((max_blk_size-1))*size_of_complex_datatype - successCUDA = cuda_malloc( hh_dot_dev, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" - stop - endif - - successCUDA = cuda_memset( hh_dot_dev, 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemset" - stop - endif - - num = (max_blk_size)*size_of_complex_datatype - successCUDA = cuda_malloc( hh_tau_dev, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" - stop - endif - - successCUDA = cuda_memset( hh_tau_dev, 0, num) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemset" - stop - endif - endif ! useGPU - - current_tv_off = 0 ! Offset of next row to be broadcast - - - ! ------------------- start of work loop ------------------- - - a_off = 0 ! offset in A (to avoid unnecessary shifts) - - top_msg_length = 0 - bottom_msg_length = 0 - -#ifdef WITH_GPU_VERSION - !! istat = cuda_ProfilerStart() - !! istat = cudaFuncSetCacheConfig ( launch_compute_hh_trafo_c_kernel_complex, cudaFuncCachePreferShared) - !! t0_compute_kernel = 0 - ! t0_mpi_time = 0 - ! t0_cuda_memcpy =0 - ! t0_cpu_code =0 - ! t0_outer_do_time =0 - ! t0_inner_do_time =0 - ! t1_outer_do_time =MPI_Wtime() - ! t0_block_time =0 - ! t0_mpi_wait_time = 0 - ! t0_memcpy_time = 0 - ! t0_mpi_outer_wait_time=0 -#endif - - do sweep = 0, (na-1)/nbw - -#ifdef WITH_GPU_VERSION - ! t1_cpu_code =MPI_Wtime() -#endif - - current_n = na - sweep*nbw - call determine_workload(current_n, nbw, np_rows, limits) - current_n_start = limits(my_prow) - current_n_end = limits(my_prow+1) - current_local_n = current_n_end - current_n_start - - next_n = max(current_n - nbw, 0) - call determine_workload(next_n, nbw, np_rows, limits) - next_n_start = limits(my_prow) - next_n_end = limits(my_prow+1) - next_local_n = next_n_end - next_n_start - - if (next_n_end < next_n) then - bottom_msg_length = current_n_end - next_n_end - else - bottom_msg_length = 0 - endif - - if (next_local_n > 0) then - next_top_msg_length = current_n_start - next_n_start - else - next_top_msg_length = 0 - endif - -#ifdef WITH_GPU_VERSION - ! t2_cpu_code =MPI_Wtime() - ! t0_cpu_code = t0_cpu_code + (t2_cpu_code - t1_cpu_code) -#endif - - if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then - do i = 1, stripe_count -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - - csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width" - b_len = csw*nbw*max_threads - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#else - call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be do done at the corresponding wait or send -! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#else - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be do done at the corresponding wait or send -! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - - enddo - endif - -#ifdef WITH_GPU_VERSION - ! t1_block_time = MPI_Wtime() -#endif - if (current_local_n > 1) then - if (my_pcol == mod(sweep,np_cols)) then - bcast_buffer(:,1:current_local_n) = hh_trans_complex(:,current_tv_off+1:current_tv_off+current_local_n) - current_tv_off = current_tv_off + current_local_n - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX16, mod(sweep,np_cols), mpi_comm_cols, mpierr) -#else - call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX8, mod(sweep,np_cols), mpi_comm_cols, mpierr) -#endif - -#endif /* WITH_MPI */ - if (useGPU) then - successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), nbw * & - current_local_n * size_of_complex_datatype , & - cudaMemcpyHostToDevice) - call extract_hh_tau_complex_gpu(nbw, current_local_n, .false.) - call compute_hh_dot_products_complex_gpu(nbw, current_local_n) - endif - else - ! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_complex - bcast_buffer(:,1) = 0 - - if (useGPU) then - successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_complex_datatype) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemset" - stop - endif - - call extract_hh_tau_complex_gpu(nbw, 1, .true.) - - !NOTE(ca): I commented out the following line - ! istat = cuda_memcpy(loc(bcast_buffer(1,1)),bcast_buffer_dev,nbw*current_local_n * size_of_complex_datatype , - ! cudaMemcpyDeviceToHost) - ! if (istat .ne. 0) then - ! print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" - ! stop - ! endif - - endif ! useGPU - endif - -#ifdef WITH_GPU_VERSION - ! t2_block_time =MPI_Wtime() - ! t0_block_time = t0_block_time + ( t2_block_time - t1_block_time) -#endif - - if (l_nev == 0) cycle - - if (current_local_n > 0) then -#ifdef WITH_GPU_VERSION - ! t1_inner_do_time =MPI_Wtime() -#endif - - do i = 1, stripe_count - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - ! Get real stripe width for strip i; - ! The last OpenMP tasks may have an even smaller stripe with, - ! but we don't care about this, i.e. we send/recv a bit too much in this case. - ! csw: current_stripe_width - - csw = min(stripe_width, thread_width-(i-1)*stripe_width) -#endif /* WITH_OPENMP */ - - !wait_b - if (current_n_end < current_n) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(bottom_recv_request(i), mpi_status, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_GPU_VERSION - ! t1_mpi_wait_time =MPI_Wtime() -#endif - -#ifdef WITH_MPI - call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - -#ifdef WITH_GPU_VERSION - ! t2_mpi_wait_time =MPI_Wtime() - ! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time - t1_mpi_wait_time) - ! - ! t1_memcpy_time =MPI_Wtime() -#endif - -#endif /* WITH_OPENMP */ - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif -!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) - do my_thread = 1, max_threads - n_off = current_local_n+a_off - b_len = csw*nbw - b_off = (my_thread-1)*b_len - a(1:csw,n_off+1:n_off+nbw,i,my_thread) = & - reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /)) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - - n_off = current_local_n+a_off - if (useGPU) then -! t1_memcpy_time =MPI_Wtime() - dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_complex_datatype - successCUDA = cuda_memcpy( a_dev + dev_offset ,loc(bottom_border_recv_buffer(1,1,i)), & - stripe_width*nbw*size_of_complex_datatype ,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc" - stop - endif - -! t2_memcpy_time =MPI_Wtime() -! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time - t1_memcpy_time) - else - a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i) - endif - -#endif /* WITH_OPENMP */ - - if (next_n_end < next_n) then - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"not yet implemented" - stop - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & - MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#else - call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, & - MPI_COMPLEX8, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be do done at the corresponding wait or send -! bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#else - call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow+1, bottom_recv_tag, & - mpi_comm_rows, bottom_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be do done at the corresponding wait or send -! bottom_border_recv_buffer(1:nbw*stripe_width,1,i) = top_border_send_buffer(1:nbw*stripe_width,1,i) -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - endif - endif - - if (current_local_n <= bottom_msg_length + top_msg_length) then - - !wait_t - if (top_msg_length>0) then -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i), mpi_status, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_GPU_VERSION - ! t1_mpi_wait_time =MPI_Wtime() -#endif - -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - if (useGPU) then - ! t2_mpi_wait_time =MPI_Wtime() - ! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time -t1_mpi_wait_time) - ! t1_memcpy_time =MPI_Wtime() - ! - dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *size_of_complex_datatype -! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ))* 16 - successCUDA = cuda_memcpy( a_dev+dev_offset ,loc(top_border_recv_buffer(1,1,i)), & - stripe_width*top_msg_length*size_of_complex_datatype , cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" - stop - endif - -! t2_memcpy_time =MPI_Wtime() -! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time - t1_memcpy_time) - else - a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) - endif ! useGPU - -#endif /* WITH_OPENMP */ - endif - - !compute -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1) - do my_thread = 1, max_threads - if (top_msg_length>0) then - b_len = csw*top_msg_length - b_off = (my_thread-1)*b_len - a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & - reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) - endif - call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & - a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & - 0, current_local_n, i, my_thread, thread_width, & - THIS_COMPLEX_ELPA_KERNEL) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - if (useGPU) then - call compute_hh_trafo_complex_gpu(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2) -! call compute_hh_trafo_complex_gpu(0, current_local_n, i) - else - call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count, & - a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & - 0, current_local_n, i, last_stripe_width, & - THIS_COMPLEX_ELPA_KERNEL) - endif - -#endif /* WITH_OPENMP */ - - !send_b -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_GPU_VERSION -! t1_mpi_wait_time =MPI_Wtime() -#endif - -#ifdef WITH_MPI - call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - -#ifdef WITH_GPU_VERSION -! t2_mpi_wait_time =MPI_Wtime() -! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time-t1_mpi_wait_time) -#endif - -#endif /* WITH_OPENMP */ - - if (bottom_msg_length>0) then - n_off = current_local_n+nbw-bottom_msg_length+a_off - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - - b_len = csw*bottom_msg_length*max_threads - bottom_border_send_buffer(1:b_len,i) = & - reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#else - call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (next_top_msg_length > 0) then - top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & - bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) - endif -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - - if (useGPU) then -! t1_memcpy_time =MPI_Wtime() - dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_complex_datatype - successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, & - stripe_width * bottom_msg_length * size_of_complex_datatype , cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" - stop - endif - -! t2_memcpy_time =MPI_Wtime() -! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time -t1_memcpy_time) - else - bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#else - call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (next_top_msg_length > 0) then - top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & - bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) - endif - -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - endif - - else - - !compute -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) - do my_thread = 1, max_threads - call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & - a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & - current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, & - thread_width, THIS_COMPLEX_ELPA_KERNEL) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - - if (useGPU) then - call compute_hh_trafo_complex_gpu(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, & - dev_offset, dev_offset_1, dev_offset_2) -! call compute_hh_trafo_complex_gpu(current_local_n -bottom_msg_length, bottom_msg_length, i) - else - call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count, & - a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & - current_local_n - bottom_msg_length, bottom_msg_length, i, & - last_stripe_width, THIS_COMPLEX_ELPA_KERNEL) - - endif - -#endif /* WITH_OPENMP */ - - !send_b - -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif -#ifdef WITH_MPI - call MPI_Wait(bottom_send_request(i), mpi_status, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_GPU_VERSION -! t1_mpi_wait_time =MPI_Wtime() -#endif - -#ifdef WITH_MPI - call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - -#ifdef WITH_GPU_VERSION -! t2_mpi_wait_time =MPI_Wtime() -! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time-t1_mpi_wait_time) -#endif - -#endif /* WITH_OPENMP */ - if (bottom_msg_length > 0) then - n_off = current_local_n+nbw-bottom_msg_length+a_off -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - - b_len = csw*bottom_msg_length*max_threads - bottom_border_send_buffer(1:b_len,i) = & - reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /)) -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#else - call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX8, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (next_top_msg_length > 0) then - top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = & - bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) - endif -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - - if (useGPU) then -! t1_memcpy_time =MPI_Wtime() - dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_complex_datatype - successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, & - stripe_width * bottom_msg_length * size_of_complex_datatype , cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" - stop - endif - -! t2_memcpy_time =MPI_Wtime() -! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time -t1_memcpy_time) - else - bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i) - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#else - call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX8, my_prow+1, & - top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (next_top_msg_length > 0) then - top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & - bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) - endif - -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - endif - - !compute -#ifdef WITH_OPENMP - if (useGPU) then - print *,"trans_ev_tridi_to_band_complex: not yet implemented" - stop - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread), schedule(static, 1) - do my_thread = 1, max_threads - call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & - a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, & - kernel_time, top_msg_length, & - current_local_n-top_msg_length-bottom_msg_length, i, & - my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - if (useGPU) then -! call compute_hh_trafo_complex_gpu(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, i) - - call compute_hh_trafo_complex_gpu(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, i, a_off, & - dev_offset, dev_offset_1, dev_offset_2) - else - call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count, & - a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & - top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & - last_stripe_width, THIS_COMPLEX_ELPA_KERNEL) - endif -#endif /* WITH_OPENMP */ - - !wait_t - if (top_msg_length>0) then -#ifdef WITH_OPENMP - -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i), mpi_status, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_GPU_VERSION -! t1_mpi_wait_time =MPI_Wtime() -#endif - -#ifdef WITH_MPI - call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - if (useGPU) then -! t2_mpi_wait_time =MPI_Wtime() -! t0_mpi_wait_time = t0_mpi_wait_time +(t2_mpi_wait_time-t1_mpi_wait_time) -! -! t1_memcpy_time =MPI_Wtime() - dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *size_of_complex_datatype - successCUDA = cuda_memcpy( a_dev + dev_offset , loc(top_border_recv_buffer(:,1,i)), & - stripe_width * top_msg_length *size_of_complex_datatype ,cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" - stop - endif - -! -! t2_memcpy_time =MPI_Wtime() -! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time-t1_memcpy_time) - else - a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i) - endif - -#endif /* WITH_OPENMP */ - endif - - !compute -#ifdef WITH_OPENMP - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) - do my_thread = 1, max_threads - if (top_msg_length>0) then - b_len = csw*top_msg_length - b_off = (my_thread-1)*b_len - a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & - reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) - endif - call compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev, & - a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & - 0, top_msg_length, i, my_thread, thread_width, & - THIS_COMPLEX_ELPA_KERNEL) - enddo -!$omp end parallel do -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /* WITH_OPENMP */ - if (useGPU) then - call compute_hh_trafo_complex_gpu(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2) -! call compute_hh_trafo_complex_gpu(0, top_msg_length, i) - else - call compute_hh_trafo_complex_cpu(a, stripe_width, a_dim2, stripe_count, & - a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, & - 0, top_msg_length, i, last_stripe_width, & - THIS_COMPLEX_ELPA_KERNEL) - endif - -#endif /* WITH_OPENMP */ - endif - - if (next_top_msg_length > 0) then - !request top_border data -#ifdef WITH_OPENMP - b_len = csw*next_top_msg_length*max_threads -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow-1, & - top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) -#else - call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX8, my_prow-1, & - top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be done at the corresponding send or wait -! top_border_recv_buffer(1:csw*next_top_msg_length*max_threads,i) = bottom_border_send_buffer(1:csw*next_top_msg_length*max_threads,i) - -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX16, my_prow-1, & - top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) -#else - call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX8, my_prow-1, & - top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull the "recieve" has to be done at the corresponding send or wait -! top_border_recv_buffer(1:next_top_msg_length*stripe_width,1,i) = & -! bottom_border_send_buffer(1:bottom_msg_length*stripe_width,1,i) -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - endif - - !send_t - if (my_prow > 0) then -#ifdef WITH_OPENMP - -#ifdef WITH_MPI - call MPI_Wait(top_send_request(i), mpi_status, mpierr) -#endif - -#else /* WITH_OPENMP */ - -#ifdef WITH_GPU_VERSION -! t1_mpi_wait_time =MPI_Wtime() -#endif - -#ifdef WITH_MPI - call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - -#ifdef WITH_GPU_VERSION -! t2_mpi_wait_time =MPI_Wtime() -! t0_mpi_wait_time = t0_mpi_wait_time+(t2_mpi_wait_time-t1_mpi_wait_time) -#endif - -#endif /* WITH_OPENMP */ - -#ifdef WITH_OPENMP - b_len = csw*nbw*max_threads - top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /)) -#ifdef WITH_MPI - -#ifdef WITH_DOUBLE_PRECISION_COMPLEX - call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX16, & - my_prow-1, bottom_recv_tag, & - mpi_comm_rows, top_send_request(i), mpierr) -#else - call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX8, & - my_prow-1, bottom_recv_tag, & - mpi_comm_rows, top_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then - bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) - endif - if (next_n_end < next_n) then - bottom_border_recv_buffer(1:csw*nbw*max_threads,i) = top_border_send_buffer(1:csw*nbw*max_threads,i) - endif -#endif /* WITH_MPI */ - -#else /* WITH_OPENMP */ - - if (useGPU) then - ! t1_memcpy_time =MPI_Wtime() - dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_complex_datatype - successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), a_dev + dev_offset, & - stripe_width*nbw*size_of_complex_datatype ,cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" - stop - endif - -! t2_memcpy_time =MPI_Wtime() -! t0_memcpy_time = t0_memcpy_time + (t2_memcpy_time-t1_memcpy_time) -! - else - top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i) - endif -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow-1, bottom_recv_tag, & - mpi_comm_rows, top_send_request(i), mpierr) -#else - call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX8, my_prow-1, bottom_recv_tag, & - mpi_comm_rows, top_send_request(i), mpierr) -#endif - -#else /* WITH_MPI */ - if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then - bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i) - endif - if (next_n_end < next_n) then - bottom_border_recv_buffer(1:nbw,1:stripe_width,i) = top_border_send_buffer(1:nbw,1:stripe_width,i) - endif -#endif /* WITH_MPI */ - -#endif /* WITH_OPENMP */ - endif - - ! Care that there are not too many outstanding top_recv_request's -#ifdef WITH_GPU_VERSION -! t1_mpi_wait_time =MPI_Wtime() -#endif - if (stripe_count > 1) then - if (i>1) then -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call MPI_Wait(top_recv_request(i-1), mpi_status, mpierr) -#else - call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_MPI */ - else -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call MPI_Wait(top_recv_request(stripe_count), mpi_status, mpierr) -#else - call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - endif -#ifdef WITH_GPU_VERSION -! t2_mpi_wait_time =MPI_Wtime() -! t0_mpi_wait_time = t0_mpi_wait_time+(t2_mpi_wait_time-t1_mpi_wait_time) -#endif - enddo - -#ifdef WITH_GPU_VERSION -! t2_inner_do_time =MPI_Wtime() -! t0_inner_do_time = t0_inner_do_time + ( t2_inner_do_time - t1_inner_do_time) -#endif - - top_msg_length = next_top_msg_length - - else - ! wait for last top_send_request -#ifdef WITH_GPU_VERSION -! t1_mpi_outer_wait_time =MPI_Wtime() -#endif - - do i = 1, stripe_count -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call MPI_Wait(top_send_request(i), mpi_status, mpierr) -#else - call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_MPI */ - enddo -#ifdef WITH_GPU_VERSION -! t2_mpi_outer_wait_time =MPI_Wtime() -! t0_mpi_outer_wait_time =t0_mpi_outer_wait_time+(t2_mpi_outer_wait_time-t1_mpi_outer_wait_time) -#endif - endif -#ifdef WITH_GPU_VERSION -! t0_result_time = MPI_Wtime() -#endif - - ! Care about the result - - if (my_prow == 0) then - - ! topmost process sends nbw rows to destination processes - - do j=0,nfact-1 - - num_blk = sweep*nfact+j ! global number of destination block, 0 based - if (num_blk*nblk >= na) exit - - nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block - -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call MPI_Wait(result_send_request(nbuf), mpi_status, mpierr) -#else - call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr) -#endif - -#endif /* WITH_MPI */ - - dst = mod(num_blk, np_rows) - - if (dst == 0) then - if (useGPU) then - row_group_size = min(na - num_blk*nblk, nblk) - call pack_row_group_complex_gpu(row_group(:, :), j * nblk + a_off,row_group_size) - do i = 1, row_group_size - q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i) - enddo - else - do i = 1, min(na - num_blk*nblk, nblk) -#ifdef WITH_OPENMP - call pack_row_complex_cpu_openmp(a, row, j*nblk+i+a_off, & - stripe_width, stripe_count, max_threads, thread_width, l_nev) -#else - call pack_row_complex_cpu(a, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count) -#endif - q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:) - enddo - endif - else - if (useGPU) then - call pack_row_group_complex_gpu(result_buffer(:, :, nbuf), j * nblk + a_off, nblk) - else - do i = 1, nblk -#ifdef WITH_OPENMP - call pack_row_complex_cpu_openmp(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, & - stripe_width, stripe_count, max_threads, thread_width, l_nev) -#else - call pack_row_complex_cpu(a, result_buffer(:,i,nbuf),j*nblk+i+a_off, stripe_width, & - last_stripe_width, stripe_count) -#endif - enddo - endif - -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, dst, & - result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) -#else - call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX8, dst, & - result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr) -#endif -#else /* WITH_MPI */ - if (j+num_result_buffers < num_result_blocks) & - result_buffer(1:l_nev,1:nblk,nbuf) = result_buffer(1:l_nev,1:nblk,nbuf) - if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends - do j1 = 1, min(num_result_buffers, num_result_blocks) - result_buffer(1:l_nev,1:nblk,j1) = result_buffer(1:l_nev,1:nblk,nbuf) - enddo - endif -#endif /* WITH_MPI */ - endif - enddo - - else - - ! receive and store final result - - do j = num_bufs_recvd, num_result_blocks-1 - - nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block - - ! If there is still work to do, just test for the next result request - ! and leave the loop if it is not ready, otherwise wait for all - ! outstanding requests - - if (next_local_n > 0) then -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call MPI_Test(result_recv_request(nbuf), flag, mpi_status, mpierr) - -#else - call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - flag = .true. -#endif /* WITH_MPI */ - if (.not.flag) exit - else -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - call MPI_Wait(result_recv_request(nbuf), mpi_status, mpierr) -#else - call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr) -#endif - -#else /* WITH_MPI */ - -#endif /* WITH_MPI */ - - endif - - ! Fill result buffer into q - num_blk = j*np_rows + my_prow ! global number of current block, 0 based - do i = 1, min(na - num_blk*nblk, nblk) - q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf) - enddo - - ! Queue result buffer again if there are outstanding blocks left -#ifdef WITH_MPI - if (j+num_result_buffers < num_result_blocks) & - -#ifdef DOUBLE_PRECISION_COMPLEX - call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(nbuf), mpierr) -#else - call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX8, 0, result_recv_tag, & - mpi_comm_rows, result_recv_request(nbuf), mpierr) -#endif - -#else /* WITH_MPI */ -! carefull "recieve" has to be done at corresponding wait or send -! if (j+num_result_buffers < num_result_blocks) & -! result_buffer(1:l_nev*nblk,1,nbuf) = result_buffer(1:l_nev*nblk,1,nbuf) -#endif /* WITH_MPI */ - enddo - num_bufs_recvd = j - - endif - -#ifdef WITH_GPU_VERSION -! t2_result_time =MPI_Wtime() -! t0_result_time = t0_result_time + ( t2_result_time - t1_result_time) -#endif - ! Shift the remaining rows to the front of A (if necessary) - - offset = nbw - top_msg_length - - if (offset<0) then - if (wantDebug) then - write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: internal error, offset for shifting = ',offset - endif - success = .false. - return - endif - - a_off = a_off + offset - if (a_off + next_local_n + nbw > a_dim2) then -#ifdef WITH_OPENMP - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("OpenMP parallel") -#endif - -!$omp parallel do private(my_thread, i, j), schedule(static, 1) - do my_thread = 1, max_threads - do i = 1, stripe_count - do j = top_msg_length+1, top_msg_length+next_local_n - A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread) - enddo - enddo - enddo -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("OpenMP parallel") -#endif - -#else /*WITH_OPENMP */ - do i = 1, stripe_count - if (useGPU) then - chunk = min(next_local_n - 1, a_off) - do j = top_msg_length + 1, top_msg_length + next_local_n, chunk - top = min(j + chunk, top_msg_length + next_local_n) - this_chunk = top - j + 1 - dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_complex_datatype - dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_complex_datatype - ! it is not logical to set here always the parameter "cudaMemcpyDeviceToDevice" do this ONCE at startup - ! tmp = cuda_d2d(1) - successCUDA = cuda_memcpy( a_dev + dev_offset , a_dev+dev_offset_1, & - stripe_width*this_chunk*size_of_complex_datatype, cudaMemcpyDeviceToDevice) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy" - stop - endif - - enddo - else ! useGPU - do j = top_msg_length+1, top_msg_length+next_local_n - A(:,j,i) = A(:,j+a_off,i) - enddo - endif - enddo -#endif /*WITH_OPENMP */ - a_off = 0 - endif - enddo - -#ifdef WITH_GPU_VERSION -! t2_outer_do_time =MPI_Wtime() -! t0_outer_do_time = t0_outer_do_time + ( t2_outer_do_time - t1_outer_do_time) -! -! istat = cuda_ProfilerStop() -#endif - - ! Just for safety: -#ifdef WITH_MPI - if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol - if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol - if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol - if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol -#endif - - if (my_prow == 0) then - -#ifdef WITH_MPI - -#ifdef WITH_OPENMP - allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error allocating mpi_statuses "//errorMessage - stop - endif - - call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr) - deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating mpi_statuses "//errorMessage - stop - endif -#else - call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - -#ifdef WITH_MPI - if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol - if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol -#endif - if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) & - write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6 - - ! deallocate all working space - - if (.not.(useGPU)) then - deallocate(a, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating a "//errorMessage - stop - endif - endif - - deallocate(row, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating row "//errorMessage - stop - endif - - deallocate(limits, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating limits "//errorMessage - stop - endif - - deallocate(result_send_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating result_send_request "//errorMessage - stop - endif - - deallocate(result_recv_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating result_recv_request "//errorMessage - stop - endif - - deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_send_buffer "//errorMessage - stop - endif - - deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_recv_buffer "//errorMessage - stop - endif - - deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_send_buffer "//errorMessage - stop - endif - - deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_border_recv_buffer "//errorMessage - stop - endif - - deallocate(result_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating result_buffer "//errorMessage - stop - endif - - deallocate(bcast_buffer, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating bcast_buffer "//errorMessage - stop - endif - - deallocate(top_send_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating top_send_request "//errorMessage - stop - endif - - deallocate(top_recv_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating top_recv_request "//errorMessage - stop - endif - - deallocate(bottom_send_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_send_request "//errorMessage - stop - endif - - deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_recv_request "//errorMessage - stop - endif - - if (useGPU) then - successCUDA = cuda_free(a_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaFree" - stop - endif - - successCUDA = cuda_free(hh_tau_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaFree" - stop - endif - - successCUDA = cuda_free(hh_dot_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaFree" - stop - endif - - successCUDA = cuda_free(row_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaFree" - stop - endif - - deallocate(row_group, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"trans_ev_tridi_to_band_complex: error deallocating row_group "//errorMessage - stop - endif - - successCUDA= cuda_free(row_group_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaFree" - stop - endif - - successCUDA = cuda_free(bcast_buffer_dev) - if (.not.(successCUDA)) then - print *,"trans_ev_tridi_to_band_complex: error in cudaFree" - stop - endif - - endif ! useGPU -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("trans_ev_tridi_to_band_complex") -#endif - return - contains - - ! The host wrapper for extracting "tau" from the HH reflectors (see the - ! kernel below) - subroutine extract_hh_tau_complex_gpu(nbw, n, is_zero) - use cuda_c_kernel - use precision - implicit none - integer(kind=ik), value :: nbw, n - logical, value :: is_zero - integer(kind=ik) :: val_is_zero - - if (is_zero) then - val_is_zero = 1 - else - val_is_zero = 0 - endif - call launch_extract_hh_tau_c_kernel_complex(bcast_buffer_dev,hh_tau_dev, nbw, n,val_is_zero) - end subroutine - - subroutine compute_hh_dot_products_complex_gpu(nbw, n) - use cuda_c_kernel - use precision - implicit none - integer(kind=ik), value :: nbw, n - - if (n .le. 1) return - call launch_compute_hh_dotp_c_kernel_complex( bcast_buffer_dev, hh_dot_dev, nbw,n) - end subroutine - - subroutine pack_row_group_complex_gpu(rows, n_offset, row_count) - use cuda_c_kernel - use precision - implicit none - integer(kind=ik), intent(in) :: n_offset, row_count - complex(kind=ck) :: rows(:,:) - integer(kind=ik) :: max_idx - logical :: successCUDA - - max_idx = (stripe_count - 1) * stripe_width + last_stripe_width - call launch_my_pack_c_kernel_complex(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, & - l_nev, a_dev, row_group_dev) - successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_complex_datatype, & - cudaMemcpyDeviceToHost) - if (.not.(successCUDA)) then - print *,"pack_row_group_complex_gpu: error in cudaMemcpy" - stop - endif - - end subroutine - - subroutine unpack_row_group_complex_gpu(rows, n_offset, row_count) - use cuda_c_kernel - use precision - implicit none - integer(kind=ik), intent(in) :: n_offset, row_count - complex(kind=ck), intent(in) :: rows(:, :) - integer(kind=ik) :: max_idx - integer(kind=ik) :: i - logical :: successCUDA - - max_idx = (stripe_count - 1) * stripe_width + last_stripe_width - successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_complex_datatype , & - cudaMemcpyHostToDevice) - if (.not.(successCUDA)) then - print *,"unpack_row_group_complex_gpu: error in cudaMemcpy" - stop - endif - - call launch_my_unpack_c_kernel_complex( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, & - row_group_dev,a_dev) - end subroutine - - subroutine unpack_and_prepare_row_group_complex_gpu(next_unpack_idx, force) - - use precision - implicit none - integer(kind=ik), intent(in) :: next_unpack_idx - logical, intent(in) :: force - - if (row_group_size == 0) then - ! Nothing to flush, just prepare for the upcoming row - row_group_size = 1 - else - if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then - ! A flush and a reset must performed - call unpack_row_group_complex_gpu(row_group(:, :), unpack_idx - row_group_size, row_group_size) - row_group_size = 1 - else - ! Just prepare for the upcoming row - row_group_size = row_group_size + 1 - endif - endif - ! Always update the index for the upcoming row - unpack_idx = next_unpack_idx - - end subroutine - - subroutine compute_hh_trafo_complex_gpu(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2) - - use iso_c_binding - use cuda_c_kernel - use precision - implicit none - integer(kind=ik), intent(in) :: off, ncols, istripe - integer(kind=ik) :: nl - real(kind=c_double) :: ttt ! MPI_WTIME always needs double - - integer(kind=ik) :: a_off - integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2 - - if (ncols < 1) return - ttt = mpi_wtime() - nl = merge(stripe_width, last_stripe_width, istripe < stripe_count) - - dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) *size_of_complex_datatype - dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_complex_datatype - dev_offset_2 =( off-1 )*size_of_complex_datatype - -! t1_compute_kernel =MPI_Wtime() - call launch_compute_hh_trafo_c_kernel_complex(a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, & - hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols) - -! time0 = time0 + time1 -! t2_compute_kernel =MPI_Wtime() -! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel - - kernel_flops = kernel_flops + 4 * int(nl, 8) * int(ncols, 8) * int(nbw,8) - kernel_time = kernel_time + mpi_wtime() - ttt - n_times =n_times +1 - end subroutine compute_hh_trafo_complex_gpu + public :: bandred_real_double + public :: tridiag_band_real_double + public :: trans_ev_tridi_to_band_real_double + public :: trans_ev_band_to_full_real_double + +#ifdef WANT_SINGLE_PRECISION_REAL + public :: bandred_real_single + public :: tridiag_band_real_single + public :: trans_ev_tridi_to_band_real_single + public :: trans_ev_band_to_full_real_single +#endif + + public :: bandred_complex_double + public :: tridiag_band_complex_double + public :: trans_ev_tridi_to_band_complex_double + public :: trans_ev_band_to_full_complex_double + +#ifdef WANT_SINGLE_PRECISION_COMPLEX + public :: bandred_complex_single + public :: tridiag_band_complex_single + public :: trans_ev_tridi_to_band_complex_single + public :: trans_ev_band_to_full_complex_single +#endif + public :: band_band_real_double + public :: divide_band -end subroutine + integer(kind=ik), public :: which_qr_decomposition = 1 ! defines, which QR-decomposition algorithm will be used + ! 0 for unblocked + ! 1 for blocked (maxrank: nblk) + contains -#ifdef DOUBLE_PRECISION_REAL +! real double precision first +#define DOUBLE_PRECISION_REAL 1 -#define DATATYPE REAL(kind=rk) +#define REAL_DATATYPE rk8 #define BYTESIZE 8 #define REALCASE 1 #include "redist_band.X90" -#undef DATATYPE +#undef DOUBLE_PRECISION_REAL +#undef REAL_DATATYPE #undef BYTESIZE #undef REALCASE -#else /* DOUBLE_PRECISION_REAL */ +! single precision +#ifdef WANT_SINGLE_PRECISION_REAL -#define DATATYPE REAL(kind=rk) +#undef DOUBLE_PRECISION_REAL +#define REAL_DATATYPE rk4 #define BYTESIZE 4 #define REALCASE 1 #include "redist_band.X90" -#undef DATATYPE +#undef REAL_DATATYPE #undef BYTESIZE #undef REALCASE -#endif /* DOUBLE_PRECISION_REAL */ +#endif -#ifdef DOUBLE_PRECISION_COMPLEX +! double precision +#define DOUBLE_PRECISION_COMPLEX 1 -#define DATATYPE COMPLEX(kind=ck) +#define COMPLEX_DATATYPE ck8 #define BYTESIZE 16 #define COMPLEXCASE 1 #include "redist_band.X90" -#undef DATATYPE +#undef COMPLEX_DATATYPE #undef BYTESIZE #undef COMPLEXCASE +#undef DOUBLE_PRECISION_COMPLEX -#else +#ifdef WANT_SINGLE_PRECISION_COMPLEX -#define DATATYPE COMPLEX(kind=ck) -#define BYTESIZE 8 +#undef DOUBLE_PRECISION_COMPLEX +#undef DOUBLE_PRECISION_REAL +#define COMPLEX_DATATYPE ck4 #define COMPLEXCASE 1 #include "redist_band.X90" -#undef DATATYPE +#undef COMPLEX_DATATYPE #undef BYTESIZE #undef COMPLEXCASE -#endif - !--------------------------------------------------------------------------------------------------- - ! divide_band: sets the work distribution in band - ! Proc n works on blocks block_limits(n)+1 .. block_limits(n+1) - - subroutine divide_band(nblocks_total, n_pes, block_limits) -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - integer(kind=ik), intent(in) :: nblocks_total ! total number of blocks in band - integer(kind=ik), intent(in) :: n_pes ! number of PEs for division - integer(kind=ik), intent(out) :: block_limits(0:n_pes) - - integer(kind=ik) :: n, nblocks, nblocks_left - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("divide_band") -#endif - - block_limits(0) = 0 - if (nblocks_total < n_pes) then - ! Not enough work for all: The first tasks get exactly 1 block - do n=1,n_pes - block_limits(n) = min(nblocks_total,n) - enddo - else - ! Enough work for all. If there is no exact loadbalance, - ! the LAST tasks get more work since they are finishing earlier! - nblocks = nblocks_total/n_pes - nblocks_left = nblocks_total - n_pes*nblocks - do n=1,n_pes - if (n<=n_pes-nblocks_left) then - block_limits(n) = block_limits(n-1) + nblocks - else - block_limits(n) = block_limits(n-1) + nblocks + 1 - endif - enddo - endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("divide_band") -#endif - - end subroutine - - subroutine band_band_real(na, nb, nbCol, nb2, nb2Col, ab, ab2, d, e, mpi_comm) - - !------------------------------------------------------------------------------- - ! band_band_real: - ! Reduces a real symmetric banded matrix to a real symmetric matrix with smaller bandwidth. Householder transformations are not stored. - ! Matrix size na and original bandwidth nb have to be a multiple of the target bandwidth nb2. (Hint: expand your matrix with - ! zero entries, if this - ! requirement doesn't hold) - ! - ! na Order of matrix - ! - ! nb Semi bandwidth of original matrix - ! - ! nb2 Semi bandwidth of target matrix - ! - ! ab Input matrix with bandwidth nb. The leading dimension of the banded matrix has to be 2*nb. The parallel data layout - ! has to be accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb+1 to min(na, block_limits(n+1)*nb) - ! are located on rank n. - ! - ! ab2 Output matrix with bandwidth nb2. The leading dimension of the banded matrix is 2*nb2. The parallel data layout is - ! accordant to divide_band(), i.e. the matrix columns block_limits(n)*nb2+1 to min(na, block_limits(n+1)*nb2) are located - ! on rank n. - ! - ! d(na) Diagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output) - ! - ! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0, set only if ab2 = 1 (output) - ! - ! mpi_comm - ! MPI-Communicator for the total processor set - !------------------------------------------------------------------------------- -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - - integer(kind=ik), intent(in) :: na, nb, nbCol, nb2, nb2Col, mpi_comm - real(kind=rk), intent(inout) :: ab(2*nb,nbCol) ! removed assumed size - real(kind=rk), intent(inout) :: ab2(2*nb2,nb2Col) ! removed assumed size - real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0 - - real(kind=rk) :: hv(nb,nb2), w(nb,nb2), w_new(nb,nb2), tau(nb2), hv_new(nb,nb2), & - tau_new(nb2), ab_s(1+nb,nb2), ab_r(1+nb,nb2), ab_s2(2*nb2,nb2), hv_s(nb,nb2) - - real(kind=rk) :: work(nb*nb2), work2(nb2*nb2) - integer(kind=ik) :: lwork, info - - integer(kind=ik) :: istep, i, n, dest - integer(kind=ik) :: n_off, na_s - integer(kind=ik) :: my_pe, n_pes, mpierr - integer(kind=ik) :: nblocks_total, nblocks - integer(kind=ik) :: nblocks_total2, nblocks2 - integer(kind=ik) :: ireq_ab, ireq_hv -#ifdef WITH_MPI - integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE) -#endif - integer(kind=ik), allocatable :: mpi_statuses(:,:) - integer(kind=ik), allocatable :: block_limits(:), block_limits2(:), ireq_ab2(:) - - integer(kind=ik) :: j, nc, nr, ns, ne, iblk - integer(kind=ik) :: istat - character(200) :: errorMessage - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("band_band_real") -#endif - if (na .lt. 2*nb) then - print *,"na lt 2*nb ",na,2*nb - stop - endif - if (na .lt. 2*nb2) then - print *,"na lt 2*nb2 ",na,2*nb2 - stop - endif - if (na .lt. nbCol) then - print *,"na lt nbCol ",na,nbCol - stop - endif - if (na .lt. nb2Col) then - print *,"na lt nb2Col ",na,nb2Col - stop - endif - - call mpi_comm_rank(mpi_comm,my_pe,mpierr) - call mpi_comm_size(mpi_comm,n_pes,mpierr) - ! Total number of blocks in the band: - nblocks_total = (na-1)/nb + 1 - nblocks_total2 = (na-1)/nb2 + 1 - - ! Set work distribution - allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"error allocating block_limits "//errorMessage - stop - endif - call divide_band(nblocks_total, n_pes, block_limits) - - allocate(block_limits2(0:n_pes), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"error allocating block_limits2 "//errorMessage - stop - endif - - call divide_band(nblocks_total2, n_pes, block_limits2) - - ! nblocks: the number of blocks for my task - nblocks = block_limits(my_pe+1) - block_limits(my_pe) - nblocks2 = block_limits2(my_pe+1) - block_limits2(my_pe) - - allocate(ireq_ab2(1:nblocks2), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"error allocating ireq_ab2 "//errorMessage - stop - endif - -#ifdef WITH_MPI - ireq_ab2 = MPI_REQUEST_NULL - - if (nb2>1) then - do i=0,nblocks2-1 - -#ifdef DOUBLE_PRECISION_REAL - call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, mpi_real8, 0, 3, mpi_comm, ireq_ab2(i+1), mpierr) -#else - call mpi_irecv(ab2(1,i*nb2+1), 2*nb2*nb2, mpi_real4, 0, 3, mpi_comm, ireq_ab2(i+1), mpierr) -#endif - enddo - endif -#else /* WITH_MPI */ - ! carefull the "recieve" has to be done at the corresponding send or wait -! if (nb2>1) then -! do i=0,nblocks2-1 -! ab2(1:2*nb2*nb2,i*nb2+1:i*nb2+1+nb2-1) = ab_s2(1:2*nb2,i*nb2+1:nb2) -! enddo -! endif - -#endif /* WITH_MPI */ - ! n_off: Offset of ab within band - n_off = block_limits(my_pe)*nb - lwork = nb*nb2 - dest = 0 -#ifdef WITH_MPI - ireq_ab = MPI_REQUEST_NULL - ireq_hv = MPI_REQUEST_NULL -#endif - ! --------------------------------------------------------------------------- - ! Start of calculations - - na_s = block_limits(my_pe)*nb + 1 - - if (my_pe>0 .and. na_s<=na) then - ! send first nb2 columns to previous PE - ! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also) - do i=1,nb2 - ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1) - enddo -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_isend(ab_s, (nb+1)*nb2, mpi_real8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#else - call mpi_isend(ab_s, (nb+1)*nb2, mpi_real4, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif - -#endif /* WITH_MPI */ - endif - - do istep=1,na/nb2 - - if (my_pe==0) then - - n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced - hv(:,:) = 0 - tau(:) = 0 - - ! The last step (istep=na-1) is only needed for sending the last HH vectors. - ! We don't want the sign of the last element flipped (analogous to the other sweeps) - if (istep < na/nb2) then - - ! Transform first block column of remaining matrix -#ifdef DOUBLE_PRECISION_REAL - call dgeqrf(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info) -#else - call sgeqrf(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info) -#endif - - do i=1,nb2 - hv(i,i) = 1.0_rk - hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1) - ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0 - enddo - - endif +#endif /* WANT_SINGLE_PRECISION_COMPLEX */ - if (nb2==1) then - d(istep) = ab(1,na_s-n_off) - e(istep) = ab(2,na_s-n_off) - if (istep == na) then - e(na) = 0 - endif - else - ab_s2 = 0 - ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1) - if (block_limits2(dest+1)1) then - do i= 0,nblocks2-1 - ab2(1:2*nb2*nb2,i*nb2+1:i+nb2+1+nb2-1) = ab_s2(1:2*nb2,1:nb2) - enddo - endif - -#endif /* WITH_MPI */ - - endif - - else - if (na>na_s+nb2-1) then - ! Receive Householder vectors from previous task, from PE owning subdiagonal -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_recv(hv, nb*nb2, mpi_real8, my_pe-1, 2, mpi_comm, mpi_status, mpierr) -#else - call mpi_recv(hv, nb*nb2, mpi_real4, my_pe-1, 2, mpi_comm, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - hv(1:nb,1:nb2) = hv_s(1:nb,1:nb2) -#endif /* WITH_MPI */ - - do i=1,nb2 - tau(i) = hv(i,i) - hv(i,i) = 1. - enddo - endif - endif - - na_s = na_s+nb2 - if (na_s-n_off > nb) then - ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb) - ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0 - n_off = n_off + nb - endif - - do iblk=1,nblocks - ns = na_s + (iblk-1)*nb - n_off ! first column in block - ne = ns+nb-nb2 ! last column in block - - if (ns+n_off>na) exit - - nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block - nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!) - ! Note that nr>=0 implies that diagonal block is full (nc==nb)! - - call wy_gen(nc,nb2,w,hv,tau,work,nb) - - if (iblk==nblocks .and. nc==nb) then - !request last nb2 columns -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_recv(ab_r,(nb+1)*nb2, mpi_real8, my_pe+1, 1, mpi_comm, mpi_status, mpierr) -#else - call mpi_recv(ab_r,(nb+1)*nb2, mpi_real4, my_pe+1, 1, mpi_comm, mpi_status, mpierr) -#endif - -#else /* WITH_MPI */ - ab_r(1:nb+1,1:nb2) = ab_s(1:nb+1,1:nb2) -#endif /* WITH_MPI */ - do i=1,nb2 - ab(1:nb+1,ne+i-1) = ab_r(:,i) - enddo - endif - - hv_new(:,:) = 0 ! Needed, last rows must be 0 for nr < nb - tau_new(:) = 0 - - if (nr>0) then - call wy_right(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb) -#ifdef DOUBLE_PRECISION_REAL - call dgeqrf(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info) -#else - call sgeqrf(nr, nb2, ab(nb+1,ns), 2*nb-1, tau_new, work, lwork, info) -#endif - do i=1,nb2 - hv_new(i,i) = 1.0_rk - hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1) - ab(nb+2:,ns+i-1) = 0 - enddo - - !send hh-vector - if (iblk==nblocks) then -#ifdef WITH_MPI - call mpi_wait(ireq_hv,mpi_status,mpierr) -#endif - hv_s = hv_new - do i=1,nb2 - hv_s(i,i) = tau_new(i) - enddo -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_isend(hv_s,nb*nb2, mpi_real8, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) -#else - call mpi_isend(hv_s,nb*nb2, mpi_real4, my_pe+1, 2, mpi_comm, ireq_hv, mpierr) -#endif - -#else /* WITH_MPI */ - -#endif /* WITH_MPI */ - endif - endif - - call wy_symm(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb) - - if (my_pe>0 .and. iblk==1) then - !send first nb2 columns to previous PE -#ifdef WITH_MPI - call mpi_wait(ireq_ab,mpi_status,mpierr) -#endif - do i=1,nb2 - ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1) - enddo -#ifdef WITH_MPI - -#ifdef DOUBLE_PRECISION_REAL - call mpi_isend(ab_s,(nb+1)*nb2, mpi_real8, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#else - call mpi_isend(ab_s,(nb+1)*nb2, mpi_real4, my_pe-1, 1, mpi_comm, ireq_ab, mpierr) -#endif -#else /* WITH_MPI */ +! real double precision +#define DOUBLE_PRECISION_REAL 1 +#define REAL_DATATYPE rk8 -#endif /* WITH_MPI */ - endif +#include "elpa2_compute_real_template.X90" - if (nr>0) then - call wy_gen(nr,nb2,w_new,hv_new,tau_new,work,nb) - call wy_left(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb) - endif +#undef DOUBLE_PRECISION_REAL +#undef REAL_DATATYPE - ! Use new HH vector for the next block - hv(:,:) = hv_new(:,:) - tau = tau_new - enddo - enddo +! real single precision +#if defined(WANT_SINGLE_PRECISION_REAL) - ! Finish the last outstanding requests -#ifdef WITH_MPI - call mpi_wait(ireq_ab,mpi_status,mpierr) - call mpi_wait(ireq_hv,mpi_status,mpierr) - allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"error allocating mpi_statuses "//errorMessage - stop - endif +#undef DOUBLE_PRECISION_REAL +#define REAL_DATATYPE rk4 - call mpi_waitall(nblocks2,ireq_ab2,mpi_statuses,mpierr) - deallocate(mpi_statuses, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"error deallocating mpi_statuses "//errorMessage - stop - endif +#include "elpa2_compute_real_template.X90" - call mpi_barrier(mpi_comm,mpierr) -#endif /* WITH_MPI */ +#undef DOUBLE_PRECISION_REAL +#undef REAL_DATATYPE - deallocate(block_limits, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"error deallocating block_limits "//errorMessage - stop - endif +#endif /* WANT_SINGLE_PRECISION_REAL */ - deallocate(block_limits2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"error deallocating block_limits2 "//errorMessage - stop - endif +! complex double precision +#define DOUBLE_PRECISION_COMPLEX 1 +#define REAL_DATATYPE rk8 +#define COMPLEX_DATATYPE ck8 +#include "elpa2_compute_complex_template.X90" - deallocate(ireq_ab2, stat=istat, errmsg=errorMessage) - if (istat .ne. 0) then - print *,"error deallocating ireq_ab2 "//errorMessage - stop - endif +#undef DOUBLE_PRECISION_COMPLEX +#undef REAL_DATATYPE +#undef COMPLEX_DATATYPE -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("band_band_real") -#endif - - end subroutine - - subroutine wy_gen(n, nb, W, Y, tau, mem, lda) -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - integer(kind=ik), intent(in) :: n !length of householder-vectors - integer(kind=ik), intent(in) :: nb !number of householder-vectors - integer(kind=ik), intent(in) :: lda !leading dimension of Y and W - real(kind=rk), intent(in) :: Y(lda,nb) !matrix containing nb householder-vectors of length b - real(kind=rk), intent(in) :: tau(nb) !tau values - real(kind=rk), intent(out) :: W(lda,nb) !output matrix W - real(kind=rk), intent(in) :: mem(nb) !memory for a temporary matrix of size nb - - integer(kind=ik) :: i - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("wy_gen") -#endif - - W(1:n,1) = tau(1)*Y(1:n,1) - do i=2,nb - W(1:n,i) = tau(i)*Y(1:n,i) -#ifdef DOUBLE_PRECISION_REAL - call DGEMV('T', n, i-1, 1.0_rk, Y, lda, W(1,i), 1, 0.0_rk, mem,1) - call DGEMV('N', n, i-1, -1.0_rk, W, lda, mem, 1, 1.0_rk, W(1,i),1) -#else - call SGEMV('T', n, i-1, 1.0_rk, Y, lda, W(1,i), 1, 0.0_rk, mem,1) - call SGEMV('N', n, i-1, -1.0_rk, W, lda, mem, 1, 1.0_rk, W(1,i),1) -#endif - enddo - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("wy_gen") -#endif - - end subroutine - - subroutine wy_left(n, m, nb, A, lda, W, Y, mem, lda2) -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - integer(kind=ik), intent(in) :: n !width of the matrix A - integer(kind=ik), intent(in) :: m !length of matrix W and Y - integer(kind=ik), intent(in) :: nb !width of matrix W and Y - integer(kind=ik), intent(in) :: lda !leading dimension of A - integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y - real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size - real(kind=rk), intent(in) :: W(m,nb) !blocked transformation matrix W - real(kind=rk), intent(in) :: Y(m,nb) !blocked transformation matrix Y - real(kind=rk), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("wy_left") -#endif -#ifdef DOUBLE_PRECISION_REAL - call DGEMM('T', 'N', nb, n, m, 1.0_rk, W, lda2, A, lda, 0.0_rk, mem, nb) - call DGEMM('N', 'N', m, n, nb, -1.0_rk, Y, lda2, mem, nb, 1.0_rk, A, lda) -#else - call SGEMM('T', 'N', nb, n, m, 1.0_rk, W, lda2, A, lda, 0.0_rk, mem, nb) - call SGEMM('N', 'N', m, n, nb, -1.0_rk, Y, lda2, mem, nb, 1.0_rk, A, lda) -#endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("wy_left") -#endif - - end subroutine - - subroutine wy_right(n, m, nb, A, lda, W, Y, mem, lda2) -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - integer(kind=ik), intent(in) :: n !height of the matrix A - integer(kind=ik), intent(in) :: m !length of matrix W and Y - integer(kind=ik), intent(in) :: nb !width of matrix W and Y - integer(kind=ik), intent(in) :: lda !leading dimension of A - integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y - real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size - real(kind=rk), intent(in) :: W(m,nb) !blocked transformation matrix W - real(kind=rk), intent(in) :: Y(m,nb) !blocked transformation matrix Y - real(kind=rk), intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb - -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("wy_right") -#endif +! complex single precision +#if defined(WANT_SINGLE_PRECISION_COMPLEX) -#ifdef DOUBLE_PRECISION_REAL - call DGEMM('N', 'N', n, nb, m, 1.0_rk, A, lda, W, lda2, 0.0_rk, mem, n) - call DGEMM('N', 'T', n, m, nb, -1.0_rk, mem, n, Y, lda2, 1.0_rk, A, lda) -#else - call SGEMM('N', 'N', n, nb, m, 1.0_rk, A, lda, W, lda2, 0.0_rk, mem, n) - call SGEMM('N', 'T', n, m, nb, -1.0_rk, mem, n, Y, lda2, 1.0_rk, A, lda) -#endif - -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("wy_right") -#endif - - end subroutine +#undef DOUBLE_PRECISION_COMPLEX +#define REAL_DATATYPE rk4 +#define COMPLEX_DATATYPE ck4 - subroutine wy_symm(n, nb, A, lda, W, Y, mem, mem2, lda2) -#ifdef HAVE_DETAILED_TIMINGS - use timings -#endif - use precision - implicit none - integer(kind=ik), intent(in) :: n !width/heigth of the matrix A; length of matrix W and Y - integer(kind=ik), intent(in) :: nb !width of matrix W and Y - integer(kind=ik), intent(in) :: lda !leading dimension of A - integer(kind=ik), intent(in) :: lda2 !leading dimension of W and Y - real(kind=rk), intent(inout) :: A(lda,*) !matrix to be transformed ! remove assumed size - real(kind=rk), intent(in) :: W(n,nb) !blocked transformation matrix W - real(kind=rk), intent(in) :: Y(n,nb) !blocked transformation matrix Y - real(kind=rk) :: mem(n,nb) !memory for a temporary matrix of size n x nb - real(kind=rk) :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb +#include "elpa2_compute_complex_template.X90" -#ifdef HAVE_DETAILED_TIMINGS - call timer%start("wy_symm") -#endif +#undef DOUBLE_PRECISION_COMPLEX +#undef COMPLEX_DATATYPE +#undef REAL_DATATYPE -#ifdef DOUBLE_PRECISION_REAL - call DSYMM('L', 'L', n, nb, 1.0_rk, A, lda, W, lda2, 0.0_rk, mem, n) - call DGEMM('T', 'N', nb, nb, n, 1.0_rk, mem, n, W, lda2, 0.0_rk, mem2, nb) - call DGEMM('N', 'N', n, nb, nb, -0.5_rk, Y, lda2, mem2, nb, 1.0_rk, mem, n) - call DSYR2K('L', 'N', n, nb, -1.0_rk, Y, lda2, mem, n, 1.0_rk, A, lda) -#else - call SSYMM('L', 'L', n, nb, 1.0_rk, A, lda, W, lda2, 0.0_rk, mem, n) - call SGEMM('T', 'N', nb, nb, n, 1.0_rk, mem, n, W, lda2, 0.0_rk, mem2, nb) - call SGEMM('N', 'N', n, nb, nb, -0.5_rk, Y, lda2, mem2, nb, 1.0_rk, mem, n) - call SSYR2K('L', 'N', n, nb, -1.0_rk, Y, lda2, mem, n, 1.0_rk, A, lda) -#endif +#endif /* WANT_SINGLE_PRECISION_COMPLEX */ -#ifdef HAVE_DETAILED_TIMINGS - call timer%stop("wy_symm") -#endif - end subroutine end module ELPA2_compute diff --git a/src/elpa2_compute_complex_template.X90 b/src/elpa2_compute_complex_template.X90 new file mode 100644 index 0000000000000000000000000000000000000000..92445dc709664ae32dabb457fc9557715489709a --- /dev/null +++ b/src/elpa2_compute_complex_template.X90 @@ -0,0 +1,5710 @@ +#if 0 +! This file is part of ELPA. +! +! The ELPA library was originally created by the ELPA consortium, +! consisting of the following organizations: +! +! - Max Planck Computing and Data Facility (MPCDF), fomerly known as +! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG), +! - Bergische Universität Wuppertal, Lehrstuhl für angewandte +! Informatik, +! - Technische Universität München, Lehrstuhl für Informatik mit +! Schwerpunkt Wissenschaftliches Rechnen , +! - Fritz-Haber-Institut, Berlin, Abt. Theorie, +! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn, +! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition, +! and +! - IBM Deutschland GmbH +! +! This particular source code file contains additions, changes and +! enhancements authored by Intel Corporation which is not part of +! the ELPA consortium. +! +! More information can be found here: +! http://elpa.mpcdf.mpg.de/ +! +! ELPA is free software: you can redistribute it and/or modify +! it under the terms of the version 3 of the license of the +! GNU Lesser General Public License as published by the Free +! Software Foundation. +! +! ELPA is distributed in the hope that it will be useful, +! but WITHOUT ANY WARRANTY; without even the implied warranty of +! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +! GNU Lesser General Public License for more details. +! +! You should have received a copy of the GNU Lesser General Public License +! along with ELPA. If not, see +! +! ELPA reflects a substantial effort on the part of the original +! ELPA consortium, and we ask you to respect the spirit of the +! license that we chose: i.e., please contribute any changes you +! may have back to the original ELPA library distribution, and keep +! any derivatives of ELPA under the same license that we chose for +! the original distribution, the GNU Lesser General Public License. +! +! +! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines +! +! Copyright of the original code rests with the authors inside the ELPA +! consortium. The copyright of any additional modifications shall rest +! with their original authors, but shall adhere to the licensing terms +! distributed along with the original code in the file "COPYING". + + + +! ELPA2 -- 2-stage solver for ELPA +! +! Copyright of the original code rests with the authors inside the ELPA +! consortium. The copyright of any additional modifications shall rest +! with their original authors, but shall adhere to the licensing terms +! distributed along with the original code in the file "COPYING". +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + subroutine bandred_complex_double(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, & + useGPU, success) +#else + subroutine bandred_complex_single(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, & + useGPU, success) +#endif + !------------------------------------------------------------------------------- + ! bandred_complex: Reduces a distributed hermitian matrix to band form + ! + ! Parameters + ! + ! na Order of matrix + ! + ! a(lda,matrixCols) Distributed matrix which should be reduced. + ! Distribution is like in Scalapack. + ! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half) + ! a(:,:) is overwritten on exit with the band and the Householder vectors + ! in the upper half. + ! + ! lda Leading dimension of a + ! matrixCols local columns of matrix a + ! + ! nblk blocksize of cyclic distribution, must be the same in both directions! + ! + ! nbw semi bandwith of output matrix + ! + ! mpi_comm_rows + ! mpi_comm_cols + ! MPI-Communicators for rows/columns + ! + ! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1 + ! Factors for the Householder vectors (returned), needed for back transformation + ! + !------------------------------------------------------------------------------- +#ifdef HAVE_DETAILED_TIMINGS + use timings +#endif + use precision + use cuda_functions + use iso_c_binding + + implicit none + + logical, intent(in) :: useGPU + + integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols +#ifdef DESPERATELY_WANT_ASSUMED_SIZE + complex(kind=COMPLEX_DATATYPE) :: a(lda,*), tmat(nbw,nbw,*) +#else + complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks) +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk8, 0.0_rk8), CONE = (1.0_rk8, 0.0_rk8) +#else + complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk4, 0.0_rk4), CONE = (1.0_rk4, 0.0_rk4) +#endif + + integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr + integer(kind=ik) :: l_cols, l_rows + integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow + integer(kind=ik) :: istep, ncol, lch, lcx, nlc + integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile + + real(kind=REAL_DATATYPE) :: vnorm2 + complex(kind=COMPLEX_DATATYPE) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) + + complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:) + integer(kind=c_intptr_t) :: umc_dev, tmat_dev,vav_dev,vmr_dev,a_dev + integer(kind=ik) :: cur_l_rows, cur_l_cols,vmr_size ,umc_size + integer(kind=c_size_t) :: lc_start, lc_end, lr_end, lce_1, lcs_1,lre_1 + integer(kind=ik) :: na_rows, na_cols +#ifdef WITH_MPI + integer(kind=ik), external :: numroc +#endif + + logical, intent(in) :: wantDebug + logical, intent(out) :: success + character(200) :: errorMessage + integer(kind=ik) :: istat + logical :: successCUDA +#ifdef HAVE_DETAILED_TIMINGS +#ifdef DOUBLE_PRECISION_COMPLEX + call timer%start("bandred_complex_double") +#else + call timer%start("bandred_complex_single") +#endif +#endif + call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) + call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) + call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) + call mpi_comm_size(mpi_comm_cols,np_cols,mpierr) + success = .true. + + ! Semibandwith nbw must be a multiple of blocksize nblk + + if (mod(nbw,nblk)/=0) then + if (my_prow==0 .and. my_pcol==0) then + if (wantDebug) then + write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk + write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk' + endif + success = .false. + return + endif + endif + if (useGPU) then +#ifdef WITH_MPI + na_rows = numroc(na, nblk, my_prow, 0, np_rows) + na_cols = numroc(na, nblk, my_pcol, 0, np_cols) +#else + na_rows = na + na_cols = na +#endif + +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_complex_datatype) +#else + successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_complex_datatype) +#endif + if (.not.(successCUDA)) then + print *, " bandred_complex: cuda malloc failed tmat_dev ", istat + stop + endif + +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_double_complex_datatype) +#else + successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_single_complex_datatype) +#endif + if (.not.(successCUDA)) then + print *, "bandred_complex: cuda malloc failed vav_dev ", istat + stop + endif + +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_double_complex_datatype) +#else + successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_single_complex_datatype) +#endif + if (.not.(successCUDA)) then + print *, "bandred_complex: cuda malloc failed a_dev ", istat + stop + endif + endif ! useGPU + + ! Matrix is split into tiles; work is done only for tiles on the diagonal or above + + tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size + tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide + + l_rows_tile = tile_size/np_rows ! local rows of a tile + l_cols_tile = tile_size/np_cols ! local cols of a tile + + if (useGPU) then + if (size(a,dim=1) .ne. lda .or. size(a,dim=2) .ne. na_cols) then + print *,"bandred_complex: sizes of a wrong ? ",lda,size(a,dim=1),na_cols,size(a,dim=2) + endif +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_double_complex_datatype,cudaMemcpyHostToDevice) +#else + successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_single_complex_datatype,cudaMemcpyHostToDevice) +#endif + if (.not.(successCUDA)) then + print *, "bandred_complex: cuda memcpy faild a_dev ", istat + stop + endif + endif + + do istep = (na-1)/nbw, 1, -1 + + n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step + + ! Number of local columns/rows of remaining matrix + l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1) + l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1) + + ! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces + + if (useGPU) then + cur_l_rows = max(l_rows, 1) + cur_l_cols = max(l_cols, 1) + + vmr_size = cur_l_rows * 2 * n_cols + umc_size = cur_l_cols * 2 * n_cols + + if ((.not. allocated(umc)) .or. (umc_size .gt. ubound(umc, dim=1))) then + if (allocated(umc)) then + deallocate(umc, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when allocating umc "//errorMessage + stop + endif + successCUDA = cuda_free(umc_dev) + if (.not.(successCUDA))then + print *,"bandred_complex: error in cudaFree" + stop + endif + endif + + allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when allocating umc "//errorMessage + stop + endif + + if (max(l_cols,1) * 2*n_cols .gt. umc_size) then + print *,"bandred_complex: umc_size ",max(l_cols,1) * 2*n_cols,umc_size + endif +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_malloc(umc_dev, umc_size*size_of_double_complex_datatype) +#else + successCUDA = cuda_malloc(umc_dev, umc_size*size_of_single_complex_datatype) +#endif + if (.not.(successCUDA)) then + print *, "bandred_complex: cuda malloc failed umc_dev ", istat + stop + endif + endif + + if ((.not. allocated(vmr)) .or. (vmr_size .gt. ubound(vmr, dim=1))) then + if (allocated(vmr)) then + deallocate(vmr, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when deallocating vmr "//errorMessage + stop + endif + successCUDA = cuda_free(vmr_dev) + if (.not.(successCUDA))then + print *,"bandred_complex: error in cudaFree" + stop + endif + endif + + allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when allocating vmr "//errorMessage + stop + endif + + if (max(l_rows,1) * 2*n_cols .gt. vmr_size) then + print *,"bandred_complex: vmc_size ",max(l_rows,1) * 2*n_cols,vmr_size + endif + +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_double_complex_datatype) +#else + successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_single_complex_datatype) +#endif + if (.not.(successCUDA)) then + print *, "bandred_complex: cuda malloc failed vmr_dev ", istat + stop + endif + + endif + + if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then + if (allocated(vr)) then + deallocate(vr, stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when deallocating vr "//errorMessage + stop + endif + endif + + allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when allocating vr "//errorMessage + stop + endif + endif + + else ! GPU not used + allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when allocating vmr "//errorMessage + stop + endif + + allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when allocating umc "//errorMessage + stop + endif + + allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage) + if (istat .ne. 0) then + print *,"bandred_complex: error when allocating vr "//errorMessage + stop + endif + endif ! useGPU + +#ifdef DOUBLE_PRECISION_COMLEX + vmr(1:l_rows,1:n_cols) = 0._ck8 + vr(:) = 0._ck8 + tmat(:,:,istep) = 0._ck8 +#else + vmr(1:l_rows,1:n_cols) = 0._ck4 + vr(:) = 0._ck4 + tmat(:,:,istep) = 0._ck4 +#endif + + if (useGPU) then + lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1) + lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1) + lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1) + + if (lc_start .le. 0) lc_start = 1 + cur_pcol = pcol(istep*nbw+1, nblk, np_cols) + if (my_pcol == cur_pcol) then +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_double_complex_datatype,kind=c_size_t), & + (a_dev + int( ( (lc_start-1) * lda*size_of_double_complex_datatype),kind=c_size_t )), & + int(lda*size_of_double_complex_datatype,kind=c_size_t), & + int(lr_end*size_of_double_complex_datatype,kind=c_size_t), & + int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int)) +#else + successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_single_complex_datatype,kind=c_size_t), & + (a_dev + int( ( (lc_start-1) * lda*size_of_single_complex_datatype),kind=c_size_t )), & + int(lda*size_of_single_complex_datatype,kind=c_size_t), & + int(lr_end*size_of_single_complex_datatype,kind=c_size_t), & + int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int)) +#endif + if (.not.(successCUDA)) then + print *, "bandred_complex: error in cudaMemcpy2" + stop + endif + endif + endif + + ! Reduce current block to lower triangular form + + do lc = n_cols, 1, -1 + + ncol = istep*nbw + lc ! absolute column number of householder vector + nrow = ncol - nbw ! Absolute number of pivot row + + lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length + lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number + + tau = 0 + + if(nrow == 1) exit ! Nothing to do + + cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block + + if (my_pcol==cur_pcol) then + + ! Get vector to be transformed; distribute last element and norm of + ! remaining elements to all procs in current column + + vr(1:lr) = a(1:lr,lch) ! vector to be transformed + + if (my_prow==prow(nrow, nblk, np_rows)) then + aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1)) + aux1(2) = vr(lr) + else + aux1(1) = dot_product(vr(1:lr),vr(1:lr)) +#ifdef DOUBLE_PRECISION_COMPLEX + aux1(2) = 0._ck8 +#else + aux1(2) = 0._ck4 +#endif + endif +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call mpi_allreduce(aux1, aux2, 2, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + aux2 = aux1 +#endif /* WITH_MPI */ + vnorm2 = aux2(1) + vrl = aux2(2) + + ! Householder transformation +#ifdef DOUBLE_PRECISION_COMPLEX + call hh_transform_complex_double(vrl, vnorm2, xf, tau) +#else + call hh_transform_complex_single(vrl, vnorm2, xf, tau) +#endif + ! Scale vr and store Householder vector for back transformation + + vr(1:lr) = vr(1:lr) * xf + if (my_prow==prow(nrow, nblk, np_rows)) then + a(1:lr-1,lch) = vr(1:lr-1) + a(lr,lch) = vrl +#ifdef DOUBLE_PRECISION_COMPLEX + vr(lr) = 1._ck8 +#else + vr(lr) = 1._ck4 +#endif + else + a(1:lr,lch) = vr(1:lr) + endif + + endif + + ! Broadcast Householder vector and tau along columns + + vr(lr+1) = tau +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + call MPI_Bcast(vr, lr+1, MPI_DOUBLE_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) +#else + call MPI_Bcast(vr, lr+1, MPI_COMPLEX, cur_pcol, mpi_comm_cols, mpierr) +#endif + +#endif /* WITH_MPI */ + vmr(1:lr,lc) = vr(1:lr) + tau = vr(lr+1) + tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat + + ! Transform remaining columns in current block with Householder vector + + ! Local dot product +#ifdef DOUBLE_PRECISION_COMPLEX + aux1 = 0._ck8 +#else + aux1 = 0._ck4 +#endif + + nlc = 0 ! number of local columns + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0) then + nlc = nlc+1 + aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx)) + endif + enddo + + ! Get global dot products +#ifdef WITH_MPI + +#ifdef DOUBLE_PRECISION_COMPLEX + if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#else + if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) +#endif + +#else /* WITH_MPI */ + if (nlc>0) aux2=aux1 +#endif /* WITH_MPI */ + ! Transform + + nlc = 0 + do j=1,lc-1 + lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0) + if (lcx>0) then + nlc = nlc+1 + a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) + endif + enddo + + enddo + + ! Calculate scalar products of stored Householder vectors. + ! This can be done in different ways, we use zherk + + if (useGPU) then + cur_pcol = pcol(istep*nbw+1, nblk, np_cols) + if (my_pcol == cur_pcol) then +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_double_complex_datatype),kind=c_size_t)), & + int(lda*size_of_double_complex_datatype,kind=c_size_t), loc(a(1,lc_start)), & + int(lda*size_of_double_complex_datatype,kind=c_size_t), & + int(lr_end*size_of_double_complex_datatype,kind=c_size_t), & + int((lc_end - lc_start+1),kind=c_size_t) & + ,int(cudaMemcpyHostToDevice,kind=c_int)) +#else + successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_single_complex_datatype),kind=c_size_t)), & + int(lda*size_of_single_complex_datatype,kind=c_size_t), loc(a(1,lc_start)), & + int(lda*size_of_single_complex_datatype,kind=c_size_t), & + int(lr_end*size_of_single_complex_datatype,kind=c_size_t), & + int((lc_end - lc_start+1),kind=c_size_t) & + ,int(cudaMemcpyHostToDevice,kind=c_int)) +#endif + if (.not.(successCUDA)) then + print *, "bandred_complex: cuda memcpy a_dev failed ", istat + stop + endif + endif + endif + + vav = 0 + if (l_rows>0) & +#ifdef DOUBLE_PRECISION_COMPLEX + call zherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) + call herm_matrix_allreduce_double(n_cols,vav, nbw,nbw,mpi_comm_rows) + +#else + call cherk('U', 'C', n_cols, l_rows, CONE, vmr, ubound(vmr,dim=1), CZERO, vav, ubound(vav,dim=1)) + call herm_matrix_allreduce_single(n_cols,vav, nbw,nbw,mpi_comm_rows) +#endif + + ! Calculate triangular matrix T for block Householder Transformation + + do lc=n_cols,1,-1 + tau = tmat(lc,lc,istep) + if (lc vmc (stored in umc, second half) +#ifdef DOUBLE_PRECISION_COMPLEX + call elpa_transpose_vectors_complex_double (vmr, ubound(vmr,dim=1), mpi_comm_rows, & + umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, & + 1, istep*nbw, n_cols, nblk) +#else + call elpa_transpose_vectors_complex_single (vmr, ubound(vmr,dim=1), mpi_comm_rows, & + umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, & + 1, istep*nbw, n_cols, nblk) +#endif + + ! Calculate umc = A**T * vmr + ! Note that the distributed A has to be transposed + ! Opposed to direct tridiagonalization there is no need to use the cache locality + ! of the tiles, so we can use strips of the matrix +#ifdef DOUBLE_PRECISION_COMPLEX + umc(1:l_cols,1:n_cols) = 0.0_ck8 + vmr(1:l_rows,n_cols+1:2*n_cols) = 0._ck8 +#else + umc(1:l_cols,1:n_cols) = 0.0_ck4 + vmr(1:l_rows,n_cols+1:2*n_cols) = 0._ck4 +#endif + if (l_cols>0 .and. l_rows>0) then + if (useGPU) then + if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then + print *,"bandred_complex: vmr size 2 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size + stop + endif +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_memcpy(vmr_dev, loc(vmr(1,1)),vmr_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice) +#else + successCUDA = cuda_memcpy(vmr_dev, loc(vmr(1,1)),vmr_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice) +#endif + + if (.not.(successCUDA)) then + print *, "bandred_complex: cuda memcpy vmr_dev failed ", istat + stop + endif + if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then + print *,"bandred_complex: umc size 2 :",size(umc,dim=1)*size(umc,dim=2),umc_size + stop + endif +#ifdef DOUBLE_PRECISION_COMPLEX + successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_double_complex_datatype,cudaMemcpyHostToDevice) +#else + successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_single_complex_datatype,cudaMemcpyHostToDevice) +#endif + if (.not.(successCUDA)) then + print *, "bandred_complex: cuda memcpy umc_dev failed ", istat + stop + endif + endif + do i=0,(istep*nbw-1)/tile_size + + lcs = i*l_cols_tile+1 + lce = min(l_cols,(i+1)*l_cols_tile) + if (lce