Commit 48e712ef authored by Andreas Marek's avatar Andreas Marek
Browse files

Introducing OpenMP functionality in ELPA_development_version_OpenMP

This commit introduces OpenMP functionality in the
ELPA_development_version_OpenMP branch.

It contains several bugfixes to the OpenMP functionality in the
branch "ELPA_development_version", the later will soon be deleted
since the new branch is the new reference implementation.

The current branch contains the following features/bugfixes:
- building of the OpenMP version of ELPA via configure and the
  "--with-openmp" flag. The build library contains a "_mt"
  (multi-threaded) in its name.
  The configure procedure should (hopefully) determine for each
  compiler the neccessary OpenMP flags.
  If the "--with-openmp" flag is ommitted exactly the same code
  as in the ELPA 2013.08.001 release is used and build in the
  same way
- The example test cases print which kernels have been used and
  how many OpenMP threads are used at runtime
- correct handling of OpenMP stack arrays: the previous implementation
  caused compiler dependent segmentation faults
- OpenMP capability with all available kernels: the correctness of
  the computations have been checked for all kernels except the
  Bluegene (P/Q) versions
parent bc9a3d07
...@@ -7,73 +7,138 @@ AM_LDFLAGS = @AM_LDFLAGS@ @BLACS_LDFLAGS@ ...@@ -7,73 +7,138 @@ AM_LDFLAGS = @AM_LDFLAGS@ @BLACS_LDFLAGS@
BLACS_LDFLAGS = @BLACS_LDFLAGS@ BLACS_LDFLAGS = @BLACS_LDFLAGS@
# libelpa # libelpa
if WITH_OPENMP
lib_LTLIBRARIES = libelpa_mt.la
else
lib_LTLIBRARIES = libelpa.la lib_LTLIBRARIES = libelpa.la
endif
##rule to produce fortran config file: ##rule to produce fortran config file:
#config_f90.h: ./config.h #config_f90.h: ./config.h
# grep "^#define" ./config.h > $@ # grep "^#define" ./config.h > $@
if WITH_OPENMP
libelpa_la_SOURCES = src/elpa1.f90 src/elpa2.F90 libelpa_mt_la_SOURCES = src/elpa1.F90 src/elpa2.F90
else
libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90
endif
if WITH_GENERIC_SIMPLE if WITH_GENERIC_SIMPLE
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \
src/elpa2_kernels/elpa2_kernels_real_simple.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \
src/elpa2_kernels/elpa2_kernels_real_simple.f90 src/elpa2_kernels/elpa2_kernels_real_simple.f90
endif
endif endif
if WITH_GENERIC if WITH_GENERIC
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \
src/elpa2_kernels/elpa2_kernels_real.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \
src/elpa2_kernels/elpa2_kernels_real.f90 src/elpa2_kernels/elpa2_kernels_real.f90
endif
endif endif
if WITH_BGP if WITH_BGP
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90 src/elpa2_kernels/elpa2_kernels_complex.f90
endif
endif endif
if WITH_BGQ if WITH_BGQ
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90 src/elpa2_kernels/elpa2_kernels_complex.f90
endif
endif endif
if WITH_SSE_AS if WITH_SSE_AS
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
endif endif
endif
if WITH_AVX_SANDYBRIDGE if WITH_AVX_SANDYBRIDGE
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif
endif endif
if WITH_AMD_BULLDOZER if WITH_AMD_BULLDOZER
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif
endif endif
if WITH_AVX_COMPLEX_BLOCK1 if WITH_AVX_COMPLEX_BLOCK1
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif endif
endif
if WITH_AVX_COMPLEX_BLOCK2 if WITH_AVX_COMPLEX_BLOCK2
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \ libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif
endif endif
if WITH_AVX_REAL_BLOCK2 if WITH_AVX_REAL_BLOCK2
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif endif
endif
if WITH_AVX_REAL_BLOCK4 if WITH_AVX_REAL_BLOCK4
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
endif endif
endif
if WITH_AVX_REAL_BLOCK6 if WITH_AVX_REAL_BLOCK6
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
endif endif
endif
if WITH_OPENMP
libelpa_la_LDFLAGS = -version-info $(ELPA_SO_VERSION) libelpa_la_LDFLAGS = -version-info $(ELPA_SO_VERSION)
else
libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION)
endif
# install any .mod files in the include/ dir # install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa elpa_includedir = $(includedir)/elpa
...@@ -84,11 +149,11 @@ filesdir = $(datarootdir) ...@@ -84,11 +149,11 @@ filesdir = $(datarootdir)
files_DATA = \ files_DATA = \
test/read_real.f90 \ test/read_real.f90 \
test/read_real_gen.f90 \ test/read_real_gen.f90 \
test/test_complex2.f90 \ test/test_complex2.F90 \
test/test_complex.f90 \ test/test_complex.F90 \
test/test_complex_gen.f90 \ test/test_complex_gen.f90 \
test/test_real2.f90 \ test/test_real2.F90 \
test/test_real.f90 \ test/test_real.F90 \
test/test_real_gen.f90 test/test_real_gen.f90
# pkg-config stuff # pkg-config stuff
...@@ -96,20 +161,25 @@ pkgconfigdir = $(libdir)/pkgconfig ...@@ -96,20 +161,25 @@ pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = elpa.pc pkgconfig_DATA = elpa.pc
# test programs # test programs
if WITH_OPENMP
build_lib = libelpa_mt.la
else
build_lib = libelpa.la
endif
noinst_bindir = $(abs_top_builddir) noinst_bindir = $(abs_top_builddir)
noinst_bin_PROGRAMS = test_real test_real2 test_complex test_complex2 noinst_bin_PROGRAMS = test_real test_real2 test_complex test_complex2
test_real_SOURCES = test/test_real.f90 test_real_SOURCES = test/test_real.F90
test_real_LDADD = libelpa.la test_real_LDADD = $(build_lib)
test_real2_SOURCES = test/test_real2.f90 test_real2_SOURCES = test/test_real2.F90
test_real2_LDADD = libelpa.la test_real2_LDADD = $(build_lib)
test_complex_SOURCES = test/test_complex.f90 test_complex_SOURCES = test/test_complex.F90
test_complex_LDADD = libelpa.la test_complex_LDADD = $(build_lib)
test_complex2_SOURCES = test/test_complex2.f90 test_complex2_SOURCES = test/test_complex2.F90
test_complex2_LDADD = libelpa.la test_complex2_LDADD = $(build_lib)
check_SCRIPTS = test_real.sh test_real2.sh test_complex.sh test_complex2.sh check_SCRIPTS = test_real.sh test_real2.sh test_complex.sh test_complex2.sh
......
This diff is collapsed.
...@@ -101,5 +101,8 @@ ...@@ -101,5 +101,8 @@
optimizations) */ optimizations) */
#undef WITH_GENERIC_SIMPLE #undef WITH_GENERIC_SIMPLE
/* use OpenMP threading */
#undef WITH_OPENMP
/* use kernel tuned for SSE (written in gcc assembler) */ /* use kernel tuned for SSE (written in gcc assembler) */
#undef WITH_SSE_AS #undef WITH_SSE_AS
...@@ -595,7 +595,7 @@ PACKAGE_STRING='elpa 2013.08.001' ...@@ -595,7 +595,7 @@ PACKAGE_STRING='elpa 2013.08.001'
PACKAGE_BUGREPORT='elpa-library@rzg.mpg.de' PACKAGE_BUGREPORT='elpa-library@rzg.mpg.de'
PACKAGE_URL='' PACKAGE_URL=''
   
ac_unique_file="src/elpa1.f90" ac_unique_file="src/elpa1.F90"
# Factoring default headers for most tests. # Factoring default headers for most tests.
ac_includes_default="\ ac_includes_default="\
#include <stdio.h> #include <stdio.h>
...@@ -672,6 +672,9 @@ build_vendor ...@@ -672,6 +672,9 @@ build_vendor
build_cpu build_cpu
build build
LIBTOOL LIBTOOL
OPENMP_FCFLAGS
WITH_OPENMP_FALSE
WITH_OPENMP_TRUE
WITH_AVX_REAL_BLOCK6_FALSE WITH_AVX_REAL_BLOCK6_FALSE
WITH_AVX_REAL_BLOCK6_TRUE WITH_AVX_REAL_BLOCK6_TRUE
WITH_AVX_REAL_BLOCK4_FALSE WITH_AVX_REAL_BLOCK4_FALSE
...@@ -811,6 +814,8 @@ with_avx_complex_block2 ...@@ -811,6 +814,8 @@ with_avx_complex_block2
with_avx_real_block2 with_avx_real_block2
with_avx_real_block4 with_avx_real_block4
with_avx_real_block6 with_avx_real_block6
with_openmp
enable_openmp
enable_shared enable_shared
enable_static enable_static
with_pic with_pic
...@@ -1458,6 +1463,7 @@ Optional Features: ...@@ -1458,6 +1463,7 @@ Optional Features:
do not reject slow dependency extractors do not reject slow dependency extractors
--disable-dependency-tracking --disable-dependency-tracking
speeds up one-time build speeds up one-time build
--disable-openmp do not use OpenMP
--enable-shared[=PKGS] build shared libraries [default=yes] --enable-shared[=PKGS] build shared libraries [default=yes]
--enable-static[=PKGS] build static libraries [default=yes] --enable-static[=PKGS] build static libraries [default=yes]
--enable-fast-install[=PKGS] --enable-fast-install[=PKGS]
...@@ -1491,6 +1497,7 @@ Optional Packages: ...@@ -1491,6 +1497,7 @@ Optional Packages:
(written in gcc assembler), default no. (written in gcc assembler), default no.
--with-avx-real-block6 use AVX optimized real kernel with blocking 6 --with-avx-real-block6 use AVX optimized real kernel with blocking 6
(written in gcc assembler), default no. (written in gcc assembler), default no.
--with-openmp use OpenMP threading, default no.
--with-pic[=PKGS] try to use only PIC/non-PIC objects [default=use --with-pic[=PKGS] try to use only PIC/non-PIC objects [default=use
both] both]
--with-gnu-ld assume the C compiler uses GNU ld [default=no] --with-gnu-ld assume the C compiler uses GNU ld [default=no]
...@@ -5794,6 +5801,101 @@ $as_echo "#define WITH_AVX_REAL_BLOCK6 1" >>confdefs.h ...@@ -5794,6 +5801,101 @@ $as_echo "#define WITH_AVX_REAL_BLOCK6 1" >>confdefs.h
   
   
   
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether OpenMP usage is specified" >&5
$as_echo_n "checking whether OpenMP usage is specified... " >&6; }
# Check whether --with-openmp was given.
if test "${with_openmp+set}" = set; then :
withval=$with_openmp; with_openmp=yes
else
with_openmp=no
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${with_openmp}" >&5
$as_echo "${with_openmp}" >&6; }
if test x"$with_openmp" = x"yes"; then
WITH_OPENMP_TRUE=
WITH_OPENMP_FALSE='#'
else
WITH_OPENMP_TRUE='#'
WITH_OPENMP_FALSE=
fi
if test "x${with_openmp}" = xyes; then
$as_echo "#define WITH_OPENMP 1" >>confdefs.h
OPENMP_FCFLAGS=
# Check whether --enable-openmp was given.
if test "${enable_openmp+set}" = set; then :
enableval=$enable_openmp;
fi
if test "$enable_openmp" != no; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $FC option to support OpenMP" >&5
$as_echo_n "checking for $FC option to support OpenMP... " >&6; }
if ${ac_cv_prog_fc_openmp+:} false; then :
$as_echo_n "(cached) " >&6
else
cat > conftest.$ac_ext <<_ACEOF
program main
implicit none
!$ integer tid
tid = 42
call omp_set_num_threads(2)
end
_ACEOF
if ac_fn_fc_try_link "$LINENO"; then :
ac_cv_prog_fc_openmp='none needed'
else
ac_cv_prog_fc_openmp='unsupported'
for ac_option in -fopenmp -xopenmp -openmp -mp -omp -qsmp=omp -homp \
-Popenmp --openmp; do
ac_save_FCFLAGS=$FCFLAGS
FCFLAGS="$FCFLAGS $ac_option"
cat > conftest.$ac_ext <<_ACEOF
program main
implicit none
!$ integer tid
tid = 42
call omp_set_num_threads(2)
end
_ACEOF
if ac_fn_fc_try_link "$LINENO"; then :
ac_cv_prog_fc_openmp=$ac_option
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
FCFLAGS=$ac_save_FCFLAGS
if test "$ac_cv_prog_fc_openmp" != unsupported; then
break
fi
done
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_openmp" >&5
$as_echo "$ac_cv_prog_fc_openmp" >&6; }
case $ac_cv_prog_fc_openmp in #(
"none needed" | unsupported)
;; #(
*)
OPENMP_FCFLAGS=$ac_cv_prog_fc_openmp ;;
esac
fi
fi
FCFLAGS="$FCFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
#LDFLAGS="$LDFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
   
save_FCFLAGS=$FCFLAGS save_FCFLAGS=$FCFLAGS
save_LDFLAGS=$LDFLAGS save_LDFLAGS=$LDFLAGS
...@@ -19759,6 +19861,8 @@ ac_compiler_gnu=$ac_cv_fc_compiler_gnu ...@@ -19759,6 +19861,8 @@ ac_compiler_gnu=$ac_cv_fc_compiler_gnu
   
   
   
#AC_SUBST(OPT_FCFLAGS)
mkdir modules mkdir modules
   
#gl_VISIBILITY #gl_VISIBILITY
...@@ -19967,6 +20071,10 @@ if test -z "${WITH_AVX_REAL_BLOCK6_TRUE}" && test -z "${WITH_AVX_REAL_BLOCK6_FAL ...@@ -19967,6 +20071,10 @@ if test -z "${WITH_AVX_REAL_BLOCK6_TRUE}" && test -z "${WITH_AVX_REAL_BLOCK6_FAL
as_fn_error $? "conditional \"WITH_AVX_REAL_BLOCK6\" was never defined. as_fn_error $? "conditional \"WITH_AVX_REAL_BLOCK6\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5 Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi fi
if test -z "${WITH_OPENMP_TRUE}" && test -z "${WITH_OPENMP_FALSE}"; then
as_fn_error $? "conditional \"WITH_OPENMP\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
   
: "${CONFIG_STATUS=./config.status}" : "${CONFIG_STATUS=./config.status}"
ac_write_fail=0 ac_write_fail=0
......
AC_PREREQ([2.69]) AC_PREREQ([2.69])
AC_INIT([elpa],[2013.08.001], elpa-library@rzg.mpg.de) AC_INIT([elpa],[2013.08.001], elpa-library@rzg.mpg.de)
AC_CONFIG_SRCDIR([src/elpa1.f90]) AC_CONFIG_SRCDIR([src/elpa1.F90])
AM_INIT_AUTOMAKE([foreign -Wall subdir-objects]) AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_MACRO_DIR([m4])
...@@ -102,6 +102,22 @@ DEFINE_OPTION([WITH_AVX_REAL_BLOCK6], [avx-real-block6], ...@@ -102,6 +102,22 @@ DEFINE_OPTION([WITH_AVX_REAL_BLOCK6], [avx-real-block6],
[no],[]) [no],[])
AC_MSG_CHECKING(whether OpenMP usage is specified)
AC_ARG_WITH([openmp],
AS_HELP_STRING([--with-openmp],
[use OpenMP threading, default no.]),
[with_openmp=yes],
[with_openmp=no])
AC_MSG_RESULT([${with_openmp}])
AM_CONDITIONAL([WITH_OPENMP],[test x"$with_openmp" = x"yes"])
if test "x${with_openmp}" = xyes; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
AC_OPENMP
fi
FCFLAGS="$FCFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
#LDFLAGS="$LDFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
save_FCFLAGS=$FCFLAGS save_FCFLAGS=$FCFLAGS
save_LDFLAGS=$LDFLAGS save_LDFLAGS=$LDFLAGS
...@@ -231,6 +247,8 @@ AC_SUBST([FC_MODOUT]) ...@@ -231,6 +247,8 @@ AC_SUBST([FC_MODOUT])
AC_SUBST(BLACS_LDFLAGS) AC_SUBST(BLACS_LDFLAGS)
AC_SUBST(BLACS_FCFLAGS) AC_SUBST(BLACS_FCFLAGS)
#AC_SUBST(OPT_FCFLAGS)
mkdir modules mkdir modules
#gl_VISIBILITY #gl_VISIBILITY
......
...@@ -46,6 +46,8 @@ ...@@ -46,6 +46,8 @@
! with their original authors, but shall adhere to the licensing terms ! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING". ! distributed along with the original code in the file "COPYING".
#include "config-f90.h"
module ELPA1 module ELPA1
! Version 1.1.2, 2011-02-21 ! Version 1.1.2, 2011-02-21
...@@ -351,10 +353,17 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta ...@@ -351,10 +353,17 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
integer istep, i, j, lcs, lce, lrs, lre integer istep, i, j, lcs, lce, lrs, lre
integer tile_size, l_rows_tile, l_cols_tile integer tile_size, l_rows_tile, l_cols_tile
#ifdef WITH_OPENMP
integer my_thread, n_threads, max_threads, n_iter
integer omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
#endif
real*8 vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf real*8 vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
real*8, allocatable:: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:) real*8, allocatable:: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
#ifdef WITH_OPENMP
real*8, allocatable:: ur_p(:,:), uc_p(:,:)
#endif
integer pcol, prow integer pcol, prow
pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number
...@@ -387,6 +396,13 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta ...@@ -387,6 +396,13 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
allocate(vc(max_local_cols)) allocate(vc(max_local_cols))
allocate(uc(max_local_cols)) allocate(uc(max_local_cols))
#ifdef WITH_OPENMP
max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1))
allocate(uc_p(max_local_cols,0:max_threads-1))
#endif
tmp = 0 tmp = 0
vr = 0 vr = 0
ur = 0 ur = 0
...@@ -478,6 +494,17 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta ...@@ -478,6 +494,17 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
ur(1:l_rows) = 0 ur(1:l_rows) = 0
if(l_rows>0 .and. l_cols>0) then if(l_rows>0 .and. l_cols>0) then
#ifdef WITH_OPENMP
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads()
n_iter = 0
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif
do i=0,(istep-2)/tile_size do i=0,(istep-2)/tile_size
lcs = i*l_cols_tile+1 lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile) lce = min(l_cols,(i+1)*l_cols_tile)
...@@ -486,11 +513,27 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta ...@@ -486,11 +513,27 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
lrs = j*l_rows_tile+1 lrs = j*l_rows_tile+1
lre = min(l_rows,(j+1)*l_rows_tile) lre = min(l_rows,(j+1)*l_rows_tile)
if(lre<lrs) cycle if(lre<lrs) cycle
#ifdef WITH_OPENMP
if(mod(n_iter,n_threads) == my_thread) then
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc_p(lcs,my_thread),1)
if(i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur_p(lrs,my_thread),1)
endif
n_iter = n_iter+1
#else