Commit 48e712ef authored by Andreas Marek's avatar Andreas Marek
Browse files

Introducing OpenMP functionality in ELPA_development_version_OpenMP

This commit introduces OpenMP functionality in the
ELPA_development_version_OpenMP branch.

It contains several bugfixes to the OpenMP functionality in the
branch "ELPA_development_version", the later will soon be deleted
since the new branch is the new reference implementation.

The current branch contains the following features/bugfixes:
- building of the OpenMP version of ELPA via configure and the
  "--with-openmp" flag. The build library contains a "_mt"
  (multi-threaded) in its name.
  The configure procedure should (hopefully) determine for each
  compiler the neccessary OpenMP flags.
  If the "--with-openmp" flag is ommitted exactly the same code
  as in the ELPA 2013.08.001 release is used and build in the
  same way
- The example test cases print which kernels have been used and
  how many OpenMP threads are used at runtime
- correct handling of OpenMP stack arrays: the previous implementation
  caused compiler dependent segmentation faults
- OpenMP capability with all available kernels: the correctness of
  the computations have been checked for all kernels except the
  Bluegene (P/Q) versions
parent bc9a3d07
......@@ -7,73 +7,138 @@ AM_LDFLAGS = @AM_LDFLAGS@ @BLACS_LDFLAGS@
BLACS_LDFLAGS = @BLACS_LDFLAGS@
# libelpa
if WITH_OPENMP
lib_LTLIBRARIES = libelpa_mt.la
else
lib_LTLIBRARIES = libelpa.la
endif
##rule to produce fortran config file:
#config_f90.h: ./config.h
# grep "^#define" ./config.h > $@
libelpa_la_SOURCES = src/elpa1.f90 src/elpa2.F90
if WITH_OPENMP
libelpa_mt_la_SOURCES = src/elpa1.F90 src/elpa2.F90
else
libelpa_la_SOURCES = src/elpa1.F90 src/elpa2.F90
endif
if WITH_GENERIC_SIMPLE
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \
src/elpa2_kernels/elpa2_kernels_real_simple.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_simple.f90 \
src/elpa2_kernels/elpa2_kernels_real_simple.f90
endif
endif
if WITH_GENERIC
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \
src/elpa2_kernels/elpa2_kernels_real.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex.f90 \
src/elpa2_kernels/elpa2_kernels_real.f90
endif
endif
if WITH_BGP
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgp.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
endif
endif
if WITH_BGQ
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_bgq.f90 \
src/elpa2_kernels/elpa2_kernels_complex.f90
endif
endif
if WITH_SSE_AS
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_asm_x86_64.s
endif
endif
if WITH_AVX_SANDYBRIDGE
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif
endif
if WITH_AMD_BULLDOZER
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c \
src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif
endif
if WITH_AVX_COMPLEX_BLOCK1
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif
endif
if WITH_AVX_COMPLEX_BLOCK2
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp \
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
endif
endif
if WITH_AVX_REAL_BLOCK2
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c
endif
endif
if WITH_AVX_REAL_BLOCK4
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_4hv.c
endif
endif
if WITH_AVX_REAL_BLOCK6
if WITH_OPENMP
libelpa_mt_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
else
libelpa_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_6hv.c
endif
endif
if WITH_OPENMP
libelpa_la_LDFLAGS = -version-info $(ELPA_SO_VERSION)
else
libelpa_mt_la_LDFLAGS = -version-info $(ELPA_SO_VERSION)
endif
# install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa
......@@ -84,11 +149,11 @@ filesdir = $(datarootdir)
files_DATA = \
test/read_real.f90 \
test/read_real_gen.f90 \
test/test_complex2.f90 \
test/test_complex.f90 \
test/test_complex2.F90 \
test/test_complex.F90 \
test/test_complex_gen.f90 \
test/test_real2.f90 \
test/test_real.f90 \
test/test_real2.F90 \
test/test_real.F90 \
test/test_real_gen.f90
# pkg-config stuff
......@@ -96,20 +161,25 @@ pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = elpa.pc
# test programs
if WITH_OPENMP
build_lib = libelpa_mt.la
else
build_lib = libelpa.la
endif
noinst_bindir = $(abs_top_builddir)
noinst_bin_PROGRAMS = test_real test_real2 test_complex test_complex2
test_real_SOURCES = test/test_real.f90
test_real_LDADD = libelpa.la
test_real_SOURCES = test/test_real.F90
test_real_LDADD = $(build_lib)
test_real2_SOURCES = test/test_real2.f90
test_real2_LDADD = libelpa.la
test_real2_SOURCES = test/test_real2.F90
test_real2_LDADD = $(build_lib)
test_complex_SOURCES = test/test_complex.f90
test_complex_LDADD = libelpa.la
test_complex_SOURCES = test/test_complex.F90
test_complex_LDADD = $(build_lib)
test_complex2_SOURCES = test/test_complex2.f90
test_complex2_LDADD = libelpa.la
test_complex2_SOURCES = test/test_complex2.F90
test_complex2_LDADD = $(build_lib)
check_SCRIPTS = test_real.sh test_real2.sh test_complex.sh test_complex2.sh
......
This diff is collapsed.
......@@ -101,5 +101,8 @@
optimizations) */
#undef WITH_GENERIC_SIMPLE
/* use OpenMP threading */
#undef WITH_OPENMP
/* use kernel tuned for SSE (written in gcc assembler) */
#undef WITH_SSE_AS
......@@ -595,7 +595,7 @@ PACKAGE_STRING='elpa 2013.08.001'
PACKAGE_BUGREPORT='elpa-library@rzg.mpg.de'
PACKAGE_URL=''
 
ac_unique_file="src/elpa1.f90"
ac_unique_file="src/elpa1.F90"
# Factoring default headers for most tests.
ac_includes_default="\
#include <stdio.h>
......@@ -672,6 +672,9 @@ build_vendor
build_cpu
build
LIBTOOL
OPENMP_FCFLAGS
WITH_OPENMP_FALSE
WITH_OPENMP_TRUE
WITH_AVX_REAL_BLOCK6_FALSE
WITH_AVX_REAL_BLOCK6_TRUE
WITH_AVX_REAL_BLOCK4_FALSE
......@@ -811,6 +814,8 @@ with_avx_complex_block2
with_avx_real_block2
with_avx_real_block4
with_avx_real_block6
with_openmp
enable_openmp
enable_shared
enable_static
with_pic
......@@ -1458,6 +1463,7 @@ Optional Features:
do not reject slow dependency extractors
--disable-dependency-tracking
speeds up one-time build
--disable-openmp do not use OpenMP
--enable-shared[=PKGS] build shared libraries [default=yes]
--enable-static[=PKGS] build static libraries [default=yes]
--enable-fast-install[=PKGS]
......@@ -1491,6 +1497,7 @@ Optional Packages:
(written in gcc assembler), default no.
--with-avx-real-block6 use AVX optimized real kernel with blocking 6
(written in gcc assembler), default no.
--with-openmp use OpenMP threading, default no.
--with-pic[=PKGS] try to use only PIC/non-PIC objects [default=use
both]
--with-gnu-ld assume the C compiler uses GNU ld [default=no]
......@@ -5794,6 +5801,101 @@ $as_echo "#define WITH_AVX_REAL_BLOCK6 1" >>confdefs.h
 
 
 
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether OpenMP usage is specified" >&5
$as_echo_n "checking whether OpenMP usage is specified... " >&6; }
# Check whether --with-openmp was given.
if test "${with_openmp+set}" = set; then :
withval=$with_openmp; with_openmp=yes
else
with_openmp=no
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: ${with_openmp}" >&5
$as_echo "${with_openmp}" >&6; }
if test x"$with_openmp" = x"yes"; then
WITH_OPENMP_TRUE=
WITH_OPENMP_FALSE='#'
else
WITH_OPENMP_TRUE='#'
WITH_OPENMP_FALSE=
fi
if test "x${with_openmp}" = xyes; then
$as_echo "#define WITH_OPENMP 1" >>confdefs.h
OPENMP_FCFLAGS=
# Check whether --enable-openmp was given.
if test "${enable_openmp+set}" = set; then :
enableval=$enable_openmp;
fi
if test "$enable_openmp" != no; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $FC option to support OpenMP" >&5
$as_echo_n "checking for $FC option to support OpenMP... " >&6; }
if ${ac_cv_prog_fc_openmp+:} false; then :
$as_echo_n "(cached) " >&6
else
cat > conftest.$ac_ext <<_ACEOF
program main
implicit none
!$ integer tid
tid = 42
call omp_set_num_threads(2)
end
_ACEOF
if ac_fn_fc_try_link "$LINENO"; then :
ac_cv_prog_fc_openmp='none needed'
else
ac_cv_prog_fc_openmp='unsupported'
for ac_option in -fopenmp -xopenmp -openmp -mp -omp -qsmp=omp -homp \
-Popenmp --openmp; do
ac_save_FCFLAGS=$FCFLAGS
FCFLAGS="$FCFLAGS $ac_option"
cat > conftest.$ac_ext <<_ACEOF
program main
implicit none
!$ integer tid
tid = 42
call omp_set_num_threads(2)
end
_ACEOF
if ac_fn_fc_try_link "$LINENO"; then :
ac_cv_prog_fc_openmp=$ac_option
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
FCFLAGS=$ac_save_FCFLAGS
if test "$ac_cv_prog_fc_openmp" != unsupported; then
break
fi
done
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_openmp" >&5
$as_echo "$ac_cv_prog_fc_openmp" >&6; }
case $ac_cv_prog_fc_openmp in #(
"none needed" | unsupported)
;; #(
*)
OPENMP_FCFLAGS=$ac_cv_prog_fc_openmp ;;
esac
fi
fi
FCFLAGS="$FCFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
#LDFLAGS="$LDFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
 
save_FCFLAGS=$FCFLAGS
save_LDFLAGS=$LDFLAGS
......@@ -19759,6 +19861,8 @@ ac_compiler_gnu=$ac_cv_fc_compiler_gnu
 
 
 
#AC_SUBST(OPT_FCFLAGS)
mkdir modules
 
#gl_VISIBILITY
......@@ -19967,6 +20071,10 @@ if test -z "${WITH_AVX_REAL_BLOCK6_TRUE}" && test -z "${WITH_AVX_REAL_BLOCK6_FAL
as_fn_error $? "conditional \"WITH_AVX_REAL_BLOCK6\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
if test -z "${WITH_OPENMP_TRUE}" && test -z "${WITH_OPENMP_FALSE}"; then
as_fn_error $? "conditional \"WITH_OPENMP\" was never defined.
Usually this means the macro was only invoked conditionally." "$LINENO" 5
fi
 
: "${CONFIG_STATUS=./config.status}"
ac_write_fail=0
......
AC_PREREQ([2.69])
AC_INIT([elpa],[2013.08.001], elpa-library@rzg.mpg.de)
AC_CONFIG_SRCDIR([src/elpa1.f90])
AC_CONFIG_SRCDIR([src/elpa1.F90])
AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
AC_CONFIG_MACRO_DIR([m4])
......@@ -102,6 +102,22 @@ DEFINE_OPTION([WITH_AVX_REAL_BLOCK6], [avx-real-block6],
[no],[])
AC_MSG_CHECKING(whether OpenMP usage is specified)
AC_ARG_WITH([openmp],
AS_HELP_STRING([--with-openmp],
[use OpenMP threading, default no.]),
[with_openmp=yes],
[with_openmp=no])
AC_MSG_RESULT([${with_openmp}])
AM_CONDITIONAL([WITH_OPENMP],[test x"$with_openmp" = x"yes"])
if test "x${with_openmp}" = xyes; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
AC_OPENMP
fi
FCFLAGS="$FCFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
#LDFLAGS="$LDFLAGS $OPENMP_FCFLAGS $OPENMP_FFFLAGS"
save_FCFLAGS=$FCFLAGS
save_LDFLAGS=$LDFLAGS
......@@ -231,6 +247,8 @@ AC_SUBST([FC_MODOUT])
AC_SUBST(BLACS_LDFLAGS)
AC_SUBST(BLACS_FCFLAGS)
#AC_SUBST(OPT_FCFLAGS)
mkdir modules
#gl_VISIBILITY
......
......@@ -46,6 +46,8 @@
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "config-f90.h"
module ELPA1
! Version 1.1.2, 2011-02-21
......@@ -351,10 +353,17 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
integer istep, i, j, lcs, lce, lrs, lre
integer tile_size, l_rows_tile, l_cols_tile
#ifdef WITH_OPENMP
integer my_thread, n_threads, max_threads, n_iter
integer omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
#endif
real*8 vav, vnorm2, x, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
real*8, allocatable:: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
#ifdef WITH_OPENMP
real*8, allocatable:: ur_p(:,:), uc_p(:,:)
#endif
integer pcol, prow
pcol(i) = MOD((i-1)/nblk,np_cols) !Processor col for global col number
prow(i) = MOD((i-1)/nblk,np_rows) !Processor row for global row number
......@@ -387,6 +396,13 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
allocate(vc(max_local_cols))
allocate(uc(max_local_cols))
#ifdef WITH_OPENMP
max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1))
allocate(uc_p(max_local_cols,0:max_threads-1))
#endif
tmp = 0
vr = 0
ur = 0
......@@ -478,6 +494,17 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
ur(1:l_rows) = 0
if(l_rows>0 .and. l_cols>0) then
#ifdef WITH_OPENMP
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads()
n_iter = 0
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif
do i=0,(istep-2)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
......@@ -486,11 +513,27 @@ subroutine tridiag_real(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e, ta
lrs = j*l_rows_tile+1
lre = min(l_rows,(j+1)*l_rows_tile)
if(lre<lrs) cycle
#ifdef WITH_OPENMP
if(mod(n_iter,n_threads) == my_thread) then
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc_p(lcs,my_thread),1)
if(i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur_p(lrs,my_thread),1)
endif
n_iter = n_iter+1
#else
call DGEMV('T',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vr(lrs),1,1.d0,uc(lcs),1)
if(i/=j) call DGEMV('N',lre-lrs+1,lce-lcs+1,1.d0,a(lrs,lcs),lda,vc(lcs),1,1.d0,ur(lrs),1)
#endif
enddo
enddo
#ifdef WITH_OPENMP
!$OMP END PARALLEL
do i=0,max_threads-1
uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
enddo
#endif
if(nstor>0) then
call DGEMV('T',l_rows,2*nstor,1.d0,vur,ubound(vur,1),vr,1,0.d0,aux,1)
call DGEMV('N',l_cols,2*nstor,1.d0,uvc,ubound(uvc,1),aux,1,1.d0,uc,1)
......@@ -1027,10 +1070,18 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
integer istep, i, j, lcs, lce, lrs, lre
integer tile_size, l_rows_tile, l_cols_tile
#ifdef WITH_OPENMP
integer my_thread, n_threads, max_threads, n_iter
integer omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
#endif
real*8 vnorm2
complex*16 vav, xc, aux(2*max_stored_rows), aux1(2), aux2(2), vrl, xf
complex*16, allocatable:: tmp(:), vr(:), vc(:), ur(:), uc(:), vur(:,:), uvc(:,:)
#ifdef WITH_OPENMP
complex*16, allocatable:: ur_p(:,:), uc_p(:,:)
#endif
real*8, allocatable:: tmpr(:)
integer pcol, prow
......@@ -1065,6 +1116,13 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
allocate(vc(max_local_cols))
allocate(uc(max_local_cols))
#ifdef WITH_OPENMP
max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1))
allocate(uc_p(max_local_cols,0:max_threads-1))
#endif
tmp = 0
vr = 0
ur = 0
......@@ -1155,6 +1213,17 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
uc(1:l_cols) = 0
ur(1:l_rows) = 0
if(l_rows>0 .and. l_cols>0) then
#ifdef WITH_OPENMP
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,lcs,lce,j,lrs,lre)
my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads()
n_iter = 0
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif
do i=0,(istep-2)/tile_size
lcs = i*l_cols_tile+1
......@@ -1164,10 +1233,26 @@ subroutine tridiag_complex(na, a, lda, nblk, mpi_comm_rows, mpi_comm_cols, d, e,
lrs = j*l_rows_tile+1
lre = min(l_rows,(j+1)*l_rows_tile)
if(lre<lrs) cycle
#ifdef WITH_OPENMP
if(mod(n_iter,n_threads) == my_thread) then
call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc_p(lcs,my_thread),1)
if(i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur_p(lrs,my_thread),1)
endif
n_iter = n_iter+1
#else
call ZGEMV('C',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vr(lrs),1,CONE,uc(lcs),1)
if(i/=j) call ZGEMV('N',lre-lrs+1,lce-lcs+1,CONE,a(lrs,lcs),lda,vc(lcs),1,CONE,ur(lrs),1)
#endif
enddo
enddo
#ifdef WITH_OPENMP
!$OMP END PARALLEL
do i=0,max_threads-1
uc(1:l_cols) = uc(1:l_cols) + uc_p(1:l_cols,i)
ur(1:l_rows) = ur(1:l_rows) + ur_p(1:l_rows,i)
enddo
#endif
if(nstor>0) then
call ZGEMV('C',l_rows,2*nstor,CONE,vur,ubound(vur,1),vr,1,CZERO,aux,1)
......@@ -2124,6 +2209,9 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
real*8 z(na), d1(na), d2(na), z1(na), delta(na), dbase(na), ddiff(na), ev_scale(na), tmp(na)
real*8 d1u(na), zu(na), d1l(na), zl(na)
real*8, allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
#ifdef WITH_OPENMP
real*8, allocatable :: z_p(:,:)
#endif
integer i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, l_rqm, ns, info
integer l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, l_cols_qreorg, np, l_idx, nqcols1, nqcols2
......@@ -2132,6 +2220,14 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
integer idx(na), idx1(na), idx2(na)
integer coltyp(na), idxq1(na), idxq2(na)
#ifdef WITH_OPENMP
integer max_threads, my_thread
integer omp_get_max_threads, omp_get_thread_num
max_threads = omp_get_max_threads()
allocate(z_p(na,0:max_threads-1))
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
......@@ -2405,11 +2501,18 @@ subroutine merge_systems( na, nm, d, e, q, ldq, nqoff, nblk, mpi_comm_rows, mpi_
! Solve secular equation
z(1:na1) = 1
#ifdef WITH_OPENMP
z_p(1:na1,:) = 1
#endif
dbase(1:na1) = 0
ddiff(1:na1) = 0
info = 0
#ifdef WITH_OPENMP
!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,j)
my_thread = omp_get_thread_num()
!$OMP DO
#endif
DO i = my_proc+1, na1, n_procs ! work distributed over all processors