Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
1d20472e
Commit
1d20472e
authored
May 19, 2016
by
Andreas Marek
Browse files
Merge branch 'master' into ELPA_GPU
parents
ed5d94d0
b31f44c9
Changes
15
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
1d20472e
before_script
:
-
export LANG=C
-
module load impi/5.1.3 intel/16.0 gcc/4.9 mkl/11.3 autotools
-
module load impi/5.1.3 intel/16.0 gcc/4.9 mkl/11.3 autotools
pkg-config
-
module list
-
export MKL_INTEL_SCALAPACK_MPI_NO_OMP_BASELINE="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm"
-
export MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP="$MKL_INTEL_SCALAPACK_MPI_NO_OMP_BASELINE -I$MKL_HOME/include/intel64/lp64"
...
...
@@ -1225,3 +1225,25 @@ distcheck:
-
./autogen.sh
-
./configure --with-mpi=no
-
make distcheck DISTCHECK_CONFIGURE_FLAGS="--with-mpi=no" TEST_FLAGS='1500 50 16'
test_project
:
tags
:
-
cpu
script
:
-
mkdir build
-
pushd build
-
../autogen.sh
-
../configure SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NO_OMP" FC=mpif90 --prefix=$PWD/installdest
-
make -j
8
-
make install
-
popd
-
mkdir test_project/build
-
pushd test_project/build
-
../autogen.sh
-
../configure PKG_CONFIG_PATH=../../build/installdest/lib64/pkgconfig
-
make -j
8
-
./test_real
-
make distclean
-
popd
-
make distclean
-
rm -rf installdest
Makefile.am
View file @
1d20472e
...
...
@@ -501,15 +501,16 @@ TESTS = $(check_SCRIPTS)
echo
'
$(wrapper)
./$^ $$TEST_FLAGS'
>
$@
chmod
+x
$@
# this one does not want any arguments
elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh
:
echo
'
$(wrapper)
./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@'
>
$@
chmod
+x
$@
## this one does not want any arguments
#elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh:
# echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > $@
# chmod +x $@
if
WANT_SINGLE_PRECISION_REAL
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@.sh
:
echo
'
$(wrapper)
./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@'
>
$@
chmod
+x
$@
#
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@.sh:
#
echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > $@
#
chmod +x $@
endif
# Preprocessed files (just used for manual inspection)
mod_precision.i
:
$(top_srcdir)/src/mod_precision.F90
...
...
generated_headers.am
View file @
1d20472e
define extract_interface
@echo "Generating $@...";
@grep -h "^ *$1" $^ | sed 's/^ *$1//;' > $@ || { rm $@; exit 1; }
@grep -h "^ *$1" $^ | sed 's/^ *$1//;' >
>
$@ || { rm $@; exit 1; }
endef
elpa test:
...
...
@@ -19,8 +19,9 @@ elpa/elpa_generated.h: $(top_srcdir)/src/elpa_c_interface.F90 | elpa
test/shared_sources/generated.h: $(wildcard $(top_srcdir)/test/shared_sources/*.F90) | test/shared_sources
$(call extract_interface,!c>)
elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c) | elpa
elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c)
$(wildcard $(top_srcdir)/src/elpa2_kernels/*.s)
| elpa
$(call extract_interface,!f>)
$(call extract_interface,#!f>)
generated_headers= config-f90.h elpa/elpa_generated.h test/shared_sources/generated.h elpa/elpa_generated_fortran_interfaces.h
generated-headers: $(generated_headers)
src/elpa2.F90
View file @
1d20472e
...
...
@@ -239,7 +239,7 @@ contains
endif
if
(
useQRActual
)
then
if
(
mod
(
na
,
nblk
)
.ne.
0
)
then
if
(
mod
(
na
,
2
)
.ne.
0
)
then
if
(
wantDebug
)
then
write
(
error_unit
,
*
)
"solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
endif
...
...
src/elpa2_kernels/elpa2_kernels_asm_x86_64_double_precision.s
View file @
1d20472e
...
...
@@ -54,8 +54,8 @@
#
distributed
along
with
the
original
code
in
the
file
"COPYING"
.
#
#
--------------------------------------------------------------------------------------------------
.
globl
double_hh_trafo_double
_
.
globl
single_hh_trafo_complex_double
_
.
globl
double_hh_trafo_double
.
globl
single_hh_trafo_complex_double
.
text
#-------------------------------------------------------------------------------
...
...
@@ -382,8 +382,19 @@
#
parameter
6
:
%
r9
:
ldh
#
#-------------------------------------------------------------------------------
#!
f
>#
ifdef
WITH_REAL_SSE_ASSEMBLY_KERNEL
#!
f
>
interface
#!
f
>
subroutine
double_hh_trafo_double
(
q
,
hh
,
nb
,
nq
,
ldq
,
ldh
)
bind
(
C
,
name
=
"double_hh_trafo_double"
)
#!
f
>
use
,
intrinsic
::
iso_c_binding
#!
f
>
integer
(
kind
=
c_int
)
::
nb
,
nq
,
ldq
,
ldh
#!
f
>
type
(
c_ptr
),
value
::
q
#!
f
>
real
(
kind
=
c_double
)
::
hh
(
nb
,
6
)
#!
f
>
end
subroutine
#!
f
>
end
interface
#!
f
>#
endif
.
align
16
,
0x90
double_hh_trafo_double
_
:
double_hh_trafo_double
:
#
Get
integer
parameters
into
corresponding
registers
...
...
@@ -698,8 +709,18 @@ return1:
#
parameter
5
:
%
r8
:
ldq
#
#-------------------------------------------------------------------------------
#!
f
>#
ifdef
WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
#!
f
>
interface
#!
f
>
subroutine
single_hh_trafo_complex_double
(
q
,
hh
,
nb
,
nq
,
ldq
)
bind
(
C
,
name
=
"single_hh_trafo_complex_double"
)
#!
f
>
use
,
intrinsic
::
iso_c_binding
#!
f
>
integer
(
kind
=
c_int
)
::
nb
,
nq
,
ldq
#!
f
>
complex
(
kind
=
c_double
)
::
q
(*)
#!
f
>
complex
(
kind
=
c_double
)
::
hh
(
nb
,
2
)
#!
f
>
end
subroutine
#!
f
>
end
interface
#!
f
>#
endif
.
align
16
,
0x90
single_hh_trafo_complex_double
_
:
single_hh_trafo_complex_double
:
#
Get
integer
parameters
into
corresponding
registers
...
...
src/elpa2_kernels/elpa2_kernels_asm_x86_64_single_precision.s
View file @
1d20472e
...
...
@@ -41,8 +41,8 @@
#
#
Author
:
Andreas
Marek
,
MPCDF
.
globl
double_hh_trafo_single
_
.
globl
single_hh_trafo_complex_single
_
.
globl
double_hh_trafo_single
.
globl
single_hh_trafo_complex_single
.
text
#-------------------------------------------------------------------------------
...
...
@@ -391,8 +391,20 @@
#
parameter
6
:
%
r9
:
ldh
#
#-------------------------------------------------------------------------------
#!
f
>#
ifdef
WITH_REAL_SSE_ASSEMBLY_KERNEL
#!
f
>#
ifdef
WANT_SINGLE_PRECISION_REAL
#!
f
>
interface
#!
f
>
subroutine
double_hh_trafo_single
(
q
,
hh
,
nb
,
nq
,
ldq
,
ldh
)
bind
(
C
,
name
=
"double_hh_trafo_single"
)
#!
f
>
use
,
intrinsic
::
iso_c_binding
#!
f
>
integer
(
kind
=
c_int
)
::
nb
,
nq
,
ldq
,
ldh
#!
f
>
type
(
c_ptr
),
value
::
q
#!
f
>
real
(
kind
=
c_float
)
::
hh
(
nb
,
6
)
#!
f
>
end
subroutine
#!
f
>
end
interface
#!
f
>#
endif
#!
f
>#
endif
.
align
16
,
0x90
double_hh_trafo_single
_
:
double_hh_trafo_single
:
#
Get
integer
parameters
into
corresponding
registers
...
...
@@ -716,8 +728,21 @@ return1:
#
parameter
5
:
%
r8
:
ldq
#
#-------------------------------------------------------------------------------
#!
f
>#
ifdef
WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
#!
f
>#
ifdef
WANT_SINGLE_PRECISION_COMPLEX
#!
f
>
interface
#!
f
>
subroutine
single_hh_trafo_complex_single
(
q
,
hh
,
nb
,
nq
,
ldq
)
bind
(
C
,
name
=
"single_hh_trafo_complex_single"
)
#!
f
>
use
,
intrinsic
::
iso_c_binding
#!
f
>
integer
(
kind
=
c_int
)
::
nb
,
nq
,
ldq
#!
f
>
complex
(
kind
=
c_float
)
::
q
(*)
#!
f
>
complex
(
kind
=
c_float
)
::
hh
(
nb
,
2
)
#!
f
>
end
subroutine
#!
f
>
end
interface
#!
f
>#
endif
#!
f
>#
endif
.
align
16
,
0x90
single_hh_trafo_complex_single
_
:
single_hh_trafo_complex_single
:
#
Get
integer
parameters
into
corresponding
registers
...
...
src/elpa_qr/elpa_pdlarfb.F90
View file @
1d20472e
...
...
@@ -748,6 +748,7 @@ subroutine qr_tmerge_pdlarfb_1dcomm_double(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,basei
else
! do not calculate parts for T merge as there is nothing to merge
mergeoffset
=
0
updateoffset
=
0
tgenoffset
=
updateoffset
+
updatesize
...
...
src/mod_compute_hh_trafo_complex.F90
View file @
1d20472e
...
...
@@ -88,7 +88,7 @@ module compute_hh_trafo_complex
#endif
use
iso_c_binding
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) ||
defined(HAVE_AVX2) ||
defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use
kernel_interfaces
#endif
implicit
none
...
...
@@ -399,7 +399,7 @@ module compute_hh_trafo_complex
use
timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) ||
defined(HAVE_AVX2) ||
defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use
kernel_interfaces
#endif
use
iso_c_binding
...
...
src/mod_compute_hh_trafo_real.F90
View file @
1d20472e
...
...
@@ -103,7 +103,7 @@ module compute_hh_trafo_real
use
timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) ||
defined(HAVE_AVX2) ||
defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use
kernel_interfaces
#endif
implicit
none
...
...
@@ -282,10 +282,10 @@ module compute_hh_trafo_real
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_double
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
&
call
double_hh_trafo_double
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)
)
,
w
,
nbw
,
nl
,
&
stripe_width
,
nbw
)
#else
call
double_hh_trafo_double
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
w
,
nbw
,
nl
,
&
call
double_hh_trafo_double
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)
)
,
w
,
nbw
,
nl
,
&
stripe_width
,
nbw
)
#endif
enddo
...
...
@@ -682,7 +682,7 @@ module compute_hh_trafo_real
use
timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) ||
defined(HAVE_AVX2) ||
defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use
kernel_interfaces
#endif
implicit
none
...
...
@@ -860,10 +860,10 @@ module compute_hh_trafo_real
w
(:,
1
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
)
w
(:,
2
)
=
bcast_buffer
(
1
:
nbw
,
j
+
off
-1
)
#ifdef WITH_OPENMP
call
double_hh_trafo_single
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
),
w
,
nbw
,
nl
,
&
call
double_hh_trafo_single
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
,
my_thread
)
)
,
w
,
nbw
,
nl
,
&
stripe_width
,
nbw
)
#else
call
double_hh_trafo_single
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
),
w
,
nbw
,
nl
,
&
call
double_hh_trafo_single
(
c_loc
(
a
(
1
,
j
+
off
+
a_off
-1
,
istripe
)
)
,
w
,
nbw
,
nl
,
&
stripe_width
,
nbw
)
#endif
enddo
...
...
src/mod_mpi.F90
View file @
1d20472e
...
...
@@ -44,7 +44,6 @@
#include "config-f90.h"
module
elpa_mpi
use
precision
#ifndef WITH_MPI
use
elpa_mpi_stubs
#else
...
...
test/fortran_test_programs/test_elpa2_real_qr_default_kernel.F90
View file @
1d20472e
...
...
@@ -142,15 +142,31 @@ program test_real2_default_kernel_qr_decomposition_double_precision
successELPA
=
.true.
gpuAvailable
=
.false.
!write_to_file = .false.
call
read_input_parameters
(
na
,
nev
,
nblk
,
write_to_file
)
if
(
COMMAND_ARGUMENT_COUNT
()
/
=
0
)
then
write
(
error_unit
,
*
)
"This program does not support any command-line arguments"
stop
1
! if (COMMAND_ARGUMENT_COUNT() /= 0) then
! write(error_unit,*) "This program does not support any command-line arguments"
! stop 1
! endif
! nblk = 2
! na = 4000
! nev = 1500
! make sure na, nbl is even
if
(
mod
(
nblk
,
2
)
.ne.
0
)
then
nblk
=
nblk
-
1
endif
nblk
=
2
na
=
4000
nev
=
1500
! make sure na is even
if
(
mod
(
na
,
2
)
.ne.
0
)
then
na
=
na
-
1
endif
! make sure na is at least 34
if
(
na
.lt.
34
)
then
na
=
34
endif
!-------------------------------------------------------------------------------
! MPI Initialization
...
...
test/fortran_test_programs/test_elpa2_real_qr_default_kernel_single.F90
View file @
1d20472e
...
...
@@ -143,14 +143,32 @@ program test_real2_default_kernel_qr_decomposition_single_precision
gpuAvailable
=
.false.
if
(
COMMAND_ARGUMENT_COUNT
()
/
=
0
)
then
write
(
error_unit
,
*
)
"This program does not support any command-line arguments"
stop
1
!write_to_file = .false.
call
read_input_parameters
(
na
,
nev
,
nblk
,
write_to_file
)
!if (COMMAND_ARGUMENT_COUNT() /= 0) then
! write(error_unit,*) "This program does not support any command-line arguments"
! stop 1
!endif
! override nblk
! nblk = 2
! na = 4000
! nev = 1500
! make sure na, nbl is even
if
(
mod
(
nblk
,
2
)
.ne.
0
)
then
nblk
=
nblk
-
1
endif
nblk
=
2
na
=
4000
nev
=
1500
! make sure na is even
if
(
mod
(
na
,
2
)
.ne.
0
)
then
na
=
na
-
1
endif
! make sure na is at least 34
if
(
na
.lt.
34
)
then
na
=
34
endif
!-------------------------------------------------------------------------------
! MPI Initialization
...
...
test_project/Makefile.am
View file @
1d20472e
...
...
@@ -8,3 +8,6 @@ AM_LDFLAGS = $(ELPA_LIBS)
#bindir = $(abs_top_builddir)
bin_PROGRAMS
=
test_real
test_real_SOURCES
=
src/test_real.F90
distclean-local
:
-
rm
config-f90.h
test_project/configure.ac
View file @
1d20472e
AC_PREREQ([2.69])
AC_INIT([elpa_test_project],[2014.06.000], elpa-library@rzg.mpg.de)
AC_INIT([elpa_test_project],[2016.05.001], elpa-library@rzg.mpg.de)
elpaversion="2016.05.001"
AC_CONFIG_SRCDIR([src/test_real.F90])
AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
...
...
@@ -67,9 +68,9 @@ AC_ARG_WITH([openmp],
if test x"${with_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
AX_ELPA_OPENMP
elpa="elpa_openmp-
2014.06.001
"
elpa="elpa_openmp-
$elpaversion
"
else
elpa="elpa-
2014.06.001
"
elpa="elpa-
$elpaversion
"
fi
# Here comes the ELPA specific part
...
...
test_project/src/test_real.F90
View file @
1d20472e
...
...
@@ -3,7 +3,8 @@
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
...
...
@@ -16,7 +17,7 @@
!
!
! More information can be found here:
! http://elpa.
rzg
.mpg.de/
! http://elpa.
mpcdf
.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
...
...
@@ -39,7 +40,6 @@
! the original distribution, the GNU Lesser General Public License.
!
!
#include "config-f90.h"
!>
!> Fortran test programm to demonstrates the use of
!> ELPA 1 real case library.
...
...
@@ -58,7 +58,7 @@
!> "output", which specifies that the EV's are written to
!> an ascii file.
!>
program
test_real
program
test_real
_example
!-------------------------------------------------------------------------------
! Standard eigenvalue problem - REAL version
...
...
@@ -75,25 +75,9 @@ program test_real
use
ELPA1
use
elpa_utilities
,
only
:
error_unit
#ifdef WITH_OPENMP
use
test_util
#endif
use
mod_read_input_parameters
use
mod_check_correctness
use
mod_setup_mpi
use
mod_blacs_infrastructure
use
mod_prepare_matrix
#ifdef HAVE_REDIRECT
use
redirect
#endif
#ifdef HAVE_DETAILED_TIMINGS
use
timings
#endif
use
elpa_mpi
use
iso_c_binding
implicit
none
include
'mpif.h'
!-------------------------------------------------------------------------------
! Please set system size parameters below!
...
...
@@ -101,172 +85,113 @@ program test_real
! nev: Number of eigenvectors to be calculated
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer
::
nblk
integer
na
,
nev
!-------------------------------------------------------------------------------
! Local Variables
integer
::
np_rows
,
np_cols
,
na_rows
,
na_cols
integer
::
myid
,
nprocs
,
my_prow
,
my_pcol
,
mpi_comm_rows
,
mpi_comm_cols
integer
::
i
,
mpierr
,
my_blacs_ctxt
,
sc_desc
(
9
),
info
,
nprow
,
npcol
integer
,
parameter
::
ik
=
C_INT32_T
integer
,
parameter
::
rk
=
C_DOUBLE
integer
,
external
::
numroc
integer
(
kind
=
ik
)
::
nblk
integer
(
kind
=
ik
)
::
na
,
nev
real
*
8
,
allocatable
::
a
(:,:),
z
(:,:),
tmp1
(:,:),
tmp2
(:,:),
as
(:,:),
ev
(:)
integer
(
kind
=
ik
)
::
np_rows
,
np_cols
,
na_rows
,
na_cols
integer
::
iseed
(
4096
)
! Random seed, size should be sufficient for every generator
integer
(
kind
=
ik
)
::
myid
,
nprocs
,
my_prow
,
my_pcol
,
mpi_comm_rows
,
mpi_comm_cols
integer
(
kind
=
ik
)
::
i
,
mpierr
,
my_blacs_ctxt
,
sc_desc
(
9
),
info
,
nprow
,
npcol
integer
,
external
::
numroc
integer
::
STATUS
#ifdef WITH_OPENMP
integer
::
omp_get_max_threads
,
required_mpi_thread_level
,
&
provided_mpi_thread_level
#endif
logical
::
write_to_file
logical
::
success
!-------------------------------------------------------------------------------
real
(
kind
=
rk
),
allocatable
::
a
(:,:),
z
(:,:),
ev
(:)
success
=
.true.
integer
(
kind
=
ik
)
::
iseed
(
4096
)
! Random seed, size should be sufficient for every generator
call
read_input_parameters
(
na
,
nev
,
nblk
,
write_to_file
)
integer
(
kind
=
ik
)
::
STATUS
logical
::
success
character
(
len
=
8
)
::
task_suffix
integer
(
kind
=
ik
)
::
j
!-------------------------------------------------------------------------------
! MPI Initialization
call
setup_mpi
(
myid
,
nprocs
)
if
(
write_to_file
)
then
if
(
myid
.eq.
0
)
print
*
,
"Writing output files"
endif
#ifdef HAVE_DETAILED_TIMINGS
! initialise the timing functionality
#ifdef HAVE_LIBPAPI
call
timer
%
measure_flops
(
.true.
)
#endif
call
timer
%
measure_allocated_memory
(
.true.
)
call
timer
%
measure_virtual_memory
(
.true.
)
call
timer
%
measure_max_allocated_memory
(
.true.
)
call
timer
%
set_print_options
(&
#ifdef HAVE_LIBPAPI
print_flop_count
=
.true.
,
&
print_flop_rate
=
.true.
,
&
#endif
print_allocated_memory
=
.true.
,
&
print_virtual_memory
=
.true.
,
&
print_max_allocated_memory
=
.true.
)
success
=
.true.
call
timer
%
enable
()
! default parameters
na
=
4000
nev
=
1500
nblk
=
16
call
timer
%
start
(
"program"
)
#endif
!-------------------------------------------------------------------------------
! Selection of number of processor rows/columns
! We try to set up the grid square-like, i.e. start the search for possible
! divisors of nprocs with a number next to the square root of nprocs
! and decrement it until a divisor is found.
STATUS
=
0
#ifdef WITH_OPENMP
if
(
myid
.eq.
0
)
then
print
*
,
"Threaded version of test program"
print
*
,
"Using "
,
omp_get_max_threads
(),
" threads"
print
*
,
" "
endif
#endif
call
MPI_BARRIER
(
MPI_COMM_WORLD
,
mpierr
)
#ifdef HAVE_REDIRECT
if
(
check_redirect_environment_variable
())
then
if
(
myid
.eq.
0
)
then
print
*
,
" "
print
*
,
"Redirection of mpi processes is used"
print
*
,
" "
if
(
create_directories
()
.ne.
1
)
then
write
(
error_unit
,
*
)
"Unable to create directory for stdout and stderr!"
stop
endif
endif
call
MPI_BARRIER
(
MPI_COMM_WORLD
,
mpierr
)
call
redirect_stdout
(
myid
)
endif
#endif
call
mpi_init
(
mpierr
)
call
mpi_comm_rank
(
mpi_comm_world
,
myid
,
mpierr
)
call
mpi_comm_size
(
mpi_comm_world
,
nprocs
,
mpierr
)
do
np_cols
=
NINT
(
SQRT
(
REAL
(
nprocs
))),
2
,
-1
if
(
mod
(
nprocs
,
np_cols
)
==
0
)
exit
if
(
mod
(
nprocs
,
np_cols
)
==
0
)
exit
enddo
! at the end of the above loop, nprocs is always divisible by np_cols
np_rows
=
nprocs
/
np_cols
if
(
myid
==
0
)
then
print
*
print
'(a)'
,
'Standard eigenvalue problem - REAL version'
print
*
print
'(3(a,i0))'
,
'Matrix size='
,
na
,
', Number of eigenvectors='
,
nev
,
', Block size='
,
nblk
print
'(3(a,i0))'
,
'Number of processor rows='
,
np_rows
,
', cols='
,
np_cols
,
', total='
,
nprocs
print
*
if
(
myid
==
0
)
then
print
*
print
'(a)'
,
'Standard eigenvalue problem - REAL version'
print
*
print
'(3(a,i0))'
,
'Matrix size='
,
na
,
', Number of eigenvectors='
,
nev
,
', Block size='
,
nblk
print
'(3(a,i0))'
,
'Number of processor rows='
,
np_rows
,
', cols='
,
np_cols
,
', total='
,
nprocs
print
*
endif
!-------------------------------------------------------------------------------
! Set up BLACS context and MPI communicators
!
! The BLACS context is only necessary for using Scalapack.
!
! For ELPA, the MPI communicators along rows/cols are sufficient,
! and the grid setup may be done in an arbitrary way as long as it is
! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
! process has a unique (my_prow,my_pcol) pair).