Commit 1d20472e authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' into ELPA_GPU

parents ed5d94d0 b31f44c9
before_script:
- export LANG=C
- module load impi/5.1.3 intel/16.0 gcc/4.9 mkl/11.3 autotools
- module load impi/5.1.3 intel/16.0 gcc/4.9 mkl/11.3 autotools pkg-config
- module list
- export MKL_INTEL_SCALAPACK_MPI_NO_OMP_BASELINE="-L$MKL_HOME/lib/intel64 -lmkl_scalapack_lp64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lmkl_blacs_intelmpi_lp64 -lpthread -lm"
- export MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP="$MKL_INTEL_SCALAPACK_MPI_NO_OMP_BASELINE -I$MKL_HOME/include/intel64/lp64"
......@@ -1225,3 +1225,25 @@ distcheck:
- ./autogen.sh
- ./configure --with-mpi=no
- make distcheck DISTCHECK_CONFIGURE_FLAGS="--with-mpi=no" TEST_FLAGS='1500 50 16'
test_project:
tags:
- cpu
script:
- mkdir build
- pushd build
- ../autogen.sh
- ../configure SCALAPACK_LDFLAGS="$MKL_GFORTRAN_SCALAPACK_LDFLAGS_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_GFORTRAN_SCALAPACK_FCFLAGS_MPI_NO_OMP" FC=mpif90 --prefix=$PWD/installdest
- make -j 8
- make install
- popd
- mkdir test_project/build
- pushd test_project/build
- ../autogen.sh
- ../configure PKG_CONFIG_PATH=../../build/installdest/lib64/pkgconfig
- make -j 8
- ./test_real
- make distclean
- popd
- make distclean
- rm -rf installdest
......@@ -501,15 +501,16 @@ TESTS = $(check_SCRIPTS)
echo '$(wrapper)./$^ $$TEST_FLAGS' > $@
chmod +x $@
# this one does not want any arguments
elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh:
echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > $@
chmod +x $@
## this one does not want any arguments
#elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@.sh:
# echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition@SUFFIX@' > $@
# chmod +x $@
if WANT_SINGLE_PRECISION_REAL
elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@.sh:
echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > $@
chmod +x $@
#elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@.sh:
# echo '$(wrapper)./elpa2_test_real_default_kernel_qr_decomposition_single_precision@SUFFIX@' > $@
# chmod +x $@
endif
# Preprocessed files (just used for manual inspection)
mod_precision.i: $(top_srcdir)/src/mod_precision.F90
......
define extract_interface
@echo "Generating $@...";
@grep -h "^ *$1" $^ | sed 's/^ *$1//;' > $@ || { rm $@; exit 1; }
@grep -h "^ *$1" $^ | sed 's/^ *$1//;' >> $@ || { rm $@; exit 1; }
endef
elpa test:
......@@ -19,8 +19,9 @@ elpa/elpa_generated.h: $(top_srcdir)/src/elpa_c_interface.F90 | elpa
test/shared_sources/generated.h: $(wildcard $(top_srcdir)/test/shared_sources/*.F90) | test/shared_sources
$(call extract_interface,!c>)
elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c) | elpa
elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c) $(wildcard $(top_srcdir)/src/elpa2_kernels/*.s) | elpa
$(call extract_interface,!f>)
$(call extract_interface,#!f>)
generated_headers= config-f90.h elpa/elpa_generated.h test/shared_sources/generated.h elpa/elpa_generated_fortran_interfaces.h
generated-headers: $(generated_headers)
......@@ -239,7 +239,7 @@ contains
endif
if (useQRActual) then
if (mod(na,nblk) .ne. 0) then
if (mod(na,2) .ne. 0) then
if (wantDebug) then
write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
endif
......
......@@ -54,8 +54,8 @@
# distributed along with the original code in the file "COPYING".
#
# --------------------------------------------------------------------------------------------------
.globl double_hh_trafo_double_
.globl single_hh_trafo_complex_double_
.globl double_hh_trafo_double
.globl single_hh_trafo_complex_double
.text
#-------------------------------------------------------------------------------
......@@ -382,8 +382,19 @@
# parameter 6: %r9 : ldh
#
#-------------------------------------------------------------------------------
#!f>#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
#!f> interface
#!f> subroutine double_hh_trafo_double(q, hh, nb, nq, ldq, ldh) bind(C,name="double_hh_trafo_double")
#!f> use, intrinsic :: iso_c_binding
#!f> integer(kind=c_int) :: nb, nq, ldq, ldh
#!f> type(c_ptr), value :: q
#!f> real(kind=c_double) :: hh(nb,6)
#!f> end subroutine
#!f> end interface
#!f>#endif
.align 16,0x90
double_hh_trafo_double_:
double_hh_trafo_double:
# Get integer parameters into corresponding registers
......@@ -698,8 +709,18 @@ return1:
# parameter 5: %r8 : ldq
#
#-------------------------------------------------------------------------------
#!f>#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
#!f> interface
#!f> subroutine single_hh_trafo_complex_double(q, hh, nb, nq, ldq) bind(C,name="single_hh_trafo_complex_double")
#!f> use, intrinsic :: iso_c_binding
#!f> integer(kind=c_int) :: nb, nq, ldq
#!f> complex(kind=c_double) :: q(*)
#!f> complex(kind=c_double) :: hh(nb,2)
#!f> end subroutine
#!f> end interface
#!f>#endif
.align 16,0x90
single_hh_trafo_complex_double_:
single_hh_trafo_complex_double:
# Get integer parameters into corresponding registers
......
......@@ -41,8 +41,8 @@
#
# Author: Andreas Marek, MPCDF
.globl double_hh_trafo_single_
.globl single_hh_trafo_complex_single_
.globl double_hh_trafo_single
.globl single_hh_trafo_complex_single
.text
#-------------------------------------------------------------------------------
......@@ -391,8 +391,20 @@
# parameter 6: %r9 : ldh
#
#-------------------------------------------------------------------------------
#!f>#ifdef WITH_REAL_SSE_ASSEMBLY_KERNEL
#!f>#ifdef WANT_SINGLE_PRECISION_REAL
#!f> interface
#!f> subroutine double_hh_trafo_single(q, hh, nb, nq, ldq, ldh) bind(C,name="double_hh_trafo_single")
#!f> use, intrinsic :: iso_c_binding
#!f> integer(kind=c_int) :: nb, nq, ldq, ldh
#!f> type(c_ptr), value :: q
#!f> real(kind=c_float) :: hh(nb,6)
#!f> end subroutine
#!f> end interface
#!f>#endif
#!f>#endif
.align 16,0x90
double_hh_trafo_single_:
double_hh_trafo_single:
# Get integer parameters into corresponding registers
......@@ -716,8 +728,21 @@ return1:
# parameter 5: %r8 : ldq
#
#-------------------------------------------------------------------------------
#!f>#ifdef WITH_COMPLEX_SSE_ASSEMBLY_KERNEL
#!f>#ifdef WANT_SINGLE_PRECISION_COMPLEX
#!f> interface
#!f> subroutine single_hh_trafo_complex_single(q, hh, nb, nq, ldq) bind(C,name="single_hh_trafo_complex_single")
#!f> use, intrinsic :: iso_c_binding
#!f> integer(kind=c_int) :: nb, nq, ldq
#!f> complex(kind=c_float) :: q(*)
#!f> complex(kind=c_float) :: hh(nb,2)
#!f> end subroutine
#!f> end interface
#!f>#endif
#!f>#endif
.align 16,0x90
single_hh_trafo_complex_single_:
single_hh_trafo_complex_single:
# Get integer parameters into corresponding registers
......
......@@ -748,6 +748,7 @@ subroutine qr_tmerge_pdlarfb_1dcomm_double(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,basei
else
! do not calculate parts for T merge as there is nothing to merge
mergeoffset = 0
updateoffset = 0
tgenoffset = updateoffset + updatesize
......
......@@ -88,7 +88,7 @@ module compute_hh_trafo_complex
#endif
use iso_c_binding
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use kernel_interfaces
#endif
implicit none
......@@ -399,7 +399,7 @@ module compute_hh_trafo_complex
use timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use kernel_interfaces
#endif
use iso_c_binding
......
......@@ -103,7 +103,7 @@ module compute_hh_trafo_real
use timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use kernel_interfaces
#endif
implicit none
......@@ -282,10 +282,10 @@ module compute_hh_trafo_real
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_double(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
call double_hh_trafo_double(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, &
stripe_width, nbw)
#else
call double_hh_trafo_double(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
call double_hh_trafo_double(c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, &
stripe_width, nbw)
#endif
enddo
......@@ -682,7 +682,7 @@ module compute_hh_trafo_real
use timings
#endif
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
use kernel_interfaces
#endif
implicit none
......@@ -860,10 +860,10 @@ module compute_hh_trafo_real
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_single(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
call double_hh_trafo_single(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, &
stripe_width, nbw)
#else
call double_hh_trafo_single(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
call double_hh_trafo_single(c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, &
stripe_width, nbw)
#endif
enddo
......
......@@ -44,7 +44,6 @@
#include "config-f90.h"
module elpa_mpi
use precision
#ifndef WITH_MPI
use elpa_mpi_stubs
#else
......
......@@ -142,15 +142,31 @@ program test_real2_default_kernel_qr_decomposition_double_precision
successELPA = .true.
gpuAvailable = .false.
!write_to_file = .false.
call read_input_parameters(na, nev, nblk, write_to_file)
if (COMMAND_ARGUMENT_COUNT() /= 0) then
write(error_unit,*) "This program does not support any command-line arguments"
stop 1
! if (COMMAND_ARGUMENT_COUNT() /= 0) then
! write(error_unit,*) "This program does not support any command-line arguments"
! stop 1
! endif
! nblk = 2
! na = 4000
! nev = 1500
! make sure na, nbl is even
if (mod(nblk,2 ) .ne. 0) then
nblk = nblk - 1
endif
nblk = 2
na = 4000
nev = 1500
! make sure na is even
if (mod(na,2) .ne. 0) then
na = na - 1
endif
! make sure na is at least 34
if (na .lt. 34) then
na = 34
endif
!-------------------------------------------------------------------------------
! MPI Initialization
......
......@@ -143,14 +143,32 @@ program test_real2_default_kernel_qr_decomposition_single_precision
gpuAvailable = .false.
if (COMMAND_ARGUMENT_COUNT() /= 0) then
write(error_unit,*) "This program does not support any command-line arguments"
stop 1
!write_to_file = .false.
call read_input_parameters(na, nev, nblk, write_to_file)
!if (COMMAND_ARGUMENT_COUNT() /= 0) then
! write(error_unit,*) "This program does not support any command-line arguments"
! stop 1
!endif
! override nblk
! nblk = 2
! na = 4000
! nev = 1500
! make sure na, nbl is even
if (mod(nblk,2 ) .ne. 0) then
nblk = nblk - 1
endif
nblk = 2
na = 4000
nev = 1500
! make sure na is even
if (mod(na,2) .ne. 0) then
na = na - 1
endif
! make sure na is at least 34
if (na .lt. 34) then
na = 34
endif
!-------------------------------------------------------------------------------
! MPI Initialization
......
......@@ -8,3 +8,6 @@ AM_LDFLAGS = $(ELPA_LIBS)
#bindir = $(abs_top_builddir)
bin_PROGRAMS = test_real
test_real_SOURCES = src/test_real.F90
distclean-local:
-rm config-f90.h
AC_PREREQ([2.69])
AC_INIT([elpa_test_project],[2014.06.000], elpa-library@rzg.mpg.de)
AC_INIT([elpa_test_project],[2016.05.001], elpa-library@rzg.mpg.de)
elpaversion="2016.05.001"
AC_CONFIG_SRCDIR([src/test_real.F90])
AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
......@@ -67,9 +68,9 @@ AC_ARG_WITH([openmp],
if test x"${with_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
AX_ELPA_OPENMP
elpa="elpa_openmp-2014.06.001"
elpa="elpa_openmp-$elpaversion"
else
elpa="elpa-2014.06.001"
elpa="elpa-$elpaversion"
fi
# Here comes the ELPA specific part
......
......@@ -3,7 +3,8 @@
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
......@@ -16,7 +17,7 @@
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
......@@ -39,7 +40,6 @@
! the original distribution, the GNU Lesser General Public License.
!
!
#include "config-f90.h"
!>
!> Fortran test programm to demonstrates the use of
!> ELPA 1 real case library.
......@@ -58,7 +58,7 @@
!> "output", which specifies that the EV's are written to
!> an ascii file.
!>
program test_real
program test_real_example
!-------------------------------------------------------------------------------
! Standard eigenvalue problem - REAL version
......@@ -75,25 +75,9 @@ program test_real
use ELPA1
use elpa_utilities, only : error_unit
#ifdef WITH_OPENMP
use test_util
#endif
use mod_read_input_parameters
use mod_check_correctness
use mod_setup_mpi
use mod_blacs_infrastructure
use mod_prepare_matrix
#ifdef HAVE_REDIRECT
use redirect
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use elpa_mpi
use iso_c_binding
implicit none
include 'mpif.h'
!-------------------------------------------------------------------------------
! Please set system size parameters below!
......@@ -101,172 +85,113 @@ program test_real
! nev: Number of eigenvectors to be calculated
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer :: nblk
integer na, nev
!-------------------------------------------------------------------------------
! Local Variables
integer :: np_rows, np_cols, na_rows, na_cols
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, parameter :: ik = C_INT32_T
integer, parameter :: rk = C_DOUBLE
integer, external :: numroc
integer(kind=ik) :: nblk
integer(kind=ik) :: na, nev
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
integer(kind=ik) :: np_rows, np_cols, na_rows, na_cols
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer(kind=ik) :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, &
provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: success
!-------------------------------------------------------------------------------
real(kind=rk), allocatable :: a(:,:), z(:,:), ev(:)
success = .true.
integer(kind=ik) :: iseed(4096) ! Random seed, size should be sufficient for every generator
call read_input_parameters(na, nev, nblk, write_to_file)
integer(kind=ik) :: STATUS
logical :: success
character(len=8) :: task_suffix
integer(kind=ik) :: j
!-------------------------------------------------------------------------------
! MPI Initialization
call setup_mpi(myid, nprocs)
if (write_to_file) then
if (myid .eq. 0) print *,"Writing output files"
endif
#ifdef HAVE_DETAILED_TIMINGS
! initialise the timing functionality
#ifdef HAVE_LIBPAPI
call timer%measure_flops(.true.)
#endif
call timer%measure_allocated_memory(.true.)
call timer%measure_virtual_memory(.true.)
call timer%measure_max_allocated_memory(.true.)
call timer%set_print_options(&
#ifdef HAVE_LIBPAPI
print_flop_count=.true., &
print_flop_rate=.true., &
#endif
print_allocated_memory = .true. , &
print_virtual_memory=.true., &
print_max_allocated_memory=.true.)
success = .true.
call timer%enable()
! default parameters
na = 4000
nev = 1500
nblk = 16
call timer%start("program")
#endif
!-------------------------------------------------------------------------------
! Selection of number of processor rows/columns
! We try to set up the grid square-like, i.e. start the search for possible
! divisors of nprocs with a number next to the square root of nprocs
! and decrement it until a divisor is found.
STATUS = 0
#ifdef WITH_OPENMP
if (myid .eq. 0) then
print *,"Threaded version of test program"
print *,"Using ",omp_get_max_threads()," threads"
print *," "
endif
#endif
call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
#ifdef HAVE_REDIRECT
if (check_redirect_environment_variable()) then
if (myid .eq. 0) then
print *," "
print *,"Redirection of mpi processes is used"
print *," "
if (create_directories() .ne. 1) then
write(error_unit,*) "Unable to create directory for stdout and stderr!"
stop
endif
endif
call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
call redirect_stdout(myid)
endif
#endif
call mpi_init(mpierr)
call mpi_comm_rank(mpi_comm_world,myid,mpierr)
call mpi_comm_size(mpi_comm_world,nprocs,mpierr)
do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
if(mod(nprocs,np_cols) == 0 ) exit
if(mod(nprocs,np_cols) == 0 ) exit
enddo
! at the end of the above loop, nprocs is always divisible by np_cols
np_rows = nprocs/np_cols
if(myid==0) then
print *
print '(a)','Standard eigenvalue problem - REAL version'
print *
print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
print *
if (myid==0) then
print *
print '(a)','Standard eigenvalue problem - REAL version'
print *
print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
print *
endif
!-------------------------------------------------------------------------------
! Set up BLACS context and MPI communicators
!
! The BLACS context is only necessary for using Scalapack.
!
! For ELPA, the MPI communicators along rows/cols are sufficient,
! and the grid setup may be done in an arbitrary way as long as it is
! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
! process has a unique (my_prow,my_pcol) pair).
call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
nprow, npcol, my_prow, my_pcol)
! initialise BLACS
my_blacs_ctxt = mpi_comm_world
call BLACS_Gridinit(my_blacs_ctxt, 'C', np_rows, np_cols)
call BLACS_Gridinfo(my_blacs_ctxt, nprow, npcol, my_prow, my_pcol)
if (myid==0) then
print '(a)','| Past BLACS_Gridinfo.'
end if
! All ELPA routines need MPI communicators for communicating within
! rows or columns of processes, these are set in get_elpa_row_col_comms.
mpierr = get_elpa_row_col_comms(mpi_comm_world, my_prow, my_pcol, &
mpi_comm_rows, mpi_comm_cols)
if (myid==0) then
print '(a)','| Past split communicator setup for rows and columns.'
end if