Commit 5afd4d63 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' into ELPA_KNL

parents a6235503 2fb06c49
......@@ -58,7 +58,8 @@ intel-double-precision-mpi-noomp-cuda-jobs:
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --disable-assumed-size
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-double-precision-nompi-noomp-cuda-jobs:
......@@ -72,7 +73,8 @@ intel-double-precision-nompi-noomp-cuda-jobs:
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
#gfortran-double-precision-mpi-noomp-jobs:
# tags:
......@@ -125,7 +127,22 @@ intel-single-precision-mpi-noomp-cuda-jobs:
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-single-precision-mpi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs:
# tags:
......@@ -170,7 +187,22 @@ intel-single-precision-nompi-noomp-cuda-jobs:
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-single-precision-nompi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#gfortran-single-precision-mpi-noomp-jobs:
......@@ -213,6 +245,16 @@ intel-double-precision-nompi-noomp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP" --with-mpi=no FC=ifort
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 50 32' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-assumed-size-jobs:
tags:
- cpu
......@@ -376,6 +418,17 @@ intel-single-precision-mpi-openmp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 500 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-assumed-size-jobs:
tags:
- cpu
......
......@@ -26,6 +26,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_precision.F90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/mod_redist_band.F90 \
src/mod_pack_unpack_real.F90 \
src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \
......@@ -63,6 +64,11 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
src/elpa_multiply_a_b.X90 \
src/elpa_solve_tridi.X90 \
src/precision_macros.h
lib_LTLIBRARIES = libelpa@SUFFIX@.la
......@@ -347,6 +353,7 @@ dist_files_DATA = \
test/Fortran/test_complex2.F90 \
test/Fortran/test_complex2_default.F90 \
test/Fortran/test_complex2_api.F90 \
test/Fortran/test_complex2_banded.F90 \
test/Fortran/test_complex.F90 \
test/Fortran/test_real2.F90 \
test/Fortran/test_real2_default.F90 \
......@@ -388,6 +395,7 @@ noinst_PROGRAMS = \
elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_banded@SUFFIX@ \
elpa_driver_real@SUFFIX@ \
elpa_driver_complex@SUFFIX@ \
elpa1_real_toeplitz@SUFFIX@ \
......@@ -553,6 +561,7 @@ EXTRA_elpa1_complex_invert_trm@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_he
elpa2_test_real@SUFFIX@_SOURCES = test/Fortran/test_real2.F90
elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
......@@ -596,6 +605,12 @@ elpa2_test_complex_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_complex_banded@SUFFIX@_SOURCES = test/Fortran/test_complex2_banded.F90
elpa2_test_complex_banded@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_banded@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa_driver_real@SUFFIX@_SOURCES = test/Fortran/test_driver_real.F90
elpa_driver_real@SUFFIX@_LDADD = $(build_lib)
elpa_driver_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -777,6 +792,7 @@ check_SCRIPTS = \
elpa2_test_real_api@SUFFIX@.sh \
elpa2_test_real_banded@SUFFIX@.sh \
elpa2_test_complex_api@SUFFIX@.sh \
elpa2_test_complex_banded@SUFFIX@.sh \
elpa_driver_real@SUFFIX@.sh \
elpa_driver_complex@SUFFIX@.sh \
elpa1_real_toeplitz@SUFFIX@.sh \
......@@ -913,7 +929,7 @@ CLEANFILES = \
elpa2_test*\
elpa2_real* \
elpa1_real* \
*.sh \
elpa*.sh \
*.i
clean-local:
......@@ -958,6 +974,11 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
src/elpa_multiply_a_b.X90 \
src/elpa_solve_tridi.X90 \
src/elpa_qr/elpa_qrkernels.X90 \
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
src/ev_tridi_band_gpu_c_v2_real_template.Xcu \
......
......@@ -113,6 +113,7 @@ if test x"$with_mpi" = x"yes"; then
AC_MSG_ERROR([Could not compile an MPI Fortran program])
fi
fi
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
......@@ -250,7 +251,7 @@ if test x"${want_single_precision}" = x"yes" ; then
fi
dnl check whether on can compile with sse-gcc intrinsics
AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C)
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
......@@ -655,7 +656,7 @@ if test x"${have_mkl}" = x"yes" ; then
else
dnl first check blas
AC_SEARCH_LIBS([dgemm],[blas],[have_blas=yes],[have_blas=no])
AC_SEARCH_LIBS([dgemm],[openblas satlas blas],[have_blas=yes],[have_blas=no])
AC_MSG_CHECKING([whether we can link a program with a blas lib])
AC_MSG_RESULT([${have_blas}])
......@@ -756,6 +757,23 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then
AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can query environment variables])
fi
dnl check whether BAND_TO_FULL_BLOCKING is set
use_band_to_full_blocking=yes
AC_MSG_CHECKING(whether BAND_TO_FLULL_BLOCKING is requested)
AC_ARG_ENABLE(band-to-full-blocking,[AS_HELP_STRING([--enable-band-to-full-blocking],
[build ELPA2 with blocking in band_to_full (default: enabled)])],
want_band_to_full_blocking="yes", want_to_full_blocking="no")
AC_MSG_RESULT([${want_band_to_full_blocking}])
if test x"${enableval}" = x"no" ; then
use_band_to_full_blocking=no
fi
AM_CONDITIONAL([BAND_TO_FULL_BLOCKING],[test x"$use_band_to_full_blocking" = x"yes"])
if test x"${use_band_to_full_blocking}" = x"yes"; then
AC_DEFINE([BAND_TO_FULL_BLOCKING], [1], [use blocking in trans_ev_band_to_full])
fi
dnl check whether GPU version is requested
#CUDA_INSTALL_PATH="/usr/local/cuda/"
......@@ -775,9 +793,28 @@ AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
dnl setup nvcc flags and use them in later tests
user_sets_gpu_compute_capability="no"
AC_MSG_CHECKING(whether a GPU compute capability is specified)
AC_ARG_WITH([GPU-compute-capability],
[AS_HELP_STRING([--with-GPU-compute-capability=value],
[use compute capability "value" for GPU version (default sm_35)])],
[user_sets_gpu_compute_capability="yes"],[cuda_compute_capability="sm_35"])
AC_MSG_RESULT([${user_sets_gpu_compute_capability}])
dnl sanity check whether compute capability setting by user is reasonable
if test x"${user_sets_gpu_compute_capability}" = x"yes" ; then
dnl the user must set a value which starts with "sm_"
value=$(echo $withval | cut -c1-3)
if test x"${value}" = x"sm_" ; then
cuda_compute_capability=$withval
else
AC_MSG_ERROR([Unknown GPU compute capability set: ${withval}])
fi
fi
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -O2 -I$CUDA_INSTALL_PATH/include"
CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
......@@ -1245,6 +1282,20 @@ AC_CONFIG_FILES([
${PKG_CONFIG_FILE}:elpa.pc.in
])
AC_MSG_CHECKING([if workaround for Intel's broken preprocessor is needed])
if test x"$FC" = x"mpiifort" ; then
need_manual_cpp=yes
fi
if test x"$FC" = x"ifort" ; then
need_manual_cpp=yes
fi
if test x"$need_manual_cpp" = x"yes" ; then
AC_MSG_RESULT([yes])
FC="\$(top_srcdir)/manual_cpp $FC"
else
AC_MSG_RESULT([no])
fi
AC_OUTPUT
if test "${can_compile_avx}" = "no" ; then
......
.TH "elpa_solve_evp_complex_2stage_double" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*-
.TH "elpa_solve_evp_complex_2stage_double" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU)"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU, bandwidth)"
.br
.RI " "
.br
......@@ -47,6 +47,8 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
......@@ -55,7 +57,7 @@ use elpa2
#include <complex.h>
.br
.RI "success = \fBelpa_solve_evp_complex_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU);"
.RI "success = \fBelpa_solve_evp_complex_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br
.RI " "
.br
......@@ -89,6 +91,9 @@ use elpa2
.RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver"
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
.TH "elpa_solve_evp_complex_2stage_single" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*-
.TH "elpa_solve_evp_complex_2stage_single" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU)"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU, bandwidth)"
.br
.RI " "
.br
......@@ -47,6 +47,8 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
......@@ -55,41 +57,44 @@ use elpa2
#include <complex.h>
.br
.RI "success = \fBelpa_solve_evp_complex_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU);"
.RI "success = \fBelpa_solve_evp_complex_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br
.RI " "
.br
.RI "With the definintions of the input and output variables:"
.br
.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve"
.RI "int \fBna\fP: global dimension of quadratic matrix \fBa\fP to solve"
.br
.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
.br
.RI "complex *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
.br
.RI "int \fBnev\fP: number of eigenvalues to be computed; the first \fBnev\fP eigenvalules are calculated"
.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP"
.br
.RI "complex *\fBa\fP: pointer to locally distributed part of the matrix \fBa\fP. The local dimensions are \fBlda\fP x \fBmatrixCols\fP"
.RI "float *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
.br
.RI "int \fBlda\fP: leading dimension of locally distributed matrix \fBa\fP"
.RI "complex *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
.br
.RI "float *\fBev\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvalues"
.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors"
.br
.RI "complex *\fBq\fP: pointer to memory containing on output the first \fBnev\fP computed eigenvectors"
.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions"
.br
.RI "int \fBldq\fP: leading dimension of matrix \fBq\fP which stores the eigenvectors"
.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
.br
.RI "int \fBnblk\fP: blocksize of block cyclic distributin, must be the same in both directions"
.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBelpa_get_communicators\fP(3)"
.br
.RI "int \fBmatrixCols\fP: number of columns of locally distributed matrices \fBa\fP and \fBq\fP"
.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBelpa_get_communicators\fP(3)"
.br
.RI "int \fBmpi_comm_rows\fP: communicator for communication in rows. Constructed with \fBelpa_get_communicators\fP(3)"
.RI "int \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA"
.br
.RI "int \fBmpi_comm_cols\fP: communicator for communication in colums. Constructed with \fBelpa_get_communicators\fP(3)"
.RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver"
.br
.RI "int \fBmpi_comm_all\fP: communicator for all processes in the processor set involved in ELPA"
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver"
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
Solve the complex eigenvalue problem with the 2-stage solver. The ELPA communicators \fBmpi_comm_rows\fP and \fBmpi_comm_cols\fP are obtained with the \fBelpa_get_communicators\fP(3) function. The distributed quadratic marix \fBa\fP has global dimensions \fBna\fP x \fBna\fP, and a local size \fBlda\fP x \fBmatrixCols\fP. The solver will compute the first \fBnev\fP eigenvalues, which will be stored on exit in \fBev\fP. The eigenvectors corresponding to the eigenvalues will be stored in \fBq\fP. All memory of the arguments must be allocated outside the call to the solver.
......
.TH "elpa_solve_evp_real_2stage_double" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*-
.TH "elpa_solve_evp_real_2stage_double" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU, bandwidth)"
.br
.RI " "
.br
......@@ -49,13 +49,15 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
#include "elpa.h"
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU);"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br
.RI " "
.br
......@@ -92,6 +94,8 @@ use elpa2
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
.TH "elpa_solve_evp_real_2stage_single" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*-
.TH "elpa_solve_evp_real_2stage_single" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU, bandwidth)"
.br
.RI " "
.br
......@@ -49,13 +49,15 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix "
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
#include "elpa.h"
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU);"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br
.RI " "
.br
......@@ -92,6 +94,8 @@ use elpa2
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
#!/usr/bin/python
from __future__ import print_function
import os
import sys
import subprocess
def cpp_arg(arg):
return arg.startswith("-I") or \
arg.startswith("-D") or \
arg.startswith("-U")
def check_call(args, **kwargs):
if os.getenv("V") == "1":
print(" ".join(args))
return subprocess.check_call(args, **kwargs)
def check_call_redirect(args, filename=None, **kwargs):
if os.getenv("V") == "1":
print(" ".join(args), ">", filename)
with open(filename, "wb") as fd:
try:
return subprocess.check_call(args, stdout=fd, **kwargs)
except subprocess.CalledProcessError as e:
os.remove(filename)
raise SystemExit(e.returncode)
args = sys.argv[1:]
cpp_args = filter(cpp_arg, args)
files = filter(lambda q : q.endswith(".F90"), args)
args = filter(lambda q : not q.endswith(".F90"), args)
if len(files) > 1:
raise Exception("Specify exactly one .F90 file")
elif len(files) == 0:
# No .F90 file specified, execute program as-is
os.execvp(args[0], args[0:])
elif len(files) == 1:
file, = files
tmp_filename = "manually_preprocessed_" + file.replace("/", "__")
try:
output = args.index("-o")
outputname = args[output + 1]
tmp_filename += "-" + outputname.replace("/", "__") + ".F90"
except ValueError:
pass
# preprocess
check_call_redirect(["cpp","-P", "-traditional", "-Wall", "-Werror"] + cpp_args + [file], filename=tmp_filename)
# compile
check_call(args + [tmp_filename])
# cleanup
os.remove(tmp_filename)
......@@ -77,7 +77,7 @@ module mod_check_for_gpu
if (.not.(success)) then
print *,"error in cuda_getdevicecount"
stop
stop 1
endif
! make sure that all nodes have the same number of GPU's, otherwise
......@@ -108,7 +108,7 @@ module mod_check_for_gpu
if (.not.(success)) then
print *,"Cannot set CudaDevice"
stop
stop 1
endif
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
......
......@@ -69,6 +69,17 @@
#ifdef WITH_GPU_VERSION
extern "C" {
int cudaThreadSynchronizeFromC() {
cudaError_t cuerr = cudaThreadSynchronize();
if (cuerr != cudaSuccess) {
errormessage("Error in cudaThreadSynchronize: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaSetDeviceFromC(int n) {
cudaError_t cuerr = cudaSetDevice(n);
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -155,147 +155,92 @@ module ELPA1_COMPUTE
! real double precision first
#define DOUBLE_PRECISION_REAL 1
#define DATATYPE REAL(kind=rk8)
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
#undef REALCASE
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#define DATATYPE REAL(kind=rk4)
#define BYTESIZE 4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef SINGLE_PRECISION
#undef REALCASE
#endif
! double precision
#define DOUBLE_PRECISION_COMPLEX 1
#define DATATYPE COMPLEX(kind=ck8)
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_REAL
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */