Commit 5afd4d63 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master' into ELPA_KNL

parents a6235503 2fb06c49
......@@ -58,7 +58,8 @@ intel-double-precision-mpi-noomp-cuda-jobs:
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --disable-assumed-size
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-double-precision-nompi-noomp-cuda-jobs:
......@@ -72,7 +73,8 @@ intel-double-precision-nompi-noomp-cuda-jobs:
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
#gfortran-double-precision-mpi-noomp-jobs:
# tags:
......@@ -125,7 +127,22 @@ intel-single-precision-mpi-noomp-cuda-jobs:
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-single-precision-mpi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#intel-single-precision-mpi-noomp-cuda-runtime-choice-jobs:
# tags:
......@@ -170,7 +187,22 @@ intel-single-precision-nompi-noomp-cuda-jobs:
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128' || { cat test-suite.log; exit 1; }
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='150 50 128'
- cat test-suite.log
intel-single-precision-nompi-noomp-cuda-larger-jobs:
tags:
- gpu
script:
- module unload gcc
- module load gcc/4.9 cuda
- module list
- ./autogen.sh
- ./configure FC=ifort SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP -L$CUDA_HOME/lib64 -lcublas -I$CUDA_HOME/include" CFLAGS="-O2" CXXFLAGS="-O2" FCFLAGS="-O1" --enable-gpu-support --with-cuda-path=$CUDA_HOME/ --enable-single-precision --with-mpi=0
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- /home/elpa/bin/reserve_timeslot make check TEST_FLAGS='1500 500 128'
- cat test-suite.log
#gfortran-single-precision-mpi-noomp-jobs:
......@@ -213,6 +245,16 @@ intel-double-precision-nompi-noomp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_NO_MPI_NO_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_NO_MPI_NO_OMP" --with-mpi=no FC=ifort
- make -j 8
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1000 50 32' || { cat test-suite.log; exit 1; }
intel-double-precision-nompi-noomp-assumed-size-jobs:
tags:
- cpu
......@@ -376,6 +418,17 @@ intel-single-precision-mpi-openmp-jobs:
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='150 50 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-larger-jobs:
tags:
- cpu
script:
- ./autogen.sh
- ./configure CFLAGS="-O3 -mavx" CXXFLAGS="-O3 -mavx" FCFLAGS="-O3 -mavx" SCALAPACK_LDFLAGS="$MKL_INTEL_SCALAPACK_LDFLAGS_MPI_OMP" SCALAPACK_FCFLAGS="$MKL_INTEL_SCALAPACK_FCFLAGS_MPI_OMP" --enable-openmp --enable-single-precision
- make -j 8
- export OMP_NUM_THREADS=2
- export LD_LIBRARY_PATH=$MKL_HOME/lib/intel64:$LD_LIBRARY_PATH
- make check TEST_FLAGS='1500 500 16' || { cat test-suite.log; exit 1; }
intel-single-precision-mpi-openmp-assumed-size-jobs:
tags:
- cpu
......
......@@ -26,6 +26,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/mod_precision.F90 \
src/mod_mpi.F90 \
src/mod_mpi_stubs.F90 \
src/mod_redist_band.F90 \
src/mod_pack_unpack_real.F90 \
src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \
......@@ -63,6 +64,11 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
src/elpa_multiply_a_b.X90 \
src/elpa_solve_tridi.X90 \
src/precision_macros.h
lib_LTLIBRARIES = libelpa@SUFFIX@.la
......@@ -347,6 +353,7 @@ dist_files_DATA = \
test/Fortran/test_complex2.F90 \
test/Fortran/test_complex2_default.F90 \
test/Fortran/test_complex2_api.F90 \
test/Fortran/test_complex2_banded.F90 \
test/Fortran/test_complex.F90 \
test/Fortran/test_real2.F90 \
test/Fortran/test_real2_default.F90 \
......@@ -388,6 +395,7 @@ noinst_PROGRAMS = \
elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
elpa2_test_complex_banded@SUFFIX@ \
elpa_driver_real@SUFFIX@ \
elpa_driver_complex@SUFFIX@ \
elpa1_real_toeplitz@SUFFIX@ \
......@@ -553,6 +561,7 @@ EXTRA_elpa1_complex_invert_trm@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_he
elpa2_test_real@SUFFIX@_SOURCES = test/Fortran/test_real2.F90
elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
......@@ -596,6 +605,12 @@ elpa2_test_complex_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_complex_banded@SUFFIX@_SOURCES = test/Fortran/test_complex2_banded.F90
elpa2_test_complex_banded@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_complex_banded@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa_driver_real@SUFFIX@_SOURCES = test/Fortran/test_driver_real.F90
elpa_driver_real@SUFFIX@_LDADD = $(build_lib)
elpa_driver_real@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -777,6 +792,7 @@ check_SCRIPTS = \
elpa2_test_real_api@SUFFIX@.sh \
elpa2_test_real_banded@SUFFIX@.sh \
elpa2_test_complex_api@SUFFIX@.sh \
elpa2_test_complex_banded@SUFFIX@.sh \
elpa_driver_real@SUFFIX@.sh \
elpa_driver_complex@SUFFIX@.sh \
elpa1_real_toeplitz@SUFFIX@.sh \
......@@ -913,7 +929,7 @@ CLEANFILES = \
elpa2_test*\
elpa2_real* \
elpa1_real* \
*.sh \
elpa*.sh \
*.i
clean-local:
......@@ -958,6 +974,11 @@ EXTRA_DIST = \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/sanity.X90 \
src/elpa_cholesky_template.X90 \
src/elpa_invert_trm.X90 \
src/elpa_multiply_a_b.X90 \
src/elpa_solve_tridi.X90 \
src/elpa_qr/elpa_qrkernels.X90 \
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
src/ev_tridi_band_gpu_c_v2_real_template.Xcu \
......
......@@ -113,6 +113,7 @@ if test x"$with_mpi" = x"yes"; then
AC_MSG_ERROR([Could not compile an MPI Fortran program])
fi
fi
if test x"${enable_openmp}" = x"yes"; then
AX_ELPA_OPENMP
if test "$ac_cv_prog_fc_openmp" = unsupported; then
......@@ -250,7 +251,7 @@ if test x"${want_single_precision}" = x"yes" ; then
fi
dnl check whether on can compile with sse-gcc intrinsics
AC_MSG_CHECKING(whether we can compile SSE with gcc intrinsics in C)
AC_MSG_CHECKING(whether we can compile SSE3 with gcc intrinsics in C)
AC_COMPILE_IFELSE([AC_LANG_SOURCE([
#include <x86intrin.h>
int main(int argc, char **argv){
......@@ -655,7 +656,7 @@ if test x"${have_mkl}" = x"yes" ; then
else
dnl first check blas
AC_SEARCH_LIBS([dgemm],[blas],[have_blas=yes],[have_blas=no])
AC_SEARCH_LIBS([dgemm],[openblas satlas blas],[have_blas=yes],[have_blas=no])
AC_MSG_CHECKING([whether we can link a program with a blas lib])
AC_MSG_RESULT([${have_blas}])
......@@ -756,6 +757,23 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then
AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can query environment variables])
fi
dnl check whether BAND_TO_FULL_BLOCKING is set
use_band_to_full_blocking=yes
AC_MSG_CHECKING(whether BAND_TO_FLULL_BLOCKING is requested)
AC_ARG_ENABLE(band-to-full-blocking,[AS_HELP_STRING([--enable-band-to-full-blocking],
[build ELPA2 with blocking in band_to_full (default: enabled)])],
want_band_to_full_blocking="yes", want_to_full_blocking="no")
AC_MSG_RESULT([${want_band_to_full_blocking}])
if test x"${enableval}" = x"no" ; then
use_band_to_full_blocking=no
fi
AM_CONDITIONAL([BAND_TO_FULL_BLOCKING],[test x"$use_band_to_full_blocking" = x"yes"])
if test x"${use_band_to_full_blocking}" = x"yes"; then
AC_DEFINE([BAND_TO_FULL_BLOCKING], [1], [use blocking in trans_ev_band_to_full])
fi
dnl check whether GPU version is requested
#CUDA_INSTALL_PATH="/usr/local/cuda/"
......@@ -775,9 +793,28 @@ AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
dnl setup nvcc flags and use them in later tests
user_sets_gpu_compute_capability="no"
AC_MSG_CHECKING(whether a GPU compute capability is specified)
AC_ARG_WITH([GPU-compute-capability],
[AS_HELP_STRING([--with-GPU-compute-capability=value],
[use compute capability "value" for GPU version (default sm_35)])],
[user_sets_gpu_compute_capability="yes"],[cuda_compute_capability="sm_35"])
AC_MSG_RESULT([${user_sets_gpu_compute_capability}])
dnl sanity check whether compute capability setting by user is reasonable
if test x"${user_sets_gpu_compute_capability}" = x"yes" ; then
dnl the user must set a value which starts with "sm_"
value=$(echo $withval | cut -c1-3)
if test x"${value}" = x"sm_" ; then
cuda_compute_capability=$withval
else
AC_MSG_ERROR([Unknown GPU compute capability set: ${withval}])
fi
fi
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -O2 -I$CUDA_INSTALL_PATH/include"
CUDA_CFLAGS="$CUDA_CFLAGS -arch $cuda_compute_capability -O2 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
......@@ -1245,6 +1282,20 @@ AC_CONFIG_FILES([
${PKG_CONFIG_FILE}:elpa.pc.in
])
AC_MSG_CHECKING([if workaround for Intel's broken preprocessor is needed])
if test x"$FC" = x"mpiifort" ; then
need_manual_cpp=yes
fi
if test x"$FC" = x"ifort" ; then
need_manual_cpp=yes
fi
if test x"$need_manual_cpp" = x"yes" ; then
AC_MSG_RESULT([yes])
FC="\$(top_srcdir)/manual_cpp $FC"
else
AC_MSG_RESULT([no])
fi
AC_OUTPUT
if test "${can_compile_avx}" = "no" ; then
......
.TH "elpa_solve_evp_complex_2stage_double" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*-
.TH "elpa_solve_evp_complex_2stage_double" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU)"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU, bandwidth)"
.br
.RI " "
.br
......@@ -47,6 +47,8 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
......@@ -55,7 +57,7 @@ use elpa2
#include <complex.h>
.br
.RI "success = \fBelpa_solve_evp_complex_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU);"
.RI "success = \fBelpa_solve_evp_complex_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br
.RI " "
.br
......@@ -89,6 +91,9 @@ use elpa2
.RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver"
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
.TH "elpa_solve_evp_complex_2stage_single" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*-
.TH "elpa_solve_evp_complex_2stage_single" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU)"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU, bandwidth)"
.br
.RI " "
.br
......@@ -47,6 +47,8 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
......@@ -55,7 +57,7 @@ use elpa2
#include <complex.h>
.br
.RI "success = \fBelpa_solve_evp_complex_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU);"
.RI "success = \fBelpa_solve_evp_complex_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br
.RI " "
.br
......@@ -89,6 +91,9 @@ use elpa2
.RI "int \fBTHIS_ELPA_COMPLEX_KERNEL\fp: choose the compute kernel for 2-stage solver"
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
.TH "elpa_solve_evp_real_2stage_double" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*-
.TH "elpa_solve_evp_real_2stage_double" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU, bandwidth)"
.br
.RI " "
.br
......@@ -49,13 +49,15 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
#include "elpa.h"
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU);"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br
.RI " "
.br
......@@ -92,6 +94,8 @@ use elpa2
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
.TH "elpa_solve_evp_real_2stage_single" 3 "Tue Oct 18 2016" "ELPA" \" -*- nroff -*-
.TH "elpa_solve_evp_real_2stage_single" 3 "Wed Jan 15 2017" "ELPA" \" -*- nroff -*-
.ad l
.nh
.SH NAME
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU, bandwidth)"
.br
.RI " "
.br
......@@ -49,13 +49,15 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix "
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
#include "elpa.h"
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU);"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU, \fBint\fP bandwidth);"
.br
.RI " "
.br
......@@ -92,6 +94,8 @@ use elpa2
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
#!/usr/bin/python
from __future__ import print_function
import os
import sys
import subprocess
def cpp_arg(arg):
return arg.startswith("-I") or \
arg.startswith("-D") or \
arg.startswith("-U")
def check_call(args, **kwargs):
if os.getenv("V") == "1":
print(" ".join(args))
return subprocess.check_call(args, **kwargs)
def check_call_redirect(args, filename=None, **kwargs):
if os.getenv("V") == "1":
print(" ".join(args), ">", filename)
with open(filename, "wb") as fd:
try:
return subprocess.check_call(args, stdout=fd, **kwargs)
except subprocess.CalledProcessError as e:
os.remove(filename)
raise SystemExit(e.returncode)
args = sys.argv[1:]
cpp_args = filter(cpp_arg, args)
files = filter(lambda q : q.endswith(".F90"), args)
args = filter(lambda q : not q.endswith(".F90"), args)
if len(files) > 1:
raise Exception("Specify exactly one .F90 file")
elif len(files) == 0:
# No .F90 file specified, execute program as-is
os.execvp(args[0], args[0:])
elif len(files) == 1:
file, = files
tmp_filename = "manually_preprocessed_" + file.replace("/", "__")
try:
output = args.index("-o")
outputname = args[output + 1]
tmp_filename += "-" + outputname.replace("/", "__") + ".F90"
except ValueError:
pass
# preprocess
check_call_redirect(["cpp","-P", "-traditional", "-Wall", "-Werror"] + cpp_args + [file], filename=tmp_filename)
# compile
check_call(args + [tmp_filename])
# cleanup
os.remove(tmp_filename)
......@@ -77,7 +77,7 @@ module mod_check_for_gpu
if (.not.(success)) then
print *,"error in cuda_getdevicecount"
stop
stop 1
endif
! make sure that all nodes have the same number of GPU's, otherwise
......@@ -108,7 +108,7 @@ module mod_check_for_gpu
if (.not.(success)) then
print *,"Cannot set CudaDevice"
stop
stop 1
endif
if (wantDebugMessage) then
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
......
......@@ -69,6 +69,17 @@
#ifdef WITH_GPU_VERSION
extern "C" {
int cudaThreadSynchronizeFromC() {
cudaError_t cuerr = cudaThreadSynchronize();
if (cuerr != cudaSuccess) {
errormessage("Error in cudaThreadSynchronize: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaSetDeviceFromC(int n) {
cudaError_t cuerr = cudaSetDevice(n);
......
This diff is collapsed.
......@@ -155,147 +155,92 @@ module ELPA1_COMPUTE
! real double precision first
#define DOUBLE_PRECISION_REAL 1
#define DATATYPE REAL(kind=rk8)
#define BYTESIZE 8
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DOUBLE_PRECISION_REAL
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef DOUBLE_PRECISION
#undef REALCASE
! single precision
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#define DATATYPE REAL(kind=rk4)
#define BYTESIZE 4
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef REALCASE
#undef SINGLE_PRECISION
#undef REALCASE
#endif
! double precision
#define DOUBLE_PRECISION_COMPLEX 1
#define DATATYPE COMPLEX(kind=ck8)
#define BYTESIZE 16
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
#undef DOUBLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_REAL
#define DATATYPE COMPLEX(kind=ck4)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef DATATYPE
#undef BYTESIZE
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif /* WANT_SINGLE_PRECISION_COMPLEX */
! real double precision
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8
#define REALCASE 1
#define DOUBLE_PRECISION 1
! remove? :
#undef COMPLEXCASE
#include "precision_macros.h"
#include "elpa1_compute_template.X90"
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef REALCASE
#undef DOUBLE_PRECISION