Commit 3ea32c22 authored by Andreas Marek's avatar Andreas Marek

Merge NVIDIA GPU sources by hand

parent 9ad68bd0
......@@ -30,6 +30,10 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_la_SOURCES += src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
endif
......@@ -82,10 +86,8 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
endif
#if WITH_AVX_SANDYBRIDGE
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
# src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
.cu.lo:
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -c $< -o $@
# install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
......@@ -155,6 +157,7 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_SOURCES = test/test_real2.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources)
......@@ -258,7 +261,7 @@ elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
CLEANFILES = \
elpa-generated.h \
elpa/elpa-generated.h \
elpa1_test_real.sh \
elpa1_test_complex.sh \
elpa2_test_real.sh \
......
......@@ -83,6 +83,7 @@ AC_PROG_CXX
dnl variables needed for the tests
N="0"
dnl these test will cause an abort of configure if not
dnl successful. However, if MKL is found then the blas, blacs,
......@@ -343,7 +344,7 @@ else
AC_MSG_RESULT([${have_blas}])
if test x"${have_blas}" = x"no" ; then
AC_MSG_ERROR([could not link with blas: specify path])
AC_MSG_ERROR([could not link with blas: specify path])
fi
dnl now lapack
AC_SEARCH_LIBS([dlarrv],[lapack],[have_lapack=yes],[have_lapack=no])
......@@ -494,7 +495,7 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
AS_HELP_STRING([--with-$1],
[only compile $2 for real case]),
[],[with_option=no])
if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_real_kernel}" = x"no" ; then
......@@ -515,25 +516,25 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
if test x"${install_real_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block4}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block6}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
......@@ -583,7 +584,7 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
AS_HELP_STRING([--with-$1],
[only compile $2 for complex case]),
[],[with_option=no])
if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_complex_kernel}" = x"no" ; then
......@@ -604,19 +605,19 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
if test x"${install_complex_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_complex_avx_block1}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_complex_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
AC_MSG_NOTICE([$1 will be the only compiled kernel for real case])
......@@ -656,6 +657,65 @@ if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
dnl check whether GPU version is requested
CUDA_INSTALL_PATH="/usr/local/cuda/"
#CUDA_SDK_INSTALL_PATH="/usr/local/NVIDIA_GPU_Computing_SDK"
AC_MSG_CHECKING(whether GPU support is requested)
AC_ARG_ENABLE(gpu-support,[AS_HELP_STRING([--enable-gpu-support],
[build ELPA2 with GPU-support ( no CPU version available)])],
want_gpu="yes", want_gpu="no")
#AC_ARG_WITH([GPU-SUPPORT], [AS_HELP_STRING([--with-GPU-SUPPORT],
# [build ELPA2 with GPU-support ( no CPU version available)])],
# [with_gpu=yes],[with_gpu=no])
AC_MSG_RESULT([${want_gpu}])
AC_ARG_WITH([cuda-path],[AS_HELP_STRING([--with-cuda-path=PATH],[prefix where CUDA is installed @<:@default=auto@:>@])],
[CUDA_INSTALL_PATH=$withval], [with_cuda=auto])
AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix where CUDA SDK is installed @<:@default=auto@:>@])],
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
#AC_ARG_VAR([SCALAPACK_LDFLAGS],[Extra LDFLAGS necessary to link a program with Scalapack])
#AC_ARG_VAR([SCALAPACK_FCFLAGS],[Extra FCFLAGS necessary to compile a Fortran program with Scalapack])
#FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
#LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
dnl setup nvcc flags
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
AC_SUBST(NVCC)
AC_SUBST(NVCCFLAGS)
dnl check whether nvcc compiler is found
AC_CHECK_PROG(nvcc_found,nvcc,yes,no)
if test x"${nvcc_found}" = x"no" ; then
AC_MSG_ERROR([nvcc not found])
fi
dnl check whether we find cublas
AC_SEARCH_LIBS([cublasDgemm],[cublas],[have_cublas=yes],[have_cublas=no])
if test x"${have_cublas}" = x"no"; then
AC_MSG_ERROR([Could not link cublas])
fi
AC_SEARCH_LIBS([cudaMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
if test x"${have_cudart}" = x"no"; then
AC_MSG_ERROR([Could not link cudart])
fi
AC_LANG_POP([C])
AC_DEFINE([WITH_GPU_VERSION],[1],[build with GPU support])
fi
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$want_gpu" = x"yes"])
AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"])
if test x"${install_real_generic}" = x"yes" ; then
......@@ -794,4 +854,3 @@ grep "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_g
if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
......@@ -6,8 +6,9 @@
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9
#define ELPA2_NUMBER_OF_REAL_KERNELS 8
#define ELPA2_NUMBER_OF_REAL_KERNELS 9
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
......@@ -17,5 +18,6 @@
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 8
This diff is collapsed.
#ifndef UTILS_H
#define UTILS_H
void * allocateDeviceBuffer(int N);
int sendBufferToDevice(void *d_buf, void *h_buf, int N);
int getBufferFromDevice(void *h_buf, void *d_buf, int N);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -79,6 +79,10 @@ program test_complex2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
#ifdef WITH_OPENMP
use test_util
#endif
......@@ -115,35 +119,38 @@ program test_complex2
!-------------------------------------------------------------------------------
! Local Variables
integer np_rows, np_cols, na_rows, na_cols
integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer, external :: numroc
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
real*8, allocatable :: ev(:), xr(:,:)
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
real*8, allocatable :: ev(:), xr(:,:)
complex*16, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
integer, parameter :: error_unit = 6
#endif
logical :: success
logical :: success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum
#endif
write_to_file = .false.
success = .true.
......@@ -157,6 +164,15 @@ program test_complex2
!-------------------------------------------------------------------------------
! MPI Initialization
call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
devnum = 0
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
#endif
STATUS = 0
......@@ -186,6 +202,10 @@ program test_complex2
if (myid .eq. 0) then
print *," "
print *,"This ELPA2 is build with"
#ifdef WITH_GPU_VERSION
print *,"GPU support"
#else
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
print *,"AVX optimized kernel (2 blocking) for complex matrices"
#endif
......@@ -201,6 +221,8 @@ program test_complex2
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
print *,"SSE ASSEMBLER kernel for complex matrices"
#endif
#endif
endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment