Commit 3ea32c22 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge NVIDIA GPU sources by hand

parent 9ad68bd0
...@@ -30,6 +30,10 @@ if HAVE_DETAILED_TIMINGS ...@@ -30,6 +30,10 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c src/ftimings/papi.c
endif endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_la_SOURCES += src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if WITH_REAL_GENERIC_KERNEL if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90 libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
endif endif
...@@ -82,10 +86,8 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL ...@@ -82,10 +86,8 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
endif endif
#if WITH_AVX_SANDYBRIDGE .cu.lo:
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -c $< -o $@
# src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
# install any .mod files in the include/ dir # install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@ elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
...@@ -155,6 +157,7 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib) ...@@ -155,6 +157,7 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_SOURCES = test/test_real2.F90 $(shared_sources) $(redirect_sources) elpa2_test_real@SUFFIX@_SOURCES = test/test_real2.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_LDADD = $(build_lib) elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources) elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources)
...@@ -258,7 +261,7 @@ elpa1.i: $(top_srcdir)/src/elpa1.F90 ...@@ -258,7 +261,7 @@ elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@ $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
CLEANFILES = \ CLEANFILES = \
elpa-generated.h \ elpa/elpa-generated.h \
elpa1_test_real.sh \ elpa1_test_real.sh \
elpa1_test_complex.sh \ elpa1_test_complex.sh \
elpa2_test_real.sh \ elpa2_test_real.sh \
......
...@@ -83,6 +83,7 @@ AC_PROG_CXX ...@@ -83,6 +83,7 @@ AC_PROG_CXX
dnl variables needed for the tests dnl variables needed for the tests
N="0"
dnl these test will cause an abort of configure if not dnl these test will cause an abort of configure if not
dnl successful. However, if MKL is found then the blas, blacs, dnl successful. However, if MKL is found then the blas, blacs,
...@@ -343,7 +344,7 @@ else ...@@ -343,7 +344,7 @@ else
AC_MSG_RESULT([${have_blas}]) AC_MSG_RESULT([${have_blas}])
if test x"${have_blas}" = x"no" ; then if test x"${have_blas}" = x"no" ; then
AC_MSG_ERROR([could not link with blas: specify path]) AC_MSG_ERROR([could not link with blas: specify path])
fi fi
dnl now lapack dnl now lapack
AC_SEARCH_LIBS([dlarrv],[lapack],[have_lapack=yes],[have_lapack=no]) AC_SEARCH_LIBS([dlarrv],[lapack],[have_lapack=yes],[have_lapack=no])
...@@ -494,7 +495,7 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[ ...@@ -494,7 +495,7 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
AS_HELP_STRING([--with-$1], AS_HELP_STRING([--with-$1],
[only compile $2 for real case]), [only compile $2 for real case]),
[],[with_option=no]) [],[with_option=no])
if test x"${with_option}" = x"yes" ; then if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_real_kernel}" = x"no" ; then if test x"${use_specific_real_kernel}" = x"no" ; then
...@@ -515,25 +516,25 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[ ...@@ -515,25 +516,25 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
if test x"${install_real_sse}" = x"yes" ; then if test x"${install_real_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi fi
fi fi
if test x"${install_real_avx_block2}" = x"yes" ; then if test x"${install_real_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi fi
fi fi
if test x"${install_real_avx_block4}" = x"yes" ; then if test x"${install_real_avx_block4}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi fi
fi fi
if test x"${install_real_avx_block6}" = x"yes" ; then if test x"${install_real_avx_block6}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi fi
fi fi
...@@ -583,7 +584,7 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[ ...@@ -583,7 +584,7 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
AS_HELP_STRING([--with-$1], AS_HELP_STRING([--with-$1],
[only compile $2 for complex case]), [only compile $2 for complex case]),
[],[with_option=no]) [],[with_option=no])
if test x"${with_option}" = x"yes" ; then if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_complex_kernel}" = x"no" ; then if test x"${use_specific_complex_kernel}" = x"no" ; then
...@@ -604,19 +605,19 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[ ...@@ -604,19 +605,19 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
if test x"${install_complex_sse}" = x"yes" ; then if test x"${install_complex_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi fi
fi fi
if test x"${install_complex_avx_block1}" = x"yes" ; then if test x"${install_complex_avx_block1}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi fi
fi fi
if test x"${install_complex_avx_block2}" = x"yes" ; then if test x"${install_complex_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!]) AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi fi
fi fi
AC_MSG_NOTICE([$1 will be the only compiled kernel for real case]) AC_MSG_NOTICE([$1 will be the only compiled kernel for real case])
...@@ -656,6 +657,65 @@ if test x"${can_use_iso_fortran_env}" = x"yes" ; then ...@@ -656,6 +657,65 @@ if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env]) AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi fi
dnl check whether GPU version is requested
CUDA_INSTALL_PATH="/usr/local/cuda/"
#CUDA_SDK_INSTALL_PATH="/usr/local/NVIDIA_GPU_Computing_SDK"
AC_MSG_CHECKING(whether GPU support is requested)
AC_ARG_ENABLE(gpu-support,[AS_HELP_STRING([--enable-gpu-support],
[build ELPA2 with GPU-support ( no CPU version available)])],
want_gpu="yes", want_gpu="no")
#AC_ARG_WITH([GPU-SUPPORT], [AS_HELP_STRING([--with-GPU-SUPPORT],
# [build ELPA2 with GPU-support ( no CPU version available)])],
# [with_gpu=yes],[with_gpu=no])
AC_MSG_RESULT([${want_gpu}])
AC_ARG_WITH([cuda-path],[AS_HELP_STRING([--with-cuda-path=PATH],[prefix where CUDA is installed @<:@default=auto@:>@])],
[CUDA_INSTALL_PATH=$withval], [with_cuda=auto])
AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix where CUDA SDK is installed @<:@default=auto@:>@])],
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
#AC_ARG_VAR([SCALAPACK_LDFLAGS],[Extra LDFLAGS necessary to link a program with Scalapack])
#AC_ARG_VAR([SCALAPACK_FCFLAGS],[Extra FCFLAGS necessary to compile a Fortran program with Scalapack])
#FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
#LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
dnl setup nvcc flags
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
AC_SUBST(NVCC)
AC_SUBST(NVCCFLAGS)
dnl check whether nvcc compiler is found
AC_CHECK_PROG(nvcc_found,nvcc,yes,no)
if test x"${nvcc_found}" = x"no" ; then
AC_MSG_ERROR([nvcc not found])
fi
dnl check whether we find cublas
AC_SEARCH_LIBS([cublasDgemm],[cublas],[have_cublas=yes],[have_cublas=no])
if test x"${have_cublas}" = x"no"; then
AC_MSG_ERROR([Could not link cublas])
fi
AC_SEARCH_LIBS([cudaMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
if test x"${have_cudart}" = x"no"; then
AC_MSG_ERROR([Could not link cudart])
fi
AC_LANG_POP([C])
AC_DEFINE([WITH_GPU_VERSION],[1],[build with GPU support])
fi
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$want_gpu" = x"yes"])
AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"]) AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"])
if test x"${install_real_generic}" = x"yes" ; then if test x"${install_real_generic}" = x"yes" ; then
...@@ -794,4 +854,3 @@ grep "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_g ...@@ -794,4 +854,3 @@ grep "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_g
if test "${can_compile_avx}" = "no" ; then if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Could not compile AVX instructions]) AC_MSG_WARN([Could not compile AVX instructions])
fi fi
...@@ -6,8 +6,9 @@ ...@@ -6,8 +6,9 @@
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6 #define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7 #define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8 #define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9
#define ELPA2_NUMBER_OF_REAL_KERNELS 8 #define ELPA2_NUMBER_OF_REAL_KERNELS 9
#define ELPA2_COMPLEX_KERNEL_GENERIC 1 #define ELPA2_COMPLEX_KERNEL_GENERIC 1
...@@ -17,5 +18,6 @@ ...@@ -17,5 +18,6 @@
#define ELPA2_COMPLEX_KERNEL_SSE 5 #define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6 #define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7 #define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7 #define ELPA2_NUMBER_OF_COMPLEX_KERNELS 8
This diff is collapsed.
#ifndef UTILS_H
#define UTILS_H
void * allocateDeviceBuffer(int N);
int sendBufferToDevice(void *d_buf, void *h_buf, int N);
int getBufferFromDevice(void *h_buf, void *d_buf, int N);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -79,6 +79,10 @@ program test_complex2 ...@@ -79,6 +79,10 @@ program test_complex2
use ELPA1 use ELPA1
use ELPA2 use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
use test_util use test_util
#endif #endif
...@@ -115,35 +119,38 @@ program test_complex2 ...@@ -115,35 +119,38 @@ program test_complex2
!------------------------------------------------------------------------------- !-------------------------------------------------------------------------------
! Local Variables ! Local Variables
integer np_rows, np_cols, na_rows, na_cols integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc integer, external :: numroc
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0) complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
real*8, allocatable :: ev(:), xr(:,:) real*8, allocatable :: ev(:), xr(:,:)
complex*16, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:) complex*16, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS integer :: STATUS
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
#endif #endif
logical :: write_to_file logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV #ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6 integer, parameter :: error_unit = 6
#endif #endif
logical :: success logical :: success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum
#endif
write_to_file = .false. write_to_file = .false.
success = .true. success = .true.
...@@ -157,6 +164,15 @@ program test_complex2 ...@@ -157,6 +164,15 @@ program test_complex2
!------------------------------------------------------------------------------- !-------------------------------------------------------------------------------
! MPI Initialization ! MPI Initialization
call setup_mpi(myid, nprocs) call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
devnum = 0
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
#endif
STATUS = 0 STATUS = 0
...@@ -186,6 +202,10 @@ program test_complex2 ...@@ -186,6 +202,10 @@ program test_complex2
if (myid .eq. 0) then if (myid .eq. 0) then
print *," " print *," "
print *,"This ELPA2 is build with" print *,"This ELPA2 is build with"
#ifdef WITH_GPU_VERSION
print *,"GPU support"
#else
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL #ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
print *,"AVX optimized kernel (2 blocking) for complex matrices" print *,"AVX optimized kernel (2 blocking) for complex matrices"
#endif #endif
...@@ -201,6 +221,8 @@ program test_complex2 ...@@ -201,6 +221,8 @@ program test_complex2
#endif #endif
#ifdef WITH_COMPLEX_SSE_KERNEL #ifdef WITH_COMPLEX_SSE_KERNEL
print *,"SSE ASSEMBLER kernel for complex matrices" print *,"SSE ASSEMBLER kernel for complex matrices"
#endif
#endif #endif
endif endif
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment