Commit 3ea32c22 authored by Andreas Marek's avatar Andreas Marek

Merge NVIDIA GPU sources by hand

parent 9ad68bd0
......@@ -30,6 +30,10 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_la_SOURCES += src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
endif
......@@ -82,10 +86,8 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
endif
#if WITH_AVX_SANDYBRIDGE
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
# src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
.cu.lo:
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -c $< -o $@
# install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
......@@ -155,6 +157,7 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_SOURCES = test/test_real2.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources)
......@@ -258,7 +261,7 @@ elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
CLEANFILES = \
elpa-generated.h \
elpa/elpa-generated.h \
elpa1_test_real.sh \
elpa1_test_complex.sh \
elpa2_test_real.sh \
......
......@@ -83,6 +83,7 @@ AC_PROG_CXX
dnl variables needed for the tests
N="0"
dnl these test will cause an abort of configure if not
dnl successful. However, if MKL is found then the blas, blacs,
......@@ -343,7 +344,7 @@ else
AC_MSG_RESULT([${have_blas}])
if test x"${have_blas}" = x"no" ; then
AC_MSG_ERROR([could not link with blas: specify path])
AC_MSG_ERROR([could not link with blas: specify path])
fi
dnl now lapack
AC_SEARCH_LIBS([dlarrv],[lapack],[have_lapack=yes],[have_lapack=no])
......@@ -494,7 +495,7 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
AS_HELP_STRING([--with-$1],
[only compile $2 for real case]),
[],[with_option=no])
if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_real_kernel}" = x"no" ; then
......@@ -515,25 +516,25 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
if test x"${install_real_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block4}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block6}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
......@@ -583,7 +584,7 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
AS_HELP_STRING([--with-$1],
[only compile $2 for complex case]),
[],[with_option=no])
if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_complex_kernel}" = x"no" ; then
......@@ -604,19 +605,19 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
if test x"${install_complex_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_complex_avx_block1}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_complex_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
AC_MSG_NOTICE([$1 will be the only compiled kernel for real case])
......@@ -656,6 +657,65 @@ if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
dnl check whether GPU version is requested
CUDA_INSTALL_PATH="/usr/local/cuda/"
#CUDA_SDK_INSTALL_PATH="/usr/local/NVIDIA_GPU_Computing_SDK"
AC_MSG_CHECKING(whether GPU support is requested)
AC_ARG_ENABLE(gpu-support,[AS_HELP_STRING([--enable-gpu-support],
[build ELPA2 with GPU-support ( no CPU version available)])],
want_gpu="yes", want_gpu="no")
#AC_ARG_WITH([GPU-SUPPORT], [AS_HELP_STRING([--with-GPU-SUPPORT],
# [build ELPA2 with GPU-support ( no CPU version available)])],
# [with_gpu=yes],[with_gpu=no])
AC_MSG_RESULT([${want_gpu}])
AC_ARG_WITH([cuda-path],[AS_HELP_STRING([--with-cuda-path=PATH],[prefix where CUDA is installed @<:@default=auto@:>@])],
[CUDA_INSTALL_PATH=$withval], [with_cuda=auto])
AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix where CUDA SDK is installed @<:@default=auto@:>@])],
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
#AC_ARG_VAR([SCALAPACK_LDFLAGS],[Extra LDFLAGS necessary to link a program with Scalapack])
#AC_ARG_VAR([SCALAPACK_FCFLAGS],[Extra FCFLAGS necessary to compile a Fortran program with Scalapack])
#FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
#LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
dnl setup nvcc flags
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
AC_SUBST(NVCC)
AC_SUBST(NVCCFLAGS)
dnl check whether nvcc compiler is found
AC_CHECK_PROG(nvcc_found,nvcc,yes,no)
if test x"${nvcc_found}" = x"no" ; then
AC_MSG_ERROR([nvcc not found])
fi
dnl check whether we find cublas
AC_SEARCH_LIBS([cublasDgemm],[cublas],[have_cublas=yes],[have_cublas=no])
if test x"${have_cublas}" = x"no"; then
AC_MSG_ERROR([Could not link cublas])
fi
AC_SEARCH_LIBS([cudaMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
if test x"${have_cudart}" = x"no"; then
AC_MSG_ERROR([Could not link cudart])
fi
AC_LANG_POP([C])
AC_DEFINE([WITH_GPU_VERSION],[1],[build with GPU support])
fi
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$want_gpu" = x"yes"])
AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"])
if test x"${install_real_generic}" = x"yes" ; then
......@@ -794,4 +854,3 @@ grep "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_g
if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
......@@ -6,8 +6,9 @@
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9
#define ELPA2_NUMBER_OF_REAL_KERNELS 8
#define ELPA2_NUMBER_OF_REAL_KERNELS 9
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
......@@ -17,5 +18,6 @@
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 8
This diff is collapsed.
#ifndef UTILS_H
#define UTILS_H
void * allocateDeviceBuffer(int N);
int sendBufferToDevice(void *d_buf, void *h_buf, int N);
int getBufferFromDevice(void *h_buf, void *d_buf, int N);
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -76,12 +76,14 @@ module ELPA2_utilities
public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, &
REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_GPU
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK2
COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_GPU
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
......@@ -108,6 +110,8 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_GPU = ELPA2_REAL_KERNEL_GPU
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
......@@ -122,7 +126,8 @@ module ELPA2_utilities
"REAL_ELPA_KERNEL_SSE ", &
"REAL_ELPA_KERNEL_AVX_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK6 "/)
"REAL_ELPA_KERNEL_AVX_BLOCK6 ", &
"REAL_ELPA_KERNEL_GPU "/)
integer, parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS
integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA2_COMPLEX_KERNEL_GENERIC
......@@ -132,6 +137,8 @@ module ELPA2_utilities
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE = ELPA2_COMPLEX_KERNEL_SSE
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_GPU = ELPA2_COMPLEX_KERNEL_GPU
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
......@@ -145,7 +152,8 @@ module ELPA2_utilities
"COMPLEX_ELPA_KERNEL_BGQ ", &
"COMPLEX_ELPA_KERNEL_SSE ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "/)
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_GPU "/)
integer, parameter :: &
AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) = &
......@@ -190,7 +198,12 @@ module ELPA2_utilities
#else
,0 &
#endif
/)
#ifdef WITH_GPU_VERSION
,1 &
#else
,0 &
#endif
/)
integer, parameter :: &
AVAILABLE_COMPLEX_ELPA_KERNELS(number_of_complex_kernels) = &
......@@ -230,7 +243,12 @@ module ELPA2_utilities
#else
,0 &
#endif
/)
#ifdef WITH_GPU_VERSION
,1 &
#else
,0 &
#endif
/)
!******
contains
......@@ -306,11 +324,21 @@ module ELPA2_utilities
! check whether set by environment variable
actual_kernel = real_kernel_via_environment_variable()
#ifdef WITH_GPU_VERSION
actual_kernel = REAL_ELPA_KERNEL_GPU
#endif
if (actual_kernel .eq. 0) then
! if not then set default kernel
actual_kernel = DEFAULT_REAL_ELPA_KERNEL
endif
#ifdef WITH_GPU_VERSION
if (actual_kernel .ne. REAL_ELPA_KERNEL_GPU) then
print *,"if build with GPU you cannot choose another real kernel"
stop
endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("get_actual_real_kernel")
#endif
......@@ -355,11 +383,22 @@ module ELPA2_utilities
! check whether set by environment variable
actual_kernel = complex_kernel_via_environment_variable()
#ifdef WITH_GPU_VERSION
actual_kernel = COMPLEX_ELPA_KERNEL_GPU
#endif
if (actual_kernel .eq. 0) then
! if not then set default kernel
actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
endif
#ifdef WITH_GPU_VERSION
if (actual_kernel .ne. COMPLEX_ELPA_KERNEL_GPU) then
print *,"if build with GPU you cannot choose another complex kernel"
stop
endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("get_actual_complex_kernel")
#endif
......@@ -463,7 +502,6 @@ module ELPA2_utilities
end function qr_decomposition_via_environment_variable
function real_kernel_via_environment_variable() result(kernel)
#ifdef HAVE_DETAILED_TIMINGS
use timings
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!This is a module contains all CUDA C Calls
! it was provided by NVIDIA with their ELPA GPU port and
! adapted for an ELPA release by A.Marek, RZG
#include "config-f90.h"
#ifdef WITH_GPU_VERSION
module cuda_c_kernel
implicit none
interface
subroutine launch_dot_product_kernel(hs_dev, hv_new_dev, tau_new, x_dev, h_dev,hv_dev, nr) bind(c)
use iso_c_binding
implicit none
integer, value :: nr
integer(C_SIZE_T), value :: hs_dev ,hv_new_dev,x_dev,h_dev, hv_dev
complex*16,value :: tau_new
end subroutine
subroutine launch_dot_product_kernel_1(ab_dev, hs_dev, hv_new_dev, x_dev,h_dev,hv_dev,nb, nr, ns) bind(c)
use iso_c_binding
implicit none
integer, value :: nb, nr, ns
integer(C_SIZE_T), value :: x_dev,h_dev, hv_dev, ab_dev, hs_dev,hv_new_dev
end subroutine
subroutine launch_dot_product_kernel_2(ab_dev, hs_dev, hv_dev,hd_dev,nb, nr, ne) bind(c)
use iso_c_binding
implicit none
integer, value :: nb, nr, ne
integer(C_SIZE_T), value :: hd_dev,hv_dev, hs_dev, ab_dev
end subroutine
subroutine launch_double_hh_transform_1(ab_dev, hs_dev,hv_dev,nb,ns) bind(c)
use iso_c_binding
implicit none
integer, value :: nb, ns
integer(C_SIZE_T), value :: hv_dev, ab_dev,hs_dev
end subroutine
subroutine launch_double_hh_transform_2(ab_dev, hd_dev,hv_dev,nc,ns, nb) bind(c)
use iso_c_binding
implicit none
integer, value :: nc, ns, nb
integer(C_SIZE_T), value :: hv_dev, ab_dev,hd_dev
end subroutine
subroutine launch_compute_kernel_reduce(a_dev, lda, n, nbw, h1_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: n,lda,nbw
integer(C_SIZE_T), value :: h1_dev ,a_dev
end subroutine
subroutine launch_compute_kernel_reduce_1(a_dev, lda, n, h1_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: n,lda
integer(C_SIZE_T), value :: h1_dev ,a_dev
end subroutine
subroutine launch_compute_hh_trafo_c_kernel(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols) bind(c)
use iso_c_binding
implicit none
integer, value :: nev, nb, ldq, off, ncols
integer*8, value :: q
integer*8, value :: hh_dot
integer(C_SIZE_T), value :: hh_tau ,hh
end subroutine
subroutine launch_compute_hh_trafo_c_kernel_complex(q, hh, hh_tau, nev, nb,ldq,off, ncols) bind(c)
use iso_c_binding
implicit none
integer, value :: nev, nb, ldq, off, ncols
integer*8, value :: q
integer*8, value :: hh_tau ,hh
end subroutine
subroutine launch_compute_hh_trafo_c_kernel_complex_1(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols) bind(c)
use iso_c_binding
implicit none
integer, value :: nev, nb, ldq, off, ncols
integer*8, value :: q
integer*8, value :: hh_tau ,hh, hh_dot
end subroutine
subroutine launch_my_unpack_c_kernel(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev,row_group_dev, &
a_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: row_count
integer, value :: n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev
integer*8, value :: a_dev, row_group_dev
end subroutine
subroutine launch_my_pack_c_kernel(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev, a_dev, &
row_group_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev
integer*8, value :: a_dev
integer*8, value :: row_group_dev
end subroutine
subroutine launch_compute_hh_dotp_c_kernel(bcast_buffer_dev, hh_dot_dev, nbw, n) bind(c)
use iso_c_binding
implicit none
integer*8, value :: bcast_buffer_dev
integer*8, value :: hh_dot_dev
integer, value :: nbw, n
end subroutine
subroutine launch_extract_hh_tau_c_kernel(hh, hh_tau, nb, n, is_zero) bind(c)
use iso_c_binding
implicit none
integer*8, value :: hh
integer*8, value :: hh_tau
integer, value :: nb, n
integer, value :: is_zero
end subroutine
subroutine launch_my_unpack_c_kernel_complex(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, &
row_group_dev, a_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: row_count
integer, value :: n_offset, max_idx,stripe_width, a_dim2, stripe_count,l_nev
integer*8, value :: a_dev, row_group_dev
end subroutine
subroutine launch_my_pack_c_kernel_complex(row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, a_dev, &
row_group_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: row_count, n_offset, max_idx, stripe_width, a_dim2,stripe_count, l_nev
integer*8, value :: a_dev
integer*8, value :: row_group_dev
end subroutine
subroutine launch_compute_hh_dotp_c_kernel_complex(bcast_buffer_dev, hh_dot_dev, nbw,n) bind(c)
use iso_c_binding
implicit none
integer*8, value :: bcast_buffer_dev
integer*8, value :: hh_dot_dev
integer, value :: nbw, n
end subroutine
subroutine launch_extract_hh_tau_c_kernel_complex(hh, hh_tau, nb, n, is_zero) bind(c)
use iso_c_binding
implicit none
integer*8, value :: hh
integer*8, value :: hh_tau
integer, value :: nb, n
integer, value :: is_zero
end subroutine
end interface
end module cuda_c_kernel
#endif /* WITH_GPU_VERSION */
This diff is collapsed.
......@@ -79,6 +79,10 @@ program test_complex2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
#ifdef WITH_OPENMP
use test_util
#endif
......@@ -115,35 +119,38 @@ program test_complex2
!-------------------------------------------------------------------------------
! Local Variables
integer np_rows, np_cols, na_rows, na_cols
integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer, external :: numroc
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
real*8, allocatable :: ev(:), xr(:,:)
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
real*8, allocatable :: ev(:), xr(:,:)
complex*16, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
integer, parameter :: error_unit = 6
#endif
logical :: success
logical :: success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum
#endif
write_to_file = .false.
success = .true.
......@@ -157,6 +164,15 @@ program test_complex2
!-------------------------------------------------------------------------------
! MPI Initialization
call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
devnum = 0
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
#endif
STATUS = 0
......@@ -186,6 +202,10 @@ program test_complex2
if (myid .eq. 0) then
print *," "
print *,"This ELPA2 is build with"
#ifdef WITH_GPU_VERSION
print *,"GPU support"
#else
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
print *,"AVX optimized kernel (2 blocking) for complex matrices"
#endif
......@@ -201,6 +221,8 @@ program test_complex2
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
print *,"SSE ASSEMBLER kernel for complex matrices"
#endif
#endif
endif
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment