Commit c3b698f2 authored by Andreas Marek's avatar Andreas Marek

Make GPU usage a run-time option

parent 88118861
......@@ -12,6 +12,9 @@ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSIO
libelpa@SUFFIX@_la_SOURCES = src/elpa_utilities.F90 \
src/elpa1.F90 \
src/elpa2_utilities.F90 \
src/check_for_gpu.F90 \
src/mod_cuda.F90 \
src/interface_c_kernel.F90 \
src/elpa2.F90 \
src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \
......@@ -31,7 +34,8 @@ if HAVE_DETAILED_TIMINGS
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_la_SOURCES += src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
libelpa@SUFFIX@_la_SOURCES += src/cudaFunctions.cu src/cuUtils.cu src/ev_tridi_band_gpu_c_v2.cu
#src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if WITH_REAL_GENERIC_KERNEL
......@@ -87,7 +91,7 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL
endif
.cu.lo:
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -c $< -o $@
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $< -o $@
# install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
......@@ -142,8 +146,12 @@ else
redirect_sources =
endif
shared_sources = test/shared_sources/util.F90 test/shared_sources/read_input_parameters.F90 test/shared_sources/check_correctnes.F90 test/shared_sources/setup_mpi.F90 \
test/shared_sources/blacs_infrastructure.F90 test/shared_sources/prepare_matrix.F90
shared_sources = test/shared_sources/util.F90 \
test/shared_sources/read_input_parameters.F90 \
test/shared_sources/check_correctnes.F90 \
test/shared_sources/setup_mpi.F90 \
test/shared_sources/blacs_infrastructure.F90 \
test/shared_sources/prepare_matrix.F90
elpa1_test_real_c_version@SUFFIX@_SOURCES = test/test_real_c_version.c $(shared_sources) $(redirect_sources)
elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib)
......
......@@ -532,6 +532,10 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then
AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can querry environment variables])
fi
dnl default value
use_specific_real_kernel=no
use_specific_complex_kernel=no
dnl GPU version only
m4_include([m4/ax_elpa_gpu_version_only.m4])
DEFINE_OPTION_GPU_SUPPORT_ONLY([gpu-version-only],[gpu-support],[install_gpu])
......@@ -540,65 +544,58 @@ dnl last check whether user wants to compile only a specific kernel
dnl
m4_include([m4/ax_elpa_specific_kernels.m4])
dnl only do this if GPU support only has not been requested
if test x"${build_with_gpu_support_only}" = x"no" ; then
dnl real kernels
dnl do not remove this variable it is needed in the macros
use_specific_real_kernel=no
dnl do not remove this variable it is needed in the macros
dnl generic kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-kernel-only],[generic-kernel],[install_real_generic])
dnl generic kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-kernel-only],[generic-kernel],[install_real_generic])
dnl generic-simple kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple])
dnl generic-simple kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple])
dnl sse kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-kernel-only],[sse-kernel],[install_real_sse])
dnl sse kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-kernel-only],[sse-kernel],[install_real_sse])
dnl bgp kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp])
dnl bgp kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp])
dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
dnl real-avx-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block4-kernel]-only,[real-avx-block4-kernel],[install_real_avx_block4])
dnl real-avx-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block4-kernel]-only,[real-avx-block4-kernel],[install_real_avx_block4])
dnl real-avx-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block6-kernel-only],[real-avx-block6-kernel],[install_real_avx_block6])
dnl real-avx-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block6-kernel-only],[real-avx-block6-kernel],[install_real_avx_block6])
dnl complex kernels
dnl do not remove this variable it is needed in the macros
use_specific_complex_kernel=no
dnl generic kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-kernel-only],[generic-kernel],[install_complex_generic])
dnl generic kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-kernel-only],[generic-kernel],[install_complex_generic])
dnl generic-simple kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple])
dnl generic-simple kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple])
dnl sse kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-kernel-only],[sse-kernel],[install_complex_sse])
dnl sse kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-kernel-only],[sse-kernel],[install_complex_sse])
dnl complex-bqp kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp])
dnl complex-bqp kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp])
dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
dnl complex-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
fi
dnl complex-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
dnl set the conditionals according to the previous tests
if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
......@@ -673,12 +670,12 @@ if test x"${install_real_bgq}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGQ_KERNEL],[1],[can use real BGQ kernel])
fi
if test x"${use_specific_complex_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)])
if test x"${use_specific_complex_kernel}" = x"yes" ; then
AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)])
fi
if test x"${use_specific_real_kernel}" = x"no" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)])
if test x"${use_specific_real_kernel}" = x"yes" ; then
AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)])
fi
LT_INIT
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
#include "config-f90.h"
module mod_check_for_gpu
contains
function check_for_gpu(myid, numberOfDevices) result(gpuAvailable)
use cuda_functions
implicit none
include 'mpif.h'
integer, intent(in) :: myid
logical :: success
integer, intent(out) :: numberOfDevices
integer :: deviceNumber, mpierr, maxNumberOfDevices
logical :: gpuAvailable
character(len=1024) :: envname
gpuAvailable = .false.
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success = cuda_getdevicecount(numberOfDevices)
if (.not.(success)) then
print *,"error in cuda_getdevicecount"
stop
endif
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
if (maxNumberOfDevices .ne. numberOfDevices) then
print *,"Different number of GPU devices on MPI tasks!"
print *,"GPUs will NOT be used!"
gpuAvailable = .false.
return
endif
if (numberOfDevices .ne. 0) then
gpuAvailable = .true.
! Usage of GPU is possible since devices have been detected
if (myid==0) then
print *
print '(3(a,i0))','Found ', numberOfDevices, ' GPUs'
endif
deviceNumber = mod(myid, numberOfDevices)
success = cuda_setdevice(deviceNumber)
if (.not.(success)) then
print *,"Cannot set CudaDevice"
stop
endif
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
end function
end module
#include <stdio.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <alloca.h>
#include <stdint.h>
#include "config-f90.h"
#define errormessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
#ifdef DEBUG_CUDA
#define debugmessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
#else
#define debugmessage(x, ...)
#endif
#ifdef WITH_GPU_VERSION
extern "C" {
int cudaSetDeviceFromC(int n) {
cudaError_t cuerr = cudaSetDevice(n);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaSetDevice: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaGetDeviceCountFromC(int *count) {
cudaError_t cuerr = cudaGetDeviceCount(count);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaGetDeviceCount: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaDeviceSynchronizeFromC() {
cudaError_t cuerr = cudaDeviceSynchronize();
if (cuerr != cudaSuccess) {
errormessage("Error in cudaGetDeviceCount: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMallocFromC(intptr_t *a, size_t width_height) {
cudaError_t cuerr = cudaMalloc((void **) a, width_height);
#ifdef DEBUG_CUDA
printf("Malloc pointer address: %p \n", *a);
#endif
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMalloc: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaFreeFromC(intptr_t *a) {
#ifdef DEBUG_CUDA
printf("Free pointer address: %p \n", a);
#endif
cudaError_t cuerr = cudaFree(a);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaFree: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemsetFromC(intptr_t *a, int value, size_t count) {
cudaError_t cuerr = cudaMemset( a, value, count);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMemset: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemcpyFromC(intptr_t *dest, intptr_t *src, size_t count, int dir) {
cudaError_t cuerr = cudaMemcpy( dest, src, count, (cudaMemcpyKind)dir);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMemcpy: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemcpy2dFromC(intptr_t *dest, size_t dpitch, intptr_t *src, size_t spitch, size_t width, size_t height, int dir) {
cudaError_t cuerr = cudaMemcpy2D( dest, dpitch, src, spitch, width, height, (cudaMemcpyKind)dir);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMemcpy2d: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemcpyDeviceToDeviceFromC(void) {
int val = cudaMemcpyDeviceToDevice;
return val;
}
int cudaMemcpyHostToDeviceFromC(void) {
int val = cudaMemcpyHostToDevice;
return val;
}
int cudaMemcpyDeviceToHostFromC(void) {
int val = cudaMemcpyDeviceToHost;
return val;
}
int cudaHostRegisterPortableFromC(void) {
int val = cudaHostRegisterPortable;
return val;
}
int cudaHostRegisterMappedFromC(void) {
int val = cudaHostRegisterMapped;
return val;
}
}
#endif /* WITH_GPU_VERSION */
This diff is collapsed.
......@@ -105,10 +105,79 @@ module ELPA2_utilities
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#endif
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#ifdef WITH_GPU_VERSION
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#else / * WITH_REAL_AVX_BLOCK2_KERNEL */
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#endif
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#ifdef WITH_GPU_VERSION
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#endif / * WITH_REAL_AVX_BLOCK2_KERNEL */
character(35), parameter, dimension(number_of_real_kernels) :: &
REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", &
"REAL_ELPA_KERNEL_GENERIC_SIMPLE ", &
......@@ -132,10 +201,65 @@ module ELPA2_utilities
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_GPU_VERSION
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_GPU_VERSION
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
character(35), parameter, dimension(number_of_complex_kernels) :: &
COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", &
"COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", &
......@@ -315,20 +439,20 @@ module ELPA2_utilities
! check whether set by environment variable
actual_kernel = real_kernel_via_environment_variable()
#ifdef WITH_GPU_VERSION
actual_kernel = REAL_ELPA_KERNEL_GPU
#endif
!#ifdef WITH_GPU_VERSION
! actual_kernel = REAL_ELPA_KERNEL_GPU
!#endif
if (actual_kernel .eq. 0) then
! if not then set default kernel
actual_kernel = DEFAULT_REAL_ELPA_KERNEL
endif
#ifdef WITH_GPU_VERSION
if (actual_kernel .ne. REAL_ELPA_KERNEL_GPU) then
print *,"if build with GPU you cannot choose another real kernel"
stop
endif
#endif
!#ifdef WITH_GPU_VERSION
! if (actual_kernel .ne. REAL_ELPA_KERNEL_GPU) then
! print *,"if build with GPU you cannot choose another real kernel"
! stop
! endif
!#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("get_actual_real_kernel")
......@@ -374,20 +498,20 @@ module ELPA2_utilities
! check whether set by environment variable
actual_kernel = complex_kernel_via_environment_variable()
#ifdef WITH_GPU_VERSION
actual_kernel = COMPLEX_ELPA_KERNEL_GPU
#endif
!#ifdef WITH_GPU_VERSION
! actual_kernel = COMPLEX_ELPA_KERNEL_GPU
!#endif
if (actual_kernel .eq. 0) then
! if not then set default kernel
actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
endif
#ifdef WITH_GPU_VERSION
if (actual_kernel .ne. COMPLEX_ELPA_KERNEL_GPU) then
print *,"if build with GPU you cannot choose another complex kernel"
stop
endif
#endif
!#ifdef WITH_GPU_VERSION
! if (actual_kernel .ne. COMPLEX_ELPA_KERNEL_GPU) then
! print *,"if build with GPU you cannot choose another complex kernel"
! stop
! endif
!#endif
#ifdef HAVE_DETAILED_TIMINGS
......
......@@ -138,8 +138,8 @@
useQRFortran = .true.
endif
successFortran = solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_REAL_ELPA_KERNEL_API, useQRFortran)
successFortran = solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_REAL_ELPA_KERNEL_API, useQRFortran)
if (successFortran) then
success = 1
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -173,7 +173,7 @@ module mod_check_correctness
if (myid==0) print *
if (myid==0) print *,'Error Residual :',errmax
if (errmax .gt. 5e-12) then
if (errmax .gt. 9e-9) then
status = 1
endif
......@@ -196,7 +196,7 @@ module mod_check_correctness
call mpi_allreduce(err,errmax,1,MPI_REAL8,MPI_MAX,MPI_COMM_WORLD,mpierr)
if (myid==0) print *,'Error Orthogonality:',errmax
if (errmax .gt. 5e-12) then
if (errmax .gt. 9e-9) then
status = 1
endif
end function
......
......@@ -79,10 +79,7 @@ program test_complex2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
use mod_check_for_gpu, only : check_for_gpu
use elpa_utilities, only : error_unit
#ifdef WITH_OPENMP
......@@ -139,38 +136,20 @@ program test_complex2
logical :: write_to_file
logical :: success
logical :: successELPA, success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum, numdevs
#endif
integer :: numberOfDevices
logical :: gpuAvailable
success = .true.
successELPA = .true.
gpuAvailable = .false.
call read_input_parameters(na, nev, nblk, write_to_file)
!-------------------------------------------------------------------------------
! MPI Initialization
call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
istat = cuda_getdevicecount(numdevs)
if (istat .ne. 0) then
print *,"Error in cuda_getdevicecount"
stop
endif
if(myid==0) then
print *
print '(3(a,i0))','Found ', numdevs, ' GPUs'
endif
devnum = mod(myid, numdevs)
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', devnum
#endif
gpuAvailable = check_for_gpu(myid, numberOfDevices)
STATUS = 0
......@@ -200,9 +179,9 @@ program test_complex2
if (myid .eq. 0) then
print *," "
print *,"This ELPA2 is build with"
#ifdef WITH_GPU_VERSION
print *,"GPU support"
#else
if (gpuAvailable) then
print *,"GPU support"
endif
#ifdef WITH_COMPLEX_AVX_BLOCK2_KERNEL
print *,"AVX optimized kernel (2 blocking) for complex matrices"
......@@ -220,9 +199,6 @@ program test_complex2
#ifdef WITH_COMPLEX_SSE_KERNEL
print *,"SSE ASSEMBLER kernel for complex matrices"
#endif
#endif
endif
if (write_to_file) then
......@@ -344,10 +320,10 @@ program test_complex2
call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
successELPA = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
if (.not.(success)) then
if (.not.(successELPA)) then
write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
call MPI_ABORT(mpi_comm_world, 1, mpierr)
endif
......
......@@ -79,12 +79,10 @@ program test_complex2
use ELPA1