Commit c3b698f2 authored by Andreas Marek's avatar Andreas Marek
Browse files

Make GPU usage a run-time option

parent 88118861
...@@ -12,6 +12,9 @@ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSIO ...@@ -12,6 +12,9 @@ libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSIO
libelpa@SUFFIX@_la_SOURCES = src/elpa_utilities.F90 \ libelpa@SUFFIX@_la_SOURCES = src/elpa_utilities.F90 \
src/elpa1.F90 \ src/elpa1.F90 \
src/elpa2_utilities.F90 \ src/elpa2_utilities.F90 \
src/check_for_gpu.F90 \
src/mod_cuda.F90 \
src/interface_c_kernel.F90 \
src/elpa2.F90 \ src/elpa2.F90 \
src/elpa_c_interface.F90 \ src/elpa_c_interface.F90 \
src/elpa_qr/qr_utils.f90 \ src/elpa_qr/qr_utils.f90 \
...@@ -31,7 +34,8 @@ if HAVE_DETAILED_TIMINGS ...@@ -31,7 +34,8 @@ if HAVE_DETAILED_TIMINGS
endif endif
if WITH_GPU_VERSION if WITH_GPU_VERSION
libelpa@SUFFIX@_la_SOURCES += src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu libelpa@SUFFIX@_la_SOURCES += src/cudaFunctions.cu src/cuUtils.cu src/ev_tridi_band_gpu_c_v2.cu
#src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif endif
if WITH_REAL_GENERIC_KERNEL if WITH_REAL_GENERIC_KERNEL
...@@ -87,7 +91,7 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL ...@@ -87,7 +91,7 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL
endif endif
.cu.lo: .cu.lo:
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -c $< -o $@ NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $< -o $@
# install any .mod files in the include/ dir # install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@ elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
...@@ -142,8 +146,12 @@ else ...@@ -142,8 +146,12 @@ else
redirect_sources = redirect_sources =
endif endif
shared_sources = test/shared_sources/util.F90 test/shared_sources/read_input_parameters.F90 test/shared_sources/check_correctnes.F90 test/shared_sources/setup_mpi.F90 \ shared_sources = test/shared_sources/util.F90 \
test/shared_sources/blacs_infrastructure.F90 test/shared_sources/prepare_matrix.F90 test/shared_sources/read_input_parameters.F90 \
test/shared_sources/check_correctnes.F90 \
test/shared_sources/setup_mpi.F90 \
test/shared_sources/blacs_infrastructure.F90 \
test/shared_sources/prepare_matrix.F90
elpa1_test_real_c_version@SUFFIX@_SOURCES = test/test_real_c_version.c $(shared_sources) $(redirect_sources) elpa1_test_real_c_version@SUFFIX@_SOURCES = test/test_real_c_version.c $(shared_sources) $(redirect_sources)
elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib) elpa1_test_real_c_version@SUFFIX@_LDADD = $(build_lib)
......
...@@ -532,6 +532,10 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then ...@@ -532,6 +532,10 @@ if test x"${fortran_can_check_environment}" = x"yes" ; then
AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can querry environment variables]) AC_DEFINE([HAVE_ENVIRONMENT_CHECKING],[1],[Fortran can querry environment variables])
fi fi
dnl default value
use_specific_real_kernel=no
use_specific_complex_kernel=no
dnl GPU version only dnl GPU version only
m4_include([m4/ax_elpa_gpu_version_only.m4]) m4_include([m4/ax_elpa_gpu_version_only.m4])
DEFINE_OPTION_GPU_SUPPORT_ONLY([gpu-version-only],[gpu-support],[install_gpu]) DEFINE_OPTION_GPU_SUPPORT_ONLY([gpu-version-only],[gpu-support],[install_gpu])
...@@ -540,65 +544,58 @@ dnl last check whether user wants to compile only a specific kernel ...@@ -540,65 +544,58 @@ dnl last check whether user wants to compile only a specific kernel
dnl dnl
m4_include([m4/ax_elpa_specific_kernels.m4]) m4_include([m4/ax_elpa_specific_kernels.m4])
dnl only do this if GPU support only has not been requested
if test x"${build_with_gpu_support_only}" = x"no" ; then
dnl real kernels dnl real kernels
dnl do not remove this variable it is needed in the macros dnl do not remove this variable it is needed in the macros
use_specific_real_kernel=no
dnl generic kernel dnl generic kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-kernel-only],[generic-kernel],[install_real_generic]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-kernel-only],[generic-kernel],[install_real_generic])
dnl generic-simple kernel dnl generic-simple kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-generic-simple-kernel-only],[generic-simple-kernel],[install_real_generic_simple])
dnl sse kernel dnl sse kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-kernel-only],[sse-kernel],[install_real_sse]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-sse-kernel-only],[sse-kernel],[install_real_sse])
dnl bgp kernel dnl bgp kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgp-kernel-only],[bgp-kernel],[install_real_bgp])
dnl bgq kernel dnl bgq kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-bgq-kernel-only],[bgq-kernel],[install_real_bgq])
dnl real-avx-block2 kernel dnl real-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block2-kernel-only],[real-avx-block2-kernel],[install_real_avx_block2])
dnl real-avx-block4 kernel dnl real-avx-block4 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block4-kernel]-only,[real-avx-block4-kernel],[install_real_avx_block4]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block4-kernel]-only,[real-avx-block4-kernel],[install_real_avx_block4])
dnl real-avx-block6 kernel dnl real-avx-block6 kernel
DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block6-kernel-only],[real-avx-block6-kernel],[install_real_avx_block6]) DEFINE_OPTION_SPECIFIC_REAL_KERNEL([real-avx-block6-kernel-only],[real-avx-block6-kernel],[install_real_avx_block6])
dnl complex kernels dnl complex kernels
dnl do not remove this variable it is needed in the macros dnl generic kernel
use_specific_complex_kernel=no DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-kernel-only],[generic-kernel],[install_complex_generic])
dnl generic kernel dnl generic-simple kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-kernel-only],[generic-kernel],[install_complex_generic]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple])
dnl generic-simple kernel dnl sse kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-generic-simple-kernel-only],[generic-simple-kernel],[install_complex_generic_simple]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-kernel-only],[sse-kernel],[install_complex_sse])
dnl sse kernel dnl complex-bqp kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-sse-kernel-only],[sse-kernel],[install_complex_sse]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp])
dnl complex-bqp kernel dnl complex-bqq kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgp-kernel-only],[bgp-kernel],[install_complex_bgp]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq])
dnl complex-bqq kernel dnl complex-avx-block1 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-bgq-kernel-only],[bgq-kernel],[install_complex_bgq]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1])
dnl complex-avx-block1 kernel dnl complex-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block1-kernel-only],[complex-avx-block1-kernel],[install_complex_avx_block1]) DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
dnl complex-avx-block2 kernel
DEFINE_OPTION_SPECIFIC_COMPLEX_KERNEL([complex-avx-block2-kernel-only],[complex-avx-block2-kernel],[install_complex_avx_block2])
fi
dnl set the conditionals according to the previous tests dnl set the conditionals according to the previous tests
if test x"${can_use_iso_fortran_env}" = x"yes" ; then if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env]) AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi fi
...@@ -673,12 +670,12 @@ if test x"${install_real_bgq}" = x"yes" ; then ...@@ -673,12 +670,12 @@ if test x"${install_real_bgq}" = x"yes" ; then
AC_DEFINE([WITH_REAL_BGQ_KERNEL],[1],[can use real BGQ kernel]) AC_DEFINE([WITH_REAL_BGQ_KERNEL],[1],[can use real BGQ kernel])
fi fi
if test x"${use_specific_complex_kernel}" = x"no" ; then if test x"${use_specific_complex_kernel}" = x"yes" ; then
AC_DEFINE([WITH_NO_SPECIFIC_COMPLEX_KERNEL],[1],[do not use only one specific complex kernel (set at compile time)]) AC_DEFINE([WITH_ONE_SPECIFIC_COMPLEX_KERNEL],[1],[use only one specific complex kernel (set at compile time)])
fi fi
if test x"${use_specific_real_kernel}" = x"no" ; then if test x"${use_specific_real_kernel}" = x"yes" ; then
AC_DEFINE([WITH_NO_SPECIFIC_REAL_KERNEL],[1],[do not use only one specific real kernel (set at compile time)]) AC_DEFINE([WITH_ONE_SPECIFIC_REAL_KERNEL],[1],[use only one specific real kernel (set at compile time)])
fi fi
LT_INIT LT_INIT
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
#include "config-f90.h"
module mod_check_for_gpu
contains
function check_for_gpu(myid, numberOfDevices) result(gpuAvailable)
use cuda_functions
implicit none
include 'mpif.h'
integer, intent(in) :: myid
logical :: success
integer, intent(out) :: numberOfDevices
integer :: deviceNumber, mpierr, maxNumberOfDevices
logical :: gpuAvailable
character(len=1024) :: envname
gpuAvailable = .false.
! call getenv("CUDA_PROXY_PIPE_DIRECTORY", envname)
success = cuda_getdevicecount(numberOfDevices)
if (.not.(success)) then
print *,"error in cuda_getdevicecount"
stop
endif
! make sure that all nodes have the same number of GPU's, otherwise
! we run into loadbalancing trouble
call mpi_allreduce(numberOfDevices, maxNumberOfDevices, 1, MPI_INTEGER, MPI_MAX, MPI_COMM_WORLD, mpierr)
if (maxNumberOfDevices .ne. numberOfDevices) then
print *,"Different number of GPU devices on MPI tasks!"
print *,"GPUs will NOT be used!"
gpuAvailable = .false.
return
endif
if (numberOfDevices .ne. 0) then
gpuAvailable = .true.
! Usage of GPU is possible since devices have been detected
if (myid==0) then
print *
print '(3(a,i0))','Found ', numberOfDevices, ' GPUs'
endif
deviceNumber = mod(myid, numberOfDevices)
success = cuda_setdevice(deviceNumber)
if (.not.(success)) then
print *,"Cannot set CudaDevice"
stop
endif
print '(3(a,i0))', 'MPI rank ', myid, ' uses GPU #', deviceNumber
endif
end function
end module
#include <stdio.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <alloca.h>
#include <stdint.h>
#include "config-f90.h"
#define errormessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
#ifdef DEBUG_CUDA
#define debugmessage(x, ...) do { fprintf(stderr, "%s:%d " x, __FILE__, __LINE__, __VA_ARGS__ ); } while (0)
#else
#define debugmessage(x, ...)
#endif
#ifdef WITH_GPU_VERSION
extern "C" {
int cudaSetDeviceFromC(int n) {
cudaError_t cuerr = cudaSetDevice(n);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaSetDevice: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaGetDeviceCountFromC(int *count) {
cudaError_t cuerr = cudaGetDeviceCount(count);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaGetDeviceCount: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaDeviceSynchronizeFromC() {
cudaError_t cuerr = cudaDeviceSynchronize();
if (cuerr != cudaSuccess) {
errormessage("Error in cudaGetDeviceCount: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMallocFromC(intptr_t *a, size_t width_height) {
cudaError_t cuerr = cudaMalloc((void **) a, width_height);
#ifdef DEBUG_CUDA
printf("Malloc pointer address: %p \n", *a);
#endif
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMalloc: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaFreeFromC(intptr_t *a) {
#ifdef DEBUG_CUDA
printf("Free pointer address: %p \n", a);
#endif
cudaError_t cuerr = cudaFree(a);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaFree: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemsetFromC(intptr_t *a, int value, size_t count) {
cudaError_t cuerr = cudaMemset( a, value, count);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMemset: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemcpyFromC(intptr_t *dest, intptr_t *src, size_t count, int dir) {
cudaError_t cuerr = cudaMemcpy( dest, src, count, (cudaMemcpyKind)dir);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMemcpy: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemcpy2dFromC(intptr_t *dest, size_t dpitch, intptr_t *src, size_t spitch, size_t width, size_t height, int dir) {
cudaError_t cuerr = cudaMemcpy2D( dest, dpitch, src, spitch, width, height, (cudaMemcpyKind)dir);
if (cuerr != cudaSuccess) {
errormessage("Error in cudaMemcpy2d: %s\n",cudaGetErrorString(cuerr));
return 0;
}
return 1;
}
int cudaMemcpyDeviceToDeviceFromC(void) {
int val = cudaMemcpyDeviceToDevice;
return val;
}
int cudaMemcpyHostToDeviceFromC(void) {
int val = cudaMemcpyHostToDevice;
return val;
}
int cudaMemcpyDeviceToHostFromC(void) {
int val = cudaMemcpyDeviceToHost;
return val;
}
int cudaHostRegisterPortableFromC(void) {
int val = cudaHostRegisterPortable;
return val;
}
int cudaHostRegisterMappedFromC(void) {
int val = cudaHostRegisterMapped;
return val;
}
}
#endif /* WITH_GPU_VERSION */
This diff is collapsed.
...@@ -105,10 +105,79 @@ module ELPA2_utilities ...@@ -105,10 +105,79 @@ module ELPA2_utilities
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) #if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else #else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#endif
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif
#ifdef WITH_GPU_VERSION
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#else / * WITH_REAL_AVX_BLOCK2_KERNEL */
#ifndef WITH_ONE_SPECIFIC_REAL_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#ifdef WITH_REAL_GENERIC_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_REAL_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_REAL_SSE_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_SSE
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_REAL_AVX_BLOCK4_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK4
#endif
#ifdef WITH_REAL_AVX_BLOCK6_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BLOCK6
#endif
#ifdef WITH_REAL_BGP_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGP
#endif
#ifdef WITH_REAL_BGQ_KERNEL
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_AVX_BGQ
#endif #endif
#ifdef WITH_GPU_VERSION
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_REAL_KERNEL */
#endif / * WITH_REAL_AVX_BLOCK2_KERNEL */
character(35), parameter, dimension(number_of_real_kernels) :: & character(35), parameter, dimension(number_of_real_kernels) :: &
REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", & REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", &
"REAL_ELPA_KERNEL_GENERIC_SIMPLE ", & "REAL_ELPA_KERNEL_GENERIC_SIMPLE ", &
...@@ -132,10 +201,65 @@ module ELPA2_utilities ...@@ -132,10 +201,65 @@ module ELPA2_utilities
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif #endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_GPU_VERSION
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#else /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
#ifndef WITH_ONE_SPECIFIC_COMPLEX_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#else /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
! go through all kernels and set them
#ifdef WITH_COMPLEX_GENERIC_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
#endif
#ifdef WITH_COMPLEX_GENERIC_SIMPLE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE
#endif
#ifdef WITH_COMPLEX_SSE_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_SSE
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK1_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK1
#endif
#ifdef WITH_COMPLEX_AVX1_BLOCK2_KERNEL
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_AVX_BLOCK2
#endif
#ifdef WITH_GPU_VERSION
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
#endif
#endif /* WITH_ONE_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
character(35), parameter, dimension(number_of_complex_kernels) :: & character(35), parameter, dimension(number_of_complex_kernels) :: &
COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", & COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", &
"COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", & "COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE ", &
...@@ -315,20 +439,20 @@ module ELPA2_utilities ...@@ -315,20 +439,20 @@ module ELPA2_utilities
! check whether set by environment variable ! check whether set by environment variable
actual_kernel = real_kernel_via_environment_variable() actual_kernel = real_kernel_via_environment_variable()
#ifdef WITH_GPU_VERSION !#ifdef WITH_GPU_VERSION
actual_kernel = REAL_ELPA_KERNEL_GPU ! actual_kernel = REAL_ELPA_KERNEL_GPU
#endif !#endif
if (actual_kernel .eq. 0) then if (actual_kernel .eq. 0) then
! if not then set default kernel ! if not then set default kernel
actual_kernel = DEFAULT_REAL_ELPA_KERNEL