Commit 3ea32c22 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge NVIDIA GPU sources by hand

parent 9ad68bd0
...@@ -30,6 +30,10 @@ if HAVE_DETAILED_TIMINGS ...@@ -30,6 +30,10 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c src/ftimings/papi.c
endif endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_la_SOURCES += src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if WITH_REAL_GENERIC_KERNEL if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90 libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
endif endif
...@@ -82,10 +86,8 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL ...@@ -82,10 +86,8 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
endif endif
#if WITH_AVX_SANDYBRIDGE .cu.lo:
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \ NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -c $< -o $@
# src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
# install any .mod files in the include/ dir # install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@ elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
...@@ -155,6 +157,7 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib) ...@@ -155,6 +157,7 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_SOURCES = test/test_real2.F90 $(shared_sources) $(redirect_sources) elpa2_test_real@SUFFIX@_SOURCES = test/test_real2.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_LDADD = $(build_lib) elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources) elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources)
...@@ -258,7 +261,7 @@ elpa1.i: $(top_srcdir)/src/elpa1.F90 ...@@ -258,7 +261,7 @@ elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@ $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
CLEANFILES = \ CLEANFILES = \
elpa-generated.h \ elpa/elpa-generated.h \
elpa1_test_real.sh \ elpa1_test_real.sh \
elpa1_test_complex.sh \ elpa1_test_complex.sh \
elpa2_test_real.sh \ elpa2_test_real.sh \
......
...@@ -83,6 +83,7 @@ AC_PROG_CXX ...@@ -83,6 +83,7 @@ AC_PROG_CXX
dnl variables needed for the tests dnl variables needed for the tests
N="0"
dnl these test will cause an abort of configure if not dnl these test will cause an abort of configure if not
dnl successful. However, if MKL is found then the blas, blacs, dnl successful. However, if MKL is found then the blas, blacs,
...@@ -656,6 +657,65 @@ if test x"${can_use_iso_fortran_env}" = x"yes" ; then ...@@ -656,6 +657,65 @@ if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env]) AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi fi
dnl check whether GPU version is requested
CUDA_INSTALL_PATH="/usr/local/cuda/"
#CUDA_SDK_INSTALL_PATH="/usr/local/NVIDIA_GPU_Computing_SDK"
AC_MSG_CHECKING(whether GPU support is requested)
AC_ARG_ENABLE(gpu-support,[AS_HELP_STRING([--enable-gpu-support],
[build ELPA2 with GPU-support ( no CPU version available)])],
want_gpu="yes", want_gpu="no")
#AC_ARG_WITH([GPU-SUPPORT], [AS_HELP_STRING([--with-GPU-SUPPORT],
# [build ELPA2 with GPU-support ( no CPU version available)])],
# [with_gpu=yes],[with_gpu=no])
AC_MSG_RESULT([${want_gpu}])
AC_ARG_WITH([cuda-path],[AS_HELP_STRING([--with-cuda-path=PATH],[prefix where CUDA is installed @<:@default=auto@:>@])],
[CUDA_INSTALL_PATH=$withval], [with_cuda=auto])
AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix where CUDA SDK is installed @<:@default=auto@:>@])],
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
#AC_ARG_VAR([SCALAPACK_LDFLAGS],[Extra LDFLAGS necessary to link a program with Scalapack])
#AC_ARG_VAR([SCALAPACK_FCFLAGS],[Extra FCFLAGS necessary to compile a Fortran program with Scalapack])
#FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
#LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
dnl setup nvcc flags
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
AC_SUBST(NVCC)
AC_SUBST(NVCCFLAGS)
dnl check whether nvcc compiler is found
AC_CHECK_PROG(nvcc_found,nvcc,yes,no)
if test x"${nvcc_found}" = x"no" ; then
AC_MSG_ERROR([nvcc not found])
fi
dnl check whether we find cublas
AC_SEARCH_LIBS([cublasDgemm],[cublas],[have_cublas=yes],[have_cublas=no])
if test x"${have_cublas}" = x"no"; then
AC_MSG_ERROR([Could not link cublas])
fi
AC_SEARCH_LIBS([cudaMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
if test x"${have_cudart}" = x"no"; then
AC_MSG_ERROR([Could not link cudart])
fi
AC_LANG_POP([C])
AC_DEFINE([WITH_GPU_VERSION],[1],[build with GPU support])
fi
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$want_gpu" = x"yes"])
AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"]) AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"])
if test x"${install_real_generic}" = x"yes" ; then if test x"${install_real_generic}" = x"yes" ; then
...@@ -794,4 +854,3 @@ grep "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_g ...@@ -794,4 +854,3 @@ grep "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_g
if test "${can_compile_avx}" = "no" ; then if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Could not compile AVX instructions]) AC_MSG_WARN([Could not compile AVX instructions])
fi fi
...@@ -6,8 +6,9 @@ ...@@ -6,8 +6,9 @@
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6 #define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7 #define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8 #define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9
#define ELPA2_NUMBER_OF_REAL_KERNELS 8 #define ELPA2_NUMBER_OF_REAL_KERNELS 9
#define ELPA2_COMPLEX_KERNEL_GENERIC 1 #define ELPA2_COMPLEX_KERNEL_GENERIC 1
...@@ -17,5 +18,6 @@ ...@@ -17,5 +18,6 @@
#define ELPA2_COMPLEX_KERNEL_SSE 5 #define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6 #define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7 #define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7 #define ELPA2_NUMBER_OF_COMPLEX_KERNELS 8
This diff is collapsed.
#ifndef UTILS_H
#define UTILS_H
void * allocateDeviceBuffer(int N);
int sendBufferToDevice(void *d_buf, void *h_buf, int N);
int getBufferFromDevice(void *h_buf, void *d_buf, int N);
This diff is collapsed.
...@@ -76,12 +76,14 @@ module ELPA2_utilities ...@@ -76,12 +76,14 @@ module ELPA2_utilities
public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, & public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GENERIC_SIMPLE, &
REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, & REAL_ELPA_KERNEL_BGP, REAL_ELPA_KERNEL_BGQ, &
REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2, & REAL_ELPA_KERNEL_SSE, REAL_ELPA_KERNEL_AVX_BLOCK2, &
REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6 REAL_ELPA_KERNEL_AVX_BLOCK4, REAL_ELPA_KERNEL_AVX_BLOCK6, &
REAL_ELPA_KERNEL_GPU
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, & public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE, &
COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, & COMPLEX_ELPA_KERNEL_BGP, COMPLEX_ELPA_KERNEL_BGQ, &
COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1, & COMPLEX_ELPA_KERNEL_SSE, COMPLEX_ELPA_KERNEL_AVX_BLOCK1, &
COMPLEX_ELPA_KERNEL_AVX_BLOCK2 COMPLEX_ELPA_KERNEL_AVX_BLOCK2, &
COMPLEX_ELPA_KERNEL_GPU
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
...@@ -108,6 +110,8 @@ module ELPA2_utilities ...@@ -108,6 +110,8 @@ module ELPA2_utilities
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_REAL_KERNEL_AVX_BLOCK2
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK4 = ELPA2_REAL_KERNEL_AVX_BLOCK4
integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6 integer, parameter :: REAL_ELPA_KERNEL_AVX_BLOCK6 = ELPA2_REAL_KERNEL_AVX_BLOCK6
integer, parameter :: REAL_ELPA_KERNEL_GPU = ELPA2_REAL_KERNEL_GPU
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) #if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
...@@ -122,7 +126,8 @@ module ELPA2_utilities ...@@ -122,7 +126,8 @@ module ELPA2_utilities
"REAL_ELPA_KERNEL_SSE ", & "REAL_ELPA_KERNEL_SSE ", &
"REAL_ELPA_KERNEL_AVX_BLOCK2 ", & "REAL_ELPA_KERNEL_AVX_BLOCK2 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK4 ", & "REAL_ELPA_KERNEL_AVX_BLOCK4 ", &
"REAL_ELPA_KERNEL_AVX_BLOCK6 "/) "REAL_ELPA_KERNEL_AVX_BLOCK6 ", &
"REAL_ELPA_KERNEL_GPU "/)
integer, parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS integer, parameter :: number_of_complex_kernels = ELPA2_NUMBER_OF_COMPLEX_KERNELS
integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA2_COMPLEX_KERNEL_GENERIC integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA2_COMPLEX_KERNEL_GENERIC
...@@ -132,6 +137,8 @@ module ELPA2_utilities ...@@ -132,6 +137,8 @@ module ELPA2_utilities
integer, parameter :: COMPLEX_ELPA_KERNEL_SSE = ELPA2_COMPLEX_KERNEL_SSE integer, parameter :: COMPLEX_ELPA_KERNEL_SSE = ELPA2_COMPLEX_KERNEL_SSE
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK1 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK1
integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 integer, parameter :: COMPLEX_ELPA_KERNEL_AVX_BLOCK2 = ELPA2_COMPLEX_KERNEL_AVX_BLOCK2
integer, parameter :: COMPLEX_ELPA_KERNEL_GPU = ELPA2_COMPLEX_KERNEL_GPU
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) #if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC integer, parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
...@@ -145,7 +152,8 @@ module ELPA2_utilities ...@@ -145,7 +152,8 @@ module ELPA2_utilities
"COMPLEX_ELPA_KERNEL_BGQ ", & "COMPLEX_ELPA_KERNEL_BGQ ", &
"COMPLEX_ELPA_KERNEL_SSE ", & "COMPLEX_ELPA_KERNEL_SSE ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", & "COMPLEX_ELPA_KERNEL_AVX_BLOCK1 ", &
"COMPLEX_ELPA_KERNEL_AVX_BLOCK2 "/) "COMPLEX_ELPA_KERNEL_AVX_BLOCK2 ", &
"COMPLEX_ELPA_KERNEL_GPU "/)
integer, parameter :: & integer, parameter :: &
AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) = & AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) = &
...@@ -189,6 +197,11 @@ module ELPA2_utilities ...@@ -189,6 +197,11 @@ module ELPA2_utilities
,1 & ,1 &
#else #else
,0 & ,0 &
#endif
#ifdef WITH_GPU_VERSION
,1 &
#else
,0 &
#endif #endif
/) /)
...@@ -229,6 +242,11 @@ module ELPA2_utilities ...@@ -229,6 +242,11 @@ module ELPA2_utilities
,1 & ,1 &
#else #else
,0 & ,0 &
#endif
#ifdef WITH_GPU_VERSION
,1 &
#else
,0 &
#endif #endif
/) /)
...@@ -306,11 +324,21 @@ module ELPA2_utilities ...@@ -306,11 +324,21 @@ module ELPA2_utilities
! check whether set by environment variable ! check whether set by environment variable
actual_kernel = real_kernel_via_environment_variable() actual_kernel = real_kernel_via_environment_variable()
#ifdef WITH_GPU_VERSION
actual_kernel = REAL_ELPA_KERNEL_GPU
#endif
if (actual_kernel .eq. 0) then if (actual_kernel .eq. 0) then
! if not then set default kernel ! if not then set default kernel
actual_kernel = DEFAULT_REAL_ELPA_KERNEL actual_kernel = DEFAULT_REAL_ELPA_KERNEL
endif endif
#ifdef WITH_GPU_VERSION
if (actual_kernel .ne. REAL_ELPA_KERNEL_GPU) then
print *,"if build with GPU you cannot choose another real kernel"
stop
endif
#endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("get_actual_real_kernel") call timer%stop("get_actual_real_kernel")
#endif #endif
...@@ -355,11 +383,22 @@ module ELPA2_utilities ...@@ -355,11 +383,22 @@ module ELPA2_utilities
! check whether set by environment variable ! check whether set by environment variable
actual_kernel = complex_kernel_via_environment_variable() actual_kernel = complex_kernel_via_environment_variable()
#ifdef WITH_GPU_VERSION
actual_kernel = COMPLEX_ELPA_KERNEL_GPU
#endif
if (actual_kernel .eq. 0) then if (actual_kernel .eq. 0) then
! if not then set default kernel ! if not then set default kernel
actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL actual_kernel = DEFAULT_COMPLEX_ELPA_KERNEL
endif endif
#ifdef WITH_GPU_VERSION
if (actual_kernel .ne. COMPLEX_ELPA_KERNEL_GPU) then
print *,"if build with GPU you cannot choose another complex kernel"
stop
endif
#endif
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%stop("get_actual_complex_kernel") call timer%stop("get_actual_complex_kernel")
#endif #endif
...@@ -463,7 +502,6 @@ module ELPA2_utilities ...@@ -463,7 +502,6 @@ module ELPA2_utilities
end function qr_decomposition_via_environment_variable end function qr_decomposition_via_environment_variable
function real_kernel_via_environment_variable() result(kernel) function real_kernel_via_environment_variable() result(kernel)
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!This is a module contains all CUDA C Calls
! it was provided by NVIDIA with their ELPA GPU port and
! adapted for an ELPA release by A.Marek, RZG
#include "config-f90.h"
#ifdef WITH_GPU_VERSION
module cuda_c_kernel
implicit none
interface
subroutine launch_dot_product_kernel(hs_dev, hv_new_dev, tau_new, x_dev, h_dev,hv_dev, nr) bind(c)
use iso_c_binding
implicit none
integer, value :: nr
integer(C_SIZE_T), value :: hs_dev ,hv_new_dev,x_dev,h_dev, hv_dev
complex*16,value :: tau_new
end subroutine
subroutine launch_dot_product_kernel_1(ab_dev, hs_dev, hv_new_dev, x_dev,h_dev,hv_dev,nb, nr, ns) bind(c)
use iso_c_binding
implicit none
integer, value :: nb, nr, ns
integer(C_SIZE_T), value :: x_dev,h_dev, hv_dev, ab_dev, hs_dev,hv_new_dev
end subroutine
subroutine launch_dot_product_kernel_2(ab_dev, hs_dev, hv_dev,hd_dev,nb, nr, ne) bind(c)
use iso_c_binding
implicit none
integer, value :: nb, nr, ne
integer(C_SIZE_T), value :: hd_dev,hv_dev, hs_dev, ab_dev
end subroutine
subroutine launch_double_hh_transform_1(ab_dev, hs_dev,hv_dev,nb,ns) bind(c)
use iso_c_binding
implicit none
integer, value :: nb, ns
integer(C_SIZE_T), value :: hv_dev, ab_dev,hs_dev
end subroutine
subroutine launch_double_hh_transform_2(ab_dev, hd_dev,hv_dev,nc,ns, nb) bind(c)
use iso_c_binding
implicit none
integer, value :: nc, ns, nb
integer(C_SIZE_T), value :: hv_dev, ab_dev,hd_dev
end subroutine
subroutine launch_compute_kernel_reduce(a_dev, lda, n, nbw, h1_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: n,lda,nbw
integer(C_SIZE_T), value :: h1_dev ,a_dev
end subroutine
subroutine launch_compute_kernel_reduce_1(a_dev, lda, n, h1_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: n,lda
integer(C_SIZE_T), value :: h1_dev ,a_dev
end subroutine
subroutine launch_compute_hh_trafo_c_kernel(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols) bind(c)
use iso_c_binding
implicit none
integer, value :: nev, nb, ldq, off, ncols
integer*8, value :: q
integer*8, value :: hh_dot
integer(C_SIZE_T), value :: hh_tau ,hh
end subroutine
subroutine launch_compute_hh_trafo_c_kernel_complex(q, hh, hh_tau, nev, nb,ldq,off, ncols) bind(c)
use iso_c_binding
implicit none
integer, value :: nev, nb, ldq, off, ncols
integer*8, value :: q
integer*8, value :: hh_tau ,hh
end subroutine
subroutine launch_compute_hh_trafo_c_kernel_complex_1(q, hh, hh_dot, hh_tau, nev, nb, ldq, off, ncols) bind(c)
use iso_c_binding
implicit none
integer, value :: nev, nb, ldq, off, ncols
integer*8, value :: q
integer*8, value :: hh_tau ,hh, hh_dot
end subroutine
subroutine launch_my_unpack_c_kernel(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev,row_group_dev, &
a_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: row_count
integer, value :: n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev
integer*8, value :: a_dev, row_group_dev
end subroutine
subroutine launch_my_pack_c_kernel(row_count, n_offset, max_idx,stripe_width, a_dim2, stripe_count, l_nev, a_dev, &
row_group_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev
integer*8, value :: a_dev
integer*8, value :: row_group_dev
end subroutine
subroutine launch_compute_hh_dotp_c_kernel(bcast_buffer_dev, hh_dot_dev, nbw, n) bind(c)
use iso_c_binding
implicit none
integer*8, value :: bcast_buffer_dev
integer*8, value :: hh_dot_dev
integer, value :: nbw, n
end subroutine
subroutine launch_extract_hh_tau_c_kernel(hh, hh_tau, nb, n, is_zero) bind(c)
use iso_c_binding
implicit none
integer*8, value :: hh
integer*8, value :: hh_tau
integer, value :: nb, n
integer, value :: is_zero
end subroutine
subroutine launch_my_unpack_c_kernel_complex(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, &
row_group_dev, a_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: row_count
integer, value :: n_offset, max_idx,stripe_width, a_dim2, stripe_count,l_nev
integer*8, value :: a_dev, row_group_dev
end subroutine
subroutine launch_my_pack_c_kernel_complex(row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, a_dev, &
row_group_dev) bind(c)
use iso_c_binding
implicit none
integer, value :: row_count, n_offset, max_idx, stripe_width, a_dim2,stripe_count, l_nev
integer*8, value :: a_dev
integer*8, value :: row_group_dev
end subroutine
subroutine launch_compute_hh_dotp_c_kernel_complex(bcast_buffer_dev, hh_dot_dev, nbw,n) bind(c)
use iso_c_binding
implicit none
integer*8, value :: bcast_buffer_dev
integer*8, value :: hh_dot_dev
integer, value :: nbw, n
end subroutine
subroutine launch_extract_hh_tau_c_kernel_complex(hh, hh_tau, nb, n, is_zero) bind(c)
use iso_c_binding
implicit none
integer*8, value :: hh
integer*8, value :: hh_tau
integer, value :: nb, n
integer, value :: is_zero
end subroutine
end interface