Commit 3ea32c22 authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge NVIDIA GPU sources by hand

parent 9ad68bd0
......@@ -30,6 +30,10 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c
endif
if WITH_GPU_VERSION
libelpa@SUFFIX@_la_SOURCES += src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real.f90
endif
......@@ -82,10 +86,8 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
endif
#if WITH_AVX_SANDYBRIDGE
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
# src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
.cu.lo:
NVCC="$(NVCC)" libtool --mode=compile --tag=CC $(top_srcdir)/nvcc_wrap $(NVCCFLAGS) $(LDFLAGS) -c $< -o $@
# install any .mod files in the include/ dir
elpa_includedir = $(includedir)/elpa@SUFFIX@-@PACKAGE_VERSION@
......@@ -155,6 +157,7 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@_SOURCES = test/test_real2.F90 $(shared_sources) $(redirect_sources)
elpa2_test_real@SUFFIX@_LDFLAGS = -static
elpa2_test_real@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_default_kernel@SUFFIX@_SOURCES = test/test_real2_default_kernel.F90 $(shared_sources) $(redirect_sources)
......@@ -258,7 +261,7 @@ elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
CLEANFILES = \
elpa-generated.h \
elpa/elpa-generated.h \
elpa1_test_real.sh \
elpa1_test_complex.sh \
elpa2_test_real.sh \
......
......@@ -83,6 +83,7 @@ AC_PROG_CXX
dnl variables needed for the tests
N="0"
dnl these test will cause an abort of configure if not
dnl successful. However, if MKL is found then the blas, blacs,
......@@ -343,7 +344,7 @@ else
AC_MSG_RESULT([${have_blas}])
if test x"${have_blas}" = x"no" ; then
AC_MSG_ERROR([could not link with blas: specify path])
AC_MSG_ERROR([could not link with blas: specify path])
fi
dnl now lapack
AC_SEARCH_LIBS([dlarrv],[lapack],[have_lapack=yes],[have_lapack=no])
......@@ -494,7 +495,7 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
AS_HELP_STRING([--with-$1],
[only compile $2 for real case]),
[],[with_option=no])
if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_real_kernel}" = x"no" ; then
......@@ -515,25 +516,25 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
if test x"${install_real_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block4}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block6}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
......@@ -583,7 +584,7 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
AS_HELP_STRING([--with-$1],
[only compile $2 for complex case]),
[],[with_option=no])
if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_complex_kernel}" = x"no" ; then
......@@ -604,19 +605,19 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
if test x"${install_complex_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_complex_avx_block1}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_complex_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
AC_MSG_NOTICE([$1 will be the only compiled kernel for real case])
......@@ -656,6 +657,65 @@ if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
dnl check whether GPU version is requested
CUDA_INSTALL_PATH="/usr/local/cuda/"
#CUDA_SDK_INSTALL_PATH="/usr/local/NVIDIA_GPU_Computing_SDK"
AC_MSG_CHECKING(whether GPU support is requested)
AC_ARG_ENABLE(gpu-support,[AS_HELP_STRING([--enable-gpu-support],
[build ELPA2 with GPU-support ( no CPU version available)])],
want_gpu="yes", want_gpu="no")
#AC_ARG_WITH([GPU-SUPPORT], [AS_HELP_STRING([--with-GPU-SUPPORT],
# [build ELPA2 with GPU-support ( no CPU version available)])],
# [with_gpu=yes],[with_gpu=no])
AC_MSG_RESULT([${want_gpu}])
AC_ARG_WITH([cuda-path],[AS_HELP_STRING([--with-cuda-path=PATH],[prefix where CUDA is installed @<:@default=auto@:>@])],
[CUDA_INSTALL_PATH=$withval], [with_cuda=auto])
AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix where CUDA SDK is installed @<:@default=auto@:>@])],
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
#AC_ARG_VAR([SCALAPACK_LDFLAGS],[Extra LDFLAGS necessary to link a program with Scalapack])
#AC_ARG_VAR([SCALAPACK_FCFLAGS],[Extra FCFLAGS necessary to compile a Fortran program with Scalapack])
#FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
#LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
dnl setup nvcc flags
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
AC_SUBST(NVCC)
AC_SUBST(NVCCFLAGS)
dnl check whether nvcc compiler is found
AC_CHECK_PROG(nvcc_found,nvcc,yes,no)
if test x"${nvcc_found}" = x"no" ; then
AC_MSG_ERROR([nvcc not found])
fi
dnl check whether we find cublas
AC_SEARCH_LIBS([cublasDgemm],[cublas],[have_cublas=yes],[have_cublas=no])
if test x"${have_cublas}" = x"no"; then
AC_MSG_ERROR([Could not link cublas])
fi
AC_SEARCH_LIBS([cudaMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
if test x"${have_cudart}" = x"no"; then
AC_MSG_ERROR([Could not link cudart])
fi
AC_LANG_POP([C])
AC_DEFINE([WITH_GPU_VERSION],[1],[build with GPU support])
fi
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$want_gpu" = x"yes"])
AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"])
if test x"${install_real_generic}" = x"yes" ; then
......@@ -794,4 +854,3 @@ grep "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_g
if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
......@@ -6,8 +6,9 @@
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9
#define ELPA2_NUMBER_OF_REAL_KERNELS 8
#define ELPA2_NUMBER_OF_REAL_KERNELS 9
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
......@@ -17,5 +18,6 @@
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 7
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 8
#include <cuda_runtime.h>
#include <stdlib.h>
#include <stdio.h>
#include <cuComplex.h>
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
__device__ void reset_shared_block_c ( double * s_block, int b_size)
{
int i, t_idx, s_chunk ;
t_idx = threadIdx.x;
s_chunk = b_size / blockDim.x;
for(i = ((t_idx - 1) * s_chunk + 1) ; i < (t_idx * s_chunk); i++)
s_block[i] = 0.0 ;
__syncthreads();
}
// Reset 2 reduction blocks without an explicit synchronization at the end
// Limitation: : the thread-block size must be a divider of the reduction block's size
__device__ void reset_shared_block_pair_c( double *s_block_1, double *s_block_2, int b_size)
{
int i, t_idx, s_chunk;
t_idx = threadIdx.x;
s_chunk = b_size / blockDim.x;
for(i = ((t_idx - 1) * s_chunk + 1); i < (t_idx * s_chunk); i++)
{ s_block_1[i] = 0.0 ;
s_block_2[i] = 0.0 ;
}
}
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
__device__ void reset_shared_block_c_complex ( cuDoubleComplex * s_block, int b_size)
{
int i, t_idx, s_chunk ;
t_idx = threadIdx.x;
s_chunk = b_size / blockDim.x;
for(i = ((t_idx - 1) * s_chunk + 1) ; i < (t_idx * s_chunk); i++)
{ s_block[i].x = 0.0 ;
s_block[i].y = 0.0 ;}
__syncthreads();
}
// Reset 2 reduction blocks without an explicit synchronization at the end
// Limitation: : the thread-block size must be a divider of the reduction block's size
__device__ void reset_shared_block_pair_c_complex( cuDoubleComplex *s_block_1, cuDoubleComplex *s_block_2, int b_size)
{
int i, t_idx, s_chunk;
t_idx = threadIdx.x;
s_chunk = b_size / blockDim.x;
for(i = ((t_idx - 1) * s_chunk + 1); i < (t_idx * s_chunk); i++)
{ s_block_1[i].x = 0.0 ;
s_block_2[i].x= 0.0 ;
s_block_1[i].y = 0.0 ;
s_block_2[i].y= 0.0 ;
}
}
__device__ void warp_reduce_complex( cuDoubleComplex *s_block)
{
int t_idx ;
t_idx = threadIdx.x;
__syncthreads();
if (t_idx < 32)
{
s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 32]) , cuCadd( s_block[t_idx + 64], s_block[t_idx + 96]) );
if (t_idx < 8)
{
s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 8] ) , cuCadd( s_block[t_idx + 16] , s_block[t_idx + 24] ) );
}
if (t_idx < 4)
{
s_block[t_idx] = cuCadd(s_block[t_idx] , s_block[t_idx + 4]) ;
}
if (t_idx < 1)
{
s_block[t_idx] = cuCadd(cuCadd(s_block[t_idx],s_block[t_idx + 1] ) , cuCadd( s_block[t_idx +2] , s_block[t_idx + 3] ) );
}
}
}
__global__ void my_pack_c_kernel_complex(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* src, cuDoubleComplex* dst)
{
int b_id, t_id ;
int dst_ind ;
b_id = blockIdx.y;
t_id = threadIdx.x;
dst_ind = b_id * stripe_width + t_id;
if (dst_ind < max_idx)
{
// dimension of dst - lnev, nblk
// dimension of src - stripe_width,a_dim2,stripe_count
dst[dst_ind + (l_nev*blockIdx.x)].x = src[t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2)].x;
dst[dst_ind + (l_nev*blockIdx.x)].y = src[t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2)].y;
}
}
__global__ void my_unpack_c_kernel_complex( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* src, cuDoubleComplex* dst)
{
int b_id, t_id ;
int src_ind;
b_id = blockIdx.y;
t_id = threadIdx.x;
src_ind = b_id * stripe_width + t_id;
if (src_ind < max_idx)
{ dst[ t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 )].x = src[ src_ind + (blockIdx.x) *l_nev ].x;
dst[ t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 )].y = src[ src_ind + (blockIdx.x) *l_nev ].y;
}
}
__global__ void extract_hh_tau_c_kernel_complex(cuDoubleComplex* hh, cuDoubleComplex* hh_tau, const int nbw, const int n, int val)
{
int h_idx ;
h_idx = (blockIdx.x) * blockDim.x + threadIdx.x;
if (h_idx < n)
{
//dimension of hh - (nbw, max_blk_size)
//dimension of hh_tau - max_blk_size
hh_tau[h_idx] = hh[h_idx * nbw] ;
// Replace the first element in the HH reflector with 1.0 or 0.0
if( val == 0)
{
hh[(h_idx * nbw)].x = 1.0;
hh[h_idx *nbw].y= 0.0;
}
else
{
hh[(h_idx * nbw)].x = 0.0;
hh[h_idx*nbw].y =0.0;
}
}
}
__global__ void compute_hh_dotp_c_kernel_complex(cuDoubleComplex* hh, cuDoubleComplex* v_dot, const int nbw, const int n)
{
__shared__ cuDoubleComplex hh_s[128] ;
int t_idx, v_idx;
// The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
v_idx = blockIdx.x ;
// The thread index indicates the position within the two HH reflectors
t_idx = threadIdx.x ;
if (t_idx > 0)
{
hh_s[t_idx] = cuCmul(cuConj(hh[t_idx + v_idx * nbw]), hh[ (t_idx - 1) + (v_idx +1)* nbw]) ;
}
else
{
hh_s[t_idx].x = 0.0 ;
hh_s[t_idx].y = 0.0;
}
// Compute the dot product using a fast reduction
warp_reduce_complex(hh_s);
__syncthreads();
if(t_idx == 0)
{
v_dot[v_idx] = hh_s[0] ;
}
}
extern "C" void launch_my_pack_c_kernel_complex(const int row_count, const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* a_dev, cuDoubleComplex* row_group_dev)
{
dim3 grid_size;
grid_size = dim3(row_count, stripe_count, 1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to mypack kernel: %s, %d\n",cudaGetErrorString(err), err);
my_pack_c_kernel_complex<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev);
err = cudaGetLastError();
if ( err!= cudaSuccess)
{
printf("\n my pack_kernel failed %s \n",cudaGetErrorString(err) );
}
}
extern "C" void launch_compute_hh_dotp_c_kernel_complex(cuDoubleComplex* bcast_buffer_dev, cuDoubleComplex* hh_dot_dev,const int nbw,const int n)
{
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to compute_hh kernel: %s, %d\n",cudaGetErrorString(err), err);
compute_hh_dotp_c_kernel_complex<<< n-1, nbw >>>(bcast_buffer_dev, hh_dot_dev, nbw, n);
err = cudaGetLastError();
if ( err!= cudaSuccess)
{
printf("\n compute _kernel failed %s \n",cudaGetErrorString(err) );
}
}
extern "C" void launch_extract_hh_tau_c_kernel_complex(cuDoubleComplex* bcast_buffer_dev, cuDoubleComplex* hh_tau_dev, const int nbw, const int n , const int is_zero)
{
int grid_size;
grid_size = 1 + (n - 1) / 256;
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to extract kernel: %s, %d\n",cudaGetErrorString(err), err);
extract_hh_tau_c_kernel_complex<<<grid_size,256>>>(bcast_buffer_dev,hh_tau_dev, nbw, n, is_zero);
err = cudaGetLastError();
if ( err!= cudaSuccess)
{
printf("\n extract _kernel failed %s \n",cudaGetErrorString(err) );
}
}
extern "C" void launch_my_unpack_c_kernel_complex( const int row_count, const int n_offset, const int max_idx, const int stripe_width,const int a_dim2, const int stripe_count, const int l_nev, cuDoubleComplex* row_group_dev, cuDoubleComplex* a_dev)
{
dim3 grid_size;
grid_size = dim3(row_count, stripe_count, 1);
cudaDeviceSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) printf("error prior to unpack kernel: %s, %d\n",cudaGetErrorString(err), err);
my_unpack_c_kernel_complex<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, row_group_dev , a_dev);
err = cudaGetLastError();
if ( err != cudaSuccess)
{
printf("\n my_unpack_c_kernel failed %s \n",cudaGetErrorString(err) );
}
}
__device__ void warp_reduce_c( double *s_block)
{
int t_idx ;
t_idx = threadIdx.x;
__syncthreads();
if (t_idx < 32)
{
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 32] + s_block[t_idx + 64] + s_block[t_idx + 96] ;
if (t_idx < 8)
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 8] + s_block[t_idx + 16] + s_block[t_idx + 24];
if (t_idx < 4)
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 4];
if (t_idx < 1)
s_block[t_idx] = s_block[t_idx] + s_block[t_idx + 1] + s_block[t_idx + 2] + s_block[t_idx + 3];
}
}
__global__ void my_pack_c_kernel(const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst)
{
int b_id, t_id ;
int dst_ind ;
b_id = blockIdx.y;
t_id = threadIdx.x;
dst_ind = b_id * stripe_width + t_id;
if (dst_ind < max_idx)
{
// dimension of dst - lnev, nblk
// dimension of src - stripe_width,a_dim2,stripe_count
*(dst + dst_ind + (l_nev*blockIdx.x) ) = *(src + t_id + (stripe_width*(n_offset + blockIdx.x)) + ( b_id *stripe_width*a_dim2 ));
}
}
__global__ void my_unpack_c_kernel( const int n_offset, const int max_idx, const int stripe_width, const int a_dim2, const int stripe_count, const int l_nev, double* src, double* dst)
{
int b_id, t_id ;
int src_ind;
b_id = blockIdx.y;
t_id = threadIdx.x;
src_ind = b_id * stripe_width + t_id;
if (src_ind < max_idx)
*(dst + (t_id + ((n_offset + blockIdx.x) * stripe_width) + (b_id * stripe_width * a_dim2 ))) = *(src + src_ind + (blockIdx.x) *l_nev );
}
__global__ void compute_kernel_reduce( cuDoubleComplex* a_dev, int lda , int n ,int nbw , cuDoubleComplex *h1_dev )
{
int t_id ;
int st_ind;
t_id = threadIdx.x;
st_ind = (t_id*(t_id+1))/2;
if(t_id< n)
{
for(int i =0;i<=t_id;i++)
{
h1_dev[st_ind + i] = a_dev[t_id *lda + i ] ;
}
}
__syncthreads();
}
__global__ void compute_kernel_reduce_1( cuDoubleComplex* a_dev, int lda , int n, cuDoubleComplex *h1_dev )
{
int t_id ;
int st_ind;
t_id = threadIdx.x;
st_ind = (t_id*(t_id+1))/2;
if(t_id< n)
{
for(int i =0;i<=t_id;i++)
{
a_dev[t_id *lda + i ] = h1_dev[st_ind + i];
a_dev[ (i-1)*lda + t_id ] = cuConj(a_dev[ t_id *lda + i-1]) ;
}
}
__syncthreads();
}
__global__ void dot_product_c_kernel( cuDoubleComplex* hs_dev, cuDoubleComplex* hv_new_dev, cuDoubleComplex tau_new_dev, cuDoubleComplex* x_dev, cuDoubleComplex *h_dev, cuDoubleComplex *hv_dev, int nr)
{
int t_id ;
__shared__ cuDoubleComplex x_dev_temp[128];
__shared__ cuDoubleComplex x_val;
//b_id = blockIdx.y;
t_id = threadIdx.x;
if(t_id<nr)
x_dev_temp[t_id] = cuCmul( cuConj(hs_dev[t_id]), hv_new_dev[t_id]) ;
__syncthreads();
if(t_id==0)
{
for(int i=1;i<nr;i++)
x_dev_temp[t_id] = cuCadd(x_dev_temp[t_id],x_dev_temp[t_id +i]);
}
__syncthreads();
if(t_id ==0)
{
x_val = cuCmul(x_dev_temp[t_id], tau_new_dev);
x_dev[0] = x_val;
}
__syncthreads();
}
__global__ void dot_product_c_kernel_1( cuDoubleComplex* ab_dev, cuDoubleComplex *hs_dev, cuDoubleComplex* hv_new_dev, cuDoubleComplex* x_dev, cuDoubleComplex *h_dev, cuDoubleComplex *hv_dev, int nb, int nr , int ns )
{
int t_id = threadIdx.x;
int i;
if((t_id>0 )&& (t_id < nb))
{
h_dev[t_id] = cuCsub(h_dev[t_id], cuCmul(x_dev[0],hv_dev[t_id]));
for(i=0;i<nr;i++)
{
ab_dev[ i+nb-t_id + (t_id+ns-1)*2*nb ] = cuCsub(cuCsub(ab_dev[ i+nb-t_id + (t_id+ns-1)*2*nb],cuCmul(hv_new_dev[i],cuConj(h_dev[t_id])) ),cuCmul(hs_dev[i], cuConj(hv_dev[t_id])));
}
}
__syncthreads();
}
__global__ void double_hh_transform_kernel( cuDoubleComplex* ab_dev, cuDoubleComplex *hs_dev, cuDoubleComplex *hv_dev, int nb, int ns )
{
int t_id = threadIdx.x;
if((t_id>0 )&& (t_id < nb))
{
ab_dev[ nb-t_id + (t_id+ns-1)*2*nb ] = cuCsub(ab_dev[ nb-t_id + (t_id+ns-1)*2*nb],cuCmul(hs_dev[0], cuConj(hv_dev[t_id])));
}
__syncthreads();
}
__global__ void double_hh_transform_kernel_2( cuDoubleComplex* ab_dev, cuDoubleComplex *hd_dev, cuDoubleComplex *hv_dev, int nc, int ns , int nb )
{
int t_id = threadIdx.x;
if(t_id < nc)
{
ab_dev[ t_id + (ns-1)*2*nb ] = cuCsub(cuCsub(ab_dev[ t_id + (ns-1)*2*nb],cuCmul(hd_dev[ t_id], cuConj(hv_dev[0]))) , cuCmul(hv_dev[ t_id], cuConj(hd_dev[0])));
}
__syncthreads();
}
__global__ void extract_hh_tau_c_kernel(double* hh, double* hh_tau, const int nbw, const int n, int val)
{
int h_idx ;
h_idx = (blockIdx.x) * blockDim.x + threadIdx.x;
if (h_idx < n)
{
//dimension of hh - (nbw, max_blk_size)