Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
3ea32c22
Commit
3ea32c22
authored
Mar 16, 2015
by
Andreas Marek
Browse files
Merge NVIDIA GPU sources by hand
parent
9ad68bd0
Changes
10
Hide whitespace changes
Inline
Side-by-side
Makefile.am
View file @
3ea32c22
...
...
@@ -30,6 +30,10 @@ if HAVE_DETAILED_TIMINGS
src/ftimings/papi.c
endif
if
WITH_GPU_VERSION
libelpa@SUFFIX@
_la_SOURCES
+=
src/interface_cuda.F90 src/interface_c_kernel.F90 src/ev_tridi_band_gpu_c_v2.cu src/cuUtils.cu
endif
if
WITH_REAL_GENERIC_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_real.f90
endif
...
...
@@ -82,10 +86,8 @@ if WITH_COMPLEX_AVX_BLOCK2_KERNEL
libelpa@SUFFIX@
_la_SOURCES
+=
src/elpa2_kernels/elpa2_kernels_complex_sse-avx_2hv.cpp
endif
#if WITH_AVX_SANDYBRIDGE
# libelpa@SUFFIX@_la_SOURCES += src/elpa2_kernels/elpa2_kernels_real_sse-avx_2hv.c \
# src/elpa2_kernels/elpa2_kernels_complex_sse-avx_1hv.cpp
#endif
.cu.lo
:
NVCC
=
"
$(NVCC)
"
libtool
--mode
=
compile
--tag
=
CC
$(top_srcdir)
/nvcc_wrap
$(NVCCFLAGS)
$(LDFLAGS)
-c
$<
-o
$@
# install any .mod files in the include/ dir
elpa_includedir
=
$(includedir)
/elpa@SUFFIX@-@PACKAGE_VERSION@
...
...
@@ -155,6 +157,7 @@ elpa1_test_real_with_c@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real@SUFFIX@
_SOURCES
=
test
/test_real2.F90
$(shared_sources)
$(redirect_sources)
elpa2_test_real@SUFFIX@
_LDFLAGS
=
-static
elpa2_test_real@SUFFIX@
_LDADD
=
$(build_lib)
elpa2_test_real_default_kernel@SUFFIX@
_SOURCES
=
test
/test_real2_default_kernel.F90
$(shared_sources)
$(redirect_sources)
...
...
@@ -258,7 +261,7 @@ elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP)
$(CPPFLAGS)
-I
$(top_builddir)
/
-c
$(top_srcdir)
/src/elpa1.F90
-o
$@
CLEANFILES
=
\
elpa-generated.h
\
elpa/
elpa-generated.h
\
elpa1_test_real.sh
\
elpa1_test_complex.sh
\
elpa2_test_real.sh
\
...
...
configure.ac
View file @
3ea32c22
...
...
@@ -83,6 +83,7 @@ AC_PROG_CXX
dnl variables needed for the tests
N="0"
dnl these test will cause an abort of configure if not
dnl successful. However, if MKL is found then the blas, blacs,
...
...
@@ -343,7 +344,7 @@ else
AC_MSG_RESULT([${have_blas}])
if test x"${have_blas}" = x"no" ; then
AC_MSG_ERROR([could not link with blas: specify path])
AC_MSG_ERROR([could not link with blas: specify path])
fi
dnl now lapack
AC_SEARCH_LIBS([dlarrv],[lapack],[have_lapack=yes],[have_lapack=no])
...
...
@@ -494,7 +495,7 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
AS_HELP_STRING([--with-$1],
[only compile $2 for real case]),
[],[with_option=no])
if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_real_kernel}" = x"no" ; then
...
...
@@ -515,25 +516,25 @@ AC_DEFUN([DEFINE_OPTION_REAL_KERNEL],[
if test x"${install_real_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block4}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_real_avx_block6}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
...
...
@@ -583,7 +584,7 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
AS_HELP_STRING([--with-$1],
[only compile $2 for complex case]),
[],[with_option=no])
if test x"${with_option}" = x"yes" ; then
if test x"${use_specific_complex_kernel}" = x"no" ; then
...
...
@@ -604,19 +605,19 @@ AC_DEFUN([DEFINE_OPTION_COMPLEX_KERNEL],[
if test x"${install_complex_sse}" = x"yes" ; then
if test x"${can_compile_sse}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_complex_avx_block1}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
if test x"${install_complex_avx_block2}" = x"yes" ; then
if test x"${can_compile_avx}" = x"no" ; then
AC_MSG_ERROR([$2 kernel was set, but cannot be compiled!])
fi
fi
fi
AC_MSG_NOTICE([$1 will be the only compiled kernel for real case])
...
...
@@ -656,6 +657,65 @@ if test x"${can_use_iso_fortran_env}" = x"yes" ; then
AC_DEFINE([HAVE_ISO_FORTRAN_ENV],[1],[can use module iso_fortran_env])
fi
dnl check whether GPU version is requested
CUDA_INSTALL_PATH="/usr/local/cuda/"
#CUDA_SDK_INSTALL_PATH="/usr/local/NVIDIA_GPU_Computing_SDK"
AC_MSG_CHECKING(whether GPU support is requested)
AC_ARG_ENABLE(gpu-support,[AS_HELP_STRING([--enable-gpu-support],
[build ELPA2 with GPU-support ( no CPU version available)])],
want_gpu="yes", want_gpu="no")
#AC_ARG_WITH([GPU-SUPPORT], [AS_HELP_STRING([--with-GPU-SUPPORT],
# [build ELPA2 with GPU-support ( no CPU version available)])],
# [with_gpu=yes],[with_gpu=no])
AC_MSG_RESULT([${want_gpu}])
AC_ARG_WITH([cuda-path],[AS_HELP_STRING([--with-cuda-path=PATH],[prefix where CUDA is installed @<:@default=auto@:>@])],
[CUDA_INSTALL_PATH=$withval], [with_cuda=auto])
AC_ARG_WITH([cuda-sdk-path],[AS_HELP_STRING([--with-cuda-sdk-path=PATH],[prefix where CUDA SDK is installed @<:@default=auto@:>@])],
[CUDA_SDK_INSTALL_PATH=$withval],[with_cuda_sdk=auto])
#AC_ARG_VAR([SCALAPACK_LDFLAGS],[Extra LDFLAGS necessary to link a program with Scalapack])
#AC_ARG_VAR([SCALAPACK_FCFLAGS],[Extra FCFLAGS necessary to compile a Fortran program with Scalapack])
#FCFLAGS="$FCFLAGS $SCALAPACK_FCFLAGS"
#LDFLAGS="$LDFLAGS $SCALAPACK_LDFLAGS"
dnl setup nvcc flags
if test x"${want_gpu}" = x"yes" ; then
AC_LANG_PUSH([C])
CUDA_CFLAGS="$CUDA_CFLAGS -arch sm_35 -I$CUDA_INSTALL_PATH/include"
LDFLAGS="$LDFLAGS -L$CUDA_INSTALL_PATH/lib64"
NVCCFLAGS="$NVCCFLAGS $CUDA_CFLAGS $CUDA_LDFLAGS"
NVCC="nvcc"
AC_SUBST(NVCC)
AC_SUBST(NVCCFLAGS)
dnl check whether nvcc compiler is found
AC_CHECK_PROG(nvcc_found,nvcc,yes,no)
if test x"${nvcc_found}" = x"no" ; then
AC_MSG_ERROR([nvcc not found])
fi
dnl check whether we find cublas
AC_SEARCH_LIBS([cublasDgemm],[cublas],[have_cublas=yes],[have_cublas=no])
if test x"${have_cublas}" = x"no"; then
AC_MSG_ERROR([Could not link cublas])
fi
AC_SEARCH_LIBS([cudaMemcpy],[cudart],[have_cudart=yes],[have_cudart=no])
if test x"${have_cudart}" = x"no"; then
AC_MSG_ERROR([Could not link cudart])
fi
AC_LANG_POP([C])
AC_DEFINE([WITH_GPU_VERSION],[1],[build with GPU support])
fi
AM_CONDITIONAL([WITH_GPU_VERSION],[test x"$want_gpu" = x"yes"])
AM_CONDITIONAL([WITH_REAL_GENERIC_KERNEL],[test x"$install_real_generic" = x"yes"])
if test x"${install_real_generic}" = x"yes" ; then
...
...
@@ -794,4 +854,3 @@ grep "^ *!c>" $srcdir/src/elpa_c_interface.F90 | sed 's/^ *!c>//;' > elpa/elpa_g
if test "${can_compile_avx}" = "no" ; then
AC_MSG_WARN([Could not compile AVX instructions])
fi
elpa/elpa_kernel_constants.h
View file @
3ea32c22
...
...
@@ -6,8 +6,9 @@
#define ELPA2_REAL_KERNEL_AVX_BLOCK2 6
#define ELPA2_REAL_KERNEL_AVX_BLOCK4 7
#define ELPA2_REAL_KERNEL_AVX_BLOCK6 8
#define ELPA2_REAL_KERNEL_GPU 9
#define ELPA2_NUMBER_OF_REAL_KERNELS
8
#define ELPA2_NUMBER_OF_REAL_KERNELS
9
#define ELPA2_COMPLEX_KERNEL_GENERIC 1
...
...
@@ -17,5 +18,6 @@
#define ELPA2_COMPLEX_KERNEL_SSE 5
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK1 6
#define ELPA2_COMPLEX_KERNEL_AVX_BLOCK2 7
#define ELPA2_COMPLEX_KERNEL_GPU 8
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS
7
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS
8
src/cuUtils.cu
0 → 100644
View file @
3ea32c22
#include
<cuda_runtime.h>
#include
<stdlib.h>
#include
<stdio.h>
#include
<cuComplex.h>
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
__device__
void
reset_shared_block_c
(
double
*
s_block
,
int
b_size
)
{
int
i
,
t_idx
,
s_chunk
;
t_idx
=
threadIdx
.
x
;
s_chunk
=
b_size
/
blockDim
.
x
;
for
(
i
=
((
t_idx
-
1
)
*
s_chunk
+
1
)
;
i
<
(
t_idx
*
s_chunk
);
i
++
)
s_block
[
i
]
=
0.0
;
__syncthreads
();
}
// Reset 2 reduction blocks without an explicit synchronization at the end
// Limitation: : the thread-block size must be a divider of the reduction block's size
__device__
void
reset_shared_block_pair_c
(
double
*
s_block_1
,
double
*
s_block_2
,
int
b_size
)
{
int
i
,
t_idx
,
s_chunk
;
t_idx
=
threadIdx
.
x
;
s_chunk
=
b_size
/
blockDim
.
x
;
for
(
i
=
((
t_idx
-
1
)
*
s_chunk
+
1
);
i
<
(
t_idx
*
s_chunk
);
i
++
)
{
s_block_1
[
i
]
=
0.0
;
s_block_2
[
i
]
=
0.0
;
}
}
// Reset a reduction block
// Limitation: the thread-block size must be a divider of the reduction block's size
__device__
void
reset_shared_block_c_complex
(
cuDoubleComplex
*
s_block
,
int
b_size
)
{
int
i
,
t_idx
,
s_chunk
;
t_idx
=
threadIdx
.
x
;
s_chunk
=
b_size
/
blockDim
.
x
;
for
(
i
=
((
t_idx
-
1
)
*
s_chunk
+
1
)
;
i
<
(
t_idx
*
s_chunk
);
i
++
)
{
s_block
[
i
].
x
=
0.0
;
s_block
[
i
].
y
=
0.0
;}
__syncthreads
();
}
// Reset 2 reduction blocks without an explicit synchronization at the end
// Limitation: : the thread-block size must be a divider of the reduction block's size
__device__
void
reset_shared_block_pair_c_complex
(
cuDoubleComplex
*
s_block_1
,
cuDoubleComplex
*
s_block_2
,
int
b_size
)
{
int
i
,
t_idx
,
s_chunk
;
t_idx
=
threadIdx
.
x
;
s_chunk
=
b_size
/
blockDim
.
x
;
for
(
i
=
((
t_idx
-
1
)
*
s_chunk
+
1
);
i
<
(
t_idx
*
s_chunk
);
i
++
)
{
s_block_1
[
i
].
x
=
0.0
;
s_block_2
[
i
].
x
=
0.0
;
s_block_1
[
i
].
y
=
0.0
;
s_block_2
[
i
].
y
=
0.0
;
}
}
__device__
void
warp_reduce_complex
(
cuDoubleComplex
*
s_block
)
{
int
t_idx
;
t_idx
=
threadIdx
.
x
;
__syncthreads
();
if
(
t_idx
<
32
)
{
s_block
[
t_idx
]
=
cuCadd
(
cuCadd
(
s_block
[
t_idx
],
s_block
[
t_idx
+
32
])
,
cuCadd
(
s_block
[
t_idx
+
64
],
s_block
[
t_idx
+
96
])
);
if
(
t_idx
<
8
)
{
s_block
[
t_idx
]
=
cuCadd
(
cuCadd
(
s_block
[
t_idx
],
s_block
[
t_idx
+
8
]
)
,
cuCadd
(
s_block
[
t_idx
+
16
]
,
s_block
[
t_idx
+
24
]
)
);
}
if
(
t_idx
<
4
)
{
s_block
[
t_idx
]
=
cuCadd
(
s_block
[
t_idx
]
,
s_block
[
t_idx
+
4
])
;
}
if
(
t_idx
<
1
)
{
s_block
[
t_idx
]
=
cuCadd
(
cuCadd
(
s_block
[
t_idx
],
s_block
[
t_idx
+
1
]
)
,
cuCadd
(
s_block
[
t_idx
+
2
]
,
s_block
[
t_idx
+
3
]
)
);
}
}
}
__global__
void
my_pack_c_kernel_complex
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
)
{
int
b_id
,
t_id
;
int
dst_ind
;
b_id
=
blockIdx
.
y
;
t_id
=
threadIdx
.
x
;
dst_ind
=
b_id
*
stripe_width
+
t_id
;
if
(
dst_ind
<
max_idx
)
{
// dimension of dst - lnev, nblk
// dimension of src - stripe_width,a_dim2,stripe_count
dst
[
dst_ind
+
(
l_nev
*
blockIdx
.
x
)].
x
=
src
[
t_id
+
(
stripe_width
*
(
n_offset
+
blockIdx
.
x
))
+
(
b_id
*
stripe_width
*
a_dim2
)].
x
;
dst
[
dst_ind
+
(
l_nev
*
blockIdx
.
x
)].
y
=
src
[
t_id
+
(
stripe_width
*
(
n_offset
+
blockIdx
.
x
))
+
(
b_id
*
stripe_width
*
a_dim2
)].
y
;
}
}
__global__
void
my_unpack_c_kernel_complex
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
src
,
cuDoubleComplex
*
dst
)
{
int
b_id
,
t_id
;
int
src_ind
;
b_id
=
blockIdx
.
y
;
t_id
=
threadIdx
.
x
;
src_ind
=
b_id
*
stripe_width
+
t_id
;
if
(
src_ind
<
max_idx
)
{
dst
[
t_id
+
((
n_offset
+
blockIdx
.
x
)
*
stripe_width
)
+
(
b_id
*
stripe_width
*
a_dim2
)].
x
=
src
[
src_ind
+
(
blockIdx
.
x
)
*
l_nev
].
x
;
dst
[
t_id
+
((
n_offset
+
blockIdx
.
x
)
*
stripe_width
)
+
(
b_id
*
stripe_width
*
a_dim2
)].
y
=
src
[
src_ind
+
(
blockIdx
.
x
)
*
l_nev
].
y
;
}
}
__global__
void
extract_hh_tau_c_kernel_complex
(
cuDoubleComplex
*
hh
,
cuDoubleComplex
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
{
int
h_idx
;
h_idx
=
(
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
h_idx
<
n
)
{
//dimension of hh - (nbw, max_blk_size)
//dimension of hh_tau - max_blk_size
hh_tau
[
h_idx
]
=
hh
[
h_idx
*
nbw
]
;
// Replace the first element in the HH reflector with 1.0 or 0.0
if
(
val
==
0
)
{
hh
[(
h_idx
*
nbw
)].
x
=
1.0
;
hh
[
h_idx
*
nbw
].
y
=
0.0
;
}
else
{
hh
[(
h_idx
*
nbw
)].
x
=
0.0
;
hh
[
h_idx
*
nbw
].
y
=
0.0
;
}
}
}
__global__
void
compute_hh_dotp_c_kernel_complex
(
cuDoubleComplex
*
hh
,
cuDoubleComplex
*
v_dot
,
const
int
nbw
,
const
int
n
)
{
__shared__
cuDoubleComplex
hh_s
[
128
]
;
int
t_idx
,
v_idx
;
// The vector index (v_idx) identifies the pair of HH reflectors from which the dot product is computed
v_idx
=
blockIdx
.
x
;
// The thread index indicates the position within the two HH reflectors
t_idx
=
threadIdx
.
x
;
if
(
t_idx
>
0
)
{
hh_s
[
t_idx
]
=
cuCmul
(
cuConj
(
hh
[
t_idx
+
v_idx
*
nbw
]),
hh
[
(
t_idx
-
1
)
+
(
v_idx
+
1
)
*
nbw
])
;
}
else
{
hh_s
[
t_idx
].
x
=
0.0
;
hh_s
[
t_idx
].
y
=
0.0
;
}
// Compute the dot product using a fast reduction
warp_reduce_complex
(
hh_s
);
__syncthreads
();
if
(
t_idx
==
0
)
{
v_dot
[
v_idx
]
=
hh_s
[
0
]
;
}
}
extern
"C"
void
launch_my_pack_c_kernel_complex
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
a_dev
,
cuDoubleComplex
*
row_group_dev
)
{
dim3
grid_size
;
grid_size
=
dim3
(
row_count
,
stripe_count
,
1
);
cudaDeviceSynchronize
();
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
printf
(
"error prior to mypack kernel: %s, %d
\n
"
,
cudaGetErrorString
(
err
),
err
);
my_pack_c_kernel_complex
<<<
grid_size
,
stripe_width
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
l_nev
,
a_dev
,
row_group_dev
);
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
my pack_kernel failed %s
\n
"
,
cudaGetErrorString
(
err
)
);
}
}
extern
"C"
void
launch_compute_hh_dotp_c_kernel_complex
(
cuDoubleComplex
*
bcast_buffer_dev
,
cuDoubleComplex
*
hh_dot_dev
,
const
int
nbw
,
const
int
n
)
{
cudaDeviceSynchronize
();
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
printf
(
"error prior to compute_hh kernel: %s, %d
\n
"
,
cudaGetErrorString
(
err
),
err
);
compute_hh_dotp_c_kernel_complex
<<<
n
-
1
,
nbw
>>>
(
bcast_buffer_dev
,
hh_dot_dev
,
nbw
,
n
);
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
compute _kernel failed %s
\n
"
,
cudaGetErrorString
(
err
)
);
}
}
extern
"C"
void
launch_extract_hh_tau_c_kernel_complex
(
cuDoubleComplex
*
bcast_buffer_dev
,
cuDoubleComplex
*
hh_tau_dev
,
const
int
nbw
,
const
int
n
,
const
int
is_zero
)
{
int
grid_size
;
grid_size
=
1
+
(
n
-
1
)
/
256
;
cudaDeviceSynchronize
();
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
printf
(
"error prior to extract kernel: %s, %d
\n
"
,
cudaGetErrorString
(
err
),
err
);
extract_hh_tau_c_kernel_complex
<<<
grid_size
,
256
>>>
(
bcast_buffer_dev
,
hh_tau_dev
,
nbw
,
n
,
is_zero
);
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
extract _kernel failed %s
\n
"
,
cudaGetErrorString
(
err
)
);
}
}
extern
"C"
void
launch_my_unpack_c_kernel_complex
(
const
int
row_count
,
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
cuDoubleComplex
*
row_group_dev
,
cuDoubleComplex
*
a_dev
)
{
dim3
grid_size
;
grid_size
=
dim3
(
row_count
,
stripe_count
,
1
);
cudaDeviceSynchronize
();
cudaError_t
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
printf
(
"error prior to unpack kernel: %s, %d
\n
"
,
cudaGetErrorString
(
err
),
err
);
my_unpack_c_kernel_complex
<<<
grid_size
,
stripe_width
>>>
(
n_offset
,
max_idx
,
stripe_width
,
a_dim2
,
stripe_count
,
l_nev
,
row_group_dev
,
a_dev
);
err
=
cudaGetLastError
();
if
(
err
!=
cudaSuccess
)
{
printf
(
"
\n
my_unpack_c_kernel failed %s
\n
"
,
cudaGetErrorString
(
err
)
);
}
}
__device__
void
warp_reduce_c
(
double
*
s_block
)
{
int
t_idx
;
t_idx
=
threadIdx
.
x
;
__syncthreads
();
if
(
t_idx
<
32
)
{
s_block
[
t_idx
]
=
s_block
[
t_idx
]
+
s_block
[
t_idx
+
32
]
+
s_block
[
t_idx
+
64
]
+
s_block
[
t_idx
+
96
]
;
if
(
t_idx
<
8
)
s_block
[
t_idx
]
=
s_block
[
t_idx
]
+
s_block
[
t_idx
+
8
]
+
s_block
[
t_idx
+
16
]
+
s_block
[
t_idx
+
24
];
if
(
t_idx
<
4
)
s_block
[
t_idx
]
=
s_block
[
t_idx
]
+
s_block
[
t_idx
+
4
];
if
(
t_idx
<
1
)
s_block
[
t_idx
]
=
s_block
[
t_idx
]
+
s_block
[
t_idx
+
1
]
+
s_block
[
t_idx
+
2
]
+
s_block
[
t_idx
+
3
];
}
}
__global__
void
my_pack_c_kernel
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
src
,
double
*
dst
)
{
int
b_id
,
t_id
;
int
dst_ind
;
b_id
=
blockIdx
.
y
;
t_id
=
threadIdx
.
x
;
dst_ind
=
b_id
*
stripe_width
+
t_id
;
if
(
dst_ind
<
max_idx
)
{
// dimension of dst - lnev, nblk
// dimension of src - stripe_width,a_dim2,stripe_count
*
(
dst
+
dst_ind
+
(
l_nev
*
blockIdx
.
x
)
)
=
*
(
src
+
t_id
+
(
stripe_width
*
(
n_offset
+
blockIdx
.
x
))
+
(
b_id
*
stripe_width
*
a_dim2
));
}
}
__global__
void
my_unpack_c_kernel
(
const
int
n_offset
,
const
int
max_idx
,
const
int
stripe_width
,
const
int
a_dim2
,
const
int
stripe_count
,
const
int
l_nev
,
double
*
src
,
double
*
dst
)
{
int
b_id
,
t_id
;
int
src_ind
;
b_id
=
blockIdx
.
y
;
t_id
=
threadIdx
.
x
;
src_ind
=
b_id
*
stripe_width
+
t_id
;
if
(
src_ind
<
max_idx
)
*
(
dst
+
(
t_id
+
((
n_offset
+
blockIdx
.
x
)
*
stripe_width
)
+
(
b_id
*
stripe_width
*
a_dim2
)))
=
*
(
src
+
src_ind
+
(
blockIdx
.
x
)
*
l_nev
);
}
__global__
void
compute_kernel_reduce
(
cuDoubleComplex
*
a_dev
,
int
lda
,
int
n
,
int
nbw
,
cuDoubleComplex
*
h1_dev
)
{
int
t_id
;
int
st_ind
;
t_id
=
threadIdx
.
x
;
st_ind
=
(
t_id
*
(
t_id
+
1
))
/
2
;
if
(
t_id
<
n
)
{
for
(
int
i
=
0
;
i
<=
t_id
;
i
++
)
{
h1_dev
[
st_ind
+
i
]
=
a_dev
[
t_id
*
lda
+
i
]
;
}
}
__syncthreads
();
}
__global__
void
compute_kernel_reduce_1
(
cuDoubleComplex
*
a_dev
,
int
lda
,
int
n
,
cuDoubleComplex
*
h1_dev
)
{
int
t_id
;
int
st_ind
;
t_id
=
threadIdx
.
x
;
st_ind
=
(
t_id
*
(
t_id
+
1
))
/
2
;
if
(
t_id
<
n
)
{
for
(
int
i
=
0
;
i
<=
t_id
;
i
++
)
{
a_dev
[
t_id
*
lda
+
i
]
=
h1_dev
[
st_ind
+
i
];
a_dev
[
(
i
-
1
)
*
lda
+
t_id
]
=
cuConj
(
a_dev
[
t_id
*
lda
+
i
-
1
])
;
}
}
__syncthreads
();
}
__global__
void
dot_product_c_kernel
(
cuDoubleComplex
*
hs_dev
,
cuDoubleComplex
*
hv_new_dev
,
cuDoubleComplex
tau_new_dev
,
cuDoubleComplex
*
x_dev
,
cuDoubleComplex
*
h_dev
,
cuDoubleComplex
*
hv_dev
,
int
nr
)
{
int
t_id
;
__shared__
cuDoubleComplex
x_dev_temp
[
128
];
__shared__
cuDoubleComplex
x_val
;
//b_id = blockIdx.y;
t_id
=
threadIdx
.
x
;
if
(
t_id
<
nr
)
x_dev_temp
[
t_id
]
=
cuCmul
(
cuConj
(
hs_dev
[
t_id
]),
hv_new_dev
[
t_id
])
;
__syncthreads
();
if
(
t_id
==
0
)
{
for
(
int
i
=
1
;
i
<
nr
;
i
++
)
x_dev_temp
[
t_id
]
=
cuCadd
(
x_dev_temp
[
t_id
],
x_dev_temp
[
t_id
+
i
]);
}
__syncthreads
();
if
(
t_id
==
0
)
{
x_val
=
cuCmul
(
x_dev_temp
[
t_id
],
tau_new_dev
);
x_dev
[
0
]
=
x_val
;
}
__syncthreads
();
}
__global__
void
dot_product_c_kernel_1
(
cuDoubleComplex
*
ab_dev
,
cuDoubleComplex
*
hs_dev
,
cuDoubleComplex
*
hv_new_dev
,
cuDoubleComplex
*
x_dev
,
cuDoubleComplex
*
h_dev
,
cuDoubleComplex
*
hv_dev
,
int
nb
,
int
nr
,
int
ns
)
{
int
t_id
=
threadIdx
.
x
;
int
i
;
if
((
t_id
>
0
)
&&
(
t_id
<
nb
))
{
h_dev
[
t_id
]
=
cuCsub
(
h_dev
[
t_id
],
cuCmul
(
x_dev
[
0
],
hv_dev
[
t_id
]));
for
(
i
=
0
;
i
<
nr
;
i
++
)
{
ab_dev
[
i
+
nb
-
t_id
+
(
t_id
+
ns
-
1
)
*
2
*
nb
]
=
cuCsub
(
cuCsub
(
ab_dev
[
i
+
nb
-
t_id
+
(
t_id
+
ns
-
1
)
*
2
*
nb
],
cuCmul
(
hv_new_dev
[
i
],
cuConj
(
h_dev
[
t_id
]))
),
cuCmul
(
hs_dev
[
i
],
cuConj
(
hv_dev
[
t_id
])));
}
}
__syncthreads
();
}
__global__
void
double_hh_transform_kernel
(
cuDoubleComplex
*
ab_dev
,
cuDoubleComplex
*
hs_dev
,
cuDoubleComplex
*
hv_dev
,
int
nb
,
int
ns
)
{
int
t_id
=
threadIdx
.
x
;
if
((
t_id
>
0
)
&&
(
t_id
<
nb
))
{
ab_dev
[
nb
-
t_id
+
(
t_id
+
ns
-
1
)
*
2
*
nb
]
=
cuCsub
(
ab_dev
[
nb
-
t_id
+
(
t_id
+
ns
-
1
)
*
2
*
nb
],
cuCmul
(
hs_dev
[
0
],
cuConj
(
hv_dev
[
t_id
])));
}
__syncthreads
();
}
__global__
void
double_hh_transform_kernel_2
(
cuDoubleComplex
*
ab_dev
,
cuDoubleComplex
*
hd_dev
,
cuDoubleComplex
*
hv_dev
,
int
nc
,
int
ns
,
int
nb
)
{
int
t_id
=
threadIdx
.
x
;
if
(
t_id
<
nc
)
{
ab_dev
[
t_id
+
(
ns
-
1
)
*
2
*
nb
]
=
cuCsub
(
cuCsub
(
ab_dev
[
t_id
+
(
ns
-
1
)
*
2
*
nb
],
cuCmul
(
hd_dev
[
t_id
],
cuConj
(
hv_dev
[
0
])))
,
cuCmul
(
hv_dev
[
t_id
],
cuConj
(
hd_dev
[
0
])));
}
__syncthreads
();
}
__global__
void
extract_hh_tau_c_kernel
(
double
*
hh
,
double
*
hh_tau
,
const
int
nbw
,
const
int
n
,
int
val
)
{
int
h_idx
;
h_idx
=
(
blockIdx
.
x
)
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
h_idx
<
n
)
{
//dimension of hh - (nbw, max_blk_size)