Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
afc5a14a
Commit
afc5a14a
authored
Dec 05, 2016
by
Pavel Kus
Browse files
more complex single/double templates
parent
48760118
Changes
4
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
generate/generate_precision.py
View file @
afc5a14a
...
...
@@ -45,9 +45,14 @@ simple_tokens = [
"global_product_PRECISION"
,
"add_tmp_PRECISION"
,
"v_add_s_PRECISION"
,
"launch_compute_hh_trafo_c_kernel_NUMBER_PRECISION"
,
"compute_hh_trafo_NUMBER_gpu_PRECISION"
,
"launch_my_pack_c_kernel_NUMBER_PRECISION"
,
"launch_my_unpack_c_kernel_NUMBER_PRECISION"
,
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION"
,
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION"
,
]
blas_tokens
=
[
"PRECISION_GEMV"
,
"PRECISION_TRMV"
,
...
...
@@ -75,6 +80,7 @@ blas_tokens = [
explicit_tokens_complex
=
[
(
"PRECISION_SUFFIX"
,
"
\"
_double
\"
"
,
"
\"
_single
\"
"
),
(
"MPI_COMPLEX_PRECISION"
,
"MPI_DOUBLE_COMPLEX"
,
"MPI_COMPLEX"
),
(
"MPI_COMPLEX_EXPLICIT_PRECISION"
,
"MPI_COMPLEX16"
,
"MPI_COMPLEX8"
),
(
"MPI_REAL_PRECISION"
,
"MPI_REAL8"
,
"MPI_REAL4"
),
(
"KIND_PRECISION"
,
"rk8"
,
"rk4"
),
(
"PRECISION_CMPLX"
,
"DCMPLX"
,
"CMPLX"
),
...
...
src/elpa2_compute_complex_template.X90
View file @
afc5a14a
...
...
@@ -67,29 +67,17 @@
#include "elpa2_tridiag_band_complex_template.X90"
#include "elpa2_trans_ev_tridi_to_band_complex_template.X90"
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_dot_products_complex_gpu_double(nbw, n)
#else
subroutine compute_hh_dot_products_complex_gpu_single(nbw, n)
#endif
subroutine compute_hh_dot_products_complex_gpu_PRECISION(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_compute_hh_dotp_c_kernel_complex_double( bcast_buffer_dev, hh_dot_dev, nbw,n)
#else
call launch_compute_hh_dotp_c_kernel_complex_single( bcast_buffer_dev, hh_dot_dev, nbw,n)
#endif
call launch_compute_hh_dotp_c_kernel_complex_PRECISION( bcast_buffer_dev, hh_dot_dev, nbw,n)
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine pack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine pack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
subroutine pack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
...
...
@@ -99,28 +87,18 @@
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_pack_c_kernel_complex_double(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
call launch_my_pack_c_kernel_complex_PRECISION(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_
double_complex_datatype
, &
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_
PRECISION_complex
, &
cudaMemcpyDeviceToHost)
#else
call launch_my_pack_c_kernel_complex_single(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_single_complex_datatype, &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine unpack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
subroutine unpack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
...
...
@@ -131,31 +109,17 @@
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_double_complex_datatype , &
cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_single_complex_datatype , &
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_PRECISION_complex , &
cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_unpack_c_kernel_complex_double( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
#else
call launch_my_unpack_c_kernel_complex_single( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
call launch_my_unpack_c_kernel_complex_PRECISION( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
#endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_and_prepare_row_group_complex_gpu_double(next_unpack_idx, force)
#else
subroutine unpack_and_prepare_row_group_complex_gpu_single(next_unpack_idx, force)
#endif
subroutine unpack_and_prepare_row_group_complex_gpu_PRECISION(next_unpack_idx, force)
use precision
implicit none
...
...
@@ -168,11 +132,7 @@
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_group_complex_gpu_double(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#else
call unpack_row_group_complex_gpu_single(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#endif
call unpack_row_group_complex_gpu_PRECISION(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
...
...
@@ -184,11 +144,7 @@
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_trafo_complex_gpu_double(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#else
subroutine compute_hh_trafo_complex_gpu_single(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#endif
subroutine compute_hh_trafo_complex_gpu_PRECISION(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
use iso_c_binding
use cuda_c_kernel
...
...
@@ -205,27 +161,15 @@
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_double_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_double_complex_datatype
dev_offset_2 =( off-1 )*size_of_double_complex_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_double(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#else
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_
single_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_
single_complex_datatype
dev_offset_2 =( off-1 )*size_of_
single_complex_datatype
size_of_
PRECISION_complex
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_
PRECISION_complex
dev_offset_2 =( off-1 )*size_of_
PRECISION_complex
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_
single
(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
call launch_compute_hh_trafo_c_kernel_complex_
PRECISION
(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
...
...
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
View file @
afc5a14a
This diff is collapsed.
Click to expand it.
src/precision_macros_complex.h
View file @
afc5a14a
...
...
@@ -40,6 +40,12 @@
#undef global_product_PRECISION
#undef add_tmp_PRECISION
#undef v_add_s_PRECISION
#undef launch_compute_hh_trafo_c_kernel_complex_PRECISION
#undef compute_hh_trafo_complex_gpu_PRECISION
#undef launch_my_pack_c_kernel_complex_PRECISION
#undef launch_my_unpack_c_kernel_complex_PRECISION
#undef launch_compute_hh_dotp_c_kernel_complex_PRECISION
#undef launch_extract_hh_tau_c_kernel_complex_PRECISION
#undef PRECISION_GEMV
#undef PRECISION_TRMV
#undef PRECISION_GEMM
...
...
@@ -63,6 +69,7 @@
#undef cublas_PRECISION_GEMV
#undef PRECISION_SUFFIX
#undef MPI_COMPLEX_PRECISION
#undef MPI_COMPLEX_EXPLICIT_PRECISION
#undef MPI_REAL_PRECISION
#undef KIND_PRECISION
#undef PRECISION_CMPLX
...
...
@@ -113,6 +120,12 @@
#define global_product_PRECISION global_product_double
#define add_tmp_PRECISION add_tmp_double
#define v_add_s_PRECISION v_add_s_double
#define launch_compute_hh_trafo_c_kernel_complex_PRECISION launch_compute_hh_trafo_c_kernel_complex_double
#define compute_hh_trafo_complex_gpu_PRECISION compute_hh_trafo_complex_gpu_double
#define launch_my_pack_c_kernel_complex_PRECISION launch_my_pack_c_kernel_complex_double
#define launch_my_unpack_c_kernel_complex_PRECISION launch_my_unpack_c_kernel_complex_double
#define launch_compute_hh_dotp_c_kernel_complex_PRECISION launch_compute_hh_dotp_c_kernel_complex_double
#define launch_extract_hh_tau_c_kernel_complex_PRECISION launch_extract_hh_tau_c_kernel_complex_double
#define PRECISION_GEMV ZGEMV
#define PRECISION_TRMV ZTRMV
#define PRECISION_GEMM ZGEMM
...
...
@@ -136,6 +149,7 @@
#define cublas_PRECISION_GEMV cublas_ZGEMV
#define PRECISION_SUFFIX "_double"
#define MPI_COMPLEX_PRECISION MPI_DOUBLE_COMPLEX
#define MPI_COMPLEX_EXPLICIT_PRECISION MPI_COMPLEX16
#define MPI_REAL_PRECISION MPI_REAL8
#define KIND_PRECISION rk8
#define PRECISION_CMPLX DCMPLX
...
...
@@ -187,6 +201,12 @@
#undef global_product_PRECISION
#undef add_tmp_PRECISION
#undef v_add_s_PRECISION
#undef launch_compute_hh_trafo_c_kernel_complex_PRECISION
#undef compute_hh_trafo_complex_gpu_PRECISION
#undef launch_my_pack_c_kernel_complex_PRECISION
#undef launch_my_unpack_c_kernel_complex_PRECISION
#undef launch_compute_hh_dotp_c_kernel_complex_PRECISION
#undef launch_extract_hh_tau_c_kernel_complex_PRECISION
#undef PRECISION_GEMV
#undef PRECISION_TRMV
#undef PRECISION_GEMM
...
...
@@ -210,6 +230,7 @@
#undef cublas_PRECISION_GEMV
#undef PRECISION_SUFFIX
#undef MPI_COMPLEX_PRECISION
#undef MPI_COMPLEX_EXPLICIT_PRECISION
#undef MPI_REAL_PRECISION
#undef KIND_PRECISION
#undef PRECISION_CMPLX
...
...
@@ -260,6 +281,12 @@
#define global_product_PRECISION global_product_single
#define add_tmp_PRECISION add_tmp_single
#define v_add_s_PRECISION v_add_s_single
#define launch_compute_hh_trafo_c_kernel_complex_PRECISION launch_compute_hh_trafo_c_kernel_complex_single
#define compute_hh_trafo_complex_gpu_PRECISION compute_hh_trafo_complex_gpu_single
#define launch_my_pack_c_kernel_complex_PRECISION launch_my_pack_c_kernel_complex_single
#define launch_my_unpack_c_kernel_complex_PRECISION launch_my_unpack_c_kernel_complex_single
#define launch_compute_hh_dotp_c_kernel_complex_PRECISION launch_compute_hh_dotp_c_kernel_complex_single
#define launch_extract_hh_tau_c_kernel_complex_PRECISION launch_extract_hh_tau_c_kernel_complex_single
#define PRECISION_GEMV CGEMV
#define PRECISION_TRMV CTRMV
#define PRECISION_GEMM CGEMM
...
...
@@ -283,6 +310,7 @@
#define cublas_PRECISION_GEMV cublas_CGEMV
#define PRECISION_SUFFIX "_single"
#define MPI_COMPLEX_PRECISION MPI_COMPLEX
#define MPI_COMPLEX_EXPLICIT_PRECISION MPI_COMPLEX8
#define MPI_REAL_PRECISION MPI_REAL4
#define KIND_PRECISION rk4
#define PRECISION_CMPLX CMPLX
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment