Commit afc5a14a authored by Pavel Kus's avatar Pavel Kus

more complex single/double templates

parent 48760118
......@@ -45,9 +45,14 @@ simple_tokens = [
"global_product_PRECISION",
"add_tmp_PRECISION",
"v_add_s_PRECISION",
"launch_compute_hh_trafo_c_kernel_NUMBER_PRECISION",
"compute_hh_trafo_NUMBER_gpu_PRECISION",
"launch_my_pack_c_kernel_NUMBER_PRECISION",
"launch_my_unpack_c_kernel_NUMBER_PRECISION",
"launch_compute_hh_dotp_c_kernel_NUMBER_PRECISION",
"launch_extract_hh_tau_c_kernel_NUMBER_PRECISION",
]
blas_tokens = [
"PRECISION_GEMV",
"PRECISION_TRMV",
......@@ -75,6 +80,7 @@ blas_tokens = [
explicit_tokens_complex = [
("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("MPI_COMPLEX_PRECISION", "MPI_DOUBLE_COMPLEX", "MPI_COMPLEX"),
("MPI_COMPLEX_EXPLICIT_PRECISION", "MPI_COMPLEX16", "MPI_COMPLEX8"),
("MPI_REAL_PRECISION", "MPI_REAL8", "MPI_REAL4"),
("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
......
......@@ -67,29 +67,17 @@
#include "elpa2_tridiag_band_complex_template.X90"
#include "elpa2_trans_ev_tridi_to_band_complex_template.X90"
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_dot_products_complex_gpu_double(nbw, n)
#else
subroutine compute_hh_dot_products_complex_gpu_single(nbw, n)
#endif
subroutine compute_hh_dot_products_complex_gpu_PRECISION(nbw, n)
use cuda_c_kernel
use precision
implicit none
integer(kind=ik), value :: nbw, n
if (n .le. 1) return
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_compute_hh_dotp_c_kernel_complex_double( bcast_buffer_dev, hh_dot_dev, nbw,n)
#else
call launch_compute_hh_dotp_c_kernel_complex_single( bcast_buffer_dev, hh_dot_dev, nbw,n)
#endif
call launch_compute_hh_dotp_c_kernel_complex_PRECISION( bcast_buffer_dev, hh_dot_dev, nbw,n)
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine pack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine pack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
subroutine pack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
......@@ -99,28 +87,18 @@
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_pack_c_kernel_complex_double(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
call launch_my_pack_c_kernel_complex_PRECISION(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_double_complex_datatype, &
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_PRECISION_complex, &
cudaMemcpyDeviceToHost)
#else
call launch_my_pack_c_kernel_complex_single(row_count, n_offset, max_idx, stripe_width,a_dim2, stripe_count, &
l_nev, aIntern_dev, row_group_dev)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev ,row_count * l_nev * size_of_single_complex_datatype, &
cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"pack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_row_group_complex_gpu_double(rows, n_offset, row_count)
#else
subroutine unpack_row_group_complex_gpu_single(rows, n_offset, row_count)
#endif
subroutine unpack_row_group_complex_gpu_PRECISION(rows, n_offset, row_count)
use cuda_c_kernel
use precision
implicit none
......@@ -131,31 +109,17 @@
logical :: successCUDA
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_double_complex_datatype , &
cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_single_complex_datatype , &
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev* size_of_PRECISION_complex , &
cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"unpack_row_group_complex_gpu: error in cudaMemcpy"
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call launch_my_unpack_c_kernel_complex_double( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
#else
call launch_my_unpack_c_kernel_complex_single( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
call launch_my_unpack_c_kernel_complex_PRECISION( row_count, n_offset,max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,aIntern_dev)
#endif
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine unpack_and_prepare_row_group_complex_gpu_double(next_unpack_idx, force)
#else
subroutine unpack_and_prepare_row_group_complex_gpu_single(next_unpack_idx, force)
#endif
subroutine unpack_and_prepare_row_group_complex_gpu_PRECISION(next_unpack_idx, force)
use precision
implicit none
......@@ -168,11 +132,7 @@
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /=next_unpack_idx)) then
! A flush and a reset must performed
#ifdef DOUBLE_PRECISION_COMPLEX
call unpack_row_group_complex_gpu_double(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#else
call unpack_row_group_complex_gpu_single(row_group(:, :), unpack_idx - row_group_size, row_group_size)
#endif
call unpack_row_group_complex_gpu_PRECISION(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
......@@ -184,11 +144,7 @@
end subroutine
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine compute_hh_trafo_complex_gpu_double(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#else
subroutine compute_hh_trafo_complex_gpu_single(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
#endif
subroutine compute_hh_trafo_complex_gpu_PRECISION(off, ncols, istripe, a_off, dev_offset, dev_offset_1, dev_offset_2)
use iso_c_binding
use cuda_c_kernel
......@@ -205,27 +161,15 @@
ttt = mpi_wtime()
nl = merge(stripe_width, last_stripe_width, istripe < stripe_count)
#ifdef DOUBLE_PRECISION_COMPLEX
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_double_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_double_complex_datatype
dev_offset_2 =( off-1 )*size_of_double_complex_datatype
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_double(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#else
dev_offset = (0 + ( ( a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * &
size_of_single_complex_datatype
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_single_complex_datatype
dev_offset_2 =( off-1 )*size_of_single_complex_datatype
size_of_PRECISION_complex
dev_offset_1 = (0 + ( off-1 )* nbw) *size_of_PRECISION_complex
dev_offset_2 =( off-1 )*size_of_PRECISION_complex
! t1_compute_kernel =MPI_Wtime()
call launch_compute_hh_trafo_c_kernel_complex_single(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
call launch_compute_hh_trafo_c_kernel_complex_PRECISION(aIntern_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)
#endif
! time0 = time0 + time1
! t2_compute_kernel =MPI_Wtime()
! t0_compute_kernel = t0_compute_kernel + t2_compute_kernel-t1_compute_kernel
......
......@@ -40,6 +40,12 @@
#undef global_product_PRECISION
#undef add_tmp_PRECISION
#undef v_add_s_PRECISION
#undef launch_compute_hh_trafo_c_kernel_complex_PRECISION
#undef compute_hh_trafo_complex_gpu_PRECISION
#undef launch_my_pack_c_kernel_complex_PRECISION
#undef launch_my_unpack_c_kernel_complex_PRECISION
#undef launch_compute_hh_dotp_c_kernel_complex_PRECISION
#undef launch_extract_hh_tau_c_kernel_complex_PRECISION
#undef PRECISION_GEMV
#undef PRECISION_TRMV
#undef PRECISION_GEMM
......@@ -63,6 +69,7 @@
#undef cublas_PRECISION_GEMV
#undef PRECISION_SUFFIX
#undef MPI_COMPLEX_PRECISION
#undef MPI_COMPLEX_EXPLICIT_PRECISION
#undef MPI_REAL_PRECISION
#undef KIND_PRECISION
#undef PRECISION_CMPLX
......@@ -113,6 +120,12 @@
#define global_product_PRECISION global_product_double
#define add_tmp_PRECISION add_tmp_double
#define v_add_s_PRECISION v_add_s_double
#define launch_compute_hh_trafo_c_kernel_complex_PRECISION launch_compute_hh_trafo_c_kernel_complex_double
#define compute_hh_trafo_complex_gpu_PRECISION compute_hh_trafo_complex_gpu_double
#define launch_my_pack_c_kernel_complex_PRECISION launch_my_pack_c_kernel_complex_double
#define launch_my_unpack_c_kernel_complex_PRECISION launch_my_unpack_c_kernel_complex_double
#define launch_compute_hh_dotp_c_kernel_complex_PRECISION launch_compute_hh_dotp_c_kernel_complex_double
#define launch_extract_hh_tau_c_kernel_complex_PRECISION launch_extract_hh_tau_c_kernel_complex_double
#define PRECISION_GEMV ZGEMV
#define PRECISION_TRMV ZTRMV
#define PRECISION_GEMM ZGEMM
......@@ -136,6 +149,7 @@
#define cublas_PRECISION_GEMV cublas_ZGEMV
#define PRECISION_SUFFIX "_double"
#define MPI_COMPLEX_PRECISION MPI_DOUBLE_COMPLEX
#define MPI_COMPLEX_EXPLICIT_PRECISION MPI_COMPLEX16
#define MPI_REAL_PRECISION MPI_REAL8
#define KIND_PRECISION rk8
#define PRECISION_CMPLX DCMPLX
......@@ -187,6 +201,12 @@
#undef global_product_PRECISION
#undef add_tmp_PRECISION
#undef v_add_s_PRECISION
#undef launch_compute_hh_trafo_c_kernel_complex_PRECISION
#undef compute_hh_trafo_complex_gpu_PRECISION
#undef launch_my_pack_c_kernel_complex_PRECISION
#undef launch_my_unpack_c_kernel_complex_PRECISION
#undef launch_compute_hh_dotp_c_kernel_complex_PRECISION
#undef launch_extract_hh_tau_c_kernel_complex_PRECISION
#undef PRECISION_GEMV
#undef PRECISION_TRMV
#undef PRECISION_GEMM
......@@ -210,6 +230,7 @@
#undef cublas_PRECISION_GEMV
#undef PRECISION_SUFFIX
#undef MPI_COMPLEX_PRECISION
#undef MPI_COMPLEX_EXPLICIT_PRECISION
#undef MPI_REAL_PRECISION
#undef KIND_PRECISION
#undef PRECISION_CMPLX
......@@ -260,6 +281,12 @@
#define global_product_PRECISION global_product_single
#define add_tmp_PRECISION add_tmp_single
#define v_add_s_PRECISION v_add_s_single
#define launch_compute_hh_trafo_c_kernel_complex_PRECISION launch_compute_hh_trafo_c_kernel_complex_single
#define compute_hh_trafo_complex_gpu_PRECISION compute_hh_trafo_complex_gpu_single
#define launch_my_pack_c_kernel_complex_PRECISION launch_my_pack_c_kernel_complex_single
#define launch_my_unpack_c_kernel_complex_PRECISION launch_my_unpack_c_kernel_complex_single
#define launch_compute_hh_dotp_c_kernel_complex_PRECISION launch_compute_hh_dotp_c_kernel_complex_single
#define launch_extract_hh_tau_c_kernel_complex_PRECISION launch_extract_hh_tau_c_kernel_complex_single
#define PRECISION_GEMV CGEMV
#define PRECISION_TRMV CTRMV
#define PRECISION_GEMM CGEMM
......@@ -283,6 +310,7 @@
#define cublas_PRECISION_GEMV cublas_CGEMV
#define PRECISION_SUFFIX "_single"
#define MPI_COMPLEX_PRECISION MPI_COMPLEX
#define MPI_COMPLEX_EXPLICIT_PRECISION MPI_COMPLEX8
#define MPI_REAL_PRECISION MPI_REAL4
#define KIND_PRECISION rk4
#define PRECISION_CMPLX CMPLX
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment