Commit 7d5e595c authored by Pavel Kus's avatar Pavel Kus

tridiag complex ported to GPU, but temporarily disabled, due to a bug (too large residual errors)

parent 7bbb47f6
......@@ -12,6 +12,8 @@ blas_tokens = ["PRECISION_GEMV",
"PRECISION_GEMM",
"PRECISION_TRMM",
"PRECISION_HERK",
"cublas_PRECISION_gemm",
"cublas_PRECISION_gemv",
]
explicit_tokens = [("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
......@@ -22,6 +24,7 @@ explicit_tokens = [("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("PRECISION_IMAG", "DIMAG", "AIMAG"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
]
print "#ifdef DOUBLE_PRECISION_COMPLEX"
......
......@@ -777,8 +777,8 @@ function solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixC
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
print *,"Warning: using GPU with blocksize different from 128"
! error stop
endif
! set the neccessary parameters
......@@ -959,8 +959,8 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
print *,"Warning: using GPU with blocksize different from 128"
! error stop
endif
! set the neccessary parameters
......@@ -1151,8 +1151,8 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
print *,"Warning: using GPU with blocksize different from 128"
! error stop
endif
! set the neccessary parameters
......
This diff is collapsed.
......@@ -172,13 +172,17 @@
#endif
! pkus: what is the difference between na_cols and matrixCols?
! pkus: probably matrixCols is not supplied when using
if (useGPU) then
#ifdef WITH_MPI
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
na_cols = na
#endif
endif ! useGPU
! pkus: I should be able to use matrixCols
! pkus: todo: remove na_cols completely
na_cols = matrixCols
! if (useGPU) then
! #ifdef WITH_MPI
! na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
! #else
! na_cols = na
! #endif
! endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
......@@ -624,9 +628,6 @@
successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
max_local_cols * 2 * max_stored_uv * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag", successCUDA)
! vu_stored_rows_dev(:,:) = vu_stored_rows(:,:)
! uv_stored_cols_dev(:,:) = uv_stored_cols(:,:)
endif
do i=0,(istep-2)/tile_size
......@@ -642,7 +643,7 @@
M_CONST_1_0, vu_stored_rows_dev + (l_row_beg - 1) * M_size_of_PRECISSION_real, max_local_rows, &
uv_stored_cols_dev + (l_col_beg - 1) * M_size_of_PRECISSION_real, max_local_cols, &
M_CONST_1_0, a_dev + ((l_row_beg - 1) + (l_col_beg - 1) * lda) * M_size_of_PRECISSION_real, lda)
else
else !useGPU
call M_PRECISSION_GEMM('N', 'T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
M_CONST_1_0, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1), &
......@@ -680,24 +681,26 @@
enddo ! main cycle over istep=na,3,-1
! Store e(1) and d(1)
if (useGPU) then
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
! Store e(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(e(1)), a_dev + (lda * (l_cols - 1)) * M_size_of_PRECISSION_real, &
1 * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
1 * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag", successCUDA)
endif
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
successCUDA = cuda_memcpy(loc(d(1)), a_dev, &
1 * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag", successCUDA)
endif
else
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) &
else !useGPU
e(1) = a(1,l_cols) ! use last l_cols value of loop above
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) &
endif !useGPU
endif
! Store d(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(d(1)), a_dev, &
1 * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag", successCUDA)
else !useGPU
d(1) = a(1,1)
endif !useGPU
endif
deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
......@@ -730,9 +733,6 @@
check_dealloc_cuda("tridiag", successCUDA)
endif
! todo dealocate at the GPU
! distribute the arrays d and e to all processors
allocate(tmp(na), stat=istat, errmsg=errorMessage)
......
......@@ -373,6 +373,34 @@ module cuda_functions
end subroutine cublas_sgemv_c
end interface
interface
subroutine cublas_zgemv_c(cta, m, n, alpha, a, lda, x, incx, beta, y, incy) bind(C,name='cublasZgemv')
use iso_c_binding
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT),value :: m,n
integer(kind=C_INT), intent(in), value :: lda,incx,incy
complex(kind=C_DOUBLE),value :: alpha,beta
integer(kind=C_intptr_T), value :: a, x, y
end subroutine cublas_zgemv_c
end interface
interface
subroutine cublas_cgemv_c(cta, m, n, alpha, a, lda, x, incx, beta, y, incy) bind(C,name='cublasCgemv')
use iso_c_binding
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT),value :: m,n
integer(kind=C_INT), intent(in), value :: lda,incx,incy
complex(kind=C_FLOAT),value :: alpha,beta
integer(kind=C_intptr_T), value :: a, x, y
end subroutine cublas_cgemv_c
end interface
contains
......@@ -718,5 +746,33 @@ module cuda_functions
#endif
end subroutine cublas_sgemv
subroutine cublas_zgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
use iso_c_binding
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,incx,incy
complex(kind=C_DOUBLE) :: alpha,beta
integer(kind=C_intptr_T) :: a, x, y
#ifdef WITH_GPU_VERSION
call cublas_zgemv_c(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
#endif
end subroutine cublas_zgemv
subroutine cublas_cgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
use iso_c_binding
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,incx,incy
complex(kind=C_FLOAT) :: alpha,beta
integer(kind=C_intptr_T) :: a, x, y
#ifdef WITH_GPU_VERSION
call cublas_cgemv_c(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
#endif
end subroutine cublas_cgemv
end module cuda_functions
......@@ -10,6 +10,8 @@
#define PRECISION_GEMM ZGEMM
#define PRECISION_TRMM ZTRMM
#define PRECISION_HERK ZHERK
#define cublas_PRECISION_gemm cublas_Zgemm
#define cublas_PRECISION_gemv cublas_Zgemv
#define PRECISION_SUFFIX "_double"
#define MPI_COMPLEX_PRECISION MPI_DOUBLE_COMPLEX
#define MPI_REAL_PRECISION MPI_REAL8
......@@ -18,6 +20,7 @@
#define PRECISION_IMAG DIMAG
#define CONST_REAL_0_0 0.0_rk8
#define CONST_REAL_1_0 1.0_rk8
#define size_of_PRECISION_complex size_of_double_complex_datatype
#else
#undef tridiag_complex_PRECISION
#undef trans_ev_complex_PRECISION
......@@ -30,6 +33,8 @@
#undef PRECISION_GEMM
#undef PRECISION_TRMM
#undef PRECISION_HERK
#undef cublas_PRECISION_gemm
#undef cublas_PRECISION_gemv
#undef PRECISION_SUFFIX
#undef MPI_COMPLEX_PRECISION
#undef MPI_REAL_PRECISION
......@@ -38,6 +43,7 @@
#undef PRECISION_IMAG
#undef CONST_REAL_0_0
#undef CONST_REAL_1_0
#undef size_of_PRECISION_complex
#define tridiag_complex_PRECISION tridiag_complex_single
#define trans_ev_complex_PRECISION trans_ev_complex_single
#define solve_complex_PRECISION solve_complex_single
......@@ -49,6 +55,8 @@
#define PRECISION_GEMM CGEMM
#define PRECISION_TRMM CTRMM
#define PRECISION_HERK CHERK
#define cublas_PRECISION_gemm cublas_Cgemm
#define cublas_PRECISION_gemv cublas_Cgemv
#define PRECISION_SUFFIX "_single"
#define MPI_COMPLEX_PRECISION MPI_COMPLEX
#define MPI_REAL_PRECISION MPI_REAL4
......@@ -57,4 +65,5 @@
#define PRECISION_IMAG AIMAG
#define CONST_REAL_0_0 0.0_rk4
#define CONST_REAL_1_0 1.0_rk4
#define size_of_PRECISION_complex size_of_single_complex_datatype
#endif
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment