Commit 62b1d017 authored by Pavel Kus's avatar Pavel Kus
Browse files

fixed complex gpu tridiagonalization error

corrected indentation to simmplify diff of source files

Conflicts:
	src/elpa1_trans_ev_complex_template.X90
	src/elpa1_tridiag_complex_template.X90

Conflicts:
	src/elpa1_tridiag_complex_template.X90
	src/elpa1_tridiag_real_template.X90
parent 5b525415
......@@ -23,6 +23,7 @@ explicit_tokens = [("PRECISION_SUFFIX", "\"_double\"", "\"_single\""),
("KIND_PRECISION", "rk8", "rk4"),
("PRECISION_CMPLX", "DCMPLX", "CMPLX"),
("PRECISION_IMAG", "DIMAG", "AIMAG"),
("PRECISION_REAL", "DREAL", "REAL"),
("CONST_REAL_0_0", "0.0_rk8", "0.0_rk4"),
("CONST_REAL_1_0", "1.0_rk8", "1.0_rk4"),
("size_of_PRECISION_complex", "size_of_double_complex_datatype", "size_of_single_complex_datatype"),
......
......@@ -202,8 +202,8 @@
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......
......@@ -81,7 +81,7 @@
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine tridiag_complex_PRECISION(na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
d_vec, e_vec, tau, useGPU_param)
d_vec, e_vec, tau, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
......@@ -93,8 +93,7 @@
implicit none
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU_param
logical :: useGPU !todo: temporary
logical, intent(in) :: useGPU
complex(kind=COMPLEX_DATATYPE) :: tau(na)
#ifdef USE_ASSUMED_SIZE
......@@ -139,10 +138,10 @@
real(kind=REAL_DATATYPE), allocatable :: tmp_real(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
! todo: remove
complex(kind=COMPLEX_DATATYPE) :: tmp_print(10)
! todo: temporary
!useGPU = useGPU_param
useGPU = .false.
call timer%start("tridiag_complex" // PRECISION_SUFFIX)
......@@ -163,7 +162,6 @@
l_rows_per_tile = tile_size/np_rows ! local rows of a tile
l_cols_per_tile = tile_size/np_cols ! local cols of a tile
totalblocks = (na-1)/nblk + 1
max_loc_block_rows = (totalblocks-1)/np_rows + 1
max_loc_block_cols = (totalblocks-1)/np_cols + 1
......@@ -248,7 +246,8 @@
check_memcpy_cuda("tridiag", successCUDA)
endif
! main cycle of tridiagonalization
! in each step, 1 Householder vector is calculated
do istep=na,3,-1
! Calculate number of local rows and columns of the still remaining matrix
......@@ -297,22 +296,14 @@
call timer%start("mpi_communication")
#endif
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
vnorm2 = aux2(1)
vrl = aux2(2)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#else /* WITH_MPI */
! aux2 = aux1
vnorm2 = aux1(1)
vrl = aux1(2)
aux2 = aux1
#endif /* WITH_MPI */
! vnorm2 = aux2(1)
! vrl = aux2(2)
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
call hh_transform_complex_PRECISION(vrl, vnorm2, xf, tau(istep))
......@@ -321,35 +312,38 @@
v_row(1:l_rows) = v_row(1:l_rows) * xf
if (my_prow==prow(istep-1, nblk, np_rows)) then
v_row(l_rows) = 1.
! vrl is newly computed off-diagonal element of the final tridiagonal matrix
e_vec(istep-1) = vrl
endif
a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows) ! store Householder vector for back transformation
endif
! store Householder vector for back transformation
a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
! Broadcast the Householder vector (and tau) along columns
! add tau after the end of actuall v_row, to be broadcasted with it
v_row(l_rows+1) = tau(istep)
endif !(my_pcol==pcol(istep, nblk, np_cols))
if (my_pcol==pcol(istep, nblk, np_cols)) v_row(l_rows+1) = tau(istep)
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Bcast(v_row, l_rows+1, MPI_COMPLEX_PRECISION, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
! Broadcast the Householder vector (and tau) along columns
call MPI_Bcast(v_row, l_rows+1, MPI_COMPLEX_PRECISION, pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
#endif /* WITH_MPI */
!recover tau, which has been broadcasted together with v_row
tau(istep) = v_row(l_rows+1)
! Transpose Householder vector v_row -> v_col
! call elpa_transpose_vectors (v_row, 2*ubound(v_row,dim=1), mpi_comm_rows, &
! v_col, 2*ubound(v_col,dim=1), mpi_comm_cols, &
! 1, 2*(istep-1), 1, 2*nblk)
call elpa_transpose_vectors_complex_PRECISION (v_row, ubound(v_row,dim=1), mpi_comm_rows, &
v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, (istep-1), 1, nblk)
v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
! For cache efficiency, we use only the upper half of the matrix tiles for this,
......@@ -358,20 +352,20 @@
u_col(1:l_cols) = 0
u_row(1:l_rows) = 0
if (l_rows>0 .and. l_cols>0) then
if(useGPU) then
successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_PRECISION_complex)
if(useGPU) then
successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_PRECISION_complex)
check_memcpy_cuda("tridiag", successCUDA)
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_PRECISION_complex)
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_PRECISION_complex)
check_memcpy_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag", successCUDA)
endif
endif
#ifdef WITH_OPENMP
call timer%start("OpenMP parallel" // PRECISION_SUFFIX)
call timer%start("OpenMP parallel")
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
......@@ -382,7 +376,7 @@
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif
#endif /* WITH_OPENMP */
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
......@@ -409,11 +403,13 @@
#else /* WITH_OPENMP */
if(useGPU) then
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * size_of_PRECISION_complex
call cublas_PRECISION_gemv('C',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
CONE,a_dev + a_offset, lda, &
v_row_dev + (l_row_beg - 1) * size_of_PRECISION_complex, 1, &
CONE, u_col_dev + (l_col_beg - 1) * size_of_PRECISION_complex, 1)
if(i/=j) then
call cublas_PRECISION_gemv('N',l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
CONE, a_dev + a_offset, lda, &
......@@ -427,6 +423,7 @@
CONE, a_mat(l_row_beg,l_col_beg), lda, &
v_row(l_row_beg), 1, &
CONE, u_col(l_col_beg), 1)
if (i/=j) then
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
CONE, a_mat(l_row_beg,l_col_beg), lda, &
......@@ -436,8 +433,8 @@
endif ! useGPU
#endif /* WITH_OPENMP */
enddo
enddo
enddo ! j=0,i
enddo ! i=0,(istep-2)/tile_size
if(useGPU) then
successCUDA = cuda_memcpy(loc(u_col(1)), u_col_dev, l_cols * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
......@@ -449,13 +446,13 @@
#ifdef WITH_OPENMP
!$OMP END PARALLEL
call timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
call timer%stop("OpenMP parallel")
do i=0,max_threads-1
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
enddo
#endif
#endif /* WITH_OPENMP */
if (n_stored_vecs>0) then
call PRECISION_GEMV('C', l_rows, 2*n_stored_vecs, &
......@@ -468,7 +465,7 @@
CONE, u_col, 1)
endif
endif
endif ! (l_rows>0 .and. l_cols>0)
! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
! on the processors containing the diagonal
......@@ -477,8 +474,7 @@
if (tile_size < istep-1) then
call elpa_reduce_add_vectors_complex_PRECISION (u_row, ubound(u_row,dim=1), mpi_comm_rows, &
u_col, ubound(u_col,dim=1), mpi_comm_cols, &
(istep-1), 1, nblk)
u_col, ubound(u_col,dim=1), mpi_comm_cols, istep-1, 1, nblk)
endif
! Sum up all the u_col(:) parts, transpose u_col -> u_row
......@@ -502,6 +498,7 @@
! call elpa_transpose_vectors (u_col, 2*ubound(u_col,dim=1), mpi_comm_cols, &
! u_row, 2*ubound(u_row,dim=1), mpi_comm_rows, &
! 1, 2*(istep-1), 1, 2*nblk)
call elpa_transpose_vectors_complex_PRECISION (u_col, ubound(u_col,dim=1), mpi_comm_cols, &
u_row, ubound(u_row,dim=1), mpi_comm_rows, &
1, (istep-1), 1, nblk)
......@@ -509,7 +506,8 @@
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
xc = 0
if (l_cols>0) xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
if (l_cols>0) &
xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
......@@ -535,10 +533,10 @@
uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
enddo
! We have calculated another Hauseholder vector, number of implicitly stored increased
n_stored_vecs = n_stored_vecs+1
! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
if (n_stored_vecs==max_stored_uv .or. istep==3) then
if (useGPU) then
......@@ -551,7 +549,6 @@
check_memcpy_cuda("tridiag", successCUDA)
endif
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
......@@ -559,7 +556,7 @@
l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
if (l_col_end<l_col_beg .or. l_row_end<l_row_beg) &
cycle
if (useGPU) then
call cublas_PRECISION_gemm('N', 'C', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
CONE, vu_stored_rows_dev + (l_row_beg - 1) * size_of_PRECISION_complex, max_local_rows, &
......@@ -587,9 +584,9 @@
check_memcpy_cuda("tridiag", successCUDA)
endif
if (n_stored_vecs>0) then
if (n_stored_vecs>0) then
a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
+ dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
+ dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
end if
d_vec(istep-1) = a_mat(l_rows,l_cols)
......@@ -598,10 +595,10 @@
successCUDA = cuda_memcpy(a_dev + a_offset, loc(a_mat(l_rows, l_cols)), &
1 * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag", successCUDA)
endif
endif
endif
enddo ! istep
enddo ! main cycle over istep=na,3,-1
! Store e_vec(1) and d_vec(1)
......@@ -618,6 +615,7 @@
endif !useGPU
call hh_transform_complex_PRECISION(vrl, CONST_REAL_0_0, xf, tau(2))
e_vec(1) = vrl
a_mat(1,l_cols) = 1. ! for consistency only
endif
......@@ -646,19 +644,19 @@
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, &
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
1 * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag", successCUDA)
d_vec(1) = PRECISION_REAL(aux3(1))
else !useGPU
d_vec(1) = a_mat(1,1)
d_vec(1) = PRECISION_REAL(a_mat(1,1))
endif !useGPU
endif
deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when deallocating tmp "//errorMessage
stop
print *,"tridiag_complex: error when deallocating tmp "//errorMessage
stop
endif
if (useGPU) then
......@@ -668,9 +666,11 @@
successCUDA = cuda_free(v_row_dev)
check_dealloc_cuda("tridiag", successCUDA)
successCUDA = cuda_free(u_row_dev)
check_dealloc_cuda("tridiag", successCUDA)
successCUDA = cuda_free(v_col_dev)
check_dealloc_cuda("tridiag", successCUDA)
......@@ -685,7 +685,6 @@
check_dealloc_cuda("tridiag", successCUDA)
endif
! distribute the arrays d_vec and e_vec to all processors
allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
......@@ -693,7 +692,6 @@
print *,"tridiag_complex: error when allocating tmp_real "//errorMessage
stop
endif
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
......@@ -713,8 +711,8 @@
#endif /* WITH_MPI */
deallocate(tmp_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when deallocating tmp_real "//errorMessage
stop
print *,"tridiag_complex: error when deallocating tmp_real "//errorMessage
stop
endif
call timer%stop("tridiag_complex" // PRECISION_SUFFIX)
......
This diff is collapsed.
......@@ -19,6 +19,7 @@
#define KIND_PRECISION rk8
#define PRECISION_CMPLX DCMPLX
#define PRECISION_IMAG DIMAG
#define PRECISION_REAL DREAL
#define CONST_REAL_0_0 0.0_rk8
#define CONST_REAL_1_0 1.0_rk8
#define size_of_PRECISION_complex size_of_double_complex_datatype
......@@ -43,6 +44,7 @@
#undef KIND_PRECISION
#undef PRECISION_CMPLX
#undef PRECISION_IMAG
#undef PRECISION_REAL
#undef CONST_REAL_0_0
#undef CONST_REAL_1_0
#undef size_of_PRECISION_complex
......@@ -66,6 +68,7 @@
#define KIND_PRECISION rk4
#define PRECISION_CMPLX CMPLX
#define PRECISION_IMAG AIMAG
#define PRECISION_REAL REAL
#define CONST_REAL_0_0 0.0_rk4
#define CONST_REAL_1_0 1.0_rk4
#define size_of_PRECISION_complex size_of_single_complex_datatype
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment