Commit 5b525415 authored by Pavel Kus's avatar Pavel Kus
Browse files

one letter identifiers replaced

Conflicts:
	src/elpa1_trans_ev_complex_template.X90
	src/elpa1_trans_ev_real_template.X90
	src/elpa1_tridiag_complex_template.X90
	src/elpa1_tridiag_real_template.X90
parent 39d32d3e
......@@ -59,26 +59,26 @@
!>
! Parameters
!
!> \param na Order of matrix a, number of rows of matrix q
!> \param na Order of matrix a_mat, number of rows of matrix q_mat
!>
!> \param nqc Number of columns of matrix q
!> \param nqc Number of columns of matrix q_mat
!>
!> \param a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
!> \param a_mat(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
!> Distribution is like in Scalapack.
!>
!> \param lda Leading dimension of a
!> \param lda Leading dimension of a_mat
!>
!> \param tau(na) Factors of the Householder vectors
!>
!> \param q On input: Eigenvectors of tridiagonal matrix
!> \param q_mat On input: Eigenvectors of tridiagonal matrix
!> On output: Transformed eigenvectors
!> Distribution is like in Scalapack.
!>
!> \param ldq Leading dimension of q
!> \param ldq Leading dimension of q_mat
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix a and q
!> \param matrixCols local columns of matrix a_mat and q_mat
!>
!> \param mpi_comm_rows MPI-Communicator for rows
!>
......@@ -86,7 +86,7 @@
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine trans_ev_complex_PRECISION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
subroutine trans_ev_complex_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
......@@ -100,9 +100,9 @@
integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=COMPLEX_DATATYPE) :: tau(na)
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*)
complex(kind=COMPLEX_DATATYPE) :: a_mat(lda,*), q_mat(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols)
complex(kind=COMPLEX_DATATYPE) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
......@@ -141,7 +141,7 @@
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q_mat!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
......@@ -172,7 +172,7 @@
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
nstor = 0
if (useGPU) then
......@@ -181,7 +181,7 @@
! In the complex case tau(2) /= 0
if (my_prow == prow(1, nblk, np_rows)) then
q(1,1:l_cols) = q(1,1:l_cols)*(CONE-tau(2))
q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(CONE-tau(2))
endif
if (useGPU) then
......@@ -201,8 +201,9 @@
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_PRECISION_complex)
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q
successCUDA = cuda_memcpy(q_dev, loc(q(1,1)), ldq * matrixCols * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -222,7 +223,7 @@
if (my_pcol==cur_pcol) then
hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
if (my_prow==prow(ic-1, nblk, np_rows)) then
hvb(nb+l_rows) = 1.
endif
......@@ -320,7 +321,7 @@
else !useGPU
call PRECISION_GEMM('C', 'N', nstor, l_cols, l_rows, &
CONE, hvm, ubound(hvm,dim=1), &
q, ldq, &
q_mat, ldq, &
CZERO, tmp1 ,nstor)
endif !useGPU
else !l_rows>0
......@@ -374,11 +375,11 @@
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONE, tmat, max_stored_rows, &
tmp2, nstor)
!q = q - hvm*tmp2
!q_mat = q_mat - hvm*tmp2
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, &
CONE, q, ldq)
CONE, q_mat, ldq)
#else
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
CONE, tmat, max_stored_rows, &
......@@ -386,7 +387,7 @@
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, &
CONE, q, ldq)
CONE, q_mat, ldq)
#endif
endif ! useGPU
endif ! l_rows>0
......@@ -394,7 +395,7 @@
! if (l_rows>0) then
! call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! tmp2, nstor, CONE, q_mat, ldq)
! endif
nstor = 0
......@@ -409,8 +410,9 @@
endif
if (useGPU) then
!q = q_dev
successCUDA = cuda_memcpy(loc(q(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
!q_mat = q_dev
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_PRECISION_complex, &
cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
......
......@@ -58,26 +58,26 @@
!>
! Parameters
!
!> \param na Order of matrix a, number of rows of matrix q
!> \param na Order of matrix a_mat, number of rows of matrix q_mat
!>
!> \param nqc Number of columns of matrix q
!> \param nqc Number of columns of matrix q_mat
!>
!> \param a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
!> \param a_mat(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
!> Distribution is like in Scalapack.
!>
!> \param lda Leading dimension of a
!> \param lda Leading dimension of a_mat
!>
!> \param tau(na) Factors of the Householder vectors
!>
!> \param q On input: Eigenvectors of tridiagonal matrix
!> \param q_mat On input: Eigenvectors of tridiagonal matrix
!> On output: Transformed eigenvectors
!> Distribution is like in Scalapack.
!>
!> \param ldq Leading dimension of q
!> \param ldq Leading dimension of q_mat
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix a and q
!> \param matrixCols local columns of matrix a_mat and q_mat
!>
!> \param mpi_comm_rows MPI-Communicator for rows
!>
......@@ -85,7 +85,7 @@
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine M_trans_ev_real_PRECISION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
subroutine M_trans_ev_real_PRECISION(na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
......@@ -99,9 +99,9 @@
integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=REAL_DATATYPE) :: tau(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*), q(ldq,*)
real(kind=REAL_DATATYPE) :: a_mat(lda,*), q_mat(ldq,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols)
real(kind=REAL_DATATYPE) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
......@@ -138,7 +138,7 @@
#endif
totalblocks = (na-1)/nblk + 1
max_blocks_row = (totalblocks-1)/np_rows + 1
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q_mat!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
......@@ -184,15 +184,15 @@
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * M_size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q
successCUDA = cuda_memcpy(q_dev, loc(q(1,1)), ldq * matrixCols * M_size_of_PRECISION_real, cudaMemcpyHostToDevice)
! q_dev = q_mat
successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * M_size_of_PRECISION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
hvm = 0 ! Must be set to 0 !!!
hvb = 0 ! Safety only
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
nstor = 0
if (useGPU) then
......@@ -214,7 +214,7 @@
if (my_pcol==cur_pcol) then
hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
if (my_prow==prow(ic-1, nblk, np_rows)) then
hvb(nb+l_rows) = 1.
endif
......@@ -312,7 +312,7 @@
else
call M_PRECISION_GEMM('T', 'N', nstor, l_cols, l_rows, &
M_CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, &
q_mat, ldq, &
M_CONST_0_0, tmp1, nstor)
endif
......@@ -368,11 +368,11 @@
call M_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp2, nstor)
!q = q - hvm*tmp2
!q_mat = q_mat - hvm*tmp2
call M_PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp2, nstor, &
M_CONST_1_0, q, ldq)
M_CONST_1_0, q_mat, ldq)
#else
call M_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
......@@ -381,7 +381,7 @@
call M_PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp1, nstor, &
M_CONST_1_0, q, ldq)
M_CONST_1_0, q_mat, ldq)
#endif
endif ! useGPU
endif ! l_rows>0
......@@ -397,8 +397,8 @@
endif
if (useGPU) then
!q = q_dev
successCUDA = cuda_memcpy(loc(q(1,1)), q_dev, ldq * matrixCols * M_size_of_PRECISION_real, cudaMemcpyDeviceToHost)
!q_mat = q_dev
successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * M_size_of_PRECISION_real, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
......
......@@ -58,7 +58,7 @@
!
!> \param na Order of matrix
!>
!> \param a(lda,matrixCols) Distributed matrix which should be reduced.
!> \param a_mat(lda,matrixCols) Distributed matrix which should be reduced.
!> Distribution is like in Scalapack.
!> Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!> a(:,:) is overwritten on exit with the Householder vectors
......@@ -72,15 +72,16 @@
!> \param mpi_comm_rows MPI-Communicator for rows
!> \param mpi_comm_cols MPI-Communicator for columns
!>
!> \param d(na) Diagonal elements (returned), identical on all processors
!> \param d_vec(na) Diagonal elements (returned), identical on all processors
!>
!> \param e(na) Off-Diagonal elements (returned), identical on all processors
!> \param e_vec(na) Off-Diagonal elements (returned), identical on all processors
!>
!> \param tau(na) Factors for the Householder vectors (returned), needed for back transformation
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine tridiag_complex_PRECISION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau, useGPU_param)
subroutine tridiag_complex_PRECISION(na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
d_vec, e_vec, tau, useGPU_param)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
......@@ -97,11 +98,11 @@
complex(kind=COMPLEX_DATATYPE) :: tau(na)
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*)
complex(kind=COMPLEX_DATATYPE) :: a_mat(lda,*)
#else
complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols)
complex(kind=COMPLEX_DATATYPE) :: a_mat(lda,matrixCols)
#endif
real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na)
real(kind=REAL_DATATYPE), intent(out) :: d_vec(na), e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -228,22 +229,22 @@
endif
d(:) = 0
e(:) = 0
d_vec(:) = 0
e_vec(:) = 0
tau(:) = 0
n_stored_vecs = 0
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
if (my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) &
d(na) = a(l_rows,l_cols)
d_vec(na) = a_mat(l_rows,l_cols)
if (useGPU) then
! allocate memmory for matrix A on the device and than copy the matrix
successCUDA = cuda_malloc(a_dev, lda * na_cols * size_of_PRECISION_complex)
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), lda * na_cols * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * na_cols * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag", successCUDA)
endif
......@@ -273,7 +274,7 @@
cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag", successCUDA)
else
v_row(1:l_rows) = a(1:l_rows,l_cols+1)
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
endif
if (n_stored_vecs>0 .and. l_rows>0) then
......@@ -320,9 +321,9 @@
v_row(1:l_rows) = v_row(1:l_rows) * xf
if (my_prow==prow(istep-1, nblk, np_rows)) then
v_row(l_rows) = 1.
e(istep-1) = vrl
e_vec(istep-1) = vrl
endif
a(1:l_rows,l_cols+1) = v_row(1:l_rows) ! store Householder vector for back transformation
a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows) ! store Householder vector for back transformation
endif
......@@ -394,12 +395,12 @@
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call PRECISION_GEMV('C', l_row_end-l_row_beg+1 ,l_col_end-l_col_beg+1, &
CONE, a(l_row_beg,l_col_beg), lda, &
CONE, a_mat(l_row_beg,l_col_beg), lda, &
v_row(l_row_beg), 1, &
CONE, uc_p(l_col_beg,my_thread), 1)
if (i/=j) then
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
CONE, a(l_row_beg,l_col_beg), lda, &
CONE, a_mat(l_row_beg,l_col_beg), lda, &
v_col(l_col_beg), 1, &
CONE, ur_p(l_row_beg,my_thread), 1)
endif
......@@ -423,12 +424,12 @@
else ! useGPU
call PRECISION_GEMV('C', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
CONE, a(l_row_beg,l_col_beg), lda, &
CONE, a_mat(l_row_beg,l_col_beg), lda, &
v_row(l_row_beg), 1, &
CONE, u_col(l_col_beg), 1)
if (i/=j) then
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
CONE, a(l_row_beg,l_col_beg), lda, &
CONE, a_mat(l_row_beg,l_col_beg), lda, &
v_col(l_col_beg), 1, &
CONE, u_row(l_row_beg), 1)
endif
......@@ -569,7 +570,7 @@
call PRECISION_GEMM('N', 'C', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
CONE, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1), &
uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1), &
CONE, a(l_row_beg,l_col_beg), lda)
CONE, a_mat(l_row_beg,l_col_beg), lda)
endif !useGPU
enddo
......@@ -579,22 +580,22 @@
if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then
if (useGPU) then
!a(l_rows,l_cols) = a_dev(l_rows,l_cols)
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * size_of_PRECISION_complex
successCUDA = cuda_memcpy(loc(a(l_rows, l_cols)), a_dev + a_offset, &
successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &
1 * size_of_PRECISION_complex, cudaMemcpyDeviceToHost);
check_memcpy_cuda("tridiag", successCUDA)
endif
if (n_stored_vecs>0) then
a(l_rows,l_cols) = a(l_rows,l_cols) &
a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
+ dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
end if
d(istep-1) = a(l_rows,l_cols)
d_vec(istep-1) = a_mat(l_rows,l_cols)
if (useGPU) then
!a_dev(l_rows,l_cols) = a(l_rows,l_cols)
successCUDA = cuda_memcpy(a_dev + a_offset, loc(a(l_rows, l_cols)), &
!a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
successCUDA = cuda_memcpy(a_dev + a_offset, loc(a_mat(l_rows, l_cols)), &
1 * size_of_PRECISION_complex, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag", successCUDA)
endif
......@@ -602,7 +603,7 @@
enddo ! istep
! Store e(1) and d(1)
! Store e_vec(1) and d_vec(1)
if (my_pcol==pcol(2, nblk, np_cols)) then
if (my_prow==prow(1, nblk, np_rows)) then
......@@ -613,11 +614,11 @@
check_memcpy_cuda("tridiag", successCUDA)
vrl = aux3(1)
else !useGPU
vrl = a(1,l_cols)
vrl = a_mat(1,l_cols)
endif !useGPU
call hh_transform_complex_PRECISION(vrl, CONST_REAL_0_0, xf, tau(2))
e(1) = vrl
a(1,l_cols) = 1. ! for consistency only
e_vec(1) = vrl
a_mat(1,l_cols) = 1. ! for consistency only
endif
#ifdef WITH_MPI
......@@ -645,11 +646,11 @@
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(d(1)), a_dev, &
successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, &
1 * size_of_PRECISION_complex, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag", successCUDA)
else !useGPU
d(1) = a(1,1)
d_vec(1) = a_mat(1,1)
endif !useGPU
endif
......@@ -661,7 +662,7 @@
endif
if (useGPU) then
! todo: should we leave a on the device for further use?
! todo: should we leave a_mat on the device for further use?
successCUDA = cuda_free(a_dev)
check_dealloc_cuda("tridiag", successCUDA)
......@@ -685,7 +686,7 @@
endif
! distribute the arrays d and e to all processors
! distribute the arrays d_vec and e_vec to all processors
allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -697,14 +698,14 @@
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
tmp_real = d
call mpi_allreduce(tmp_real, d, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp_real = d
call mpi_allreduce(tmp_real, d, na, MPI_REAL_PRECISION ,MPI_SUM, mpi_comm_cols, mpierr)
tmp_real = e
call mpi_allreduce(tmp_real, e, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp_real = e
call mpi_allreduce(tmp_real, e, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
tmp_real = d_vec
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp_real = d_vec
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION ,MPI_SUM, mpi_comm_cols, mpierr)
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
......
......@@ -81,7 +81,7 @@
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine M_tridiag_real_PRECISION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau, useGPU)
subroutine M_tridiag_real_PRECISION(na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
use cuda_functions
use iso_c_binding
......@@ -96,11 +96,11 @@
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU
real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na), tau(na)
real(kind=REAL_DATATYPE), intent(out) :: d_vec(na), e_vec(na), tau(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: a(lda,*)
real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: a(lda,matrixCols)
real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
integer(kind=ik), parameter :: max_stored_uv = 32
......@@ -283,22 +283,22 @@
check_alloc_cuda("tridiag", successCUDA)
endif
d(:) = 0
e(:) = 0
d_vec(:) = 0
e_vec(:) = 0
tau(:) = 0
n_stored_vecs = 0
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
if(my_prow==prow(na, nblk, np_rows) .and. my_pcol==pcol(na, nblk, np_cols)) &
d(na) = a(l_rows,l_cols)
d_vec(na) = a_mat(l_rows,l_cols)
if (useGPU) then
! allocate memmory for matrix A on the device and than copy the matrix
successCUDA = cuda_malloc(a_dev, lda * na_cols * M_size_of_PRECISION_real)
check_alloc_cuda("tridiag", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), lda * na_cols * M_size_of_PRECISION_real, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * na_cols * M_size_of_PRECISION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag", successCUDA)
endif
......@@ -331,7 +331,7 @@
cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag", successCUDA)
else
v_row(1:l_rows) = a(1:l_rows,l_cols+1)
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
endif
!if((my_prow == 1) .and. (my_pcol == 1)) write(*,'(2I2, A, 20G14.3)'), l_rows, l_cols, "v_row:", v_row(:)
......@@ -376,11 +376,11 @@
v_row(l_rows) = 1.
! vrl is newly computed off-diagonal element of the final tridiagonal matrix
e(istep-1) = vrl
e_vec(istep-1) = vrl
endif
! store Householder vector for back transformation
a(1:l_rows,l_cols+1) = v_row(1:l_rows)
a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
! add tau after the end of actuall v_row, to be broadcasted with it
v_row(l_rows+1) = tau(istep)
......@@ -440,10 +440,10 @@
! enddo
!
!--- for now, just use DSYMV!!!
! a_dev -> a ?
!write(*,*) "ubound ", ubound(a,1), "lda", lda, "lcols", l_cols
! a_dev -> a_mat ?
!write(*,*) "ubound ", ubound(a_mat,1), "lda", lda, "lcols", l_cols
! call M_PRECISION_SYMV('U', l_cols, &
! 1.d0, a, ubound(a,1), &
! 1.d0, a_mat, ubound(a_mat,1), &
! v_row, 1, &
! 0.d0, u_col, 1)
......@@ -475,12 +475,12 @@
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call M_PRECISION_GEMV('T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
M_CONST_1_0, a(l_row_beg,l_col_beg), lda, &
M_CONST_1_0, a_mat(l_row_beg,l_col_beg), lda, &
v_row(l_row_beg), 1, &
M_CONST_1_0, uc_p(l_col_beg,my_thread), 1)
if (i/=j) then
call M_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
M_CONST_1_0, a(l_row_beg,l_col_beg), lda, &
M_CONST_1_0, a_mat(l_row_beg,l_col_beg), lda, &
v_col(l_col_beg), 1, &
M_CONST_1_0, ur_p(l_row_beg,my_thread), 1)
endif
......@@ -503,12 +503,12 @@
else ! useGPU
call M_PRECISION_GEMV('T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
M_CONST_1_0, a(l_row_beg, l_col_beg), lda, &
M_CONST_1_0, a_mat(l_row_beg, l_col_beg), lda, &
v_row(l_row_beg), 1, &
M_CONST_1_0, u_col(l_col_beg), 1)
if (i/=j) then
call M_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
M_CONST_1_0, a(l_row_beg,l_col_beg), lda, &
M_CONST_1_0, a_mat(l_row_beg,l_col_beg), lda, &
v_col(l_col_beg), 1, &
M_CONST_1_0, u_row(l_row_beg), 1)
endif
......@@ -526,7 +526,7 @@
endif
! call M_PRECISION_SYMV('U', l_cols, &
! 1.d0, a, ubound(a,1), &
! 1.d0, a_mat, ubound(a_mat,1), &
! v_row, 1, &
! 0.d0, u_col, 1)
......@@ -648,7 +648,7 @@
call M_PRECISION_GEMM('N', 'T', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, 2*n_stored_vecs, &
M_CONST_1_0, vu_stored_rows(l_row_beg,1), ubound(vu_stored_rows,dim=1), &
uv_stored_cols(l_col_beg,1), ubound(uv_stored_cols,dim=1), &
M_CONST_1_0, a(l_row_beg,l_col_beg), lda)
M_CONST_1_0, a_mat(l_row_beg,l_col_beg), lda)
endif ! useGPU
enddo
......@@ -658,22 +658,22 @@
if (my_prow==prow(istep-1, nblk, np_rows) .and. my_pcol==pcol(istep-1, nblk, np_cols)) then
if (useGPU) then
!a(l_rows,l_cols) = a_dev(l_rows,l_cols)
!a_mat(l_rows,l_cols) = a_dev(l_rows,l_cols)
a_offset = ((l_rows - 1) + lda * (l_cols - 1)) * M_size_of_PRECISION_real
successCUDA = cuda_memcpy(loc(a(l_rows, l_cols)), a_dev + a_offset, &
successCUDA = cuda_memcpy(loc(a_mat(l_rows, l_cols)), a_dev + a_offset, &