Commit 80951583 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'master_pre_stage' of https://gitlab.mpcdf.mpg.de/elpa/elpa into master_pre_stage

parents a9f1fc4b 2f62f542
This diff is collapsed.
......@@ -583,6 +583,16 @@ for cc, fc, m, o, p, a, b, g, instr, addr, na in product(
if (instr != "avx" and instr != "sse" and addr == "address-sanitize"):
MasterOnly=True
# make non-master tests even faster
# kicking out gpu is not good, but at the momemt we have a real problem with gpu runners
# should be returned when solved
if (g == "with-gpu"):
MasterOnly=True
if (a == "no-assumed-size"):
MasterOnly=True
if (instr == "avx2" or instr == "avx512"):
MasterOnly=True
print("# " + cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" + a + "-" + b + "-" +g + "-" + cov + "-" + instr + "-" + addr)
print(cc + "-" + fc + "-" + m + "-" + o + "-" + p + "-" +a + "-" +b + "-" +g + "-" + cov + "-" + instr + "-" + addr + "-jobs:")
if (MasterOnly):
......
......@@ -98,61 +98,28 @@
use precision
use elpa_abstract_impl
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#if REALCASE == 1
real(kind=REAL_DATATYPE), intent(in) :: tau(na)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), intent(in) :: tau(na)
#endif
MATH_DATATYPE(kind=rck), intent(in) :: tau(na)
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
#endif
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
real(kind=rk8), parameter :: ZERO = 0.0_rk8, ONE = 1.0_rk8
#else
real(kind=rk4), parameter :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=ck8), parameter :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
#else
complex(kind=ck4), parameter :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
#endif
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, nstor
integer(kind=ik) :: istep, n, nc, ic, ics, ice, nb, cur_pcol
integer(kind=ik) :: hvn_ubnd, hvm_ubnd
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
real(kind=REAL_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
#endif
MATH_DATATYPE(kind=rck), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
MATH_DATATYPE(kind=rck), allocatable :: tmat(:,:), h1(:), h2(:), hvm1(:)
integer(kind=ik) :: istat
character(200) :: errorMessage
character(20) :: gpuString
......@@ -294,15 +261,7 @@
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (nb>0) &
call MPI_Bcast(hvb, nb, &
#if REALCASE == 1
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
&MPI_COMPLEX_PRECISION&
#endif
, cur_pcol, mpi_comm_cols, mpierr)
call MPI_Bcast(hvb, nb, MPI_MATH_DATATYPE_PRECISION , cur_pcol, mpi_comm_cols, mpierr)
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -341,14 +300,7 @@
enddo
#ifdef WITH_MPI
call obj%timer%start("mpi_communication")
if (nc>0) call mpi_allreduce( h1, h2, nc, &
#if REALCASE == 1
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
&MPI_COMPLEX_PRECISION&
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
if (nc>0) call mpi_allreduce( h1, h2, nc, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
......@@ -361,13 +313,7 @@
tmat(1,1) = tau(ice-nstor+1)
do n = 1, nstor-1
call obj%timer%start("blas")
#if REALCASE == 1
call PRECISION_TRMV('L', 'T', 'N', &
#endif
#if COMPLEXCASE == 1
call PRECISION_TRMV('L', 'C', 'N', &
#endif
n, tmat, max_stored_rows, h2(nc+1), 1)
call PRECISION_TRMV('L', BLAS_TRANS_OR_CONJ , 'N', n, tmat, max_stored_rows, h2(nc+1), 1)
call obj%timer%stop("blas")
tmat(n+1,1:n) = &
......@@ -404,25 +350,15 @@
if (l_rows>0) then
if (useGPU) then
call obj%timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_GEMM('T', 'N', &
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_GEMM('C', 'N', &
#endif
call cublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd, &
q_dev, ldq, ZERO, tmp_dev, nstor)
q_dev, ldq, ZERO, tmp_dev, nstor)
call obj%timer%stop("cublas")
else ! useGPU
call obj%timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMM('T', 'N', &
#endif
#if COMPLEXCASE == 1
call PRECISION_GEMM('C', 'N', &
#endif
call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', &
nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
q_mat, ldq, ZERO, tmp1, nstor)
call obj%timer%stop("blas")
......@@ -447,14 +383,7 @@
check_memcpy_cuda("trans_ev", successCUDA)
endif
call obj%timer%start("mpi_communication")
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, &
#if REALCASE == 1
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
&MPI_COMPLEX_PRECISION&
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call obj%timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
......
......@@ -111,27 +111,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU, wantDebug
#if REALCASE == 1
real(kind=REAL_DATATYPE), intent(out) :: tau(na)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), intent(out) :: tau(na)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,*)
#else
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
MATH_DATATYPE(kind=rck), intent(inout) :: a_mat(lda,matrixCols)
#endif
#endif
real(kind=REAL_DATATYPE), intent(out) :: d_vec(na), e_vec(na)
real(kind=rk), intent(out) :: d_vec(na), e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
logical, parameter :: mat_vec_as_one_block = .true.
......@@ -160,50 +146,31 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
integer(kind=ik) :: my_thread, n_threads, n_iter
#endif
real(kind=REAL_DATATYPE) :: vnorm2
#if REALCASE == 1
real(kind=REAL_DATATYPE) :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#endif
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE) :: vav, xc, aux(2*max_stored_uv), aux1(2), aux2(2), aux3(1), vrl, xf
complex(kind=rck) :: aux3(1)
#endif
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp(:), &
MATH_DATATYPE(kind=rck), allocatable :: tmp(:), &
v_row(:), & ! used to store calculated Householder Vector
v_col(:), & ! the same Vector, but transposed - differently distributed among MPI tasks
u_row(:), &
u_col(:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv Vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and Vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: vu_stored_rows(:,:)
MATH_DATATYPE(kind=rck), allocatable :: vu_stored_rows(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
real(kind=REAL_DATATYPE), allocatable :: uv_stored_cols(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: vu_stored_rows(:,:), uv_stored_cols(:,:)
#endif
MATH_DATATYPE(kind=rck), allocatable :: uv_stored_cols(:,:)
#ifdef WITH_OPENMP
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
MATH_DATATYPE(kind=rck), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp_real(:)
#endif
real(kind=rk), allocatable :: tmp_real(:)
integer(kind=ik) :: min_tile_size, error
integer(kind=ik) :: istat
character(200) :: errorMessage
......@@ -738,34 +705,17 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
#if REALCASE == 1
x = 0
if (l_cols>0) &
x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif
#if COMPLEXCASE == 1
xc = 0
if (l_cols>0) &
xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
#if REALCASE == 1
call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#if COMPLEXCASE == 1
call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
#if REALCASE == 1
vav = x
#endif
#if COMPLEXCASE == 1
vav = xc
#endif
#endif /* WITH_MPI */
......@@ -977,12 +927,7 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
#if REALCASE == 1
print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
#endif
#if COMPLEXCASE == 1
print *,"tridiag_complex: error when deallocating tmp "//errorMessage
#endif
print *,"tridiag: error when deallocating "//errorMessage
stop 1
endif
......@@ -1012,62 +957,31 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
! distribute the arrays d_vec and e_vec to all processors
#if REALCASE == 1
allocate(tmp(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating tmp "//errorMessage
stop 1
endif
#endif
#if COMPLEXCASE == 1
allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when allocating tmp_real "//errorMessage
print *,"tridiag: error when allocating tmp_real "//errorMessage
stop 1
endif
#endif
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
#if REALCASE == 1
tmp = d_vec
call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp = d_vec
call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
tmp = e_vec
call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp = e_vec
call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#if COMPLEXCASE == 1
tmp_real = d_vec
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp_real = d_vec
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION ,MPI_SUM, mpi_comm_cols, mpierr)
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
if (wantDebug) call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
#if REALCASE == 1
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when deallocating tmp "//errorMessage
stop 1
endif
#endif
#if COMPLEXCASE == 1
deallocate(tmp_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when deallocating tmp_real "//errorMessage
print *,"tridiag: error when deallocating tmp_real "//errorMessage
stop 1
endif
#endif
call obj%timer%stop("tridiag_&
&MATH_DATATYPE&
......
......@@ -64,8 +64,8 @@
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, na, a, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
tmat_dev, wantDebug, useGPU, success, &
(obj, na, a_mat, a_dev, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, &
tmat_dev, wantDebug, useGPU, success &
#if REALCASE == 1
useQR, &
#endif
......@@ -78,14 +78,14 @@
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! a_mat(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! Opposed to Scalapack, a_mat(:,:) must be set completely (upper and lower half)
! a_mat(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
! lda Leading dimension of a_mat
! matrixCols local columns of matrix a_mat
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
......@@ -114,15 +114,16 @@
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE(kind=rck) :: a(lda,*), tmat(nbw,nbw,*)
MATH_DATATYPE(kind=rck) :: a_mat(lda,*), tmat(nbw,nbw,*)
#else
MATH_DATATYPE(kind=rck) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
MATH_DATATYPE(kind=rck) :: a_mat(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
#if REALCASE == 1
real(kind=rk) :: eps
#endif
logical, intent(in) :: useGPU
character(20) :: gpuString
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows
......@@ -187,12 +188,18 @@
logical :: useGPU_reduction_lower_block_to_tridiagonal
integer(kind=ik), intent(in) :: max_threads
if(useGPU) then
gpuString = "_gpu"
else
gpuString = ""
endif
call obj%timer%start("bandred_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
PRECISION_SUFFIX // &
gpuString )
useGPU_reduction_lower_block_to_tridiagonal = .false.
if (useGPU) then
......@@ -324,14 +331,14 @@
#ifdef USE_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
&(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#else
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
&(obj, a_mat(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
......@@ -360,7 +367,7 @@
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), (lda)*(na_cols)* size_of_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_&
&MATH_DATATYPE&
......@@ -538,7 +545,7 @@
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
successCUDA = cuda_memcpy2d(loc(a_mat(1, lc_start)), &
int((lda*size_of_datatype),kind=c_intptr_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_datatype),kind=c_intptr_t )), &
int(lda*size_of_datatype,kind=c_intptr_t), &
......@@ -567,7 +574,7 @@
#ifdef USE_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
&(obj, a_mat, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
......@@ -577,7 +584,7 @@
#else
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(obj, a(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , &
&(obj, a_mat(1:lda,1:matrixCols), lda, matrixCols, vmrCPU(1:max(l_rows,1),1:vmrCols) , &
max(l_rows,1), vmrCols, tauvector(1:na), na, &
tmat(1:nbw,1:nbw,istep), nbw, nbw, work_blocked(1:work_size), work_size, &
work_size, na, n_cols, nblk, nblk, &
......@@ -608,7 +615,7 @@
! Get Vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! Vector to be transformed
vr(1:lr) = a_mat(1:lr,lch) ! Vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
......@@ -646,11 +653,11 @@
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
a(1:lr-1,lch) = vr(1:lr-1)
a(lr,lch) = vrl
a_mat(1:lr-1,lch) = vr(1:lr-1)
a_mat(lr,lch) = vrl
vr(lr) = 1.0_rck
else
a(1:lr,lch) = vr(1:lr)
a_mat(1:lr,lch) = vr(1:lr)
endif
endif
......@@ -692,7 +699,7 @@
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
enddo
......@@ -708,7 +715,7 @@
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
endif
enddo
......@@ -726,7 +733,7 @@
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
......@@ -739,7 +746,7 @@
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
......@@ -762,7 +769,7 @@
if (lcx>0 ) then
mynlc = mynlc+1
if ( mod((j-1), omp_get_num_threads()) .eq. omp_get_thread_num() ) then
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a(1:lr,lcx))
if (lr>0) aux1(mynlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
endif
enddo