Commit 1aaabf51 authored by Andreas Marek's avatar Andreas Marek
Browse files

Remove some unecessary copies if compiled without MPI

parent ad475148
......@@ -275,12 +275,20 @@
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1)
vrl = aux2(2)
#else /* WITH_MPI */
! aux2 = aux1
vnorm2 = aux1(1)
vrl = aux1(2)
#endif /* WITH_MPI */
! vnorm2 = aux2(1)
! vrl = aux2(2)
! Householder transformation
#ifdef DOUBLE_PRECISION_COMPLEX
call hh_transform_complex_double(vrl, vnorm2, xf, tau(istep))
......@@ -878,26 +886,58 @@
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq)
endif
#else
! tmp2 = tmp1
if (l_rows>0) then
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
#endif
! if (l_rows>0) then
! call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! endif
#else /* DOUBLE_PRECISION_COMPLEX */
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq)
endif
#else
! tmp2 = tmp1
if (l_rows>0) then
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
#endif
!
! if (l_rows>0) then
! call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! endif
#endif /* DOUBLE_PRECISION_COMPLEX */
nstor = 0
endif
......@@ -1199,16 +1239,26 @@
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
#else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
tmp2 = tmp1
#endif /* WITH_MPI */
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
#else /* WITH_MPI */
! tmp2 = tmp1
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
#endif /* WITH_MPI */
! ! Put the result into C
! if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage
......@@ -1415,6 +1465,7 @@
#endif
#endif /* WITH_MPI */
nc = 0
do i=1,nblk
tmp2(1:i,i) = tmp1(nc+1:nc+i)
......@@ -1688,6 +1739,7 @@
#endif
#endif /* WITH_MPI */
if (l_row1>1 .and. l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk8,0.0_rk8), &
......
......@@ -313,6 +313,7 @@
! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
......@@ -921,14 +922,20 @@
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
! tmp2 = tmp1
#endif
! endif ! useGPU
if (l_rows>0) then
#ifdef WITH_MPI
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk8, q, ldq)
#else
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp1, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
tmp1, nstor, 1.0_rk8, q, ldq)
#endif
endif
#else /* DOUBLE_PRECISION_REAL */
......@@ -937,14 +944,20 @@
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
! tmp2 = tmp1
#endif
! endif ! useGPU
if (l_rows>0) then
#ifdef WITH_MPI
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk4, q, ldq)
#else
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp1, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
tmp1, nstor, 1.0_rk4, q, ldq)
#endif
endif
#endif /* DOUBLE_PRECISION_REAL */
nstor = 0
......@@ -1256,12 +1269,15 @@
#else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
#else /* WITH_MPI */
tmp2 = tmp1
#endif /* WITH_MPI */
! tmp2 = tmp1
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
#endif /* WITH_MPI */
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -1833,14 +1849,27 @@
#endif
#else /* WITH_MPI */
qmat2 = qmat1 ! is this correct
! qmat2 = qmat1 ! is this correct
#endif /* WITH_MPI */
do i=1,nlen
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call distribute_global_column_double(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else
call distribute_global_column_single(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
call distribute_global_column_double(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else
call distribute_global_column_single(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif
#endif /* WITH_MPI */
enddo
enddo
......
......@@ -506,9 +506,6 @@
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform
nlc = 0
......@@ -520,6 +517,35 @@
endif
enddo
#else /* WITH_MPI */
! if (nlc>0) aux2=aux1
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
enddo
! Calculate scalar products of stored Householder vectors.
......@@ -726,6 +752,7 @@
istep*nbw, n_cols, nblk)
#endif
endif
#ifdef WITH_MPI
if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
......@@ -733,11 +760,13 @@
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -745,24 +774,27 @@
stop
endif
endif
#else /* WITH_MPI */
if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating tmp "//errorMessage
stop
endif
endif
! if (l_cols>0) then
! allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"bandred_complex: error when allocating tmp "//errorMessage
! stop
! endif
! tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
!
! umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
! deallocate(tmp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"bandred_complex: error when deallocating tmp "//errorMessage
! stop
! endif
! endif
#endif /* WITH_MPI */
! U = U * Tmat**T
if (useGPU) then
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
......@@ -1138,9 +1170,6 @@
call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, comm, mpierr)
#endif
#else /* WITH_MPI */
h2(1:nc) = h1(1:nc)
#endif /* WITH_MPI */
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
......@@ -1148,6 +1177,27 @@
nc = nc+i
enddo
#else /* WITH_MPI */
! h2(1:nc) = h1(1:nc)
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = conjg(a(1:i-1,i))
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = conjg(a(1:i-1,i))
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("herm_matrix_allreduce_double")
......@@ -1491,16 +1541,30 @@
#endif
#else /* WITH_MPI */
tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
if (l_rows>0) then
if (useGPU) then
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
#endif /* WITH_MPI */
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
......@@ -1528,6 +1592,9 @@
tmp_dev, n_cols, CONE, q_dev, ldq)
#endif
else ! not useGPU
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
......@@ -1537,6 +1604,21 @@
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, n_cols, CONE, q, ldq)
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#else
call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#endif
#endif /* WITH_MPI */
endif
endif
......@@ -3403,7 +3485,9 @@
#endif
#else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
......@@ -3420,7 +3504,9 @@
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#else /* DOUBLE_PRECISION_COMPLEX */
......@@ -3436,7 +3522,8 @@
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
endif
......@@ -3603,7 +3690,9 @@
#endif
#else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
......@@ -3620,7 +3709,9 @@
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#else /* DOUBLE_PRECISION_COMPLEX */
......@@ -3635,7 +3726,9 @@
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#endif /* DOUBLE_PRECISION_COMPLEX */
......
......@@ -1082,11 +1082,13 @@
#else
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, ierr)
#endif
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
#else /* WITH_MPI */
tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
#endif /* WITH_MPI */
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
if (allocated(tmpCUDA)) then
deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
......@@ -1273,10 +1275,11 @@
call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
#else /* WITH_MPI */
tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
! tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
#endif /* WITH_MPI */
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -1588,9 +1591,6 @@
call mpi_allreduce(h1, h2, nc, MPI_REAL4, MPI_SUM, comm, mpierr)
#endif
#else /* WITH_MPI */
h2=h1
#endif /* WITH_MPI */
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
......@@ -1598,6 +1598,24 @@
nc = nc+i
enddo
#else /* WITH_MPI */
! h2=h1
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = a(1:i-1,i)
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = a(1:i-1,i)
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_REAL
......@@ -1910,7 +1928,7 @@
tmp1(1:l_cols*n_cols) = 0
#else
! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device
! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device
#ifdef WITH_GPU_VERSION
#ifdef DOUBLE_PRECISION_REAL
......@@ -1944,7 +1962,9 @@
#endif
#else /* WITH_MPI */
tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
......@@ -1956,7 +1976,7 @@
if (l_rows>0) then
#ifdef WITH_MPI
! after the allreduce we have to copy back to the device
! after the mpi_allreduce we have to copy back to the device
#ifdef DOUBLE_PRECISION_REAL
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*size_of_double_real_datatype,cudaMemcpyHostToDevice)
......@@ -2122,26 +2142,59 @@
max_local_rows, 0.0_rk8, t_tmp, cwy_blocking)
#ifdef WITH_MPI
call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
#endif
call dtrmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk8, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
call dtrmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk8, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
t_tmp2, cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
#else
! t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
call dtrmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk8, tmat_complete, cwy_blocking, t_tmp, cwy_blocking)
call dtrmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk8, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
t_tmp, cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
#endif
! call dtrmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk8, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
! call dtrmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk8, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
! t_tmp2, cwy_blocking)
#else /* DOUBLE_PRECISION_REAL */
call sgemm('T', 'N', t_rows, t_cols, l_rows, 1.0_rk4, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
max_local_rows, 0.0_rk4, t_tmp, cwy_blocking)
#ifdef WITH_MPI
call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else
t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
#endif
call strmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk4, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
call strmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk4, tmat_complete(t_rows+1,t_rows+1), &
cwy_blocking, t_tmp2, cwy_blocking)
#endif /* DOUBLE_PRECISION_REAL */
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
#else
! t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
call strmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk4, tmat_complete, cwy_blocking, t_tmp, cwy_blocking)
call strmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk4, tmat_complete(t_rows+1,t_rows+1), &
cwy_blocking, t_tmp, cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
#endif
! call strmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk4, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
! call strmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk4, tmat_complete(t_rows+1,t_rows+1), &
! cwy_blocking, t_tmp2, cwy_blocking)