Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
1aaabf51
Commit
1aaabf51
authored
Jun 10, 2016
by
Andreas Marek
Browse files
Remove some unecessary copies if compiled without MPI
parent
ad475148
Changes
4
Hide whitespace changes
Inline
Side-by-side
src/elpa1_compute_complex_template.X90
View file @
1aaabf51
...
...
@@ -275,12 +275,20 @@
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1)
vrl = aux2(2)
#else /* WITH_MPI */
! aux2 = aux1
vnorm2 = aux1(1)
vrl = aux1(2)
#endif /* WITH_MPI */
! vnorm2 = aux2(1)
! vrl = aux2(2)
! Householder transformation
#ifdef DOUBLE_PRECISION_COMPLEX
call hh_transform_complex_double(vrl, vnorm2, xf, tau(istep))
...
...
@@ -878,26 +886,58 @@
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq)
endif
#else
! tmp2 = tmp1
if (l_rows>0) then
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
#endif
! if (l_rows>0) then
! call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! endif
#else /* DOUBLE_PRECISION_COMPLEX */
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq)
endif
#else
! tmp2 = tmp1
if (l_rows>0) then
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
#endif
!
! if (l_rows>0) then
! call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! endif
#endif /* DOUBLE_PRECISION_COMPLEX */
nstor = 0
endif
...
...
@@ -1199,16 +1239,26 @@
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
#else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
tmp2 = tmp1
#endif /* WITH_MPI */
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
#else /* WITH_MPI */
! tmp2 = tmp1
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
#endif /* WITH_MPI */
! ! Put the result into C
! if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage
...
...
@@ -1415,6 +1465,7 @@
#endif
#endif /* WITH_MPI */
nc = 0
do i=1,nblk
tmp2(1:i,i) = tmp1(nc+1:nc+i)
...
...
@@ -1688,6 +1739,7 @@
#endif
#endif /* WITH_MPI */
if (l_row1>1 .and. l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk8,0.0_rk8), &
...
...
src/elpa1_compute_real_template.X90
View file @
1aaabf51
...
...
@@ -313,6 +313,7 @@
! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -921,14 +922,20 @@
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
!
tmp2 = tmp1
#endif
! endif ! useGPU
if (l_rows>0) then
#ifdef WITH_MPI
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk8, q, ldq)
#else
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp1, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
tmp1, nstor, 1.0_rk8, q, ldq)
#endif
endif
#else /* DOUBLE_PRECISION_REAL */
...
...
@@ -937,14 +944,20 @@
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
!
tmp2 = tmp1
#endif
! endif ! useGPU
if (l_rows>0) then
#ifdef WITH_MPI
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk4, q, ldq)
#else
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp1, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
tmp1, nstor, 1.0_rk4, q, ldq)
#endif
endif
#endif /* DOUBLE_PRECISION_REAL */
nstor = 0
...
...
@@ -1256,12 +1269,15 @@
#else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
#else /* WITH_MPI */
tmp2 = tmp1
#endif /* WITH_MPI */
! tmp2 = tmp1
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
#endif /* WITH_MPI */
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
...
...
@@ -1833,14 +1849,27 @@
#endif
#else /* WITH_MPI */
qmat2 = qmat1 ! is this correct
!
qmat2 = qmat1 ! is this correct
#endif /* WITH_MPI */
do i=1,nlen
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
call distribute_global_column_double(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else
call distribute_global_column_single(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
call distribute_global_column_double(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else
call distribute_global_column_single(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif
#endif /* WITH_MPI */
enddo
enddo
...
...
src/elpa2_compute_complex_template.X90
View file @
1aaabf51
...
...
@@ -506,9 +506,6 @@
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform
nlc = 0
...
...
@@ -520,6 +517,35 @@
endif
enddo
#else /* WITH_MPI */
! if (nlc>0) aux2=aux1
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
enddo
! Calculate scalar products of stored Householder vectors.
...
...
@@ -726,6 +752,7 @@
istep*nbw, n_cols, nblk)
#endif
endif
#ifdef WITH_MPI
if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
...
...
@@ -733,11 +760,13 @@
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
...
...
@@ -745,24 +774,27 @@
stop
endif
endif
#else /* WITH_MPI */
if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating tmp "//errorMessage
stop
endif
endif
! if (l_cols>0) then
! allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"bandred_complex: error when allocating tmp "//errorMessage
! stop
! endif
! tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
!
! umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
! deallocate(tmp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"bandred_complex: error when deallocating tmp "//errorMessage
! stop
! endif
! endif
#endif /* WITH_MPI */
! U = U * Tmat**T
if (useGPU) then
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
...
...
@@ -1138,9 +1170,6 @@
call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, comm, mpierr)
#endif
#else /* WITH_MPI */
h2(1:nc) = h1(1:nc)
#endif /* WITH_MPI */
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
...
...
@@ -1148,6 +1177,27 @@
nc = nc+i
enddo
#else /* WITH_MPI */
! h2(1:nc) = h1(1:nc)
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = conjg(a(1:i-1,i))
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = conjg(a(1:i-1,i))
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("herm_matrix_allreduce_double")
...
...
@@ -1491,16 +1541,30 @@
#endif
#else /* WITH_MPI */
tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
if (l_rows>0) then
if (useGPU) then
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
#endif /* WITH_MPI */
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
...
...
@@ -1528,6 +1592,9 @@
tmp_dev, n_cols, CONE, q_dev, ldq)
#endif
else ! not useGPU
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
...
...
@@ -1537,6 +1604,21 @@
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, n_cols, CONE, q, ldq)
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#else
call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#endif
#endif /* WITH_MPI */
endif
endif
...
...
@@ -3403,7 +3485,9 @@
#endif
#else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
...
...
@@ -3420,7 +3504,9 @@
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#else /* DOUBLE_PRECISION_COMPLEX */
...
...
@@ -3436,7 +3522,8 @@
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
endif
...
...
@@ -3603,7 +3690,9 @@
#endif
#else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
...
...
@@ -3620,7 +3709,9 @@
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#else /* DOUBLE_PRECISION_COMPLEX */
...
...
@@ -3635,7 +3726,9 @@
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#endif /* DOUBLE_PRECISION_COMPLEX */
...
...
src/elpa2_compute_real_template.X90
View file @
1aaabf51
...
...
@@ -1082,11 +1082,13 @@
#else
call mpi_allreduce(umcCUDA, tmpCUDA, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, ierr)
#endif
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
#else /* WITH_MPI */
tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
! tmpCUDA(1 : l_cols * n_cols) = umcCUDA(1 : l_cols * n_cols)
#endif /* WITH_MPI */
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
if (allocated(tmpCUDA)) then
deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
...
...
@@ -1273,10 +1275,11 @@
call mpi_allreduce(umcCPU, tmpCPU, l_cols*n_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#endif
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
#else /* WITH_MPI */
tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
!
tmpCPU(1:l_cols,1:n_cols) = umcCPU(1:l_cols,1:n_cols)
#endif /* WITH_MPI */
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
...
...
@@ -1588,9 +1591,6 @@
call mpi_allreduce(h1, h2, nc, MPI_REAL4, MPI_SUM, comm, mpierr)
#endif
#else /* WITH_MPI */
h2=h1
#endif /* WITH_MPI */
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
...
...
@@ -1598,6 +1598,24 @@
nc = nc+i
enddo
#else /* WITH_MPI */
! h2=h1
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = a(1:i-1,i)
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = a(1:i-1,i)
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -1910,7 +1928,7 @@
tmp1(1:l_cols*n_cols) = 0
#else
! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device
! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device
#ifdef WITH_GPU_VERSION
#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -1944,7 +1962,9 @@
#endif
#else /* WITH_MPI */
tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
...
...
@@ -1956,7 +1976,7 @@
if (l_rows>0) then
#ifdef WITH_MPI
! after the allreduce we have to copy back to the device
! after the
mpi_
allreduce we have to copy back to the device
#ifdef DOUBLE_PRECISION_REAL
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*size_of_double_real_datatype,cudaMemcpyHostToDevice)
...
...
@@ -2122,26 +2142,59 @@
max_local_rows, 0.0_rk8, t_tmp, cwy_blocking)
#ifdef WITH_MPI
call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
#endif
call dtrmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk8, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
call dtrmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk8, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
t_tmp2, cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
#else
! t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
call dtrmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk8, tmat_complete, cwy_blocking, t_tmp, cwy_blocking)
call dtrmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk8, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
t_tmp, cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
#endif
! call dtrmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk8, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
! call dtrmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk8, tmat_complete(t_rows+1,t_rows+1), cwy_blocking, &
! t_tmp2, cwy_blocking)
#else /* DOUBLE_PRECISION_REAL */
call sgemm('T', 'N', t_rows, t_cols, l_rows, 1.0_rk4, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
max_local_rows, 0.0_rk4, t_tmp, cwy_blocking)
#ifdef WITH_MPI
call mpi_allreduce(t_tmp, t_tmp2, cwy_blocking*nbw, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else
t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
#endif
call strmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk4, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
call strmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk4, tmat_complete(t_rows+1,t_rows+1), &
cwy_blocking, t_tmp2, cwy_blocking)
#endif /* DOUBLE_PRECISION_REAL */
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
#else
! t_tmp2(1:cwy_blocking,1:nbw) = t_tmp(1:cwy_blocking,1:nbw)
call strmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk4, tmat_complete, cwy_blocking, t_tmp, cwy_blocking)
call strmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk4, tmat_complete(t_rows+1,t_rows+1), &
cwy_blocking, t_tmp, cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp(1:t_rows,1:t_cols)
#endif
! call strmm('L', 'U', 'N', 'N', t_rows, t_cols, 1.0_rk4, tmat_complete, cwy_blocking, t_tmp2, cwy_blocking)
! call strmm('R', 'U', 'N', 'N', t_rows, t_cols, -1.0_rk4, tmat_complete(t_rows+1,t_rows+1), &
! cwy_blocking, t_tmp2, cwy_blocking)
<