Commit 1aaabf51 authored by Andreas Marek's avatar Andreas Marek
Browse files

Remove some unecessary copies if compiled without MPI

parent ad475148
...@@ -275,12 +275,20 @@ ...@@ -275,12 +275,20 @@
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif #endif
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1) vnorm2 = aux2(1)
vrl = aux2(2) vrl = aux2(2)
#else /* WITH_MPI */
! aux2 = aux1
vnorm2 = aux1(1)
vrl = aux1(2)
#endif /* WITH_MPI */
! vnorm2 = aux2(1)
! vrl = aux2(2)
! Householder transformation ! Householder transformation
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call hh_transform_complex_double(vrl, vnorm2, xf, tau(istep)) call hh_transform_complex_double(vrl, vnorm2, xf, tau(istep))
...@@ -878,26 +886,58 @@ ...@@ -878,26 +886,58 @@
#ifdef WITH_MPI #ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then if (l_rows>0) then
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor) call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), & call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq) tmp2, nstor, CONE, q, ldq)
endif endif
#else
! tmp2 = tmp1
if (l_rows>0) then
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
#endif
! if (l_rows>0) then
! call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! endif
#else /* DOUBLE_PRECISION_COMPLEX */ #else /* DOUBLE_PRECISION_COMPLEX */
#ifdef WITH_MPI #ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then if (l_rows>0) then
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor) call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), & call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq) tmp2, nstor, CONE, q, ldq)
endif endif
#else
! tmp2 = tmp1
if (l_rows>0) then
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
#endif
!
! if (l_rows>0) then
! call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! endif
#endif /* DOUBLE_PRECISION_COMPLEX */ #endif /* DOUBLE_PRECISION_COMPLEX */
nstor = 0 nstor = 0
endif endif
...@@ -1199,16 +1239,26 @@ ...@@ -1199,16 +1239,26 @@
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr) call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
#else #else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr) call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif #endif
#else /* WITH_MPI */
tmp2 = tmp1
#endif /* WITH_MPI */
! Put the result into C ! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
#else /* WITH_MPI */
! tmp2 = tmp1
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
#endif /* WITH_MPI */
! ! Put the result into C
! if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage) deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage
...@@ -1415,6 +1465,7 @@ ...@@ -1415,6 +1465,7 @@
#endif #endif
#endif /* WITH_MPI */ #endif /* WITH_MPI */
nc = 0 nc = 0
do i=1,nblk do i=1,nblk
tmp2(1:i,i) = tmp1(nc+1:nc+i) tmp2(1:i,i) = tmp1(nc+1:nc+i)
...@@ -1688,6 +1739,7 @@ ...@@ -1688,6 +1739,7 @@
#endif #endif
#endif /* WITH_MPI */ #endif /* WITH_MPI */
if (l_row1>1 .and. l_cols-l_col1+1>0) & if (l_row1>1 .and. l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk8,0.0_rk8), & call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk8,0.0_rk8), &
......
...@@ -313,6 +313,7 @@ ...@@ -313,6 +313,7 @@
! Broadcast the Householder vector (and tau) along columns ! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep) if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI #ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
...@@ -921,14 +922,20 @@ ...@@ -921,14 +922,20 @@
#ifdef WITH_MPI #ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else #else
tmp2 = tmp1 ! tmp2 = tmp1
#endif #endif
! endif ! useGPU ! endif ! useGPU
if (l_rows>0) then if (l_rows>0) then
#ifdef WITH_MPI
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor) call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), & call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk8, q, ldq) tmp2, nstor, 1.0_rk8, q, ldq)
#else
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp1, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
tmp1, nstor, 1.0_rk8, q, ldq)
#endif
endif endif
#else /* DOUBLE_PRECISION_REAL */ #else /* DOUBLE_PRECISION_REAL */
...@@ -937,14 +944,20 @@ ...@@ -937,14 +944,20 @@
#ifdef WITH_MPI #ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else #else
tmp2 = tmp1 ! tmp2 = tmp1
#endif #endif
! endif ! useGPU ! endif ! useGPU
if (l_rows>0) then if (l_rows>0) then
#ifdef WITH_MPI
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor) call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), & call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk4, q, ldq) tmp2, nstor, 1.0_rk4, q, ldq)
#else
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp1, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
tmp1, nstor, 1.0_rk4, q, ldq)
#endif
endif endif
#endif /* DOUBLE_PRECISION_REAL */ #endif /* DOUBLE_PRECISION_REAL */
nstor = 0 nstor = 0
...@@ -1256,12 +1269,15 @@ ...@@ -1256,12 +1269,15 @@
#else #else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr) call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif #endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
#else /* WITH_MPI */ #else /* WITH_MPI */
tmp2 = tmp1 ! tmp2 = tmp1
#endif /* WITH_MPI */
! Put the result into C ! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce) if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
#endif /* WITH_MPI */
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage) deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
...@@ -1833,14 +1849,27 @@ ...@@ -1833,14 +1849,27 @@
#endif #endif
#else /* WITH_MPI */ #else /* WITH_MPI */
qmat2 = qmat1 ! is this correct ! qmat2 = qmat1 ! is this correct
#endif /* WITH_MPI */ #endif /* WITH_MPI */
do i=1,nlen do i=1,nlen
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
call distribute_global_column_double(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) call distribute_global_column_double(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else #else
call distribute_global_column_single(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk) call distribute_global_column_single(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif #endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
call distribute_global_column_double(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else
call distribute_global_column_single(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif
#endif /* WITH_MPI */
enddo enddo
enddo enddo
......
...@@ -506,9 +506,6 @@ ...@@ -506,9 +506,6 @@
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif #endif
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform ! Transform
nlc = 0 nlc = 0
...@@ -520,6 +517,35 @@ ...@@ -520,6 +517,35 @@
endif endif
enddo enddo
#else /* WITH_MPI */
! if (nlc>0) aux2=aux1
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
enddo enddo
! Calculate scalar products of stored Householder vectors. ! Calculate scalar products of stored Householder vectors.
...@@ -726,6 +752,7 @@ ...@@ -726,6 +752,7 @@
istep*nbw, n_cols, nblk) istep*nbw, n_cols, nblk)
#endif #endif
endif endif
#ifdef WITH_MPI #ifdef WITH_MPI
if (l_cols>0) then if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage) allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
...@@ -733,11 +760,13 @@ ...@@ -733,11 +760,13 @@
print *,"bandred_complex: error when allocating tmp "//errorMessage print *,"bandred_complex: error when allocating tmp "//errorMessage
stop stop
endif endif
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else #else
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr) call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif #endif
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols) umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage) deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
...@@ -745,24 +774,27 @@ ...@@ -745,24 +774,27 @@
stop stop
endif endif
endif endif
#else /* WITH_MPI */ #else /* WITH_MPI */
if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols) ! if (l_cols>0) then
deallocate(tmp, stat=istat, errmsg=errorMessage) ! allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then ! if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating tmp "//errorMessage ! print *,"bandred_complex: error when allocating tmp "//errorMessage
stop ! stop
endif ! endif
endif ! tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
!
! umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
! deallocate(tmp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"bandred_complex: error when deallocating tmp "//errorMessage
! stop
! endif
! endif
#endif /* WITH_MPI */ #endif /* WITH_MPI */
! U = U * Tmat**T ! U = U * Tmat**T
if (useGPU) then if (useGPU) then
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
...@@ -1138,9 +1170,6 @@ ...@@ -1138,9 +1170,6 @@
call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, comm, mpierr) call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, comm, mpierr)
#endif #endif
#else /* WITH_MPI */
h2(1:nc) = h1(1:nc)
#endif /* WITH_MPI */
nc = 0 nc = 0
do i=1,n do i=1,n
a(1:i,i) = h2(nc+1:nc+i) a(1:i,i) = h2(nc+1:nc+i)
...@@ -1148,6 +1177,27 @@ ...@@ -1148,6 +1177,27 @@
nc = nc+i nc = nc+i
enddo enddo
#else /* WITH_MPI */
! h2(1:nc) = h1(1:nc)
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = conjg(a(1:i-1,i))
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = conjg(a(1:i-1,i))
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("herm_matrix_allreduce_double") call timer%stop("herm_matrix_allreduce_double")
...@@ -1491,16 +1541,30 @@ ...@@ -1491,16 +1541,30 @@
#endif #endif
#else /* WITH_MPI */ #else /* WITH_MPI */
tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */ #endif /* WITH_MPI */
if (l_rows>0) then if (l_rows>0) then
if (useGPU) then if (useGPU) then
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else #else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif #endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
#endif /* WITH_MPI */
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy" print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop stop
...@@ -1528,6 +1592,9 @@ ...@@ -1528,6 +1592,9 @@
tmp_dev, n_cols, CONE, q_dev, ldq) tmp_dev, n_cols, CONE, q_dev, ldq)
#endif #endif
else ! not useGPU else ! not useGPU
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols) call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), & call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
...@@ -1537,6 +1604,21 @@ ...@@ -1537,6 +1604,21 @@
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), & call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, n_cols, CONE, q, ldq) tmp2, n_cols, CONE, q, ldq)
#endif #endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#else
call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#endif
#endif /* WITH_MPI */
endif endif
endif endif
...@@ -3403,7 +3485,9 @@ ...@@ -3403,7 +3485,9 @@
#endif #endif
#else /* WITH_MPI */ #else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */ #endif /* WITH_MPI */
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
...@@ -3420,7 +3504,9 @@ ...@@ -3420,7 +3504,9 @@
#ifdef WITH_MPI #ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else #else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif #endif
endif endif
#else /* DOUBLE_PRECISION_COMPLEX */ #else /* DOUBLE_PRECISION_COMPLEX */
...@@ -3436,7 +3522,8 @@ ...@@ -3436,7 +3522,8 @@
#ifdef WITH_MPI #ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else #else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif #endif
endif endif
...@@ -3603,7 +3690,9 @@ ...@@ -3603,7 +3690,9 @@
#endif #endif
#else /* WITH_MPI */ #else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */ #endif /* WITH_MPI */
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
...@@ -3620,7 +3709,9 @@ ...@@ -3620,7 +3709,9 @@
#ifdef WITH_MPI #ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else #else