Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
1aaabf51
Commit
1aaabf51
authored
Jun 10, 2016
by
Andreas Marek
Browse files
Remove some unecessary copies if compiled without MPI
parent
ad475148
Changes
4
Hide whitespace changes
Inline
Side-by-side
src/elpa1_compute_complex_template.X90
View file @
1aaabf51
...
@@ -275,12 +275,20 @@
...
@@ -275,12 +275,20 @@
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(aux1, aux2, 2, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#endif
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1)
vnorm2 = aux2(1)
vrl = aux2(2)
vrl = aux2(2)
#else /* WITH_MPI */
! aux2 = aux1
vnorm2 = aux1(1)
vrl = aux1(2)
#endif /* WITH_MPI */
! vnorm2 = aux2(1)
! vrl = aux2(2)
! Householder transformation
! Householder transformation
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
call hh_transform_complex_double(vrl, vnorm2, xf, tau(istep))
call hh_transform_complex_double(vrl, vnorm2, xf, tau(istep))
...
@@ -878,26 +886,58 @@
...
@@ -878,26 +886,58 @@
#ifdef WITH_MPI
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
if (l_rows>0) then
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq)
tmp2, nstor, CONE, q, ldq)
endif
endif
#else
! tmp2 = tmp1
if (l_rows>0) then
call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
#endif
! if (l_rows>0) then
! call ztrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call zgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! endif
#else /* DOUBLE_PRECISION_COMPLEX */
#else /* DOUBLE_PRECISION_COMPLEX */
#ifdef WITH_MPI
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
tmp2 = tmp1
#endif
if (l_rows>0) then
if (l_rows>0) then
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, nstor, CONE, q, ldq)
tmp2, nstor, CONE, q, ldq)
endif
endif
#else
! tmp2 = tmp1
if (l_rows>0) then
call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp1, nstor)
call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, nstor, CONE, q, ldq)
endif
#endif
!
! if (l_rows>0) then
! call ctrmm('L', 'L', 'N', 'N', nstor, l_cols, CONE, tmat, max_stored_rows, tmp2, nstor)
! call cgemm('N', 'N', l_rows, l_cols, nstor, -CONE, hvm, ubound(hvm,dim=1), &
! tmp2, nstor, CONE, q, ldq)
! endif
#endif /* DOUBLE_PRECISION_COMPLEX */
#endif /* DOUBLE_PRECISION_COMPLEX */
nstor = 0
nstor = 0
endif
endif
...
@@ -1199,16 +1239,26 @@
...
@@ -1199,16 +1239,26 @@
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_DOUBLE_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
#else
#else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_COMPLEX, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif
#endif
#else /* WITH_MPI */
tmp2 = tmp1
#endif /* WITH_MPI */
! Put the result into C
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
#else /* WITH_MPI */
! tmp2 = tmp1
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
#endif /* WITH_MPI */
! ! Put the result into C
! if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (istat .ne. 0) then
print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage
print *,"mult_ah_b_complex: error when deallocating tmp1 "//errorMessage
...
@@ -1415,6 +1465,7 @@
...
@@ -1415,6 +1465,7 @@
#endif
#endif
#endif /* WITH_MPI */
#endif /* WITH_MPI */
nc = 0
nc = 0
do i=1,nblk
do i=1,nblk
tmp2(1:i,i) = tmp1(nc+1:nc+i)
tmp2(1:i,i) = tmp1(nc+1:nc+i)
...
@@ -1688,6 +1739,7 @@
...
@@ -1688,6 +1739,7 @@
#endif
#endif
#endif /* WITH_MPI */
#endif /* WITH_MPI */
if (l_row1>1 .and. l_cols-l_col1+1>0) &
if (l_row1>1 .and. l_cols-l_col1+1>0) &
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk8,0.0_rk8), &
call ZGEMM('N', 'N', l_row1-1, l_cols-l_col1+1, nb, (-1.0_rk8,0.0_rk8), &
...
...
src/elpa1_compute_real_template.X90
View file @
1aaabf51
...
@@ -313,6 +313,7 @@
...
@@ -313,6 +313,7 @@
! Broadcast the Householder vector (and tau) along columns
! Broadcast the Householder vector (and tau) along columns
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
if(my_pcol==pcol(istep, nblk, np_cols)) vr(l_rows+1) = tau(istep)
#ifdef WITH_MPI
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
...
@@ -921,14 +922,20 @@
...
@@ -921,14 +922,20 @@
#ifdef WITH_MPI
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL8, MPI_SUM, mpi_comm_rows, mpierr)
#else
#else
tmp2 = tmp1
!
tmp2 = tmp1
#endif
#endif
! endif ! useGPU
! endif ! useGPU
if (l_rows>0) then
if (l_rows>0) then
#ifdef WITH_MPI
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor)
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp2, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk8, q, ldq)
tmp2, nstor, 1.0_rk8, q, ldq)
#else
call dtrmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk8, tmat, max_stored_rows, tmp1, nstor)
call dgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk8, hvm, ubound(hvm,dim=1), &
tmp1, nstor, 1.0_rk8, q, ldq)
#endif
endif
endif
#else /* DOUBLE_PRECISION_REAL */
#else /* DOUBLE_PRECISION_REAL */
...
@@ -937,14 +944,20 @@
...
@@ -937,14 +944,20 @@
#ifdef WITH_MPI
#ifdef WITH_MPI
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, MPI_REAL4, MPI_SUM, mpi_comm_rows, mpierr)
#else
#else
tmp2 = tmp1
!
tmp2 = tmp1
#endif
#endif
! endif ! useGPU
! endif ! useGPU
if (l_rows>0) then
if (l_rows>0) then
#ifdef WITH_MPI
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor)
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp2, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
tmp2, nstor, 1.0_rk4, q, ldq)
tmp2, nstor, 1.0_rk4, q, ldq)
#else
call strmm('L', 'L', 'N', 'N', nstor, l_cols, 1.0_rk4, tmat, max_stored_rows, tmp1, nstor)
call sgemm('N', 'N', l_rows, l_cols, nstor, -1.0_rk4, hvm, ubound(hvm,dim=1), &
tmp1, nstor, 1.0_rk4, q, ldq)
#endif
endif
endif
#endif /* DOUBLE_PRECISION_REAL */
#endif /* DOUBLE_PRECISION_REAL */
nstor = 0
nstor = 0
...
@@ -1256,12 +1269,15 @@
...
@@ -1256,12 +1269,15 @@
#else
#else
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr)
call mpi_reduce(tmp1, tmp2, nstor*(lce-lcs+1), MPI_REAL4, MPI_SUM, np, mpi_comm_rows, mpierr)
#endif
#endif
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
#else /* WITH_MPI */
#else /* WITH_MPI */
tmp2 = tmp1
! tmp2 = tmp1
#endif /* WITH_MPI */
! Put the result into C
! Put the result into C
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
#endif /* WITH_MPI */
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (istat .ne. 0) then
...
@@ -1833,14 +1849,27 @@
...
@@ -1833,14 +1849,27 @@
#endif
#endif
#else /* WITH_MPI */
#else /* WITH_MPI */
qmat2 = qmat1 ! is this correct
!
qmat2 = qmat1 ! is this correct
#endif /* WITH_MPI */
#endif /* WITH_MPI */
do i=1,nlen
do i=1,nlen
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_REAL
#ifdef DOUBLE_PRECISION_REAL
call distribute_global_column_double(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
call distribute_global_column_double(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else
#else
call distribute_global_column_single(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
call distribute_global_column_single(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
call distribute_global_column_double(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else
call distribute_global_column_single(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif
#endif /* WITH_MPI */
enddo
enddo
enddo
enddo
...
...
src/elpa2_compute_complex_template.X90
View file @
1aaabf51
...
@@ -506,9 +506,6 @@
...
@@ -506,9 +506,6 @@
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#endif
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform
! Transform
nlc = 0
nlc = 0
...
@@ -520,6 +517,35 @@
...
@@ -520,6 +517,35 @@
endif
endif
enddo
enddo
#else /* WITH_MPI */
! if (nlc>0) aux2=aux1
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
enddo
enddo
! Calculate scalar products of stored Householder vectors.
! Calculate scalar products of stored Householder vectors.
...
@@ -726,6 +752,7 @@
...
@@ -726,6 +752,7 @@
istep*nbw, n_cols, nblk)
istep*nbw, n_cols, nblk)
#endif
#endif
endif
endif
#ifdef WITH_MPI
#ifdef WITH_MPI
if (l_cols>0) then
if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
...
@@ -733,11 +760,13 @@
...
@@ -733,11 +760,13 @@
print *,"bandred_complex: error when allocating tmp "//errorMessage
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
stop
endif
endif
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_DOUBLE_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#else
#else
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(umc, tmp, l_cols*n_cols, MPI_COMPLEX, MPI_SUM, mpi_comm_rows, mpierr)
#endif
#endif
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
if (istat .ne. 0) then
...
@@ -745,24 +774,27 @@
...
@@ -745,24 +774,27 @@
stop
stop
endif
endif
endif
endif
#else /* WITH_MPI */
#else /* WITH_MPI */
if (l_cols>0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
! if (l_cols>0) then
deallocate(tmp, stat=istat, errmsg=errorMessage)
! allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
! if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating tmp "//errorMessage
! print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
! stop
endif
! endif
endif
! tmp(1:l_cols,1:n_cols) = umc(1:l_cols,1:n_cols)
!
! umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
! deallocate(tmp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"bandred_complex: error when deallocating tmp "//errorMessage
! stop
! endif
! endif
#endif /* WITH_MPI */
#endif /* WITH_MPI */
! U = U * Tmat**T
! U = U * Tmat**T
if (useGPU) then
if (useGPU) then
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
...
@@ -1138,9 +1170,6 @@
...
@@ -1138,9 +1170,6 @@
call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, comm, mpierr)
call mpi_allreduce(h1, h2, nc, MPI_COMPLEX, MPI_SUM, comm, mpierr)
#endif
#endif
#else /* WITH_MPI */
h2(1:nc) = h1(1:nc)
#endif /* WITH_MPI */
nc = 0
nc = 0
do i=1,n
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
a(1:i,i) = h2(nc+1:nc+i)
...
@@ -1148,6 +1177,27 @@
...
@@ -1148,6 +1177,27 @@
nc = nc+i
nc = nc+i
enddo
enddo
#else /* WITH_MPI */
! h2(1:nc) = h1(1:nc)
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
a(i,1:i-1) = conjg(a(1:i-1,i))
nc = nc+i
enddo
#endif /* WITH_MPI */
! nc = 0
! do i=1,n
! a(1:i,i) = h2(nc+1:nc+i)
! a(i,1:i-1) = conjg(a(1:i-1,i))
! nc = nc+i
! enddo
#ifdef HAVE_DETAILED_TIMINGS
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%stop("herm_matrix_allreduce_double")
call timer%stop("herm_matrix_allreduce_double")
...
@@ -1491,16 +1541,30 @@
...
@@ -1491,16 +1541,30 @@
#endif
#endif
#else /* WITH_MPI */
#else /* WITH_MPI */
tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
#endif /* WITH_MPI */
if (l_rows>0) then
if (l_rows>0) then
if (useGPU) then
if (useGPU) then
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_double_complex_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_single_complex_datatype,cudaMemcpyHostToDevice)
#endif
#endif /* WITH_MPI */
if (.not.(successCUDA)) then
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
stop
...
@@ -1528,6 +1592,9 @@
...
@@ -1528,6 +1592,9 @@
tmp_dev, n_cols, CONE, q_dev, ldq)
tmp_dev, n_cols, CONE, q_dev, ldq)
#endif
#endif
else ! not useGPU
else ! not useGPU
#ifdef WITH_MPI
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp2, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
...
@@ -1537,6 +1604,21 @@
...
@@ -1537,6 +1604,21 @@
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp2, n_cols, CONE, q, ldq)
tmp2, n_cols, CONE, q, ldq)
#endif
#endif
#else /* WITH_MPI */
#ifdef DOUBLE_PRECISION_COMPLEX
call ztrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call zgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#else
call ctrmm('L', 'U', 'C', 'N', n_cols, l_cols, CONE, tmat(1,1,istep), ubound(tmat,dim=1), tmp1, n_cols)
call cgemm('N', 'N', l_rows, l_cols, n_cols, -CONE, hvm, ubound(hvm,dim=1), &
tmp1, n_cols, CONE, q, ldq)
#endif
#endif /* WITH_MPI */
endif
endif
endif
endif
...
@@ -3403,7 +3485,9 @@
...
@@ -3403,7 +3485,9 @@
#endif
#endif
#else /* WITH_MPI */
#else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#else /* WITH_OPENMP */
...
@@ -3420,7 +3504,9 @@
...
@@ -3420,7 +3504,9 @@
#ifdef WITH_MPI
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
#endif
endif
endif
#else /* DOUBLE_PRECISION_COMPLEX */
#else /* DOUBLE_PRECISION_COMPLEX */
...
@@ -3436,7 +3522,8 @@
...
@@ -3436,7 +3522,8 @@
#ifdef WITH_MPI
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call MPI_Recv(row, l_nev, MPI_COMPLEX8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
#else
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif
#endif
endif
endif
...
@@ -3603,7 +3690,9 @@
...
@@ -3603,7 +3690,9 @@
#endif
#endif
#else /* WITH_MPI */
#else /* WITH_MPI */
row(1:l_nev) = row(1:l_nev)
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
#else /* WITH_OPENMP */
...
@@ -3620,7 +3709,9 @@
...
@@ -3620,7 +3709,9 @@
#ifdef WITH_MPI
#ifdef WITH_MPI
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
#else
#else