Commit b8a36fee authored by Pavel Kus's avatar Pavel Kus
Browse files

PRECISSION -> PRECISION plus few minor changes

Conflicts:
	src/elpa1_merge_systems_real_template.X90
	src/elpa1_solve_tridi_real_template.X90
	src/elpa1_trans_ev_real_template.X90
	src/elpa1_tridiag_real_template.X90
	src/elpa2_compute_real_template.X90
parent 7d5e595c
This diff is collapsed.
......@@ -52,7 +52,7 @@
! distributed along with the original code in the file "COPYING".
#endif
subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
subroutine M_solve_tridi_PRECISION( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success )
#ifdef HAVE_DETAILED_TIMINGS
......@@ -81,7 +81,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat
character(200) :: errorMessage
call timer%start("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%start("solve_tridi" // M_PRECISION_SUFFIX)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -116,7 +116,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Scalapack supports it but delivers no results for these columns,
! which is rather annoying
if (nc==0) then
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" // M_PRECISION_SUFFIX)
if (wantDebug) write(error_unit,*) 'ELPA1_solve_tridi: ERROR: Problem contains processor column with zero width'
success = .false.
return
......@@ -141,10 +141,10 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
else
nev1 = MIN(nev,l_cols)
endif
call M_solve_tridi_col_PRECISSION(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
call M_solve_tridi_col_PRECISION(l_cols, nev1, nc, d(nc+1), e(nc+1), q, ldq, nblk, &
matrixCols, mpi_comm_rows, wantDebug, success)
if (.not.(success)) then
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" // M_PRECISION_SUFFIX)
return
endif
! If there is only 1 processor column, we are done
......@@ -156,7 +156,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop
endif
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" // M_PRECISION_SUFFIX)
return
endif
......@@ -215,9 +215,9 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
enddo
! Recursively merge sub problems
call M_merge_recursive_PRECISSION(0, np_cols, wantDebug, success)
call M_merge_recursive_PRECISION(0, np_cols, wantDebug, success)
if (.not.(success)) then
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" // M_PRECISION_SUFFIX)
return
endif
......@@ -227,11 +227,11 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop
endif
call timer%stop("solve_tridi" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi" // M_PRECISION_SUFFIX)
return
contains
recursive subroutine M_merge_recursive_PRECISSION(np_off, nprocs, wantDebug, success)
recursive subroutine M_merge_recursive_PRECISION(np_off, nprocs, wantDebug, success)
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
......@@ -262,9 +262,9 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
np1 = nprocs/2
np2 = nprocs-np1
if (np1 > 1) call M_merge_recursive_PRECISSION(np_off, np1, wantDebug, success)
if (np1 > 1) call M_merge_recursive_PRECISION(np_off, np1, wantDebug, success)
if (.not.(success)) return
if (np2 > 1) call M_merge_recursive_PRECISSION(np_off+np1, np2, wantDebug, success)
if (np2 > 1) call M_merge_recursive_PRECISION(np_off+np1, np2, wantDebug, success)
if (.not.(success)) return
noff = limits(np_off)
......@@ -277,7 +277,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
#endif
if (my_pcol==np_off) then
do n=np_off+np1,np_off+nprocs-1
call mpi_send(d(noff+1), nmid, M_MPI_REAL_PRECISSION, n, 1, mpi_comm_cols, mpierr)
call mpi_send(d(noff+1), nmid, M_MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
......@@ -290,7 +290,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_recv(d(noff+1), nmid, M_MPI_REAL_PRECISSION, np_off, 1, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(d(noff+1), nmid, M_MPI_REAL_PRECISION, np_off, 1, mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
......@@ -305,7 +305,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_send(d(noff+nmid+1), nlen-nmid, M_MPI_REAL_PRECISSION, n, 1, mpi_comm_cols, mpierr)
call mpi_send(d(noff+nmid+1), nlen-nmid, M_MPI_REAL_PRECISION, n, 1, mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
......@@ -318,7 +318,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_recv(d(noff+nmid+1), nlen-nmid, M_MPI_REAL_PRECISSION, np_off+np1, 1,mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
call mpi_recv(d(noff+nmid+1), nlen-nmid, M_MPI_REAL_PRECISION, np_off+np1, 1,mpi_comm_cols, MPI_STATUS_IGNORE, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
......@@ -330,22 +330,22 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Last merge, result distribution must be block cyclic, noff==0,
! p_col_bc is set so that only nev eigenvalues are calculated
call M_merge_systems_PRECISSION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
call M_merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col, p_col, &
l_col_bc, p_col_bc, np_off, nprocs, wantDebug, success )
if (.not.(success)) return
else
! Not last merge, leave dense column distribution
call M_merge_systems_PRECISSION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
call M_merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, noff, &
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, l_col(noff+1), p_col(noff+1), &
l_col(noff+1), p_col(noff+1), np_off, nprocs, wantDebug, success )
if (.not.(success)) return
endif
end subroutine M_merge_recursive_PRECISSION
end subroutine M_merge_recursive_PRECISION
end subroutine M_solve_tridi_PRECISSION
end subroutine M_solve_tridi_PRECISION
subroutine M_solve_tridi_col_PRECISSION( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
subroutine M_solve_tridi_col_PRECISION( na, nev, nqoff, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, wantDebug, success )
! Solves the symmetric, tridiagonal eigenvalue problem on one processor column
! with the divide and conquer method.
......@@ -379,7 +379,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat
character(200) :: errorMessage
call timer%start("solve_tridi_col" // M_PRECISSION_SUFFIX)
call timer%start("solve_tridi_col" // M_PRECISION_SUFFIX)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
......@@ -445,7 +445,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
noff = limits(n) ! Start of subproblem
nlen = limits(n+1)-noff ! Size of subproblem
call M_solve_tridi_single_problem_PRECISSION(nlen,d(noff+1),e(noff+1), &
call M_solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1), &
q(nqoff+noff+1,noff+1),ubound(q,dim=1), wantDebug, success)
if (.not.(success)) return
......@@ -474,7 +474,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
noff = limits(my_prow) ! Start of subproblem
nlen = limits(my_prow+1)-noff ! Size of subproblem
call M_solve_tridi_single_problem_PRECISSION(nlen,d(noff+1),e(noff+1),qmat1, &
call M_solve_tridi_single_problem_PRECISION(nlen,d(noff+1),e(noff+1),qmat1, &
ubound(qmat1,dim=1), wantDebug, success)
if (.not.(success)) return
......@@ -490,9 +490,9 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call MPI_Bcast(d(noff+1), nlen, M_MPI_REAL_PRECISSION, np, mpi_comm_rows, mpierr)
call MPI_Bcast(d(noff+1), nlen, M_MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
qmat2 = qmat1
call MPI_Bcast(qmat2, max_size*max_size, M_MPI_REAL_PRECISSION, np, mpi_comm_rows, mpierr)
call MPI_Bcast(qmat2, max_size*max_size, M_MPI_REAL_PRECISION, np, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
......@@ -502,9 +502,9 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
do i=1,nlen
#ifdef WITH_MPI
call M_distribute_global_column_PRECISSION(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
call M_distribute_global_column_PRECISION(qmat2(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#else /* WITH_MPI */
call M_distribute_global_column_PRECISSION(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
call M_distribute_global_column_PRECISION(qmat1(1,i), q(1,noff+i), nqoff+noff, nlen, my_prow, np_rows, nblk)
#endif /* WITH_MPI */
enddo
......@@ -547,7 +547,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
! Last merge, set p_col_o=-1 for unneeded (output) eigenvectors
p_col_o(nev+1:na) = -1
endif
call M_merge_systems_PRECISSION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
call M_merge_systems_PRECISION(nlen, nmid, d(noff+1), e(noff+nmid), q, ldq, nqoff+noff, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_self, l_col(noff+1), p_col_i(noff+1), &
l_col(noff+1), p_col_o(noff+1), 0, 1, wantDebug, success)
if (.not.(success)) return
......@@ -564,11 +564,11 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop
endif
call timer%stop("solve_tridi_col" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi_col" // M_PRECISION_SUFFIX)
end subroutine M_solve_tridi_col_PRECISSION
end subroutine M_solve_tridi_col_PRECISION
recursive subroutine M_solve_tridi_single_problem_PRECISSION(nlen, d, e, q, ldq, wantDebug, success)
recursive subroutine M_solve_tridi_single_problem_PRECISION(nlen, d, e, q, ldq, wantDebug, success)
! Solves the symmetric, tridiagonal eigenvalue problem on a single processor.
! Takes precautions if DSTEDC fails or if the eigenvalues are not ordered correctly.
......@@ -594,7 +594,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
integer(kind=ik) :: istat
character(200) :: errorMessage
call timer%start("solve_tridi_single" // M_PRECISSION_SUFFIX)
call timer%start("solve_tridi_single" // M_PRECISION_SUFFIX)
success = .true.
allocate(ds(nlen), es(nlen), stat=istat, errmsg=errorMessage)
......@@ -618,7 +618,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
stop
endif
call M_PRECISSION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
call M_PRECISION_STEDC('I', nlen, d, e, q, ldq, work, lwork, iwork, liwork, info)
if (info /= 0) then
......@@ -628,7 +628,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
d(:) = ds(:)
e(:) = es(:)
call M_PRECISSION_STEQR('I', nlen, d, e, q, ldq, work, info)
call M_PRECISION_STEQR('I', nlen, d, e, q, ldq, work, info)
! If DSTEQR fails also, we don't know what to do further ...
if (info /= 0) then
......@@ -688,7 +688,7 @@ subroutine M_solve_tridi_PRECISSION( na, nev, d, e, q, ldq, nblk, matrixCols, mp
endif
enddo
call timer%stop("solve_tridi_single" // M_PRECISSION_SUFFIX)
call timer%stop("solve_tridi_single" // M_PRECISION_SUFFIX)
end subroutine M_solve_tridi_single_problem_PRECISSION
end subroutine M_solve_tridi_single_problem_PRECISION
......@@ -52,16 +52,16 @@
! distributed along with the original code in the file "COPYING".
#endif
subroutine M_v_add_s_PRECISSION(v,n,s)
subroutine M_v_add_s_PRECISION(v,n,s)
use precision
implicit none
integer(kind=ik) :: n
real(kind=REAL_DATATYPE) :: v(n),s
v(:) = v(:) + s
end subroutine M_v_add_s_PRECISSION
end subroutine M_v_add_s_PRECISION
subroutine M_distribute_global_column_PRECISSION(g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
subroutine M_distribute_global_column_PRECISION(g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
use precision
implicit none
......@@ -86,9 +86,9 @@
l_col(l_off+js:l_off+je) = g_col(g_off+js-noff:g_off+je-noff)
enddo
end subroutine M_distribute_global_column_PRECISSION
end subroutine M_distribute_global_column_PRECISION
subroutine M_solve_secular_equation_PRECISSION(n, i, d, z, delta, rho, dlam)
subroutine M_solve_secular_equation_PRECISION(n, i, d, z, delta, rho, dlam)
!-------------------------------------------------------------------------------
! This routine solves the secular equation of a symmetric rank 1 modified
! diagonal matrix:
......@@ -156,7 +156,7 @@
! Upper and lower bound of the shifted solution interval are a and b
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_secular_equation" // M_PRECISSION_SUFFIX)
call timer%start("solve_secular_equation" // M_PRECISION_SUFFIX)
#endif
if (i==n) then
......@@ -222,13 +222,13 @@
dlam = x + dshift
delta(:) = delta(:) - x
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_secular_equation" // M_PRECISSION_SUFFIX)
call timer%stop("solve_secular_equation" // M_PRECISION_SUFFIX)
#endif
end subroutine M_solve_secular_equation_PRECISSION
end subroutine M_solve_secular_equation_PRECISION
!-------------------------------------------------------------------------------
subroutine M_hh_transform_real_PRECISSION(alpha, xnorm_sq, xf, tau)
subroutine M_hh_transform_real_PRECISION(alpha, xnorm_sq, xf, tau)
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
! and returns the factor xf by which x has to be scaled.
! It also hasn't the special handling for numbers < 1.d-300 or > 1.d150
......@@ -266,4 +266,4 @@
XF = 1./ALPHA
ALPHA = BETA
endif
end subroutine M_hh_transform_real_PRECISSION
end subroutine M_hh_transform_real_PRECISION
......@@ -85,7 +85,7 @@
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine M_trans_ev_real_PRECISSION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
subroutine M_trans_ev_real_PRECISION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
......@@ -125,7 +125,7 @@
!write(*,*) "na", na, "nqc", nqc, "lda", lda, "ldq", ldq, "matrixCols", matrixCols
call timer%start("trans_ev_real" // M_PRECISSION_SUFFIX)
call timer%start("trans_ev_real" // M_PRECISION_SUFFIX)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
......@@ -172,24 +172,20 @@
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_real", "hvm1", istat, errorMessage)
! allocate(tmat_dev(max_stored_rows,max_stored_rows))
! allocate(hvm_dev(max_local_rows*max_stored_rows))
! allocate(tmp_dev(max_local_cols*max_stored_rows))
! allocate(q_dev(ldq,nqc))
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * M_size_of_PRECISSION_real)
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * M_size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * M_size_of_PRECISSION_real)
successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * M_size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * M_size_of_PRECISSION_real)
successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * M_size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * M_size_of_PRECISSION_real)
successCUDA = cuda_malloc(q_dev, ldq * matrixCols * M_size_of_PRECISION_real)
check_alloc_cuda("trans_ev", successCUDA)
! q_dev = q
successCUDA = cuda_memcpy(q_dev, loc(q(1,1)), ldq * matrixCols * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy(q_dev, loc(q(1,1)), ldq * matrixCols * M_size_of_PRECISION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -232,7 +228,7 @@
call timer%start("mpi_communication")
#endif
if (nb>0) &
call MPI_Bcast(hvb, nb, M_MPI_REAL_PRECISSION, cur_pcol, mpi_comm_cols, mpierr)
call MPI_Bcast(hvb, nb, M_MPI_REAL_PRECISION, cur_pcol, mpi_comm_cols, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
......@@ -256,7 +252,7 @@
tmat = 0
if (l_rows>0) &
call M_PRECISSION_SYRK('U', 'T', nstor, l_rows, &
call M_PRECISION_SYRK('U', 'T', nstor, l_rows, &
M_CONST_1_0, hvm, ubound(hvm,dim=1), &
M_CONST_0_0, tmat, max_stored_rows)
......@@ -269,7 +265,7 @@
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
if (nc>0) call mpi_allreduce( h1, h2, nc, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
if (nc>0) call mpi_allreduce( h1, h2, nc, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
......@@ -281,7 +277,7 @@
nc = 0
tmat(1,1) = tau(ice-nstor+1)
do n=1,nstor-1
call M_PRECISSION_TRMV('L', 'T', 'N', n, &
call M_PRECISION_TRMV('L', 'T', 'N', n, &
tmat, max_stored_rows, &
h2(nc+1), 1)
tmat(n+1,1:n) = -h2(nc+1:nc+n)*tau(ice-nstor+n+1)
......@@ -295,12 +291,12 @@
!hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)), &
hvm_ubnd * nstor * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
hvm_ubnd * nstor * M_size_of_PRECISION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
!tmat_dev = tmat
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)), &
max_stored_rows * max_stored_rows * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
max_stored_rows * max_stored_rows * M_size_of_PRECISION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -308,13 +304,13 @@
if (l_rows>0) then
if(useGPU) then
call M_cublas_PRECISSION_gemm('T', 'N', nstor, l_cols, l_rows, &
call M_cublas_PRECISION_gemm('T', 'N', nstor, l_cols, l_rows, &
M_CONST_1_0, hvm_dev, hvm_ubnd, &
q_dev, ldq, &
M_CONST_0_0, tmp_dev, nstor)
else
call M_PRECISSION_GEMM('T', 'N', nstor, l_cols, l_rows, &
call M_PRECISION_GEMM('T', 'N', nstor, l_cols, l_rows, &
M_CONST_1_0, hvm, ubound(hvm,dim=1), &
q, ldq, &
M_CONST_0_0, tmp1, nstor)
......@@ -322,13 +318,12 @@
else !l_rows>0
if (useGPU) then
!tmp_dev(1:l_cols*nstor) = 0
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * M_size_of_PRECISSION_real)
successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * M_size_of_PRECISION_real)
check_memcpy_cuda("trans_ev", successCUDA)
else
tmp1(1:l_cols*nstor) = 0
endif
endif
endif !l_rows>0
#ifdef WITH_MPI
#ifdef HAVE_DETAILED_TIMINGS
......@@ -338,16 +333,16 @@
! todo: does it need to be copied whole? Wouldn't be a part sufficient?
if (useGPU) then
successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev, &
max_local_cols * max_stored_rows * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
max_local_cols * max_stored_rows * M_size_of_PRECISION_real, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
endif
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, M_MPI_REAL_PRECISSION, MPI_SUM, mpi_comm_rows, mpierr)
call mpi_allreduce(tmp1, tmp2, nstor*l_cols, M_MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
! copy back tmp2 - after reduction...
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)), &
max_local_cols * max_stored_rows * M_size_of_PRECISSION_real, cudaMemcpyHostToDevice)
max_local_cols * max_stored_rows * M_size_of_PRECISION_real, cudaMemcpyHostToDevice)
check_memcpy_cuda("trans_ev", successCUDA)
endif
......@@ -360,30 +355,30 @@
if (l_rows>0) then
if(useGPU) then
call M_cublas_PRECISSION_trmm('L', 'L', 'N', 'N', nstor, l_cols, &
call M_cublas_PRECISION_trmm('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat_dev, max_stored_rows, &
tmp_dev, nstor)
call M_cublas_PRECISSION_gemm('N', 'N' ,l_rows ,l_cols ,nstor, &
call M_cublas_PRECISION_gemm('N', 'N' ,l_rows ,l_cols ,nstor, &
-M_CONST_1_0, hvm_dev, hvm_ubnd, &
tmp_dev, nstor, &
M_CONST_1_0, q_dev, ldq)
else
#ifdef WITH_MPI
! tmp2 = tmat * tmp2
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
call M_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp2, nstor)
!q = q - hvm*tmp2
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
call M_PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp2, nstor, &
M_CONST_1_0, q, ldq)
#else
call M_PRECISSION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
call M_PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
M_CONST_1_0, tmat, max_stored_rows, &
tmp1, nstor)
call M_PRECISSION_GEMM('N', 'N', l_rows, l_cols, nstor, &
call M_PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-M_CONST_1_0, hvm, ubound(hvm,dim=1), &
tmp1, nstor, &
M_CONST_1_0, q, ldq)
......@@ -403,7 +398,7 @@
if (useGPU) then
!q = q_dev
successCUDA = cuda_memcpy(loc(q(1,1)), q_dev, ldq * matrixCols * M_size_of_PRECISSION_real, cudaMemcpyDeviceToHost)
successCUDA = cuda_memcpy(loc(q(1,1)), q_dev, ldq * matrixCols * M_size_of_PRECISION_real, cudaMemcpyDeviceToHost)
check_memcpy_cuda("trans_ev", successCUDA)
deallocate(hvm1, stat=istat, errmsg=errorMessage)
......@@ -427,6 +422,6 @@
endif
call timer%stop("trans_ev_real" // M_PRECISSION_SUFFIX)
call timer%stop("trans_ev_real" // M_PRECISION_SUFFIX)
end subroutine M_trans_ev_real_PRECISSION
end subroutine M_trans_ev_real_PRECISION
This diff is collapsed.
This diff is collapsed.
#ifdef DOUBLE_PRECISION_REAL
#define M_elpa_transpose_vectors_real_PRECISSION elpa_transpose_vectors_real_double
#define M_elpa_reduce_add_vectors_real_PRECISSION elpa_reduce_add_vectors_real_double
#define M_bandred_real_PRECISSION bandred_real_double
#define M_trans_ev_band_to_full_real_PRECISSION trans_ev_band_to_full_real_double
#define M_tridiag_band_real_PRECISSION tridiag_band_real_double
#define M_trans_ev_tridi_to_band_real_PRECISSION trans_ev_tridi_to_band_real_double
#define M_band_band_real_PRECISSION band_band_real_double
#define M_tridiag_real_PRECISSION tridiag_real_double
#define M_trans_ev_real_PRECISSION trans_ev_real_double
#define M_solve_tridi_PRECISSION solve_tridi_double
#define M_solve_tridi_col_PRECISSION solve_tridi_col_double
#define M_solve_tridi_single_problem_PRECISSION solve_tridi_single_problem_double
#define M_qr_pdgeqrf_2dcomm_PRECISSION qr_pdgeqrf_2dcomm_double
#define M_hh_transform_real_PRECISSION hh_transform_real_double
#define M_symm_matrix_allreduce_PRECISSION symm_matrix_allreduce_double
#define M_redist_band_real_PRECISSION redist_band_real_double
#define M_unpack_row_real_cpu_PRECISSION unpack_row_real_cpu_double
#define M_unpack_row_real_cpu_openmp_PRECISSION unpack_row_real_cpu_openmp_double
#define M_unpack_and_prepare_row_group_real_gpu_PRECISSION unpack_and_prepare_row_group_real_gpu_double
#define M_extract_hh_tau_real_gpu_PRECISSION extract_hh_tau_real_gpu_double
#define M_compute_hh_dot_products_real_gpu_PRECISSION compute_hh_dot_products_real_gpu_double
#define M_compute_hh_trafo_real_cpu_openmp_PRECISSION compute_hh_trafo_real_cpu_openmp_double
#define M_compute_hh_trafo_real_cpu_PRECISSION compute_hh_trafo_real_cpu_double
#define M_pack_row_group_real_gpu_PRECISSION pack_row_group_real_gpu_double
#define M_pack_row_real_cpu_openmp_PRECISSION pack_row_real_cpu_openmp_double
#define M_pack_row_real_cpu_PRECISSION pack_row_real_cpu_double
#define M_wy_gen_PRECISSION wy_gen_double
#define M_wy_right_PRECISSION wy_right_double
#define M_wy_left_PRECISSION wy_left_double
#define M_wy_symm_PRECISSION wy_symm_double
#define M_merge_recursive_PRECISSION merge_recursive_double
#define M_merge_systems_PRECISSION merge_systems_double
#define M_distribute_global_column_PRECISSION distribute_global_column_double
#define M_check_monotony_PRECISSION check_monotony_double
#define M_global_gather_PRECISSION global_gather_double
#define M_resort_ev_PRECISSION resort_ev_double
#define M_transform_columns_PRECISSION transform_columns_double
#define M_solve_secular_equation_PRECISSION solve_secular_equation_double
#define M_global_product_PRECISSION global_product_double
#define M_add_tmp_PRECISSION add_tmp_double
#define M_v_add_s_PRECISSION v_add_s_double
#define M_PRECISSION_SYRK DSYRK
#define M_PRECISSION_TRMV DTRMV
#define M_PRECISSION_GEMM DGEMM
#define M_PRECISSION_GEMV DGEMV
#define M_PRECISSION_TRMM DTRMM
#define M_PRECISSION_SYMV DSYMV
#define M_PRECISSION_SYMM DSYMM
#define M_PRECISSION_SYR2 DSYR2
#define M_PRECISSION_SYR2K DSYR2K
#define M_PRECISSION_GEQRF dgeqrf
#define M_PRECISSION_STEDC dstedc
#define M_PRECISSION_STEQR dsteqr
#define M_PRECISSION_LAMRG DLAMRG
#define M_PRECISSION_LAMCH DLAMCH
#define M_PRECISSION_LAPY2 DLAPY2
#define M_PRECISSION_LAED4 DLAED4
#define M_PRECISSION_LAED5 DLAED5
#define M_cublas_PRECISSION_gemm cublas_dgemm
#define M_cublas_PRECISSION_trmm cublas_dtrmm
#define M_cublas_PRECISSION_gemv cublas_dgemv
#define M_PRECISSION_SUFFIX "_double"
#define M_elpa_transpose_vectors_real_PRECISION elpa_transpose_vectors_real_double
#define M_elpa_reduce_add_vectors_real_PRECISION elpa_reduce_add_vectors_real_double
#define M_bandred_real_PRECISION bandred_real_double
#define M_trans_ev_band_to_full_real_PRECISION trans_ev_band_to_full_real_double
#define M_tridiag_band_real_PRECISION tridiag_band_real_double
#define M_trans_ev_tridi_to_band_real_PRECISION trans_ev_tridi_to_band_real_double
#define M_band_band_real_PRECISION band_band_real_double
#define M_tridiag_real_PRECISION tridiag_real_double
#define M_trans_ev_real_PRECISION trans_ev_real_double
#define M_solve_tridi_PRECISION solve_tridi_double
#define M_solve_tridi_col_PRECISION solve_tridi_col_double
#define M_solve_tridi_single_problem_PRECISION solve_tridi_single_problem_double
#define M_qr_pdgeqrf_2dcomm_PRECISION qr_pdgeqrf_2dcomm_double
#define M_hh_transform_real_PRECISION hh_transform_real_double
#define M_symm_matrix_allreduce_PRECISION symm_matrix_allreduce_double
#define M_redist_band_real_PRECISION redist_band_real_double
#define M_unpack_row_real_cpu_PRECISION unpack_row_real_cpu_double
#define M_unpack_row_real_cpu_openmp_PRECISION unpack_row_real_cpu_openmp_double
#define M_unpack_and_prepare_row_group_real_gpu_PRECISION unpack_and_prepare_row_group_real_gpu_double
#define M_extract_hh_tau_real_gpu_PRECISION extract_hh_tau_real_gpu_double
#define M_compute_hh_dot_products_real_gpu_PRECISION compute_hh_dot_products_real_gpu_double
#define M_compute_hh_trafo_real_cpu_openmp_PRECISION compute_hh_trafo_real_cpu_openmp_double
#define M_compute_hh_trafo_real_cpu_PRECISION compute_hh_trafo_real_cpu_double
#define M_pack_row_group_real_gpu_PRECISION pack_row_group_real_gpu_double
#define M_pack_row_real_cpu_openmp_PRECISION pack_row_real_cpu_openmp_double
#define M_pack_row_real_cpu_PRECISION pack_row_real_cpu_double
#define M_wy_gen_PRECISION wy_gen_double
#define M_wy_right_PRECISION wy_right_double
#define M_wy_left_PRECISION wy_left_double
#define M_wy_symm_PRECISION wy_symm_double
#define M_merge_recursive_PRECISION merge_recursive_double
#define M_merge_systems_PRECISION merge_systems_double
#define M_distribute_global_column_PRECISION distribute_global_column_double
#define M_check_monotony_PRECISION check_monotony_double
#define M_global_gather_PRECISION global_gather_double
#define M_resort_ev_PRECISION resort_ev_double
#define M_transform_columns_PRECISION transform_columns_double
#define M_solve_secular_equation_PRECISION solve_secular_equation_double
#define M_global_product_PRECISION global_product_double
#define M_add_tmp_PRECISION add_tmp_double
#define M_v_add_s_PRECISION v_add_s_double
#define M_PRECISION_SYRK DSYRK
#define M_PRECISION_TRMV DTRMV
#define M_PRECISION_GEMM DGEMM
#define M_PRECISION_GEMV DGEMV
#define M_PRECISION_TRMM DTRMM
#define M_PRECISION_SYMV DSYMV
#define M_PRECISION_SYMM DSYMM
#define M_PRECISION_SYR2 DSYR2
#define M_PRECISION_SYR2K DSYR2K
#define M_PRECISION_GEQRF dgeqrf
#define M_PRECISION_STEDC dstedc
#define M_PRECISION_STEQR dsteqr
#define M_PRECISION_LAMRG DLAMRG
#define M_PRECISION_LAMCH DLAMCH
#define M_PRECISION_LAPY2 DLAPY2
#define M_PRECISION_LAED4 DLAED4
#define M_PRECISION_LAED5 DLAED5
#define M_cublas_PRECISION_gemm cublas_dgemm