Commit 58923f0f authored by Andreas Marek's avatar Andreas Marek

Cleanup of precision_macros.h

parent 1bdd5eb4
......@@ -51,7 +51,10 @@
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
subroutine merge_systems_PRECISION( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
subroutine merge_systems_&
&PRECISION &
( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, wantDebug, success)
#ifdef HAVE_DETAILED_TIMINGS
......@@ -74,8 +77,7 @@
logical, intent(in) :: wantDebug
logical, intent(out) :: success
integer(kind=ik), parameter :: max_strip=128
integer(kind=ik), parameter :: max_strip=128
real(kind=REAL_DATATYPE) :: PRECISION_LAMCH, PRECISION_LAPY2
real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, &
......@@ -118,7 +120,8 @@
endif
#endif
call timer%start("merge_systems" // PRECISION_SUFFIX)
call timer%start("merge_systems" // &
&PRECISION_SUFFIX)
success = .true.
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -146,12 +149,16 @@
else
np_prev = my_pcol - 1
endif
call check_monotony_PRECISION(nm,d,'Input1',wantDebug, success)
call check_monotony_&
&PRECISION&
&(nm,d,'Input1',wantDebug, success)
if (.not.(success)) then
call timer%stop("merge_systems" // PRECISION_SUFFIX)
return
endif
call check_monotony_PRECISION(na-nm,d(nm+1),'Input2',wantDebug, success)
call check_monotony_&
&PRECISION&
&(na-nm,d(nm+1),'Input2',wantDebug, success)
if (.not.(success)) then
call timer%stop("merge_systems" // PRECISION_SUFFIX)
return
......@@ -208,7 +215,9 @@
enddo
endif
call global_gather_PRECISION(z, na)
call global_gather_&
&PRECISION&
&(z, na)
! Normalize z so that norm(z) = 1. Since z is the concatenation of
! two normalized vectors, norm2(z) = sqrt(2).
z = z/sqrt(CONST_2_0)
......@@ -238,7 +247,9 @@
enddo
! Rearrange eigenvectors
call resort_ev_PRECISION(idx, na)
call resort_ev_&
&PRECISION &
(idx, na)
call timer%stop("merge_systems" // PRECISION_SUFFIX)
......@@ -330,7 +341,9 @@
qtrans(1,1) = C; qtrans(1,2) =-S
qtrans(2,1) = S; qtrans(2,2) = C
call transform_columns_PRECISION(idx(i), idx1(na1))
call transform_columns_&
&PRECISION &
(idx(i), idx1(na1))
if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
......@@ -350,12 +363,16 @@
endif
enddo
call check_monotony_PRECISION(na1,d1,'Sorted1', wantDebug, success)
call check_monotony_&
&PRECISION&
&(na1,d1,'Sorted1', wantDebug, success)
if (.not.(success)) then
call timer%stop("merge_systems" // PRECISION_SUFFIX)
return
endif
call check_monotony_PRECISION(na2,d2,'Sorted2', wantDebug, success)
call check_monotony_&
&PRECISION&
&(na2,d2,'Sorted2', wantDebug, success)
if (.not.(success)) then
call timer%stop("merge_systems" // PRECISION_SUFFIX)
return
......@@ -371,7 +388,9 @@
call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
call timer%stop("blas")
call transform_columns_PRECISION(idx1(1), idx1(2))
call transform_columns_&
&PRECISION&
&(idx1(1), idx1(2))
endif
! Add the deflated eigenvalues
......@@ -397,7 +416,9 @@
idxq1(i) = idx2(idx(i)-na1)
endif
enddo
call resort_ev_PRECISION(idxq1, na)
call resort_ev_&
&PRECISION&
&(idxq1, na)
else if (na1>2) then
! Solve secular equation
......@@ -426,7 +447,9 @@
! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
! use the more stable bisection algorithm in solve_secular_equation
! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
call solve_secular_equation_PRECISION(na1, i, d1, z1, delta, rho, s)
call solve_secular_equation_&
&PRECISION&
&(na1, i, d1, z1, delta, rho, s)
endif
! Compute updated z
......@@ -467,11 +490,17 @@
enddo
#endif
call global_product_PRECISION(z, na1)
call global_product_&
&PRECISION&
(z, na1)
z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
call global_gather_PRECISION(dbase, na1)
call global_gather_PRECISION(ddiff, na1)
call global_gather_&
&PRECISION&
&(dbase, na1)
call global_gather_&
&PRECISION&
&(ddiff, na1)
d(1:na1) = dbase(1:na1) - ddiff(1:na1)
! Calculate scale factors for eigenvectors
......@@ -494,7 +523,9 @@
! All we want to calculate is tmp = (d1(1:na1)-dbase(i))+ddiff(i)
! in exactly this order, but we want to prevent compiler optimization
! ev_scale_val = ev_scale(i)
call add_tmp_PRECISION(d1, dbase, ddiff, z, ev_scale(i), na1,i)
call add_tmp_&
&PRECISION&
&(d1, dbase, ddiff, z, ev_scale(i), na1,i)
! ev_scale(i) = ev_scale_val
enddo
#ifdef WITH_OPENMP
......@@ -504,7 +535,9 @@
#endif
call global_gather_PRECISION(ev_scale, na1)
call global_gather_&
&PRECISION&
&(ev_scale, na1)
! Add the deflated eigenvalues
d(na1+1:na) = d2(1:na2)
......@@ -517,7 +550,9 @@
do i=1,na
d(i) = tmp(idx(i))
enddo
call check_monotony_PRECISION(na,d,'Output', wantDebug, success)
call check_monotony_&
&PRECISION&
&(na,d,'Output', wantDebug, success)
if (.not.(success)) then
call timer%stop("merge_systems" // PRECISION_SUFFIX)
......@@ -685,7 +720,9 @@
! Calculate the j-th eigenvector of the deflated system
! See above why we are doing it this way!
tmp(1:nnzu) = d1u(1:nnzu)-dbase(j)
call v_add_s_PRECISION(tmp,nnzu,ddiff(j))
call v_add_s_&
&PRECISION&
&(tmp,nnzu,ddiff(j))
ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
enddo
......@@ -710,7 +747,9 @@
! Calculate the j-th eigenvector of the deflated system
! See above why we are doing it this way!
tmp(1:nnzl) = d1l(1:nnzl)-dbase(j)
call v_add_s_PRECISION(tmp,nnzl,ddiff(j))
call v_add_s_&
&PRECISION&
&(tmp,nnzl,ddiff(j))
ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
enddo
......@@ -756,7 +795,9 @@
return
contains
subroutine add_tmp_PRECISION(d1, dbase, ddiff, z, ev_scale_value, na1,i)
subroutine add_tmp_&
&PRECISION&
&(d1, dbase, ddiff, z, ev_scale_value, na1,i)
use precision
implicit none
......@@ -773,13 +814,18 @@
! in exactly this order, but we want to prevent compiler optimization
tmp(1:na1) = d1(1:na1) -dbase(i)
call v_add_s_PRECISION(tmp(1:na1),na1,ddiff(i))
call v_add_s_&
&PRECISION&
&(tmp(1:na1),na1,ddiff(i))
tmp(1:na1) = z(1:na1) / tmp(1:na1)
ev_scale_value = CONST_1_0/sqrt(dot_product(tmp(1:na1),tmp(1:na1)))
end subroutine add_tmp_PRECISION
end subroutine add_tmp_&
&PRECISION
subroutine resort_ev_PRECISION(idx_ev, nLength)
subroutine resort_ev_&
&PRECISION&
&(idx_ev, nLength)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
......@@ -861,9 +907,12 @@
print *,"resort_ev: error when deallocating qtmp "//errorMessage
stop
endif
end subroutine resort_ev_PRECISION
end subroutine resort_ev_&
&PRECISION
subroutine transform_columns_PRECISION(col1, col2)
subroutine transform_columns_&
&PRECISION&
&(col1, col2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
......@@ -913,9 +962,12 @@
q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
endif
end subroutine transform_columns_PRECISION
end subroutine transform_columns_&
&PRECISION
subroutine global_gather_PRECISION(z, n)
subroutine global_gather_&
&PRECISION&
&(z, n)
! This routine sums up z over all processors.
! It should only be used for gathering distributed results,
! i.e. z(i) should be nonzero on exactly 1 processor column,
......@@ -972,9 +1024,12 @@
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
enddo
end subroutine global_gather_PRECISION
end subroutine global_gather_&
&PRECISION
subroutine global_product_PRECISION(z, n)
subroutine global_product_&
&PRECISION&
&(z, n)
! This routine calculates the global product of z.
use precision
#ifdef HAVE_DETAILED_TIMINGS
......@@ -1050,9 +1105,12 @@
#endif /* WITH_MPI */
endif
end subroutine global_product_PRECISION
end subroutine global_product_&
&PRECISION
subroutine check_monotony_PRECISION(n,d,text, wantDebug, success)
subroutine check_monotony_&
&PRECISION&
&(n,d,text, wantDebug, success)
! This is a test routine for checking if the eigenvalues are monotonically increasing.
! It is for debug purposes only, an error should never be triggered!
use precision
......@@ -1074,6 +1132,8 @@
return
endif
enddo
end subroutine check_monotony_PRECISION
end subroutine check_monotony_&
&PRECISION
end subroutine merge_systems_PRECISION
end subroutine merge_systems_&
&PRECISION
......@@ -54,16 +54,21 @@
#if REALCASE == 1
subroutine v_add_s_PRECISION(v,n,s)
subroutine v_add_s_&
&PRECISION&
&(v,n,s)
use precision
implicit none
integer(kind=ik) :: n
real(kind=REAL_DATATYPE) :: v(n),s
v(:) = v(:) + s
end subroutine v_add_s_PRECISION
end subroutine v_add_s_&
&PRECISION
subroutine distribute_global_column_PRECISION(g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
subroutine distribute_global_column_&
&PRECISION&
&(g_col, l_col, noff, nlen, my_prow, np_rows, nblk)
use precision
implicit none
......@@ -88,9 +93,12 @@
l_col(l_off+js:l_off+je) = g_col(g_off+js-noff:g_off+je-noff)
enddo
end subroutine distribute_global_column_PRECISION
end subroutine distribute_global_column_&
&PRECISION
subroutine solve_secular_equation_PRECISION(n, i, d, z, delta, rho, dlam)
subroutine solve_secular_equation_&
&PRECISION&
&(n, i, d, z, delta, rho, dlam)
!-------------------------------------------------------------------------------
! This routine solves the secular equation of a symmetric rank 1 modified
! diagonal matrix:
......@@ -225,7 +233,8 @@
delta(:) = delta(:) - x
call timer%stop("solve_secular_equation" // PRECISION_SUFFIX)
end subroutine solve_secular_equation_PRECISION
end subroutine solve_secular_equation_&
&PRECISION
!-------------------------------------------------------------------------------
#endif
......@@ -271,8 +280,10 @@
real(kind=REAL_DATATYPE) :: BETA
call timer%start("hh_tranform_&
&MATH_DATATYPE&
&" // PRECISION_SUFFIX )
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
#if COMPLEXCASE == 1
ALPHR = real( ALPHA, kind=REAL_DATATYPE )
......@@ -334,8 +345,10 @@
endif
call timer%stop("hh_transform_&
&MATH_DATATYPE&
&" // PRECISION_SUFFIX )
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
#if REALCASE == 1
end subroutine hh_transform_real_&
......
......@@ -212,12 +212,11 @@
ii, pp, transformChunkSize
#endif
#if REALCASE == 1
call timer%start("bandred_real" // PRECISION_SUFFIX)
#endif
#if COMPLEXCASE == 1
call timer%start("bandred_complex" // PRECISION_SUFFIX)
#endif
call timer%start("bandred_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
......@@ -233,14 +232,12 @@
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
#if REALCASE == 1
write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
#endif
#if COMPLEXCASE == 1
write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk'
#endif
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_&
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
......@@ -263,53 +260,48 @@
na_cols = na
#endif
#if REALCASE == 1
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_PRECISION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_malloc(a_dev, lda*na_cols* &
#if REALCASE == 1
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed a_dev ", istat
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop
endif
#endif
successCUDA = cuda_malloc(tmat_dev, nbw*nbw* &
#if REALCASE == 1
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_PRECISION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *, " bandred_complex: cuda malloc failed tmat_dev ", istat
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop
endif
#endif
successCUDA = cuda_malloc(vav_dev, nbw*nbw* &
#if REALCASE == 1
successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_PRECISION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_PRECISION_complex)
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed vav_dev ", istat
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop
endif
#endif
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
......@@ -352,12 +344,16 @@
vmrCols = na
#ifdef USE_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm_PRECISION(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), na, tmat(1,1,1), &
nbw, nbw, dwork_size, 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
#else
call qr_pdgeqrf_2dcomm_PRECISION(a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(a(1:lda,1:matrixCols), matrixCols, lda, vmrCPU(1:max(l_rows,1),1:vmrCols), max(l_rows,1), &
vmrCols, tauvector(1:na), na, tmat(1:nbw,1:nbw,1), nbw, &
nbw, dwork_size(1:1), 1, -1, na, nbw, nblk, nblk, na, na, 1, 0, PQRPARAM(1:11), &
mpi_comm_rows, mpi_comm_cols, blockheuristic)
......@@ -379,7 +375,7 @@
endif ! which_qr_decomposition
endif ! useQr
#endif /* useQr */
#endif /* REALCASE */
if (useGPU) then
!#if !defined(USE_ASSUMED_SIZE)
......@@ -390,20 +386,21 @@
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)* &
#if REALCASE == 1
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
size_of_PRECISION_real, &
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
size_of_PRECISION_complex, &
#endif
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy faild a_dev ", istat
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
#endif
endif ! useGPU
......@@ -430,23 +427,17 @@
if (allocated(vr)) then
deallocate(vr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
#if REALCASE == 1
print *,"bandred_real: error when deallocating vr "//errorMessage
#endif
#if COMPLEXCASE == 1
print *,"bandred_complex: error when deallocating vr "//errorMessage
#endif
print *,"bandred_&
&MATH_DATATYPE&
&: error when deallocating vr "//errorMessage
stop
endif
endif
allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
#if REALCASE == 1
print *,"bandred_real: error when allocating vr "//errorMessage
#endif
#if COMPLEXCASE == 1
print *,"bandred_complex: error when allocating vr "//errorMessage
#endif
print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating vr "//errorMessage
stop
endif
......@@ -586,46 +577,29 @@
! unify the the name vmr and vmrCPU, as well as vmrGPU
! the same for umcCPU and umcGPU
#if REALCASE == 1
! Allocate vmr and umcCPU to their exact sizes so that they can be used in bcasts and reduces
allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCPU "//errorMessage
print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating vmrCPU "//errorMessage
stop
endif
#endif
#if COMPLEXCASE == 1
allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vmrCPU "//errorMessage
stop
endif
#endif
#if REALCASE == 1
allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating umcCPU "//errorMessage
stop
endif
#endif
#if COMPLEXCASE == 1
allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umcCPU "//errorMessage
print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating umcCPU "//errorMessage
stop
endif
#endif
allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
#if REALCASE == 1
print *,"bandred_real: error when allocating vr "//errorMessage
#endif
#if COMPLEXCASE == 1
print *,"bandred_complex: error when allocating vr "//errorMessage
#endif
print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating vr "//errorMessage
stop
endif
......@@ -680,12 +654,9 @@
int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int))
#endif
if (.not.(successCUDA)) then
#if REALCASE == 1
print *,"bandred_real: error in cudaMemcpy2d"
#endif
#if COMPLEXCASE == 1
print *, "bandred_complex: error in cudaMemcpy2"
#endif
print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy2d"
stop
endif
......@@ -698,7 +669,9 @@
if (which_qr_decomposition == 1) then
vmrCols = 2*n_cols
#ifdef USE_ASSUMED_SIZE_QR
call qr_pdgeqrf_2dcomm_PRECISION(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
call qr_pdgeqrf_2dcomm_&
&PRECISION&
&(a, lda, matrixCols, vmrCPU, max(l_rows,1), vmrCols, tauvector(1), &
na, tmat(1,1,istep), nbw, nbw, work_blocked, work_size, &
work_size, na, n_cols, nblk, nblk, &