Commit a9994cff authored by Lorenz Huedepohl's avatar Lorenz Huedepohl
Browse files

Remove tab characters

parent 4de50074
......@@ -247,8 +247,8 @@
! Rearrange eigenvectors
call resort_ev_&
&PRECISION &
(idx, na)
&PRECISION &
(idx, na)
call timer%stop("merge_systems" // PRECISION_SUFFIX)
......@@ -341,8 +341,8 @@
qtrans(1,1) = C; qtrans(1,2) =-S
qtrans(2,1) = S; qtrans(2,2) = C
call transform_columns_&
&PRECISION &
(idx(i), idx1(na1))
&PRECISION &
(idx(i), idx1(na1))
if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2
if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2
......@@ -383,22 +383,22 @@
if (na1==1) then
d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation
else ! na1==2
call timer%start("blas")
call timer%start("blas")
call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1))
call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2))
call timer%stop("blas")
call timer%stop("blas")
call transform_columns_&
&PRECISION&
&(idx1(1), idx1(2))
&PRECISION&
&(idx1(1), idx1(2))
endif
! Add the deflated eigenvalues
d(na1+1:na) = d2(1:na2)
! Calculate arrangement of all eigenvalues in output
call timer%start("blas")
call timer%start("blas")
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call timer%stop("blas")
call timer%stop("blas")
! Rearrange eigenvalues
tmp = d
......@@ -416,8 +416,8 @@
endif
enddo
call resort_ev_&
&PRECISION&
&(idxq1, na)
&PRECISION&
&(idxq1, na)
else if (na1>2) then
! Solve secular equation
......@@ -439,16 +439,16 @@
!$OMP DO
#endif
DO i = my_proc+1, na1, n_procs ! work distributed over all processors
call timer%start("blas")
call timer%start("blas")
call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used!
call timer%stop("blas")
call timer%stop("blas")
if (info/=0) then
! If DLAED4 fails (may happen especially for LAPACK versions before 3.2)
! use the more stable bisection algorithm in solve_secular_equation
! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection'
call solve_secular_equation_&
&PRECISION&
&(na1, i, d1, z1, delta, rho, s)
&PRECISION&
&(na1, i, d1, z1, delta, rho, s)
endif
! Compute updated z
......@@ -490,16 +490,16 @@
#endif
call global_product_&
&PRECISION&
(z, na1)
&PRECISION&
(z, na1)
z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) )
call global_gather_&
&PRECISION&
&(dbase, na1)
&PRECISION&
&(dbase, na1)
call global_gather_&
&PRECISION&
&(ddiff, na1)
&PRECISION&
&(ddiff, na1)
d(1:na1) = dbase(1:na1) - ddiff(1:na1)
! Calculate scale factors for eigenvectors
......@@ -523,8 +523,8 @@
! in exactly this order, but we want to prevent compiler optimization
! ev_scale_val = ev_scale(i)
call add_tmp_&
&PRECISION&
&(d1, dbase, ddiff, z, ev_scale(i), na1,i)
&PRECISION&
&(d1, dbase, ddiff, z, ev_scale(i), na1,i)
! ev_scale(i) = ev_scale_val
enddo
#ifdef WITH_OPENMP
......@@ -535,12 +535,12 @@
#endif
call global_gather_&
&PRECISION&
&(ev_scale, na1)
&PRECISION&
&(ev_scale, na1)
! Add the deflated eigenvalues
d(na1+1:na) = d2(1:na2)
call timer%start("blas")
call timer%start("blas")
! Calculate arrangement of all eigenvalues in output
call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx )
call timer%stop("blas")
......@@ -550,8 +550,8 @@
d(i) = tmp(idx(i))
enddo
call check_monotony_&
&PRECISION&
&(na,d,'Output', wantDebug, success)
&PRECISION&
&(na,d,'Output', wantDebug, success)
if (.not.(success)) then
call timer%stop("merge_systems" // PRECISION_SUFFIX)
......@@ -720,8 +720,8 @@
! See above why we are doing it this way!
tmp(1:nnzu) = d1u(1:nnzu)-dbase(j)
call v_add_s_&
&PRECISION&
&(tmp,nnzu,ddiff(j))
&PRECISION&
&(tmp,nnzu,ddiff(j))
ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j)
enddo
......@@ -736,7 +736,7 @@
if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) &
call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, CONST_1_0, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), &
CONST_1_0, qtmp2(1,1), ubound(qtmp2,dim=1))
call timer%stop("blas")
call timer%stop("blas")
! endif ! useGPU
! Compute eigenvectors of the rank-1 modified matrix.
! Parts for multiplying with lower half of Q:
......@@ -747,8 +747,8 @@
! See above why we are doing it this way!
tmp(1:nnzl) = d1l(1:nnzl)-dbase(j)
call v_add_s_&
&PRECISION&
&(tmp,nnzl,ddiff(j))
&PRECISION&
&(tmp,nnzl,ddiff(j))
ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j)
enddo
......@@ -763,7 +763,7 @@
if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) &
call PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, CONST_1_0, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, &
ubound(ev,dim=1), CONST_1_0, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1))
call timer%stop("blas")
call timer%stop("blas")
! endif ! useGPU
! Put partial result into (output) Q
......@@ -795,8 +795,8 @@
contains
subroutine add_tmp_&
&PRECISION&
&(d1, dbase, ddiff, z, ev_scale_value, na1,i)
&PRECISION&
&(d1, dbase, ddiff, z, ev_scale_value, na1,i)
use precision
implicit none
......@@ -814,17 +814,17 @@
tmp(1:na1) = d1(1:na1) -dbase(i)
call v_add_s_&
&PRECISION&
&(tmp(1:na1),na1,ddiff(i))
&PRECISION&
&(tmp(1:na1),na1,ddiff(i))
tmp(1:na1) = z(1:na1) / tmp(1:na1)
ev_scale_value = CONST_1_0/sqrt(dot_product(tmp(1:na1),tmp(1:na1)))
end subroutine add_tmp_&
&PRECISION
&PRECISION
subroutine resort_ev_&
&PRECISION&
&(idx_ev, nLength)
&PRECISION&
&(idx_ev, nLength)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
......@@ -907,11 +907,11 @@
stop 1
endif
end subroutine resort_ev_&
&PRECISION
&PRECISION
subroutine transform_columns_&
&PRECISION&
&(col1, col2)
&PRECISION&
&(col1, col2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
......@@ -962,11 +962,11 @@
q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2)
endif
end subroutine transform_columns_&
&PRECISION
&PRECISION
subroutine global_gather_&
&PRECISION&
&(z, n)
&PRECISION&
&(z, n)
! This routine sums up z over all processors.
! It should only be used for gathering distributed results,
! i.e. z(i) should be nonzero on exactly 1 processor column,
......@@ -1024,11 +1024,11 @@
#endif /* WITH_MPI */
enddo
end subroutine global_gather_&
&PRECISION
&PRECISION
subroutine global_product_&
&PRECISION&
&(z, n)
&PRECISION&
&(z, n)
! This routine calculates the global product of z.
use precision
#ifdef HAVE_DETAILED_TIMINGS
......@@ -1105,11 +1105,11 @@
endif
end subroutine global_product_&
&PRECISION
&PRECISION
subroutine check_monotony_&
&PRECISION&
&(n,d,text, wantDebug, success)
&PRECISION&
&(n,d,text, wantDebug, success)
! This is a test routine for checking if the eigenvalues are monotonically increasing.
! It is for debug purposes only, an error should never be triggered!
use precision
......@@ -1132,7 +1132,7 @@
endif
enddo
end subroutine check_monotony_&
&PRECISION
&PRECISION
end subroutine merge_systems_&
&PRECISION
......@@ -242,9 +242,9 @@
if (useGPU) then
! todo: this is used only for copying hmv to device.. it should be possible to go without it
allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm1", istat, errorMessage)
call check_alloc("trans_ev_&
&MATH_DATATYPE&
&", "hvm1", istat, errorMessage)
successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
check_alloc_cuda("trans_ev", successCUDA)
......@@ -292,13 +292,13 @@
if (nb>0) &
call MPI_Bcast(hvb, nb, &
#if REALCASE == 1
&MPI_REAL_PRECISION&
&MPI_REAL_PRECISION&
#endif
#if COMPLEXCASE == 1
&MPI_COMPLEX_PRECISION&
#endif
, cur_pcol, mpi_comm_cols, mpierr)
, cur_pcol, mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
......@@ -320,7 +320,7 @@
! This can be done in different ways, we use dsyrk or zherk
tmat = 0
call timer%start("blas")
call timer%start("blas")
if (l_rows>0) &
#if REALCASE == 1
call PRECISION_SYRK('U', 'T', &
......@@ -328,8 +328,8 @@
#if COMPLEXCASE == 1
call PRECISION_HERK('U', 'C', &
#endif
nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows)
call timer%stop("blas")
nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows)
call timer%stop("blas")
nc = 0
do n = 1, nstor-1
h1(nc+1:nc+n) = tmat(1:n,n+1)
......@@ -344,7 +344,7 @@
#if COMPLEXCASE == 1
&MPI_COMPLEX_PRECISION&
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
......@@ -356,24 +356,24 @@
nc = 0
tmat(1,1) = tau(ice-nstor+1)
do n = 1, nstor-1
call timer%start("blas")
call timer%start("blas")
#if REALCASE == 1
call PRECISION_TRMV('L', 'T', 'N', &
#endif
#if COMPLEXCASE == 1
call PRECISION_TRMV('L', 'C', 'N', &
#endif
n, tmat, max_stored_rows, h2(nc+1), 1)
call timer%stop("blas")
n, tmat, max_stored_rows, h2(nc+1), 1)
call timer%stop("blas")
tmat(n+1,1:n) = &
#if REALCASE == 1
-h2(nc+1:nc+n) &
-h2(nc+1:nc+n) &
#endif
#if COMPLEXCASE == 1
-conjg(h2(nc+1:nc+n)) &
#endif
*tau(ice-nstor+n+1)
*tau(ice-nstor+n+1)
tmat(n+1,n+1) = tau(ice-nstor+n+1)
nc = nc+n
......@@ -399,7 +399,7 @@
if (l_rows>0) then
if (useGPU) then
call timer%start("cublas")
call timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_GEMM('T', 'N', &
#endif
......@@ -408,18 +408,18 @@
#endif
nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd, &
q_dev, ldq, ZERO, tmp_dev, nstor)
call timer%stop("cublas")
call timer%stop("cublas")
else ! useGPU
call timer%start("blas")
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMM('T', 'N', &
#endif
#if COMPLEXCASE == 1
call PRECISION_GEMM('C', 'N', &
#endif
nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
q_mat, ldq, ZERO, tmp1, nstor)
call timer%stop("blas")
endif ! useGPU
......@@ -450,7 +450,7 @@
#if COMPLEXCASE == 1
&MPI_COMPLEX_PRECISION&
#endif
&, MPI_SUM, mpi_comm_rows, mpierr)
&, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
! copy back tmp2 - after reduction...
if (useGPU) then
......@@ -466,33 +466,33 @@
if (l_rows>0) then
if (useGPU) then
call timer%start("cublas")
call timer%start("cublas")
call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', &
nstor, l_cols, ONE, tmat_dev, max_stored_rows, &
nstor, l_cols, ONE, tmat_dev, max_stored_rows, &
tmp_dev, nstor)
call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, &
-ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor, &
ONE, q_dev, ldq)
call timer%stop("cublas")
call timer%stop("cublas")
else !useGPU
#ifdef WITH_MPI
! tmp2 = tmat * tmp2
call timer%start("blas")
call timer%start("blas")
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
ONE, tmat, max_stored_rows, tmp2, nstor)
!q_mat = q_mat - hvm*tmp2
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-ONE, hvm, ubound(hvm,dim=1), tmp2, nstor, ONE, q_mat, ldq)
call timer%stop("blas")
call timer%stop("blas")
#else /* WITH_MPI */
call timer%start("blas")
call timer%start("blas")
call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, &
ONE, tmat, max_stored_rows, tmp1, nstor)
call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, &
-ONE, hvm, ubound(hvm,dim=1), tmp1, nstor, ONE, q_mat, ldq)
call timer%stop("blas")
call timer%stop("blas")
#endif /* WITH_MPI */
endif ! useGPU
endif ! l_rows>0
......@@ -504,8 +504,8 @@
deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_&
&MATH_DATATYPE&
&: error when deallocating hvm "//errorMessage
&MATH_DATATYPE&
&: error when deallocating hvm "//errorMessage
stop 1
endif
......@@ -517,8 +517,8 @@
deallocate(hvm1, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_&
&MATH_DATATYPE&
&: error when deallocating hvm1 "//errorMessage
&MATH_DATATYPE&
&: error when deallocating hvm1 "//errorMessage
stop 1
endif
......
......@@ -375,7 +375,7 @@
! copy l_cols + 1 column of A to v_row
if (useGPU) then
a_offset = l_cols * lda * size_of_datatype
a_offset = l_cols * lda * size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
......@@ -385,12 +385,12 @@
endif
if (n_stored_vecs > 0 .and. l_rows > 0) then
call timer%start("blas")
call timer%start("blas")
#if COMPLEXCASE == 1
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
call PRECISION_GEMV('N', &
l_rows, 2*n_stored_vecs, &
l_rows, 2*n_stored_vecs, &
ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
#if REALCASE == 1
uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
......@@ -400,7 +400,7 @@
#endif
ONE, v_row, 1)
call timer%stop("blas")
call timer%stop("blas")
endif
......@@ -459,12 +459,12 @@
tau(istep) = v_row(l_rows+1)
! Transpose Householder vector v_row -> v_col
call elpa_transpose_vectors_&
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
1, istep-1, 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
......@@ -546,7 +546,7 @@
if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call timer%start("blas")
call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, &
......@@ -558,7 +558,7 @@
ONE, ur_p(l_row_beg,my_thread), 1)
endif
call timer%stop("blas")
call timer%stop("blas")
endif
n_iter = n_iter+1
#else /* WITH_OPENMP */
......@@ -598,7 +598,7 @@
v_col(l_col_beg), 1, &
ONE, u_row(l_row_beg), 1)
endif
call timer%stop("blas")
call timer%stop("blas")
endif ! useGPU
#endif /* WITH_OPENMP */
......@@ -667,21 +667,21 @@
#endif /* WITH_OPENMP */
if (n_stored_vecs > 0) then
call timer%start("blas")
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMV('T', &
#endif
#if COMPLEXCASE == 1
call PRECISION_GEMV('C', &
#endif
l_rows, 2*n_stored_vecs, &
l_rows, 2*n_stored_vecs, &
ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
v_row, 1, ZERO, aux, 1)
call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs, &
ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1), &
aux, 1, ONE, u_col, 1)
call timer%stop("blas")
call timer%stop("blas")
endif
endif ! (l_rows>0 .and. l_cols>0)
......@@ -693,12 +693,12 @@
if (tile_size < istep-1) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
mpi_comm_cols, istep-1, 1, nblk)
mpi_comm_cols, istep-1, 1, nblk)
endif
......@@ -717,11 +717,11 @@
endif
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
&MATH_DATATYPE&
&_&
&PRECISION &
(u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
mpi_comm_rows, 1, istep-1, 1, nblk)
mpi_comm_rows, 1, istep-1, 1, nblk)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
#if REALCASE == 1
......
......@@ -140,7 +140,7 @@ subroutine elpa_reduce_add_vectors_&
#endif
do lc=1,nvc
do i = n, nblks_tot-1, lcm_s_t
k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/nps)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc)
......@@ -183,7 +183,7 @@ subroutine elpa_reduce_add_vectors_&
#endif
do lc=1,nvc
do i = n, nblks_tot-1, lcm_s_t
k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/npt)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl)
......
......@@ -91,10 +91,10 @@
& , cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"pack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop 1
endif
!write(*,*) cudaGetErrorString(istat)
......@@ -139,16 +139,16 @@
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * &
size_of_&
&PRECISION&
&_&
&MATH_DATATYPE&
&, cudaMemcpyHostToDevice)
&PRECISION&
&_&
&MATH_DATATYPE&
&, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
&: error in cudaMemcpy"
stop 1
endif
!write(*,*) cudaGetErrorString(istat)
......@@ -199,10 +199,10 @@
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
! A flush and a reset must be performed
call unpack_row_group_&
&MATH_DATATYPE&
&_gpu_&
&PRECISION&
(row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &