Commit a9994cff by Lorenz Huedepohl

### Remove tab characters

parent 4de50074
 ... ... @@ -247,8 +247,8 @@ ! Rearrange eigenvectors call resort_ev_& &PRECISION & (idx, na) &PRECISION & (idx, na) call timer%stop("merge_systems" // PRECISION_SUFFIX) ... ... @@ -341,8 +341,8 @@ qtrans(1,1) = C; qtrans(1,2) =-S qtrans(2,1) = S; qtrans(2,2) = C call transform_columns_& &PRECISION & (idx(i), idx1(na1)) &PRECISION & (idx(i), idx1(na1)) if (coltyp(idx(i))==1 .and. coltyp(idx1(na1))/=1) coltyp(idx1(na1)) = 2 if (coltyp(idx(i))==3 .and. coltyp(idx1(na1))/=3) coltyp(idx1(na1)) = 2 ... ... @@ -383,22 +383,22 @@ if (na1==1) then d(1) = d1(1) + rho*z1(1)**2 ! solve secular equation else ! na1==2 call timer%start("blas") call timer%start("blas") call PRECISION_LAED5(1, d1, z1, qtrans(1,1), rho, d(1)) call PRECISION_LAED5(2, d1, z1, qtrans(1,2), rho, d(2)) call timer%stop("blas") call timer%stop("blas") call transform_columns_& &PRECISION& &(idx1(1), idx1(2)) &PRECISION& &(idx1(1), idx1(2)) endif ! Add the deflated eigenvalues d(na1+1:na) = d2(1:na2) ! Calculate arrangement of all eigenvalues in output call timer%start("blas") call timer%start("blas") call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx ) call timer%stop("blas") call timer%stop("blas") ! Rearrange eigenvalues tmp = d ... ... @@ -416,8 +416,8 @@ endif enddo call resort_ev_& &PRECISION& &(idxq1, na) &PRECISION& &(idxq1, na) else if (na1>2) then ! Solve secular equation ... ... @@ -439,16 +439,16 @@ !\$OMP DO #endif DO i = my_proc+1, na1, n_procs ! work distributed over all processors call timer%start("blas") call timer%start("blas") call PRECISION_LAED4(na1, i, d1, z1, delta, rho, s, info) ! s is not used! call timer%stop("blas") call timer%stop("blas") if (info/=0) then ! If DLAED4 fails (may happen especially for LAPACK versions before 3.2) ! use the more stable bisection algorithm in solve_secular_equation ! print *,'ERROR DLAED4 n=',na1,'i=',i,' Using Bisection' call solve_secular_equation_& &PRECISION& &(na1, i, d1, z1, delta, rho, s) &PRECISION& &(na1, i, d1, z1, delta, rho, s) endif ! Compute updated z ... ... @@ -490,16 +490,16 @@ #endif call global_product_& &PRECISION& (z, na1) &PRECISION& (z, na1) z(1:na1) = SIGN( SQRT( -z(1:na1) ), z1(1:na1) ) call global_gather_& &PRECISION& &(dbase, na1) &PRECISION& &(dbase, na1) call global_gather_& &PRECISION& &(ddiff, na1) &PRECISION& &(ddiff, na1) d(1:na1) = dbase(1:na1) - ddiff(1:na1) ! Calculate scale factors for eigenvectors ... ... @@ -523,8 +523,8 @@ ! in exactly this order, but we want to prevent compiler optimization ! ev_scale_val = ev_scale(i) call add_tmp_& &PRECISION& &(d1, dbase, ddiff, z, ev_scale(i), na1,i) &PRECISION& &(d1, dbase, ddiff, z, ev_scale(i), na1,i) ! ev_scale(i) = ev_scale_val enddo #ifdef WITH_OPENMP ... ... @@ -535,12 +535,12 @@ #endif call global_gather_& &PRECISION& &(ev_scale, na1) &PRECISION& &(ev_scale, na1) ! Add the deflated eigenvalues d(na1+1:na) = d2(1:na2) call timer%start("blas") call timer%start("blas") ! Calculate arrangement of all eigenvalues in output call PRECISION_LAMRG( na1, na-na1, d, 1, 1, idx ) call timer%stop("blas") ... ... @@ -550,8 +550,8 @@ d(i) = tmp(idx(i)) enddo call check_monotony_& &PRECISION& &(na,d,'Output', wantDebug, success) &PRECISION& &(na,d,'Output', wantDebug, success) if (.not.(success)) then call timer%stop("merge_systems" // PRECISION_SUFFIX) ... ... @@ -720,8 +720,8 @@ ! See above why we are doing it this way! tmp(1:nnzu) = d1u(1:nnzu)-dbase(j) call v_add_s_& &PRECISION& &(tmp,nnzu,ddiff(j)) &PRECISION& &(tmp,nnzu,ddiff(j)) ev(1:nnzu,i) = zu(1:nnzu) / tmp(1:nnzu) * ev_scale(j) enddo ... ... @@ -736,7 +736,7 @@ if (l_rnm>0 .and. ncnt>0 .and. nnzu>0) & call PRECISION_GEMM('N', 'N', l_rnm, ncnt, nnzu, CONST_1_0, qtmp1, ubound(qtmp1,dim=1), ev, ubound(ev,dim=1), & CONST_1_0, qtmp2(1,1), ubound(qtmp2,dim=1)) call timer%stop("blas") call timer%stop("blas") ! endif ! useGPU ! Compute eigenvectors of the rank-1 modified matrix. ! Parts for multiplying with lower half of Q: ... ... @@ -747,8 +747,8 @@ ! See above why we are doing it this way! tmp(1:nnzl) = d1l(1:nnzl)-dbase(j) call v_add_s_& &PRECISION& &(tmp,nnzl,ddiff(j)) &PRECISION& &(tmp,nnzl,ddiff(j)) ev(1:nnzl,i) = zl(1:nnzl) / tmp(1:nnzl) * ev_scale(j) enddo ... ... @@ -763,7 +763,7 @@ if (l_rows-l_rnm>0 .and. ncnt>0 .and. nnzl>0) & call PRECISION_GEMM('N', 'N', l_rows-l_rnm, ncnt, nnzl, CONST_1_0, qtmp1(l_rnm+1,1), ubound(qtmp1,dim=1), ev, & ubound(ev,dim=1), CONST_1_0, qtmp2(l_rnm+1,1), ubound(qtmp2,dim=1)) call timer%stop("blas") call timer%stop("blas") ! endif ! useGPU ! Put partial result into (output) Q ... ... @@ -795,8 +795,8 @@ contains subroutine add_tmp_& &PRECISION& &(d1, dbase, ddiff, z, ev_scale_value, na1,i) &PRECISION& &(d1, dbase, ddiff, z, ev_scale_value, na1,i) use precision implicit none ... ... @@ -814,17 +814,17 @@ tmp(1:na1) = d1(1:na1) -dbase(i) call v_add_s_& &PRECISION& &(tmp(1:na1),na1,ddiff(i)) &PRECISION& &(tmp(1:na1),na1,ddiff(i)) tmp(1:na1) = z(1:na1) / tmp(1:na1) ev_scale_value = CONST_1_0/sqrt(dot_product(tmp(1:na1),tmp(1:na1))) end subroutine add_tmp_& &PRECISION &PRECISION subroutine resort_ev_& &PRECISION& &(idx_ev, nLength) &PRECISION& &(idx_ev, nLength) #ifdef HAVE_DETAILED_TIMINGS use timings #else ... ... @@ -907,11 +907,11 @@ stop 1 endif end subroutine resort_ev_& &PRECISION &PRECISION subroutine transform_columns_& &PRECISION& &(col1, col2) &PRECISION& &(col1, col2) #ifdef HAVE_DETAILED_TIMINGS use timings #else ... ... @@ -962,11 +962,11 @@ q(l_rqs:l_rqe,lc2) = tmp(1:l_rows)*qtrans(1,2) + q(l_rqs:l_rqe,lc2)*qtrans(2,2) endif end subroutine transform_columns_& &PRECISION &PRECISION subroutine global_gather_& &PRECISION& &(z, n) &PRECISION& &(z, n) ! This routine sums up z over all processors. ! It should only be used for gathering distributed results, ! i.e. z(i) should be nonzero on exactly 1 processor column, ... ... @@ -1024,11 +1024,11 @@ #endif /* WITH_MPI */ enddo end subroutine global_gather_& &PRECISION &PRECISION subroutine global_product_& &PRECISION& &(z, n) &PRECISION& &(z, n) ! This routine calculates the global product of z. use precision #ifdef HAVE_DETAILED_TIMINGS ... ... @@ -1105,11 +1105,11 @@ endif end subroutine global_product_& &PRECISION &PRECISION subroutine check_monotony_& &PRECISION& &(n,d,text, wantDebug, success) &PRECISION& &(n,d,text, wantDebug, success) ! This is a test routine for checking if the eigenvalues are monotonically increasing. ! It is for debug purposes only, an error should never be triggered! use precision ... ... @@ -1132,7 +1132,7 @@ endif enddo end subroutine check_monotony_& &PRECISION &PRECISION end subroutine merge_systems_& &PRECISION
 ... ... @@ -242,9 +242,9 @@ if (useGPU) then ! todo: this is used only for copying hmv to device.. it should be possible to go without it allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage) call check_alloc("trans_ev_& &MATH_DATATYPE& &", "hvm1", istat, errorMessage) call check_alloc("trans_ev_& &MATH_DATATYPE& &", "hvm1", istat, errorMessage) successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype) check_alloc_cuda("trans_ev", successCUDA) ... ... @@ -292,13 +292,13 @@ if (nb>0) & call MPI_Bcast(hvb, nb, & #if REALCASE == 1 &MPI_REAL_PRECISION& &MPI_REAL_PRECISION& #endif #if COMPLEXCASE == 1 &MPI_COMPLEX_PRECISION& #endif , cur_pcol, mpi_comm_cols, mpierr) , cur_pcol, mpi_comm_cols, mpierr) call timer%stop("mpi_communication") #endif /* WITH_MPI */ ... ... @@ -320,7 +320,7 @@ ! This can be done in different ways, we use dsyrk or zherk tmat = 0 call timer%start("blas") call timer%start("blas") if (l_rows>0) & #if REALCASE == 1 call PRECISION_SYRK('U', 'T', & ... ... @@ -328,8 +328,8 @@ #if COMPLEXCASE == 1 call PRECISION_HERK('U', 'C', & #endif nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows) call timer%stop("blas") nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows) call timer%stop("blas") nc = 0 do n = 1, nstor-1 h1(nc+1:nc+n) = tmat(1:n,n+1) ... ... @@ -344,7 +344,7 @@ #if COMPLEXCASE == 1 &MPI_COMPLEX_PRECISION& #endif &, MPI_SUM, mpi_comm_rows, mpierr) &, MPI_SUM, mpi_comm_rows, mpierr) call timer%stop("mpi_communication") #else /* WITH_MPI */ ... ... @@ -356,24 +356,24 @@ nc = 0 tmat(1,1) = tau(ice-nstor+1) do n = 1, nstor-1 call timer%start("blas") call timer%start("blas") #if REALCASE == 1 call PRECISION_TRMV('L', 'T', 'N', & #endif #if COMPLEXCASE == 1 call PRECISION_TRMV('L', 'C', 'N', & #endif n, tmat, max_stored_rows, h2(nc+1), 1) call timer%stop("blas") n, tmat, max_stored_rows, h2(nc+1), 1) call timer%stop("blas") tmat(n+1,1:n) = & #if REALCASE == 1 -h2(nc+1:nc+n) & -h2(nc+1:nc+n) & #endif #if COMPLEXCASE == 1 -conjg(h2(nc+1:nc+n)) & #endif *tau(ice-nstor+n+1) *tau(ice-nstor+n+1) tmat(n+1,n+1) = tau(ice-nstor+n+1) nc = nc+n ... ... @@ -399,7 +399,7 @@ if (l_rows>0) then if (useGPU) then call timer%start("cublas") call timer%start("cublas") #if REALCASE == 1 call cublas_PRECISION_GEMM('T', 'N', & #endif ... ... @@ -408,18 +408,18 @@ #endif nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd, & q_dev, ldq, ZERO, tmp_dev, nstor) call timer%stop("cublas") call timer%stop("cublas") else ! useGPU call timer%start("blas") call timer%start("blas") #if REALCASE == 1 call PRECISION_GEMM('T', 'N', & #endif #if COMPLEXCASE == 1 call PRECISION_GEMM('C', 'N', & #endif nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), & nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), & q_mat, ldq, ZERO, tmp1, nstor) call timer%stop("blas") endif ! useGPU ... ... @@ -450,7 +450,7 @@ #if COMPLEXCASE == 1 &MPI_COMPLEX_PRECISION& #endif &, MPI_SUM, mpi_comm_rows, mpierr) &, MPI_SUM, mpi_comm_rows, mpierr) call timer%stop("mpi_communication") ! copy back tmp2 - after reduction... if (useGPU) then ... ... @@ -466,33 +466,33 @@ if (l_rows>0) then if (useGPU) then call timer%start("cublas") call timer%start("cublas") call cublas_PRECISION_TRMM('L', 'L', 'N', 'N', & nstor, l_cols, ONE, tmat_dev, max_stored_rows, & nstor, l_cols, ONE, tmat_dev, max_stored_rows, & tmp_dev, nstor) call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor, & -ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor, & ONE, q_dev, ldq) call timer%stop("cublas") call timer%stop("cublas") else !useGPU #ifdef WITH_MPI ! tmp2 = tmat * tmp2 call timer%start("blas") call timer%start("blas") call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, & ONE, tmat, max_stored_rows, tmp2, nstor) !q_mat = q_mat - hvm*tmp2 call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, & -ONE, hvm, ubound(hvm,dim=1), tmp2, nstor, ONE, q_mat, ldq) call timer%stop("blas") call timer%stop("blas") #else /* WITH_MPI */ call timer%start("blas") call timer%start("blas") call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols, & ONE, tmat, max_stored_rows, tmp1, nstor) call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor, & -ONE, hvm, ubound(hvm,dim=1), tmp1, nstor, ONE, q_mat, ldq) call timer%stop("blas") call timer%stop("blas") #endif /* WITH_MPI */ endif ! useGPU endif ! l_rows>0 ... ... @@ -504,8 +504,8 @@ deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_& &MATH_DATATYPE& &: error when deallocating hvm "//errorMessage &MATH_DATATYPE& &: error when deallocating hvm "//errorMessage stop 1 endif ... ... @@ -517,8 +517,8 @@ deallocate(hvm1, stat=istat, errmsg=errorMessage) if (istat .ne. 0) then print *,"trans_ev_& &MATH_DATATYPE& &: error when deallocating hvm1 "//errorMessage &MATH_DATATYPE& &: error when deallocating hvm1 "//errorMessage stop 1 endif ... ...
 ... ... @@ -375,7 +375,7 @@ ! copy l_cols + 1 column of A to v_row if (useGPU) then a_offset = l_cols * lda * size_of_datatype a_offset = l_cols * lda * size_of_datatype ! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice) successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost) ... ... @@ -385,12 +385,12 @@ endif if (n_stored_vecs > 0 .and. l_rows > 0) then call timer%start("blas") call timer%start("blas") #if COMPLEXCASE == 1 aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs)) #endif call PRECISION_GEMV('N', & l_rows, 2*n_stored_vecs, & l_rows, 2*n_stored_vecs, & ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), & #if REALCASE == 1 uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), & ... ... @@ -400,7 +400,7 @@ #endif ONE, v_row, 1) call timer%stop("blas") call timer%stop("blas") endif ... ... @@ -459,12 +459,12 @@ tau(istep) = v_row(l_rows+1) ! Transpose Householder vector v_row -> v_col call elpa_transpose_vectors_& call elpa_transpose_vectors_& &MATH_DATATYPE& &_& &PRECISION & (v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, & 1, istep-1, 1, nblk) 1, istep-1, 1, nblk) ! Calculate u = (A + VU**T + UV**T)*v ... ... @@ -546,7 +546,7 @@ if (l_row_end < l_row_beg) cycle #ifdef WITH_OPENMP if (mod(n_iter,n_threads) == my_thread) then call timer%start("blas") call timer%start("blas") call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, & l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, & ONE, a_mat(l_row_beg,l_col_beg), lda, & ... ... @@ -558,7 +558,7 @@ ONE, ur_p(l_row_beg,my_thread), 1) endif call timer%stop("blas") call timer%stop("blas") endif n_iter = n_iter+1 #else /* WITH_OPENMP */ ... ... @@ -598,7 +598,7 @@ v_col(l_col_beg), 1, & ONE, u_row(l_row_beg), 1) endif call timer%stop("blas") call timer%stop("blas") endif ! useGPU #endif /* WITH_OPENMP */ ... ... @@ -667,21 +667,21 @@ #endif /* WITH_OPENMP */ if (n_stored_vecs > 0) then call timer%start("blas") call timer%start("blas") #if REALCASE == 1 call PRECISION_GEMV('T', & #endif #if COMPLEXCASE == 1 call PRECISION_GEMV('C', & #endif l_rows, 2*n_stored_vecs, & l_rows, 2*n_stored_vecs, & ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), & v_row, 1, ZERO, aux, 1) call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs, & ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1), & aux, 1, ONE, u_col, 1) call timer%stop("blas") call timer%stop("blas") endif endif ! (l_rows>0 .and. l_cols>0) ... ... @@ -693,12 +693,12 @@ if (tile_size < istep-1) then call elpa_reduce_add_vectors_& &MATH_DATATYPE& &_& &PRECISION & call elpa_reduce_add_vectors_& &MATH_DATATYPE& &_& &PRECISION & (u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), & mpi_comm_cols, istep-1, 1, nblk) mpi_comm_cols, istep-1, 1, nblk) endif ... ... @@ -717,11 +717,11 @@ endif call elpa_transpose_vectors_& &MATH_DATATYPE& &_& &PRECISION & &MATH_DATATYPE& &_& &PRECISION & (u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), & mpi_comm_rows, 1, istep-1, 1, nblk) mpi_comm_rows, 1, istep-1, 1, nblk) ! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v ) #if REALCASE == 1 ... ...
 ... ... @@ -140,7 +140,7 @@ subroutine elpa_reduce_add_vectors_& #endif do lc=1,nvc do i = n, nblks_tot-1, lcm_s_t k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride ns = (i/nps)*nblk ! local start of block i nl = min(nvr-i*nblk,nblk) ! length aux1(k+1:k+nl) = vmat_s(ns+1:ns+nl,lc) ... ... @@ -183,7 +183,7 @@ subroutine elpa_reduce_add_vectors_& #endif do lc=1,nvc do i = n, nblks_tot-1, lcm_s_t k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride k = (i - n)/lcm_s_t * nblk + (lc - 1) * auxstride ns = (i/npt)*nblk ! local start of block i nl = min(nvr-i*nblk,nblk) ! length vmat_t(ns+1:ns+nl,lc) = vmat_t(ns+1:ns+nl,lc) + aux2(k+1:k+nl) ... ...
 ... ... @@ -91,10 +91,10 @@ & , cudaMemcpyDeviceToHost) if (.not.(successCUDA)) then print *,"pack_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION& &: error in cudaMemcpy" &MATH_DATATYPE& &_gpu_& &PRECISION& &: error in cudaMemcpy" stop 1 endif !write(*,*) cudaGetErrorString(istat) ... ... @@ -139,16 +139,16 @@ successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * & size_of_& &PRECISION& &_& &MATH_DATATYPE& &, cudaMemcpyHostToDevice) &PRECISION& &_& &MATH_DATATYPE& &, cudaMemcpyHostToDevice) if (.not.(successCUDA)) then print *,"unpack_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION& &: error in cudaMemcpy" &MATH_DATATYPE& &_gpu_& &PRECISION& &: error in cudaMemcpy" stop 1 endif !write(*,*) cudaGetErrorString(istat) ... ... @@ -199,10 +199,10 @@ if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then ! A flush and a reset must be performed call unpack_row_group_& &MATH_DATATYPE& &_gpu_& &PRECISION& (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &