#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
!> \brief Reduces a distributed symmetric matrix to tridiagonal form (like Scalapack Routine PDSYTRD)
!>
! Parameters
!
!> \param na Order of matrix
!>
!> \param a_mat(lda,matrixCols) Distributed matrix which should be reduced.
!> Distribution is like in Scalapack.
!> Opposed to PDSYTRD, a(:,:) must be set completely (upper and lower half)
!> a(:,:) is overwritten on exit with the Householder vectors
!>
!> \param lda Leading dimension of a
!>
!> \param nblk blocksize of cyclic distribution, must be the same in both directions!
!>
!> \param matrixCols local columns of matrix
!>
!> \param mpi_comm_rows MPI-Communicator for rows
!> \param mpi_comm_cols MPI-Communicator for columns
!>
!> \param d_vec(na) Diagonal elements (returned), identical on all processors
!>
!> \param e_vec(na) Off-Diagonal elements (returned), identical on all processors
!>
!> \param tau(na) Factors for the Householder vectors (returned), needed for back transformation
!>
!> \param useGPU If true, GPU version of the subroutine will be used
!>
subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, a_mat, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d_vec, e_vec, tau, useGPU)
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
implicit none
logical, parameter :: new_GPU_multiply = .true.
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU
#if REALCASE == 1
real(kind=REAL_DATATYPE), intent(out) :: tau(na)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), intent(out) :: tau(na)
#endif
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*)
#else
complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,matrixCols)
#endif
#endif
real(kind=REAL_DATATYPE), intent(out) :: d_vec(na), e_vec(na)
integer(kind=ik), parameter :: max_stored_uv = 32
! id in processor row and column and total numbers of processor rows and columns
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, my_rank
integer(kind=ik) :: mpierr
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
real(kind=rk8), parameter :: ZERO=0.0_rk8, ONE = 1.0_rk8
#else
real(kind=rk4), parameter :: ZERO=0.0_rk4, ONE = 1.0_rk4
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=ck8), parameter :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
#else
complex(kind=ck4), parameter :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
#endif
#endif
integer(kind=ik) :: totalblocks, max_loc_block_rows, max_loc_block_cols, max_local_rows, &
max_local_cols
! updated after each istep (in the main cycle) to contain number of
! local columns and rows of the remaining part of the matrix
!integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: n_stored_vecs
integer(kind=C_intptr_T) :: a_dev, v_row_dev, v_col_dev, u_row_dev, u_col_dev, vu_stored_rows_dev, &
uv_stored_cols_dev
logical :: successCUDA
integer(kind=ik) :: istep, i, j, l_col_beg, l_col_end, l_row_beg, l_row_end
integer(kind=ik) :: tile_size, l_rows_per_tile, l_cols_per_tile
integer(kind=c_size_t) :: a_offset
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread, n_threads, max_threads, n_iter
integer(kind=ik) :: omp_get_thread_num, omp_get_num_threads, omp_get_max_threads
#endif
real(kind=REAL_DATATYPE) :: vnorm2
#if REALCASE == 1
real(kind=REAL_DATATYPE) :: vav, x, aux(2*max_stored_uv), aux1(2), aux2(2), vrl, xf
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE) :: vav, xc, aux(2*max_stored_uv), aux1(2), aux2(2), aux3(1), vrl, xf
#endif
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp(:), &
v_row(:), & ! used to store calculated Householder vector
v_col(:), & ! the same vector, but transposed - differently distributed among MPI tasks
u_row(:), &
u_col(:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:), v_row(:), v_col(:), u_row(:), u_col(:)
#endif
! the following two matrices store pairs of vectors v and u calculated in each step
! at most max_stored_uv vector pairs are stored, than the matrix A_i is explicitli updated
! u and v are stored both in row and vector forms
! pattern: v1,u1,v2,u2,v3,u3,....
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: vu_stored_rows(:,:)
! pattern: u1,v1,u2,v2,u3,v3,....
real(kind=REAL_DATATYPE), allocatable :: uv_stored_cols(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: vu_stored_rows(:,:), uv_stored_cols(:,:)
#endif
#ifdef WITH_OPENMP
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
#endif
#if COMPLEXCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmp_real(:)
#endif
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=c_size_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_&
&MATH_DATATYPE
call timer%start("tridiag_&
&MATH_DATATYPE&
&" // &
PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call timer%stop("mpi_communication")
if((my_prow == 0) .and. (my_pcol == 0)) then
if( new_GPU_multiply) then
write(*,*) "mat vec multiply version 1"
else
write(*,*) "mat vec multiply version 0"
endif
endif
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
! seems that tile is a square submatrix, consisting by several blocks
! it is a smallest possible square submatrix, where blocks being distributed among
! processors are "aligned" in both rows and columns
! -----------------
! | 1 4 | 1 4 | 1 4 | ...
! | 2 5 | 2 5 | 2 5 | ...
! | 3 6 | 3 6 | 3 6 | ...
! ----------------- ...
! | 1 4 | 1 4 | 1 4 | ...
! | 2 5 | 2 5 | 2 5 | ...
! | 3 6 | 3 6 | 3 6 | ...
! ----------------- .
! : : : : : : .
! : : : : : : .
!
! this is a tile, where each number represents block, assigned to a processor with the shown number
! size of this small block is nblk
! Image is for situation with 6 processors, 3 processor rows and 2 columns
! tile_size is thus nblk * 6
!
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_per_tile = tile_size/np_rows ! local rows of a tile
l_cols_per_tile = tile_size/np_cols ! local cols of a tile
totalblocks = (na-1)/nblk + 1
max_loc_block_rows = (totalblocks-1)/np_rows + 1
max_loc_block_cols = (totalblocks-1)/np_cols + 1
! localy owned submatrix has size at most max_local_rows x max_local_cols at each processor
max_local_rows = max_loc_block_rows*nblk
max_local_cols = max_loc_block_cols*nblk
! allocate memmory for vectors
! todo: It is little bit confusing, I think, that variables _row actually store columns and vice versa
! todo: if something has length max_local_rows, it is actually a column, no?
! todo: probably one should read it as v_row = vector v distributed among rows
!
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "tmp", istat, errorMessage)
! allocate v_row 1 element longer to allow store and broadcast tau together with it
allocate(v_row(max_local_rows+1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_row", istat, errorMessage)
allocate(u_row(max_local_rows), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_row", istat, errorMessage)
allocate(v_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "v_col", istat, errorMessage)
allocate(u_col(max_local_cols), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "u_col", istat, errorMessage)
#ifdef WITH_OPENMP
max_threads = omp_get_max_threads()
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "ur_p", istat, errorMessage)
allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uc_p", istat, errorMessage)
#endif /* WITH_OPENMP */
tmp = 0
v_row = 0
u_row = 0
v_col = 0
u_col = 0
allocate(vu_stored_rows(max_local_rows,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "vu_stored_rows", istat, errorMessage)
allocate(uv_stored_cols(max_local_cols,2*max_stored_uv), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uv_stored_cols", istat, errorMessage)
if (useGPU) then
successCUDA = cuda_malloc(v_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: v_row_dev", successCUDA)
successCUDA = cuda_malloc(u_row_dev, max_local_rows * size_of_datatype)
check_alloc_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_malloc(v_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_malloc(u_col_dev, max_local_cols * size_of_datatype)
check_alloc_cuda("tridiag: u_col_dev", successCUDA)
successCUDA = cuda_malloc(vu_stored_rows_dev, max_local_rows * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_malloc(uv_stored_cols_dev, max_local_cols * 2 * max_stored_uv * size_of_datatype)
check_alloc_cuda("tridiag: vu_stored_rows_dev", successCUDA)
endif !useGPU
d_vec(:) = 0
e_vec(:) = 0
tau(:) = 0
n_stored_vecs = 0
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a_mat
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local cols of a_mat
if (my_prow == prow(na, nblk, np_rows) .and. my_pcol == pcol(na, nblk, np_cols)) &
d_vec(na) = a_mat(l_rows,l_cols)
if (useGPU) then
! allocate memmory for matrix A on the device and than copy the matrix
successCUDA = cuda_malloc(a_dev, lda * matrixCols * size_of_datatype)
check_alloc_cuda("tridiag: a_dev", successCUDA)
successCUDA = cuda_memcpy(a_dev, loc(a_mat(1,1)), lda * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
check_alloc_cuda("tridiag: a_dev", successCUDA)
endif
! main cycle of tridiagonalization
! in each step, 1 Householder vector is calculated
do istep = na, 3 ,-1
! Calculate number of local rows and columns of the still remaining matrix
! on the local processor
l_rows = local_index(istep-1, my_prow, np_rows, nblk, -1)
l_cols = local_index(istep-1, my_pcol, np_cols, nblk, -1)
! Calculate vector for Householder transformation on all procs
! owning column istep
if (my_pcol == pcol(istep, nblk, np_cols)) then
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
! copy l_cols + 1 column of A to v_row
if (useGPU) then
a_offset = l_cols * lda * size_of_datatype
! we use v_row on the host at the moment! successCUDA = cuda_memcpy(v_row_dev, a_dev + a_offset, (l_rows)*size_of_PRECISION_real, cudaMemcpyDeviceToDevice)
successCUDA = cuda_memcpy(loc(v_row(1)), a_dev + a_offset, (l_rows)* size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag a_dev 1", successCUDA)
else
v_row(1:l_rows) = a_mat(1:l_rows,l_cols+1)
endif
if (n_stored_vecs > 0 .and. l_rows > 0) then
call timer%start("blas")
#if COMPLEXCASE == 1
aux(1:2*n_stored_vecs) = conjg(uv_stored_cols(l_cols+1,1:2*n_stored_vecs))
#endif
call PRECISION_GEMV('N', &
l_rows, 2*n_stored_vecs, &
ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
#if REALCASE == 1
uv_stored_cols(l_cols+1,1), ubound(uv_stored_cols,dim=1), &
#endif
#if COMPLEXCASE == 1
aux, 1, &
#endif
ONE, v_row, 1)
call timer%stop("blas")
endif
if(my_prow == prow(istep-1, nblk, np_rows)) then
aux1(1) = dot_product(v_row(1:l_rows-1),v_row(1:l_rows-1))
aux1(2) = v_row(l_rows)
else
aux1(1) = dot_product(v_row(1:l_rows),v_row(1:l_rows))
aux1(2) = 0.
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_allreduce(aux1, aux2, 2, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
aux2 = aux1
#endif /* WITH_MPI */
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
#if REALCASE == 1
call hh_transform_real_&
#endif
#if COMPLEXCASE == 1
call hh_transform_complex_&
#endif
&PRECISION &
(vrl, vnorm2, xf, tau(istep))
! Scale v_row and store Householder vector for back transformation
v_row(1:l_rows) = v_row(1:l_rows) * xf
if (my_prow == prow(istep-1, nblk, np_rows)) then
v_row(l_rows) = 1.
! vrl is newly computed off-diagonal element of the final tridiagonal matrix
e_vec(istep-1) = vrl
endif
! store Householder vector for back transformation
a_mat(1:l_rows,l_cols+1) = v_row(1:l_rows)
! add tau after the end of actuall v_row, to be broadcasted with it
v_row(l_rows+1) = tau(istep)
endif !(my_pcol == pcol(istep, nblk, np_cols))
#ifdef WITH_MPI
! Broadcast the Householder vector (and tau) along columns
call MPI_Bcast(v_row, l_rows+1, MPI_MATH_DATATYPE_PRECISION, &
pcol(istep, nblk, np_cols), mpi_comm_cols, mpierr)
#endif /* WITH_MPI */
!recover tau, which has been broadcasted together with v_row
tau(istep) = v_row(l_rows+1)
! Transpose Householder vector v_row -> v_col
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(v_row, ubound(v_row,dim=1), mpi_comm_rows, v_col, ubound(v_col,dim=1), mpi_comm_cols, &
1, istep-1, 1, nblk)
! Calculate u = (A + VU**T + UV**T)*v
! For cache efficiency, we use only the upper half of the matrix tiles for this,
! thus the result is partly in u_col(:) and partly in u_row(:)
u_col(1:l_cols) = 0
u_row(1:l_rows) = 0
if (l_rows > 0 .and. l_cols> 0 ) then
if(useGPU) then
successCUDA = cuda_memset(u_col_dev, 0, l_cols * size_of_datatype)
check_memcpy_cuda("tridiag: u_col_dev", successCUDA)
successCUDA = cuda_memset(u_row_dev, 0, l_rows * size_of_datatype)
check_memcpy_cuda("tridiag: u_row_dev", successCUDA)
successCUDA = cuda_memcpy(v_col_dev, loc(v_col(1)), l_cols * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_memcpy(v_row_dev, loc(v_row(1)), l_rows * size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU
#if REALCASE == 1
! if (useGPU) then
!u_col_dev(1:l_cols) = 0.
!u_row_dev(1:l_rows) = 0.
!v_col_dev(1:l_cols) = v_col(1:l_cols)
!v_row_dev(1:l_rows) = v_row(1:l_rows)
! do i=0,(istep-2)/tile_size
! l_col_beg = i*l_cols_per_tile+1
! l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
! if(l_col_end a_mat ?
!write(*,*) "ubound ", ubound(a_mat,1), "lda", lda, "lcols", l_cols
! call PRECISION_SYMV('U', l_cols, &
! 1.d0, a_mat, ubound(a_mat,1), &
! v_row, 1, &
! 0.d0, u_col, 1)
!u_col(1:l_cols) = u_col_dev(1:l_cols)
!u_row(1:l_rows) = u_row_dev(1:l_rows)
! else !do not use GPU
#endif
#ifdef WITH_OPENMP
call timer%start("OpenMP parallel")
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
my_thread = omp_get_thread_num()
n_threads = omp_get_num_threads()
n_iter = 0
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif /* WITH_OPENMP */
do i= 0, (istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if (l_col_end < l_col_beg) cycle
do j = 0, i
l_row_beg = j*l_rows_per_tile+1
l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP
if (mod(n_iter,n_threads) == my_thread) then
call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, &
v_row(l_row_beg), 1, ONE, uc_p(l_col_beg,my_thread), 1)
if (i/=j) then
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1, &
ONE, ur_p(l_row_beg,my_thread), 1)
endif
call timer%stop("blas")
endif
n_iter = n_iter+1
#else /* WITH_OPENMP */
if (useGPU) then
if(.not. new_GPU_multiply) then
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * size_of_datatype
call timer%start("cublas")
call cublas_PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, lda, &
v_row_dev + (l_row_beg - 1) * &
size_of_datatype, 1, &
ONE, u_col_dev + (l_col_beg - 1) * &
size_of_datatype, 1)
if(i/=j) then
call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1,l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, lda, &
v_col_dev + (l_col_beg - 1) * &
size_of_datatype, 1, &
ONE, u_row_dev + (l_row_beg - 1) * &
size_of_datatype, 1)
endif
call timer%stop("cublas")
endif
else ! useGPU
call timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg, l_col_beg), lda, &
v_row(l_row_beg), 1, &
ONE, u_col(l_col_beg), 1)
if (i/=j) then
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, &
v_col(l_col_beg), 1, &
ONE, u_row(l_row_beg), 1)
endif
call timer%stop("blas")
endif ! useGPU
#endif /* WITH_OPENMP */
enddo ! j=0,i
enddo ! i=0,(istep-2)/tile_size
if (useGPU) then
if(new_GPU_multiply) then
do i=0,(istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
if(l_col_end 0) then
call timer%start("blas")
#if REALCASE == 1
call PRECISION_GEMV('T', &
#endif
#if COMPLEXCASE == 1
call PRECISION_GEMV('C', &
#endif
l_rows, 2*n_stored_vecs, &
ONE, vu_stored_rows, ubound(vu_stored_rows,dim=1), &
v_row, 1, ZERO, aux, 1)
call PRECISION_GEMV('N', l_cols, 2*n_stored_vecs, &
ONE, uv_stored_cols, ubound(uv_stored_cols,dim=1), &
aux, 1, ONE, u_col, 1)
call timer%stop("blas")
endif
endif ! (l_rows>0 .and. l_cols>0)
! Sum up all u_row(:) parts along rows and add them to the u_col(:) parts
! on the processors containing the diagonal
! This is only necessary if u_row has been calculated, i.e. if the
! global tile size is smaller than the global remaining matrix
if (tile_size < istep-1) then
call elpa_reduce_add_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(u_row, ubound(u_row,dim=1), mpi_comm_rows, u_col, ubound(u_col,dim=1), &
mpi_comm_cols, istep-1, 1, nblk)
endif
! Sum up all the u_col(:) parts, transpose u_col -> u_row
if (l_cols>0) then
tmp(1:l_cols) = u_col(1:l_cols)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_allreduce(tmp, u_col, l_cols, MPI_MATH_DATATYPE_PRECISION, &
MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
u_col = tmp
#endif /* WITH_MPI */
endif
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
mpi_comm_rows, 1, istep-1, 1, nblk)
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
#if REALCASE == 1
x = 0
if (l_cols>0) &
x = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif
#if COMPLEXCASE == 1
xc = 0
if (l_cols>0) &
xc = dot_product(v_col(1:l_cols),u_col(1:l_cols))
#endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
#if REALCASE == 1
call mpi_allreduce(x, vav, 1, MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#if COMPLEXCASE == 1
call mpi_allreduce(xc, vav, 1 , MPI_MATH_DATATYPE_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
call timer%stop("mpi_communication")
#else /* WITH_MPI */
#if REALCASE == 1
vav = x
#endif
#if COMPLEXCASE == 1
vav = xc
#endif
#endif /* WITH_MPI */
! store u and v in the matrices U and V
! these matrices are stored combined in one here
do j=1,l_rows
#if REALCASE == 1
vu_stored_rows(j,2*n_stored_vecs+1) = tau(istep)*v_row(j)
vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*tau(istep)*vav*v_row(j) - u_row(j)
#endif
#if COMPLEXCASE == 1
vu_stored_rows(j,2*n_stored_vecs+1) = conjg(tau(istep))*v_row(j)
vu_stored_rows(j,2*n_stored_vecs+2) = 0.5*conjg(tau(istep))*vav*v_row(j) - u_row(j)
#endif
enddo
do j=1,l_cols
#if REALCASE == 1
uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*tau(istep)*vav*v_col(j) - u_col(j)
uv_stored_cols(j,2*n_stored_vecs+2) = tau(istep)*v_col(j)
#endif
#if COMPLEXCASE == 1
uv_stored_cols(j,2*n_stored_vecs+1) = 0.5*conjg(tau(istep))*vav*v_col(j) - u_col(j)
uv_stored_cols(j,2*n_stored_vecs+2) = conjg(tau(istep))*v_col(j)
#endif
enddo
! We have calculated another Hauseholder vector, number of implicitly stored increased
n_stored_vecs = n_stored_vecs+1
! If the limit of max_stored_uv is reached, calculate A + VU**T + UV**T
if (n_stored_vecs == max_stored_uv .or. istep == 3) then
if (useGPU) then
successCUDA = cuda_memcpy(vu_stored_rows_dev, loc(vu_stored_rows(1,1)), &
max_local_rows * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: vu_stored_rows_dev", successCUDA)
successCUDA = cuda_memcpy(uv_stored_cols_dev, loc(uv_stored_cols(1,1)), &
max_local_cols * 2 * max_stored_uv * &
size_of_datatype, cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: uv_stored_cols_dev", successCUDA)
endif
do i = 0, (istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
l_row_beg = 1
l_row_end = min(l_rows,(i+1)*l_rows_per_tile)
if (l_col_end 0) then
a_mat(l_rows,l_cols) = a_mat(l_rows,l_cols) &
+ dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
end if
d_vec(istep-1) = a_mat(l_rows,l_cols)
if (useGPU) then
!a_dev(l_rows,l_cols) = a_mat(l_rows,l_cols)
!successCUDA = cuda_threadsynchronize()
!check_memcpy_cuda("tridiag: a_dev 4a5a", successCUDA)
successCUDA = cuda_memcpy(a_dev + a_offset, int(loc(a_mat(l_rows, l_cols)),kind=c_size_t), &
int(1 * size_of_datatype, kind=c_size_t), cudaMemcpyHostToDevice)
check_memcpy_cuda("tridiag: a_dev 4", successCUDA)
endif
endif
enddo ! main cycle over istep=na,3,-1
#if COMPLEXCASE == 1
! Store e_vec(1) and d_vec(1)
if (my_pcol==pcol(2, nblk, np_cols)) then
if (my_prow==prow(1, nblk, np_rows)) then
! We use last l_cols value of loop above
if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 5", successCUDA)
vrl = aux3(1)
else !useGPU
vrl = a_mat(1,l_cols)
endif !useGPU
call hh_transform_complex_&
&PRECISION &
(vrl, CONST_REAL_0_0, xf, tau(2))
e_vec(1) = vrl
a_mat(1,l_cols) = 1. ! for consistency only
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, prow(1, nblk, np_rows), mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_bcast(tau(2), 1, MPI_COMPLEX_PRECISION, pcol(2, nblk, np_cols), mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
if (my_prow == prow(1, nblk, np_rows) .and. my_pcol == pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(aux3(1)), a_dev, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 6", successCUDA)
d_vec(1) = PRECISION_REAL(aux3(1))
else !useGPU
d_vec(1) = PRECISION_REAL(a_mat(1,1))
endif !useGPU
endif
#endif /* COMPLEXCASE == 1 */
#if REALCASE == 1
! Store e_vec(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(2, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(e_vec(1)), a_dev + (lda * (l_cols - 1)) * size_of_datatype, &
1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 7", successCUDA)
else !useGPU
e_vec(1) = a_mat(1,l_cols) ! use last l_cols value of loop above
endif !useGPU
endif
! Store d_vec(1)
if (my_prow==prow(1, nblk, np_rows) .and. my_pcol==pcol(1, nblk, np_cols)) then
if(useGPU) then
successCUDA = cuda_memcpy(loc(d_vec(1)), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
else !useGPU
d_vec(1) = a_mat(1,1)
endif !useGPU
endif
#endif
deallocate(tmp, v_row, u_row, v_col, u_col, vu_stored_rows, uv_stored_cols, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
#if REALCASE == 1
print *,"tridiag_real: error when deallocating uv_stored_cols "//errorMessage
#endif
#if COMPLEXCASE == 1
print *,"tridiag_complex: error when deallocating tmp "//errorMessage
#endif
stop 1
endif
if (useGPU) then
! todo: should we leave a_mat on the device for further use?
successCUDA = cuda_free(a_dev)
check_dealloc_cuda("tridiag: a_dev 9", successCUDA)
successCUDA = cuda_free(v_row_dev)
check_dealloc_cuda("tridiag: v_row_dev", successCUDA)
successCUDA = cuda_free(u_row_dev)
check_dealloc_cuda("tridiag: (u_row_dev", successCUDA)
successCUDA = cuda_free(v_col_dev)
check_dealloc_cuda("tridiag: v_col_dev", successCUDA)
successCUDA = cuda_free(u_col_dev)
check_dealloc_cuda("tridiag: u_col_dev ", successCUDA)
successCUDA = cuda_free(vu_stored_rows_dev)
check_dealloc_cuda("tridiag: vu_stored_rows_dev ", successCUDA)
successCUDA = cuda_free(uv_stored_cols_dev)
check_dealloc_cuda("tridiag:uv_stored_cols_dev ", successCUDA)
endif
! distribute the arrays d_vec and e_vec to all processors
#if REALCASE == 1
allocate(tmp(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating tmp "//errorMessage
stop 1
endif
#endif
#if COMPLEXCASE == 1
allocate(tmp_real(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when allocating tmp_real "//errorMessage
stop 1
endif
#endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
#if REALCASE == 1
tmp = d_vec
call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp = d_vec
call mpi_allreduce(tmp, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
tmp = e_vec
call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp = e_vec
call mpi_allreduce(tmp, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
#if COMPLEXCASE == 1
tmp_real = d_vec
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp_real = d_vec
call mpi_allreduce(tmp_real, d_vec, na, MPI_REAL_PRECISION ,MPI_SUM, mpi_comm_cols, mpierr)
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
tmp_real = e_vec
call mpi_allreduce(tmp_real, e_vec, na, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_cols, mpierr)
#endif
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
#if REALCASE == 1
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when deallocating tmp "//errorMessage
stop 1
endif
#endif
#if COMPLEXCASE == 1
deallocate(tmp_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_complex: error when deallocating tmp_real "//errorMessage
stop 1
endif
#endif
call timer%stop("tridiag_&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
)
end subroutine tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION