! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.rzg.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "config-f90.h"
module ELPA2
! Version 1.1.2, 2011-02-21
use elpa_utilities
USE ELPA1
use elpa2_utilities
use elpa_pdgeqrf
use iso_c_binding
#ifdef WITH_GPU_VERSION
! use cuda_routines
! use cuda_c_kernel
! use iso_c_binding
#endif
implicit none
PRIVATE ! By default, all routines contained are private
! The following routines are public:
public :: solve_evp_real_2stage
public :: solve_evp_complex_2stage
public :: bandred_real
public :: tridiag_band_real
public :: trans_ev_tridi_to_band_real
public :: trans_ev_band_to_full_real
public :: bandred_complex
public :: tridiag_band_complex
public :: trans_ev_tridi_to_band_complex
public :: trans_ev_band_to_full_complex
public :: band_band_real
public :: divide_band
integer, public :: which_qr_decomposition = 1 ! defines, which QR-decomposition algorithm will be used
! 0 for unblocked
! 1 for blocked (maxrank: nblk)
!-------------------------------------------------------------------------------
! The following array contains the Householder vectors of the
! transformation band -> tridiagonal.
! It is allocated and set in tridiag_band_real and used in
! trans_ev_tridi_to_band_real.
! It must be deallocated by the user after trans_ev_tridi_to_band_real!
real*8, allocatable :: hh_trans_real(:,:)
complex*16, allocatable :: hh_trans_complex(:,:)
!-------------------------------------------------------------------------------
include 'mpif.h'
!******
contains
function solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, &
mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_REAL_ELPA_KERNEL_API,&
useQR) result(success)
!-------------------------------------------------------------------------------
! solve_evp_real_2stage: Solves the real eigenvalue problem with a 2 stage approach
!
! Parameters
!
! na Order of matrix a
!
! nev Number of eigenvalues needed
!
! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
! Distribution is like in Scalapack.
! The full matrix must be set (not only one half like in scalapack).
! Destroyed on exit (upper and lower half).
!
! lda Leading dimension of a
! matrixCols local columns of matrix a and q
!
! ev(na) On output: eigenvalues of a, every processor gets the complete set
!
! q(ldq,matrixCols) On output: Eigenvectors of a
! Distribution is like in Scalapack.
! Must be always dimensioned to the full size (corresponding to (na,na))
! even if only a part of the eigenvalues is needed.
!
! ldq Leading dimension of q
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! mpi_comm_all
! MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_functions
use mod_check_for_gpu
implicit none
logical, intent(in), optional :: useQR
logical :: useQRActual, useQREnvironment
integer, intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
integer :: THIS_REAL_ELPA_KERNEL
integer, intent(in) :: na, nev, lda, ldq, matrixCols, mpi_comm_rows, &
mpi_comm_cols, mpi_comm_all
integer, intent(in) :: nblk
real*8, intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
integer :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer :: nbw, num_blocks
real*8, allocatable :: tmat(:,:,:), e(:)
real*8 :: ttt0, ttt1, ttts
integer :: i
logical :: success
logical, save :: firstCall = .true.
logical :: wantDebug
integer :: istat
character(200) :: errorMessage
logical :: useGPU
integer :: numberOfGPUDevices
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_evp_real_2stage")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
success = .true.
useQRActual = .false.
useGPU = .false.
! set usage of qr decomposition via API call
if (present(useQR)) then
if (useQR) useQRActual = .true.
if (.not.(useQR)) useQRACtual = .false.
endif
! overwrite this with environment variable settings
if (qr_decomposition_via_environment_variable(useQREnvironment)) then
useQRActual = useQREnvironment
endif
if (useQRActual) then
if (mod(na,nblk) .ne. 0) then
if (wantDebug) then
write(error_unit,*) "solve_evp_real_2stage: QR-decomposition: blocksize does not fit with matrixsize"
endif
print *, "Do not use QR-decomposition for this matrix and blocksize."
success = .false.
return
endif
endif
if (present(THIS_REAL_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_REAL_ELPA_KERNEL = get_actual_real_kernel()
endif
! check whether choosen kernel is allowed
if (check_allowed_real_kernels(THIS_REAL_ELPA_KERNEL)) then
if (my_pe == 0) then
write(error_unit,*) " "
write(error_unit,*) "The choosen kernel ",REAL_ELPA_KERNEL_NAMES(THIS_REAL_ELPA_KERNEL)
write(error_unit,*) "is not in the list of the allowed kernels!"
write(error_unit,*) " "
write(error_unit,*) "Allowed kernels are:"
do i=1,size(REAL_ELPA_KERNEL_NAMES(:))
if (AVAILABLE_REAL_ELPA_KERNELS(i) .ne. 0) then
write(error_unit,*) REAL_ELPA_KERNEL_NAMES(i)
endif
enddo
write(error_unit,*) " "
write(error_unit,*) "The defaul kernel REAL_ELPA_KERNEL_GENERIC will be used !"
endif
THIS_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
endif
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices)) then
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
stop
endif
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
if (useGPU) then
nbw = nblk
else
nbw = (31/nblk+1)*nblk
endif
num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_real_2stage: error when allocating tmat "//errorMessage
stop
endif
! Reduction full -> band
ttt0 = MPI_Wtime()
ttts = ttt0
call bandred_real(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQRActual)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time bandred_real :',ttt1-ttt0
! Reduction band -> tridiagonal
allocate(e(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_real_2stage: error when allocating e "//errorMessage
stop
endif
ttt0 = MPI_Wtime()
call tridiag_band_real(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, &
mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_real :',ttt1-ttt0
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
! Solve tridiagonal system
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, success)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttts = ttt1
deallocate(e, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_real_2stage: error when deallocating e "//errorMessage
stop
endif
! Backtransform stage 1
ttt0 = MPI_Wtime()
call trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, mpi_comm_rows, &
mpi_comm_cols, wantDebug, useGPU, success, THIS_REAL_ELPA_KERNEL)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_tridi_to_band_real:',ttt1-ttt0
! We can now deallocate the stored householder vectors
deallocate(hh_trans_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_real_2stage: error when deallocating hh_trans_real "//errorMessage
stop
endif
! Backtransform stage 2
print *,"useGPU== ",useGPU
ttt0 = MPI_Wtime()
call trans_ev_band_to_full_real(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQRActual)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_band_to_full_real :',ttt1-ttt0
time_evp_back = ttt1-ttts
deallocate(tmat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_real_2stage: error when deallocating tmat"//errorMessage
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_evp_real_2stage")
#endif
1 format(a,f10.3)
end function solve_evp_real_2stage
!-------------------------------------------------------------------------------
!-------------------------------------------------------------------------------
function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API) result(success)
!-------------------------------------------------------------------------------
! solve_evp_complex_2stage: Solves the complex eigenvalue problem with a 2 stage approach
!
! Parameters
!
! na Order of matrix a
!
! nev Number of eigenvalues needed
!
! a(lda,matrixCols) Distributed matrix for which eigenvalues are to be computed.
! Distribution is like in Scalapack.
! The full matrix must be set (not only one half like in scalapack).
! Destroyed on exit (upper and lower half).
!
! lda Leading dimension of a
! matrixCols local columns of matrix a and q
!
! ev(na) On output: eigenvalues of a, every processor gets the complete set
!
! q(ldq,matrixCols) On output: Eigenvectors of a
! Distribution is like in Scalapack.
! Must be always dimensioned to the full size (corresponding to (na,na))
! even if only a part of the eigenvalues is needed.
!
! ldq Leading dimension of q
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! mpi_comm_all
! MPI-Communicator for the total processor set
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_functions
use mod_check_for_gpu
implicit none
integer, intent(in), optional :: THIS_COMPLEX_ELPA_KERNEL_API
integer :: THIS_COMPLEX_ELPA_KERNEL
integer, intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
complex*16, intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols)
real*8, intent(inout) :: ev(na)
integer :: my_prow, my_pcol, np_rows, np_cols, mpierr, my_pe, n_pes
integer :: l_cols, l_rows, l_cols_nev, nbw, num_blocks
complex*16, allocatable :: tmat(:,:,:)
real*8, allocatable :: q_real(:,:), e(:)
real*8 :: ttt0, ttt1, ttts
integer :: i
logical :: success, wantDebug
logical, save :: firstCall = .true.
integer :: istat
character(200) :: errorMessage
logical :: useGPU
integer :: numberOfGPUDevices
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_evp_complex_2stage")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
useGPU = .false.
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
success = .true.
if (present(THIS_COMPLEX_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_COMPLEX_ELPA_KERNEL = THIS_COMPLEX_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_COMPLEX_ELPA_KERNEL = get_actual_complex_kernel()
endif
! check whether choosen kernel is allowed
if (check_allowed_complex_kernels(THIS_COMPLEX_ELPA_KERNEL)) then
if (my_pe == 0) then
write(error_unit,*) " "
write(error_unit,*) "The choosen kernel ",COMPLEX_ELPA_KERNEL_NAMES(THIS_COMPLEX_ELPA_KERNEL)
write(error_unit,*) "is not in the list of the allowed kernels!"
write(error_unit,*) " "
write(error_unit,*) "Allowed kernels are:"
do i=1,size(COMPLEX_ELPA_KERNEL_NAMES(:))
if (AVAILABLE_COMPLEX_ELPA_KERNELS(i) .ne. 0) then
write(error_unit,*) COMPLEX_ELPA_KERNEL_NAMES(i)
endif
enddo
write(error_unit,*) " "
write(error_unit,*) "The defaul kernel COMPLEX_ELPA_KERNEL_GENERIC will be used !"
endif
THIS_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
endif
if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe, numberOfGPUDevices)) then
useGPU=.true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
stop
endif
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
nbw = (31/nblk+1)*nblk
num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when allocating tmat"//errorMessage
stop
endif
! Reduction full -> band
ttt0 = MPI_Wtime()
ttts = ttt0
call bandred_complex(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success)
if (.not.(success)) then
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop()
#endif
return
endif
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time bandred_complex :',ttt1-ttt0
! Reduction band -> tridiagonal
allocate(e(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when allocating e"//errorMessage
stop
endif
ttt0 = MPI_Wtime()
call tridiag_band_complex(na, nbw, nblk, a, lda, ev, e, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time tridiag_band_complex :',ttt1-ttt0
call mpi_bcast(ev,na,MPI_REAL8,0,mpi_comm_all,mpierr)
call mpi_bcast(e,na,MPI_REAL8,0,mpi_comm_all,mpierr)
ttt1 = MPI_Wtime()
time_evp_fwd = ttt1-ttts
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when allocating q_real"//errorMessage
stop
endif
! Solve tridiagonal system
ttt0 = MPI_Wtime()
call solve_tridi(na, nev, ev, e, q_real, ubound(q_real,dim=1), nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, wantDebug, success)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttts = ttt1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
deallocate(e, q_real, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when deallocating e, q_real"//errorMessage
stop
endif
! Backtransform stage 1
ttt0 = MPI_Wtime()
call trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, &
matrixCols, mpi_comm_rows, mpi_comm_cols,&
wantDebug, useGPU, success,THIS_COMPLEX_ELPA_KERNEL)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_tridi_to_band_complex:',ttt1-ttt0
! We can now deallocate the stored householder vectors
deallocate(hh_trans_complex, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when deallocating hh_trans_complex"//errorMessage
stop
endif
! Backtransform stage 2
ttt0 = MPI_Wtime()
call trans_ev_band_to_full_complex(na, nev, nblk, nbw, a, lda, tmat, q, ldq, matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU)
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time trans_ev_band_to_full_complex :',ttt1-ttt0
time_evp_back = ttt1-ttts
deallocate(tmat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_complex_2stage: error when deallocating tmat "//errorMessage
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("solve_evp_complex_2stage")
#endif
1 format(a,f10.3)
end function solve_evp_complex_2stage
!-------------------------------------------------------------------------------
subroutine bandred_real(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, useGPU, success, useQR)
!-------------------------------------------------------------------------------
! bandred_real: Reduces a distributed symmetric matrix to band form
!
! Parameters
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith of output matrix
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1
! Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
real*8 :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
logical, intent(in) :: useGPU
integer :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer :: l_cols, l_rows
integer :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer :: istep, ncol, lch, lcx, nlc
integer :: tile_size, l_rows_tile, l_cols_tile
real*8 :: eps
real*8 :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
real*8, allocatable :: tmpCUDA(:), vmrCUDA(:), umcCUDA(:)
real*8, allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
real*8, allocatable :: vr(:)
! needed for blocked QR decomposition
integer :: PQRPARAM(11), work_size
real*8 :: dwork_size(1)
real*8, allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
integer, external :: numroc
integer :: ierr
integer :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=c_size_t) :: lc_start, lc_end
integer :: lr_end
integer :: na_rows, na_cols
logical, intent(in) :: wantDebug
logical, intent(out) :: success
logical, intent(in) :: useQR
logical :: successCUDA
integer :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("bandred_real")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
success = .true.
! Semibandwith nbw must be a multiple of blocksize nblk
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_real: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_real: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
if (useGPU) then
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
endif
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
if (useQR) then
if (useGPU) then
print *,"qr decomposition at the moment not supported with GPU"
stop
endif
if (which_qr_decomposition == 1) then
call qr_pqrparam_init(pqrparam, nblk,'M',0, nblk,'M',0, nblk,'M',1,'s')
allocate(tauvector(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating tauvector "//errorMessage
stop
endif
allocate(blockheuristic(nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating blockheuristic "//errorMessage
stop
endif
l_rows = local_index(na, my_prow, np_rows, nblk, -1)
allocate(vmrCPU(max(l_rows,1),na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCPU "//errorMessage
stop
endif
call qr_pdgeqrf_2dcomm(a, lda, vmrCPU, max(l_rows,1), tauvector, tmat(1,1,1), nbw, dwork_size(1), -1, na, &
nbw, nblk, nblk, na, na, 1, 0, PQRPARAM, mpi_comm_rows, mpi_comm_cols, blockheuristic)
work_size = dwork_size(1)
allocate(work_blocked(work_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating work_blocked "//errorMessage
stop
endif
work_blocked = 0.0d0
deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating vmrCPU "//errorMessage
stop
endif
endif
endif ! useQr
if (useGPU) then
! Here we convert the regular host array into a pinned host array
successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_real_datatype)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_real_datatype)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_real_datatype)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
cur_l_rows = 0
cur_l_cols = 0
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)), (lda)*(na_cols)*size_of_real_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
endif ! useGPU
do istep = (na-1)/nbw, 1, -1
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Number of local columns/rows of remaining matrix
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
if (useGPU) then
cur_l_rows = max(l_rows, 1)
cur_l_cols = max(l_cols, 1)
vmr_size = cur_l_rows * 2 * n_cols
umc_size = cur_l_cols * 2 * n_cols
! Allocate vmr and umc only if the inew size exceeds their current capacity
! Added for FORTRAN CALLS
if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
if (allocated(vr)) then
deallocate(vr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating vr "//errorMessage
stop
endif
endif
allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vr "//errorMessage
stop
endif
endif
if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
if (allocated(vmrCUDA)) then
deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
stop
endif
successCUDA = cuda_free(vmr_dev)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cuda_free"
stop
endif
endif
allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
stop
endif
successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_real_datatype)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
endif
if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
if (allocated(umcCUDA)) then
deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
stop
endif
successCUDA = cuda_free(umc_dev)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaFree"
stop
endif
endif
allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
stop
endif
successCUDA = cuda_malloc(umc_dev, umc_size*size_of_real_datatype)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
endif
else ! GPU not used
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCPU "//errorMessage
stop
endif
allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating umcCPU "//errorMessage
stop
endif
allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vr "//errorMessage
stop
endif
endif ! use GPU
if (useGPU) then
vmrCUDA(1 : cur_l_rows * n_cols) = 0.
else
vmrCPU(1:l_rows,1:n_cols) = 0.
endif
vr(:) = 0
tmat(:,:,istep) = 0
if (useGPU) then
umcCUDA(1 : umc_size) = 0.
lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
if(lc_start .le. 0) lc_start = 1
! Here we assume that the processor grid and the block grid are aligned
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if(my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), lda*size_of_real_datatype, &
(a_dev + ((lc_start-1) * lda*size_of_real_datatype)), &
lda*size_of_real_datatype, lr_end*size_of_real_datatype, &
(lc_end - lc_start+1), cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy2d"
stop
endif
endif
endif ! useGPU
! Reduce current block to lower triangular form
if (useQR) then
if (which_qr_decomposition == 1) then
call qr_pdgeqrf_2dcomm(a, lda, vmrCPU, max(l_rows,1), tauvector(1), &
tmat(1,1,istep), nbw, work_blocked, &
work_size, na, n_cols, nblk, nblk, &
istep*nbw+n_cols-nbw, istep*nbw+n_cols, 1,&
0, PQRPARAM, mpi_comm_rows, mpi_comm_cols,&
blockheuristic)
endif
else
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
tau = 0
if (nrow == 1) exit ! Nothing to do
cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
if (my_pcol==cur_pcol) then
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
aux1(2) = vr(lr)
else
aux1(1) = dot_product(vr(1:lr),vr(1:lr))
aux1(2) = 0.
endif
call mpi_allreduce(aux1,aux2,2,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
call hh_transform_real(vrl, vnorm2, xf, tau)
! Scale vr and store Householder vector for back transformation
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
a(1:lr-1,lch) = vr(1:lr-1)
a(lr,lch) = vrl
vr(lr) = 1.
else
a(1:lr,lch) = vr(1:lr)
endif
endif
! Broadcast Householder vector and tau along columns
vr(lr+1) = tau
call MPI_Bcast(vr,lr+1,MPI_REAL8,cur_pcol,mpi_comm_cols,mpierr)
if (useGPU) then
vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
else
vmrCPU(1:lr,lc) = vr(1:lr)
endif
tau = vr(lr+1)
tmat(lc,lc,istep) = tau ! Store tau in diagonal of tmat
! Transform remaining columns in current block with Householder vector
! Local dot product
aux1 = 0
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
enddo
! Get global dot products
if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
endif
enddo
enddo
if (useGPU) then
! store column tiles back to GPU
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+((lc_start-1)*lda*size_of_real_datatype)), &
lda*size_of_real_datatype, loc(a(1, lc_start)), &
lda*size_of_real_datatype, lr_end*size_of_real_datatype, &
(lc_end - lc_start+1),cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy2d"
stop
endif
endif
endif
! Calculate scalar products of stored Householder vectors.
! This can be done in different ways, we use dsyrk
vav = 0
if (useGPU) then
if (l_rows>0) &
call dsyrk('U','T',n_cols,l_rows,1.d0,vmrCUDA,cur_l_rows,0.d0,vav,ubound(vav,dim=1))
else
if (l_rows>0) &
call dsyrk('U','T',n_cols,l_rows,1.d0,vmrCPU,ubound(vmrCPU,dim=1),0.d0,vav,ubound(vav,dim=1))
endif
call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc vmc (stored in umc, second half)
if (useGPU) then
call elpa_transpose_vectors_real (vmrCUDA, cur_l_rows, mpi_comm_rows, &
umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
else
call elpa_transpose_vectors_real (vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
endif
! Calculate umc = A**T * vmr
! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix
if (useGPU) then
umcCUDA(1 : l_cols * n_cols) = 0.d0
vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = 0
else
umcCPU(1:l_cols,1:n_cols) = 0.d0
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0
endif
if (l_cols>0 .and. l_rows>0) then
if (useGPU) then
successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_real_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_real_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
endif
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce0) then
if (useGPU) then
allocate(tmpCUDA(l_cols * n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating tmpCUDA "//errorMessage
stop
endif
call mpi_allreduce(umcCUDA,tmpCUDA,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,ierr)
umcCUDA(1 : l_cols * n_cols) = tmpCUDA(1 : l_cols * n_cols)
else
allocate(tmpCPU(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating tmpCPU "//errorMessage
stop
endif
call mpi_allreduce(umcCPU,tmpCPU,l_cols*n_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
umcCPU(1:l_cols,1:n_cols) = tmpCPU(1:l_cols,1:n_cols)
endif
if (allocated(tmpCUDA)) then
deallocate(tmpCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating tmpCUDA "//errorMessage
stop
endif
endif
if (allocated(tmpCPU)) then
deallocate(tmpCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating tmpCPU "//errorMessage
stop
endif
endif
endif ! l_cols
! U = U * Tmat**T
if (useGPU) then
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_real_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call cublas_dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols, &
1.d0, tmat_dev,nbw,umc_dev,cur_l_cols)
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call cublas_dgemm('T','N',n_cols,n_cols,l_cols, &
1.d0, umc_dev,cur_l_cols,(umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, &
0.d0, vav_dev,nbw)
call cublas_dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols, &
1.d0, tmat_dev,nbw, vav_dev, nbw)
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev, nbw*nbw*size_of_real_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
call symm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev, loc(vav(1,1)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
else
call dtrmm('Right','Upper','Trans','Nonunit',l_cols,n_cols,1.d0,tmat(1,1,istep), &
ubound(tmat,dim=1),umcCPU,ubound(umcCPU,dim=1))
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
call dgemm('T','N',n_cols,n_cols,l_cols,1.d0,umcCPU,ubound(umcCPU,dim=1),umcCPU(1,n_cols+1), &
ubound(umcCPU,dim=1),0.d0,vav,ubound(vav,dim=1))
call dtrmm('Right','Upper','Trans','Nonunit',n_cols,n_cols,1.d0,tmat(1,1,istep), &
ubound(tmat,dim=1),vav,ubound(vav,dim=1))
call symm_matrix_allreduce(n_cols,vav,nbw,nbw,mpi_comm_cols)
endif
! U = U - 0.5 * V * VAV
if (useGPU) then
call cublas_dgemm('N','N',l_cols,n_cols,n_cols,&
-0.5d0, (umc_dev+(cur_l_cols * n_cols )*size_of_real_datatype),cur_l_cols, vav_dev,nbw,&
1.0d0, umc_dev,cur_l_cols)
successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_real_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_real (umcCUDA, cur_l_cols, mpi_comm_cols, &
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_real_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_real_datatype, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy"
stop
endif
else
call dgemm('N','N',l_cols,n_cols,n_cols,-0.5d0,umcCPU(1,n_cols+1),ubound(umcCPU,dim=1),vav, &
ubound(vav,dim=1),1.d0,umcCPU,ubound(umcCPU,dim=1))
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_real (umcCPU, ubound(umcCPU,dim=1), mpi_comm_cols, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
endif
! A = A - V*U**T - U*V**T
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce 1) then
call dgemm('T', 'N', t_rows, t_cols, l_rows, 1.d0, hvm(1,1), max_local_rows, hvm(1,(i-1)*nbw+1), &
max_local_rows, 0.d0, t_tmp, cwy_blocking)
call mpi_allreduce(t_tmp,t_tmp2,cwy_blocking*nbw,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
call dtrmm('L','U','N','N',t_rows,t_cols,1.0d0,tmat_complete,cwy_blocking,t_tmp2,cwy_blocking)
call dtrmm('R','U','N','N',t_rows,t_cols,-1.0d0,tmat_complete(t_rows+1,t_rows+1),cwy_blocking,t_tmp2,cwy_blocking)
tmat_complete(1:t_rows,t_rows+1:t_rows+t_cols) = t_tmp2(1:t_rows,1:t_cols)
endif
enddo
! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then
call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
q,ldq,0.d0,tmp1,n_cols)
else
tmp1(1:l_cols*n_cols) = 0
endif
call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
if (l_rows>0) then
call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat_complete,cwy_blocking,tmp2,n_cols)
call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,dim=1), tmp2,n_cols,1.d0,q,ldq)
endif
enddo
else ! do not useQR
do istep=1,(na-1)/nbw
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb
nb = 0
ns = 0
do lc = 1, n_cols
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number
if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
nb = nb+l_rows
if (lc==n_cols .or. mod(ncol,nblk)==0) then
call MPI_Bcast(hvb(ns+1),nb-ns,MPI_REAL8,pcol(ncol, nblk, np_cols),mpi_comm_cols,mpierr)
ns = nb
endif
enddo
! Expand compressed Householder vectors into matrix hvm
nb = 0
do lc = 1, n_cols
nrow = (istep-1)*nbw+lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
nb = nb+l_rows
enddo
if (useGPU) then
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), ((max_local_rows)*nbw*size_of_real_datatype),cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
endif
l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then
if (useGPU) then
call cublas_dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm_dev,max_local_rows, &
q_dev,ldq ,0.d0,tmp_dev,n_cols)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_real_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
else ! GPU not used
call dgemm('T','N',n_cols,l_cols,l_rows,1.d0,hvm,ubound(hvm,dim=1), &
q,ldq,0.d0,tmp1,n_cols)
endif
else
!#ifdef WITH_GPU_VERSION
! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_real_datatype)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_real: error in cudaMemset"
! stop
! endif
!
!#else
tmp1(1:l_cols*n_cols) = 0
!#endif
endif
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*size_of_real_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!#endif
call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_REAL8,MPI_SUM,mpi_comm_rows,mpierr)
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!#endif
if (l_rows>0) then
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols*size_of_real_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
call cublas_dtrmm('L','U','T','N',n_cols,l_cols,1.0d0, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm_dev,max_local_rows, &
tmp_dev,n_cols,1.d0,q_dev,ldq)
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_real_datatype),cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
else ! GPU is not used
call dtrmm('L','U','T','N',n_cols,l_cols,1.0d0,tmat(1,1,istep),ubound(tmat,dim=1),tmp2,n_cols)
call dgemm('N','N',l_rows,l_cols,n_cols,-1.d0,hvm,ubound(hvm,dim=1), &
tmp2,n_cols,1.d0,q,ldq)
endif
endif
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_real_datatype),cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!
!#endif
enddo
endif ! endQR
deallocate(tmp1, tmp2, hvb, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_real: error when deallocating tmp1 tmp2 hvb "//errorMessage
stop
endif
if (useGPU) then
successCUDA = cuda_free(hvm_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
successCUDA = cuda_free(tmp_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
successCUDA = cuda_memcpy(loc(q), q_dev, ldq*matrixCols*size_of_real_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)
successCUDA = cuda_free(q_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaFree"
stop
endif
! deallocate(q_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error when deallocating q_temp "//errorMessage
! stop
! endif
! deallocate(tmat_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_real: error when deallocating tmat_temp "//errorMessage
! stop
! endif
endif ! useGPU
deallocate(hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_real: error when deallocating hvm "//errorMessage
stop
endif
if (useQr) then
deallocate(tmat_complete, t_tmp, t_tmp2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_real: error when deallocating tmat_complete, t_tmp, t_tmp2 "//errorMessage
stop
endif
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_band_to_full_real")
#endif
end subroutine trans_ev_band_to_full_real
! --------------------------------------------------------------------------------------------------
subroutine tridiag_band_real(na, nb, nblk, a, lda, d, e, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm)
!-------------------------------------------------------------------------------
! tridiag_band_real:
! Reduces a real symmetric band matrix to tridiagonal form
!
! na Order of matrix a
!
! nb Semi bandwith
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output)
!
! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! mpi_comm
! MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer, intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
real*8, intent(in) :: a(lda,matrixCols)
real*8, intent(out) :: d(na), e(na) ! set only on PE 0
real*8 :: vnorm2, hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
real*8 :: hd(nb), hs(nb)
integer :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
integer :: my_pe, n_pes, mpierr
integer :: my_prow, np_rows, my_pcol, np_cols
integer :: ireq_ab, ireq_hv
integer :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
integer :: max_threads, my_thread, my_block_s, my_block_e, iter
integer :: mpi_status(MPI_STATUS_SIZE)
integer, allocatable :: mpi_statuses(:,:), global_id_tmp(:,:)
integer, allocatable :: omp_block_limits(:)
real*8, allocatable :: hv_t(:,:), tau_t(:)
#endif
integer, allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
integer, allocatable :: limits(:), snd_limits(:,:)
integer, allocatable :: block_limits(:)
real*8, allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
! ! dummies for calling redist_band
! complex*16 :: c_a(1,1), c_ab(1,1)
#ifdef WITH_OPENMP
integer :: omp_get_max_threads
#endif
integer :: istat
character(200) :: errorMessage
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("tridiag_band_real")
#endif
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_size(mpi_comm,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
! Get global_id mapping 2D procssor coordinates to global id
allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating global_id "//errorMessage
stop
endif
global_id(:,:) = 0
global_id(my_prow, my_pcol) = my_pe
#ifdef WITH_OPENMP
allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating global_id_tmp "//errorMessage
stop
endif
#endif
#ifndef WITH_OPENMP
call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
#else
global_id_tmp(:,:) = global_id(:,:)
call mpi_allreduce(global_id_tmp, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
deallocate(global_id_tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating global_id_tmp "//errorMessage
stop
endif
#endif
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
! Set work distribution
allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating block_limits"//errorMessage
stop
endif
call divide_band(nblocks_total, n_pes, block_limits)
! nblocks: the number of blocks for my task
nblocks = block_limits(my_pe+1) - block_limits(my_pe)
! allocate the part of the band matrix which is needed by this PE
! The size is 1 block larger than needed to avoid extensive shifts
allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating ab"//errorMessage
stop
endif
ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
! Redistribute band in a to ab
call redist_band_real(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating limits"//errorMessage
stop
endif
call determine_workload(na, nb, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
num_hh_vecs = 0
num_chunks = 0
nx = na
do n = 1, nblocks_total
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors
! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1
endif
nx = nx - nb
enddo
! Allocate space for HH vectors
allocate(hh_trans_real(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_trans_real"//errorMessage
stop
endif
! Allocate and init MPI requests
allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating ireq_hhr"//errorMessage
stop
endif
allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating ireq_hhs"//errorMessage
stop
endif
num_hh_vecs = 0
num_chunks = 0
nx = na
nt = 0
do n = 1, nblocks_total
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_chunks = num_chunks+1
call mpi_irecv(hh_trans_real(1,num_hh_vecs+1), nb*local_size, mpi_real8, nt, &
10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
num_hh_vecs = num_hh_vecs + local_size
endif
nx = nx - nb
if (n == block_limits(nt+1)) then
nt = nt + 1
endif
enddo
ireq_hhs(:) = MPI_REQUEST_NULL
! Buffers for gathering/sending the HH vectors
allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_gath"//errorMessage
stop
endif
allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_send"//errorMessage
stop
endif
hh_gath(:,:,:) = 0
hh_send(:,:,:) = 0
! Some counters
allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_cnt"//errorMessage
stop
endif
allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hh_dst"//errorMessage
stop
endif
hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
! Limits for sending
allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating snd_limits"//errorMessage
stop
endif
do iblk=1,nblocks
call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
enddo
#ifdef WITH_OPENMP
! OpenMP work distribution:
max_threads = 1
max_threads = omp_get_max_threads()
! For OpenMP we need at least 2 blocks for every thread
max_threads = MIN(max_threads, nblocks/2)
if (max_threads==0) max_threads = 1
allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating omp_block_limits"//errorMessage
stop
endif
! Get the OpenMP block limits
call divide_band(nblocks, max_threads, omp_block_limits)
allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating hv_t, tau_t"//errorMessage
stop
endif
hv_t = 0
tau_t = 0
#endif /* WITH_OPENMP */
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
if (my_pe>0 .and. na_s<=na) then
! send first column to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
#ifdef WITH_OPENMP
do istep=1,na-1-block_limits(my_pe)*nb
#else
do istep=1,na-1
#endif
if (my_pe==0) then
n = MIN(na-na_s,nb) ! number of rows to be reduced
hv(:) = 0
tau = 0
! The last step (istep=na-1) is only needed for sending the last HH vectors.
! We don't want the sign of the last element flipped (analogous to the other sweeps)
if (istep < na-1) then
! Transform first column of remaining matrix
vnorm2 = sum(ab(3:n+1,na_s-n_off)**2)
call hh_transform_real(ab(2,na_s-n_off),vnorm2,hf,tau)
hv(1) = 1
hv(2:n) = ab(3:n+1,na_s-n_off)*hf
endif
d(istep) = ab(1,na_s-n_off)
e(istep) = ab(2,na_s-n_off)
if (istep == na-1) then
d(na) = ab(1,na_s+1-n_off)
e(na) = 0
endif
else
if (na>na_s) then
! Receive Householder vector from previous task, from PE owning subdiagonal
#ifdef WITH_OPENMP
call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS,mpierr)
#else
call mpi_recv(hv,nb,mpi_real8,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr)
#endif
tau = hv(1)
hv(1) = 1.
endif
endif
na_s = na_s+1
if (na_s-n_off > nb) then
ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
n_off = n_off + nb
endif
#ifdef WITH_OPENMP
if (max_threads > 1) then
! Codepath for OpenMP
! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
! This simulates the behaviour of the MPI tasks which also work after each other.
! The code would be considerably easier, if the MPI communication would be made within
! the parallel region - this is avoided here since this would require
! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
hv_t(:,1) = hv
tau_t(1) = tau
do iter = 1, 2
! iter=1 : work on first block
! iter=2 : work on remaining blocks
! This is done in 2 iterations so that we have a barrier in between:
! After the first iteration, it is guaranteed that the last row of the last block
! is completed by the next thread.
! After the first iteration it is also the place to exchange the last row
! with MPI calls
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
do my_thread = 1, max_threads
if (iter == 1) then
my_block_s = omp_block_limits(my_thread-1) + 1
my_block_e = my_block_s
else
my_block_s = omp_block_limits(my_thread-1) + 2
my_block_e = omp_block_limits(my_thread)
endif
do iblk = my_block_s, my_block_e
ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
ne = ns+nb-1 ! last column in block
if (istepna) exit
hv = hv_t(:,my_thread)
tau = tau_t(my_thread)
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Transform diagonal block
call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
x = dot_product(hv(1:nc),hd(1:nc))*tau
hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1)
hv_t(:,my_thread) = 0
tau_t(my_thread) = 0
if (nr<=0) cycle ! No subdiagonal block present any more
! Transform subdiagonal block
call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
if (nr>1) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation for first column
! (stored in hv_t(:,my_thread) and tau_t(my_thread))
vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
hv_t(1 ,my_thread) = 1.
hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0
! update subdiagonal block for old and new Householder transformation
! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
call DGEMV('T',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,0.d0,h(2),1)
x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_t(1:nr,my_thread)*h(i) - hs(1:nr)*hv(i)
enddo
else
! No new Householder transformation for nr=1, just complete the old one
ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
enddo
! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
hv_t(1,my_thread) = 1.
endif
enddo
enddo ! my_thread
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
if (iter==1) then
! We are at the end of the first block
! Send our first column to previous PE
if (my_pe>0 .and. na_s <= na) then
call mpi_wait(ireq_ab,mpi_status,mpierr)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
! Request last column from next PE
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne <= na) then
call mpi_recv(ab(1,ne-n_off),nb+1,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr)
endif
else
! We are at the end of all blocks
! Send last HH vector and TAU to next PE if it has been calculated above
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne < na) then
call mpi_wait(ireq_hv,mpi_status,mpierr)
hv_s(1) = tau_t(max_threads)
hv_s(2:) = hv_t(2:,max_threads)
call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
endif
! "Send" HH vector and TAU to next OpenMP thread
do my_thread = max_threads, 2, -1
hv_t(:,my_thread) = hv_t(:,my_thread-1)
tau_t(my_thread) = tau_t(my_thread-1)
enddo
endif
enddo ! iter
else
! Codepath for 1 thread without OpenMP
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
do iblk=1,nblocks
ns = na_s + (iblk-1)*nb - n_off ! first column in block
ne = ns+nb-1 ! last column in block
if (ns+n_off>na) exit
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
#ifndef WITH_OPENMP
if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, &
global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
! Reset counter and increase destination row
hh_cnt(iblk) = 0
hh_dst(iblk) = hh_dst(iblk)+1
endif
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
#endif
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Multiply diagonal block and subdiagonal block with Householder vector
if (iblk==nblocks .and. nc==nb) then
! We need the last column from the next PE.
! First do the matrix multiplications without last column ...
! Diagonal block, the contribution of the last element is added below!
ab(1,ne) = 0
call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
! Subdiagonal block
if (nr>0) call DGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
! ... then request last column ...
#ifdef WITH_OPENMP
call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS,mpierr)
#else
call mpi_recv(ab(1,ne),nb+1,mpi_real8,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr)
#endif
! ... and complete the result
hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
else
! Normal matrix multiply
call DSYMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,0.d0,hd,1)
if (nr>0) call DGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,0.d0,hs,1)
endif
! Calculate first column of subdiagonal block and calculate new
! Householder transformation for this column
hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb
tau_new = 0
if (nr>0) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation ...
if (nr>1) then
vnorm2 = sum(ab(nb+2:nb+nr,ns)**2)
call hh_transform_real(ab(nb+1,ns),vnorm2,hf,tau_new)
hv_new(1) = 1.
hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0
endif
! ... and send it away immediatly if this is the last block
if (iblk==nblocks) then
#ifdef WITH_OPENMP
call mpi_wait(ireq_hv,MPI_STATUS,mpierr)
#else
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
#endif
hv_s(1) = tau_new
hv_s(2:) = hv_new(2:)
call mpi_isend(hv_s,nb,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
endif
endif
! Transform diagonal block
x = dot_product(hv(1:nc),hd(1:nc))*tau
hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
if (my_pe>0 .and. iblk==1) then
! The first column of the diagonal block has to be send to the previous PE
! Calculate first column only ...
ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*hv(1) - hv(1:nc)*hd(1)
! ... send it away ...
#ifdef WITH_OPENMP
call mpi_wait(ireq_ab,MPI_STATUS,mpierr)
#else
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
#endif
ab_s(1:nb+1) = ab(1:nb+1,ns)
call mpi_isend(ab_s,nb+1,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
! ... and calculate remaining columns with rank-2 update
if (nc>1) call DSYR2('L',nc-1,-1.d0,hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1)
else
! No need to send, just a rank-2 update
call DSYR2('L',nc,-1.d0,hd,1,hv,1,ab(1,ns),2*nb-1)
endif
! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
if (nr>0) then
if (nr>1) then
call DGEMV('T',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,0.d0,h(2),1)
x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*h(i) - hs(1:nr)*hv(i)
enddo
else
! No double Householder transformation for nr=1, just complete the row
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*hv(i)
enddo
endif
endif
! Use new HH vector for the next block
hv(:) = hv_new(:)
tau = tau_new
enddo
#ifdef WITH_OPENMP
endif
do iblk = 1, nblocks
if (hh_dst(iblk) >= np_rows) exit
if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr)
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_real8, &
global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
! Reset counter and increase destination row
hh_cnt(iblk) = 0
hh_dst(iblk) = hh_dst(iblk)+1
endif
enddo
#endif
enddo
! Finish the last outstanding requests
#ifdef WITH_OPENMP
call mpi_wait(ireq_ab,MPI_STATUS,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS,mpierr)
allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating mpi_statuses"//errorMessage
stop
endif
call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES, mpierr)
call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES, mpierr)
deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating mpi_statuses"//errorMessage
stop
endif
#else
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
#endif
call mpi_barrier(mpi_comm,mpierr)
deallocate(ab, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating ab"//errorMessage
stop
endif
deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating ireq_hhr, ireq_hhs"//errorMessage
stop
endif
deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating hh_cnt, hh_dst"//errorMessage
stop
endif
deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating hh_gath, hh_send"//errorMessage
stop
endif
deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating limits, send_limits"//errorMessage
stop
endif
deallocate(block_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when deallocating block_limits"//errorMessage
stop
endif
deallocate(global_id, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_real: error when allocating global_id"//errorMessage
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("tridiag_band_real")
#endif
end subroutine tridiag_band_real
! --------------------------------------------------------------------------------------------------
subroutine trans_ev_tridi_to_band_real(na, nev, nblk, nbw, q, ldq, matrixCols, &
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
THIS_REAL_ELPA_KERNEL)
!-------------------------------------------------------------------------------
! trans_ev_tridi_to_band_real:
! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
! nev Number eigenvectors to compute (= columns of matrix q)
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nb semi bandwith
!
! q On input: Eigenvectors of tridiagonal matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
! matrixCols local columns of matrix q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns/both
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_functions
implicit none
integer, intent(in) :: THIS_REAL_ELPA_KERNEL
logical, intent(in) :: useGPU
integer, intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
real*8 :: q(ldq,matrixCols)
integer :: np_rows, my_prow, np_cols, my_pcol
integer :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer :: current_n, current_local_n, current_n_start, current_n_end
integer :: next_n, next_local_n, next_n_start, next_n_end
integer :: bottom_msg_length, top_msg_length, next_top_msg_length
integer :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
integer :: thread_width, csw, b_off, b_len
#endif
integer :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer :: a_off, current_tv_off, max_blk_size
integer :: mpierr, src, src_offset, dst, offset, nfact, num_blk
#ifdef WITH_OPENMP
integer :: mpi_status(MPI_STATUS_SIZE)
#endif
logical :: flag
#ifdef WITH_OPENMP
real*8, allocatable :: a(:,:,:,:), row(:)
#else
real*8, allocatable :: a(:,:,:), row(:)
#endif
real*8, allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
real*8, allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
real*8, allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else
real*8, allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
real*8, allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
real*8, allocatable :: result_buffer(:,:,:)
real*8, allocatable :: bcast_buffer(:,:)
integer :: tmp
! real*8, allocatable, device :: a_dev(:,:,:)
! real*8, allocatable, device :: bcast_buffer_dev(:,:)
! real*8, allocatable, device :: row_dev(:)
! real*8, allocatable, device :: row_group_dev(:,:)
! real*8, allocatable, device :: hh_dot_dev(:)
! real*8, allocatable, device :: hh_tau_dev(:)
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1
integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=c_intptr_t) :: hh_tau_dev
Integer :: top, chunk, this_chunk
integer :: row_group_size, unpack_idx
integer :: n_off
integer, allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer, allocatable :: top_send_request(:), bottom_send_request(:)
integer, allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
integer, allocatable :: mpi_statuses(:,:)
#endif
! MPI send/recv tags, arbitrary
integer, parameter :: bottom_recv_tag = 111
integer, parameter :: top_recv_tag = 222
integer, parameter :: result_recv_tag = 333
! Just for measuring the kernel performance
real*8 :: kernel_time
integer*8 :: kernel_flops
#ifdef WITH_OPENMP
integer :: max_threads, my_thread
integer :: omp_get_max_threads
#endif
logical, intent(in) :: wantDebug
logical :: success
integer :: istat
character(200) :: errorMessage
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_tridi_to_band_real")
#endif
#ifdef WITH_GPU_VERSION
unpack_idx = 0
row_group_size = 0
#endif
success = .true.
kernel_time = 1.d-100
kernel_flops = 0
#ifdef WITH_OPENMP
max_threads = 1
max_threads = omp_get_max_threads()
#endif
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: band backtransform works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
nfact = nbw / nblk
! local number of eigenvectors
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if (l_nev==0) then
#ifdef WITH_OPENMP
thread_width = 0
#endif
stripe_width = 0
stripe_count = 0
last_stripe_width = 0
else
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
if (.not.(useGPU)) then
#ifdef WITH_OPENMP
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#endif
stripe_width = 48 ! Must be a multiple of 4
#ifdef WITH_OPENMP
stripe_count = (thread_width-1)/stripe_width + 1
#else
stripe_count = (l_nev-1)/stripe_width + 1
#endif
! Adapt stripe width so that last one doesn't get too small
#ifdef WITH_OPENMP
stripe_width = (thread_width-1)/stripe_count + 1
#else
stripe_width = (l_nev-1)/stripe_count + 1
#endif
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
else ! GPUs are used
stripe_width = 256 ! Must be a multiple of 4
stripe_count = (l_nev - 1) / stripe_width + 1
endif
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
endif
! Determine the matrix distribution at the beginning
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating limits"//errorMessage
stop
endif
call determine_workload(na, nbw, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
if (useGPU) then
num = (stripe_width*a_dim2*stripe_count)*size_of_real_datatype
successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_real_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage
stop
endif
successCUDA = cuda_memset(a_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage
stop
endif
else ! GPUs are not used
!DEC$ ATTRIBUTES ALIGN: 64:: a
#ifdef WITH_OPENMP
allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
stop
endif
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
#else
allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating a"//errorMessage
stop
endif
a(:,:,:) = 0
#endif
endif !useGPU
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating row"//errorMessage
stop
endif
row(:) = 0
if (useGPU) then
num = (l_nev)*size_of_real_datatype
successCUDA = cuda_malloc( row_dev,num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc "//errorMessage
stop
endif
successCUDA = cuda_memset(row_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage
stop
endif
! "row_group" and "row_group_dev" are needed for GPU optimizations
allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating row_group"//errorMessage
stop
endif
row_group(:, :) = 0
num = (l_nev*nblk)*size_of_real_datatype
! call cuda_malloc2d( row_group_dev,l_nev*size_of_real_datatype,nblk*size_of_real_datatype)
successCUDA = cuda_malloc(row_group_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"//errorMessage
stop
endif
successCUDA = cuda_memset(row_group_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"//errorMessage
stop
endif
endif ! useGPU
! Copy q from a block cyclic distribution into a distribution with contiguous rows,
! and transpose the matrix using stripes of given stripe_width for cache blocking.
! The peculiar way it is done below is due to the fact that the last row should be
! ready first since it is the first one to start below
#ifdef WITH_OPENMP
! Please note about the OMP usage below:
! This is not for speed, but because we want the matrix a in the memory and
! in the cache of the correct thread (if possible)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation!
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#endif /* WITH_OPENMP */
do ip = np_rows-1, 0, -1
if (my_prow == ip) then
! Receive my rows which have not yet been received
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src < my_prow) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_real_cpu_openmp(row,i-limits(ip),my_thread)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_real_gpu(i - limits(ip), .false.)
call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
else
call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call unpack_row_real_cpu(row,i-limits(ip))
endif
#endif /* WITH_OPENMP */
elseif (src==my_prow) then
src_offset = src_offset+1
if (.not.(useGPU)) row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_real_cpu_openmp(row,i-limits(ip),my_thread)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_real_gpu(i - limits(ip), .false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else
call unpack_row_real_cpu(row,i-limits(ip))
endif
#endif /* WITH_OPENMP */
endif
enddo
! Send all rows which have not yet been send
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if (mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
call MPI_Send(row, l_nev, MPI_REAL8, dst, 0, mpi_comm_rows, mpierr)
endif
enddo
enddo
else if (my_prow < ip) then
! Send all rows going to PE ip
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
call MPI_Send(row, l_nev, MPI_REAL8, ip, 0, mpi_comm_rows, mpierr)
endif
enddo
! Receive all rows from PE ip
do i=limits(my_prow)+1,limits(my_prow+1)
src = mod((i-1)/nblk, np_rows)
if (src == ip) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_real_cpu_openmp(row,i-limits(my_prow),my_thread)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_real_gpu(i - limits(my_prow), .false.)
call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
else
call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call unpack_row_real_cpu(row,i-limits(my_prow))
endif
#endif /* WITH_OPENMP */
endif
enddo
endif
enddo
if (useGPU) then
! Force an unpacking of all remaining rows that haven't been unpacked yet
call unpack_and_prepare_row_group_real_gpu(-1, .true.)
successCUDA = cuda_devicesynchronize()
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaDeviceSynchronize"//errorMessage
stop
endif
endif
! Set up result buffer queue
num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
num_result_buffers = 4*nfact
allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating result_buffer"//errorMessage
stop
endif
allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating result_send_request"//errorMessage
stop
endif
allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating result_recv_request"//errorMessage
stop
endif
result_send_request(:) = MPI_REQUEST_NULL
result_recv_request(:) = MPI_REQUEST_NULL
! Queue up buffers
if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
do j = 1, min(num_result_buffers, num_result_blocks)
call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(j), mpierr)
enddo
endif
num_bufs_recvd = 0 ! No buffers received yet
! Initialize top/bottom requests
allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_send_request"//errorMessage
stop
endif
allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_recv_request"//errorMessage
stop
endif
allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_send_request"//errorMessage
stop
endif
allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_recv_request"//errorMessage
stop
endif
top_send_request(:) = MPI_REQUEST_NULL
top_recv_request(:) = MPI_REQUEST_NULL
bottom_send_request(:) = MPI_REQUEST_NULL
bottom_recv_request(:) = MPI_REQUEST_NULL
#ifdef WITH_OPENMP
allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_buffer"//errorMessage
stop
endif
allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_recv_buffer"//errorMessage
stop
endif
allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_send_buffer"//errorMessage
stop
endif
allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_recv_buffer"//errorMessage
stop
endif
top_border_send_buffer(:,:) = 0
top_border_recv_buffer(:,:) = 0
bottom_border_send_buffer(:,:) = 0
bottom_border_recv_buffer(:,:) = 0
! Initialize broadcast buffer
#else
allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_bufer"//errorMessage
stop
endif
allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_recv_buffer"//errorMessage
stop
endif
allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_send_buffer"//errorMessage
stop
endif
allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bottom_border_recv_buffer"//errorMessage
stop
endif
top_border_send_buffer(:,:,:) = 0
top_border_recv_buffer(:,:,:) = 0
bottom_border_send_buffer(:,:,:) = 0
bottom_border_recv_buffer(:,:,:) = 0
#endif /* WITH_OPENMP */
allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating bcast_buffer"//errorMessage
stop
endif
bcast_buffer = 0
if (useGPU) then
num = ( nbw * max_blk_size) * size_of_real_datatype
successCUDA = cuda_malloc(bcast_buffer_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( bcast_buffer_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"
stop
endif
num = ((max_blk_size-1))*size_of_real_datatype
successCUDA = cuda_malloc( hh_dot_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( hh_dot_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"
stop
endif
num = (max_blk_size)*size_of_real_datatype
successCUDA = cuda_malloc( hh_tau_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( hh_tau_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"
stop
endif
endif ! useGPU
current_tv_off = 0 ! Offset of next row to be broadcast
! ------------------- start of work loop -------------------
a_off = 0 ! offset in A (to avoid unnecessary shifts)
top_msg_length = 0
bottom_msg_length = 0
do sweep = 0, (na-1)/nbw
current_n = na - sweep*nbw
call determine_workload(current_n, nbw, np_rows, limits)
current_n_start = limits(my_prow)
current_n_end = limits(my_prow+1)
current_local_n = current_n_end - current_n_start
next_n = max(current_n - nbw, 0)
call determine_workload(next_n, nbw, np_rows, limits)
next_n_start = limits(my_prow)
next_n_end = limits(my_prow+1)
next_local_n = next_n_end - next_n_start
if (next_n_end < next_n) then
bottom_msg_length = current_n_end - next_n_end
else
bottom_msg_length = 0
endif
if (next_local_n > 0) then
next_top_msg_length = current_n_start - next_n_start
else
next_top_msg_length = 0
endif
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
b_len = csw*nbw*max_threads
call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#else
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#endif
enddo
endif
if (current_local_n > 1) then
if (my_pcol == mod(sweep,np_cols)) then
bcast_buffer(:,1:current_local_n) = hh_trans_real(:,current_tv_off+1:current_tv_off+current_local_n)
current_tv_off = current_tv_off + current_local_n
endif
call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_REAL8, mod(sweep,np_cols), mpi_comm_cols, mpierr)
if (useGPU) then
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), &
nbw * current_local_n * size_of_real_datatype , cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
call extract_hh_tau_real_gpu(nbw, current_local_n, .false.)
call compute_hh_dot_products_real_gpu(nbw, current_local_n)
endif
else
! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real
bcast_buffer(:,1) = 0
if (useGPU) then
successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_real_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemset"
stop
endif
call extract_hh_tau_real_gpu(nbw, 1, .true.)
endif
endif
if (l_nev == 0) cycle
if (current_local_n > 0) then
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
! Get real stripe width for strip i;
! The last OpenMP tasks may have an even smaller stripe with,
! but we don't care about this, i.e. we send/recv a bit too much in this case.
! csw: current_stripe_width
csw = min(stripe_width, thread_width-(i-1)*stripe_width)
#endif /* WITH_OPENMP */
!wait_b
if (current_n_end < current_n) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(bottom_recv_request(i), MPI_STATUS, mpierr)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
n_off = current_local_n+a_off
b_len = csw*nbw
b_off = (my_thread-1)*b_len
a(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
n_off = current_local_n+a_off
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *size_of_real_datatype
successCUDA = cuda_memcpy( a_dev + dev_offset , loc(bottom_border_recv_buffer(1,1,i)), &
stripe_width*nbw*size_of_real_datatype ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
endif
#endif /* WITH_OPENMP */
if (next_n_end < next_n) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
MPI_REAL8, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#else
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#endif
endif
endif
if (current_local_n <= bottom_msg_length + top_msg_length) then
!wait_t
if (top_msg_length>0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(top_recv_request(i), MPI_STATUS, mpierr)
#else /* WITH_OPENMP */
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA = cuda_memcpy( a_dev+dev_offset , loc(top_border_recv_buffer(1,1,i)), &
stripe_width*top_msg_length*size_of_real_datatype , cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif ! useGPU
#endif /* WITH_OPENMP */
endif ! top_msg_length
!compute
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
if (top_msg_length>0) then
b_len = csw*top_msg_length
b_off = (my_thread-1)*b_len
a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif
call compute_hh_trafo_real(0, current_local_n, i, my_thread, &
THIS_REAL_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
call compute_hh_trafo_real(0, current_local_n, i, &
THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */
!send_b
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
if (bottom_msg_length>0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
b_len = csw*bottom_msg_length*max_threads
bottom_border_send_buffer(1:b_len,i) = &
reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
endif
#else /* WITH_OPENMP */
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
if (bottom_msg_length>0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_real_datatype ,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
endif
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
endif
#endif /* WITH_OPENMP */
else
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
call compute_hh_trafo_real(current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
THIS_REAL_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
!send_b
call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
if (bottom_msg_length > 0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
b_len = csw*bottom_msg_length*max_threads
bottom_border_send_buffer(1:b_len,i) = &
reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_REAL8, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
endif
#else /* WITH_OPENMP */
call compute_hh_trafo_real(current_local_n - bottom_msg_length, bottom_msg_length, i, &
THIS_REAL_ELPA_KERNEL)
!send_b
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
if (bottom_msg_length > 0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
if (useGPU) then
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, &
stripe_width*bottom_msg_length*size_of_real_datatype ,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error cudaMemcpy"
stop
endif
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
endif
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_REAL8, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
endif
#endif /* WITH_OPENMP */
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call compute_hh_trafo_real(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, &
THIS_REAL_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
call compute_hh_trafo_real(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */
!wait_t
if (top_msg_length>0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
#else /* WITH_OPENMP */
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype
successCUDA = cuda_memcpy( a_dev + dev_offset , loc( top_border_recv_buffer(:,1,i)), &
stripe_width * top_msg_length *size_of_real_datatype ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif
#endif /* WITH_OPENMP */
endif
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
if (top_msg_length>0) then
b_len = csw*top_msg_length
b_off = (my_thread-1)*b_len
a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif
call compute_hh_trafo_real(0, top_msg_length, i, my_thread, THIS_REAL_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
call compute_hh_trafo_real(0, top_msg_length, i, THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */
endif
if (next_top_msg_length > 0) then
!request top_border data
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
b_len = csw*next_top_msg_length*max_threads
call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_REAL8, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#else /* WITH_OPENMP */
call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_REAL8, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#endif /* WITH_OPENMP */
endif
!send_t
if (my_prow > 0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(top_send_request(i), mpi_status, mpierr)
b_len = csw*nbw*max_threads
top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_REAL8, &
my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#else /* WITH_OPENMP */
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
if (useGPU) then
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_real_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), a_dev + dev_offset, &
stripe_width*nbw*size_of_real_datatype ,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaMemcpy"
stop
endif
else
top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i)
endif
call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_REAL8, my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#endif /* WITH_OPENMP */
endif
! Care that there are not too many outstanding top_recv_request's
if (stripe_count > 1) then
if (i>1) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(top_recv_request(i-1), MPI_STATUS, mpierr)
#else
call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
#endif
else
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS, mpierr)
#else
call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
#endif
endif
endif
enddo
top_msg_length = next_top_msg_length
else
! wait for last top_send_request
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(top_send_request(i), MPI_STATUS, mpierr)
#else
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
enddo
endif
! Care about the result
if (my_prow == 0) then
! topmost process sends nbw rows to destination processes
do j=0,nfact-1
num_blk = sweep*nfact+j ! global number of destination block, 0 based
if (num_blk*nblk >= na) exit
nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(result_send_request(nbuf), MPI_STATUS, mpierr)
#else
call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#endif
dst = mod(num_blk, np_rows)
if (dst == 0) then
if (useGPU) then
row_group_size = min(na - num_blk*nblk, nblk)
call pack_row_group_real_gpu(row_group(:, :), j * nblk + a_off, row_group_size)
do i = 1, row_group_size
q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
enddo
else
do i = 1, min(na - num_blk*nblk, nblk)
call pack_row_real_cpu(row, j*nblk+i+a_off)
q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
enddo
endif
else
if (useGPU) then
call pack_row_group_real_gpu(result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else
do i = 1, nblk
call pack_row_real_cpu(result_buffer(:,i,nbuf),j*nblk+i+a_off)
enddo
endif
call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, dst, &
result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
endif
enddo
else
! receive and store final result
do j = num_bufs_recvd, num_result_blocks-1
nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
! If there is still work to do, just test for the next result request
! and leave the loop if it is not ready, otherwise wait for all
! outstanding requests
if (next_local_n > 0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS, mpierr)
#else
call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
#endif
if (.not.flag) exit
else
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
call MPI_Wait(result_recv_request(nbuf), MPI_STATUS, mpierr)
#else
call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#endif
endif
! Fill result buffer into q
num_blk = j*np_rows + my_prow ! global number of current block, 0 based
do i = 1, min(na - num_blk*nblk, nblk)
q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
enddo
! Queue result buffer again if there are outstanding blocks left
if (j+num_result_buffers < num_result_blocks) &
call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_REAL8, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(nbuf), mpierr)
enddo
num_bufs_recvd = j
endif
! Shift the remaining rows to the front of A (if necessary)
offset = nbw - top_msg_length
if (offset<0) then
if (wantDebug) write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_real: internal error, offset for shifting = ',offset
success = .false.
return
endif
a_off = a_off + offset
if (a_off + next_local_n + nbw > a_dim2) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, i, j), schedule(static, 1)
do my_thread = 1, max_threads
do i = 1, stripe_count
do j = top_msg_length+1, top_msg_length+next_local_n
A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread)
enddo
enddo
enddo
#else /* WITH_OPENMP */
do i = 1, stripe_count
if (useGPU) then
chunk = min(next_local_n - 1, a_off)
do j = top_msg_length + 1, top_msg_length + next_local_n, chunk
top = min(j + chunk, top_msg_length + next_local_n)
this_chunk = top - j + 1
dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype
dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_real_datatype
! it is not logical to set here always the value for the parameter
! "cudaMemcpyDeviceToDevice" do this ONCE at startup
! tmp = cuda_d2d(1)
successCUDA = cuda_memcpy( a_dev + dev_offset , a_dev +dev_offset_1, &
stripe_width*this_chunk*size_of_real_datatype, cudaMemcpyDeviceToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error cudaMemcpy"
stop
endif
enddo
else ! not useGPU
do j = top_msg_length+1, top_msg_length+next_local_n
A(:,j,i) = A(:,j+a_off,i)
enddo
endif
enddo ! stripe_count
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#endif
a_off = 0
endif
enddo
! Just for safety:
if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
if (my_prow == 0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
endif
allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating mpi_statuses"//errorMessage
stop
endif
call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr)
deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating mpi_statuses"//errorMessage
stop
endif
#else
call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
#endif
endif
if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6
! deallocate all working space
if (.not.(useGPU)) then
deallocate(a, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating a "//errorMessage
stop
endif
endif
deallocate(row, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating row "//errorMessage
stop
endif
deallocate(limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating limits"//errorMessage
stop
endif
deallocate(result_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating result_send_request "//errorMessage
stop
endif
deallocate(result_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating result_recv_request "//errorMessage
stop
endif
deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating top_border_send_buffer "//errorMessage
stop
endif
deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating top_border_recv_buffer "//errorMessage
stop
endif
deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_border_send_buffer "//errorMessage
stop
endif
deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_border_recv_buffer "//errorMessage
stop
endif
deallocate(result_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating result_buffer "//errorMessage
stop
endif
deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bcast_buffer "//errorMessage
stop
endif
deallocate(top_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating top_send_request "//errorMessage
stop
endif
deallocate(top_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating top_recv_request "//errorMessage
stop
endif
deallocate(bottom_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_send_request "//errorMessage
stop
endif
deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating bottom_recv_request "//errorMessage
stop
endif
if (useGPU) then
successCUDA = cuda_free(hh_dot_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
successCUDA = cuda_free(hh_tau_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
successCUDA = cuda_free(row_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
deallocate(row_group, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when deallocating row_group "//errorMessage
stop
endif
successCUDA = cuda_free(row_group_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
successCUDA = cuda_free(bcast_buffer_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_real: error in cudaFree "//errorMessage
stop
endif
endif ! useGPU
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_tridi_to_band_real")
#endif
return
contains
subroutine pack_row_real_cpu(row, n)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
real*8 :: row(:)
integer :: n, i, noff, nl
#ifdef WITH_OPENMP
integer :: nt
#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("pack_row_real_cpu")
#endif
#ifdef WITH_OPENMP
do nt = 1, max_threads
do i = 1, stripe_count
noff = (nt-1)*thread_width + (i-1)*stripe_width
nl = min(stripe_width, nt*thread_width-noff, l_nev-noff)
if (nl<=0) exit
row(noff+1:noff+nl) = a(1:nl,n,i,nt)
enddo
enddo
#else
do i=1,stripe_count
nl = merge(stripe_width, last_stripe_width, i>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)
call launch_my_pack_c_kernel_real(row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, &
l_nev, a_dev, row_group_dev)
! Issue one single transfer call for all rows (device to host)
! rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)
successCUDA = cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev , row_count * l_nev * size_of_real_datatype , &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"pack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
end subroutine
! Unpack a filled row group (i.e. an array of consecutive rows)
subroutine unpack_row_group_real_gpu(rows, n_offset, row_count)
use cuda_c_kernel
implicit none
integer, intent(in) :: n_offset, row_count
real*8, intent(in) :: rows(:, :)
integer :: max_idx
integer :: i
logical :: successCUA
! Use many blocks for higher GPU occupancy
max_idx = (stripe_count - 1) * stripe_width + last_stripe_width
! Issue one single transfer call for all rows (host to device)
! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
!istat = cuda_memcpy( row_group_dev , loc(rows(:, 1: row_count)),row_count * l_nev * size_of_real_datatype , &
! cudaMemcpyHostToDevice)
successCUDA = cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * size_of_real_datatype ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"unpack_row_group_real_gpu: error in cudaMemcpy"
stop
endif
!write(*,*) cudaGetErrorString(istat)
! Use one kernel call to pack the entire row group
! call my_unpack_kernel<<>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
call launch_my_unpack_c_kernel_real( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
row_group_dev,a_dev)
end subroutine
! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
! occurs when the queue is full or when the next row belongs to another group
subroutine unpack_and_prepare_row_group_real_gpu(next_unpack_idx, force)
implicit none
integer, intent(in) :: next_unpack_idx
logical, intent(in) :: force
if (row_group_size == 0) then
! Nothing to flush, just prepare for the upcoming row
row_group_size = 1
else
if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
! A flush and a reset must be performed
call unpack_row_group_real_gpu(row_group(:, :), unpack_idx - row_group_size, row_group_size)
row_group_size = 1
else
! Just prepare for the upcoming row
row_group_size = row_group_size + 1
endif
endif
! Always update the index for the upcoming row
unpack_idx = next_unpack_idx
end subroutine
! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
subroutine compute_hh_dot_products_real_gpu(nbw, n)
use cuda_c_kernel
implicit none
integer, value :: nbw, n
if (n .le. 1) return
call launch_compute_hh_dotp_c_kernel_real( bcast_buffer_dev, hh_dot_dev, nbw, n)
end subroutine
! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
subroutine extract_hh_tau_real_gpu(nbw, n, is_zero)
use cuda_c_kernel
implicit none
integer, value :: nbw, n
logical, value :: is_zero
integer val_is_zero
if (is_zero) then
val_is_zero = 1
else
val_is_zero = 0
endif
call launch_extract_hh_tau_c_kernel_real(bcast_buffer_dev,hh_tau_dev, nbw, n, val_is_zero)
end subroutine
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
! Reset a reduction block
! Limitation: the thread-block size must be a divider of the reduction block's size
! Reset 2 reduction blocks without an explicit synchronization at the end
! Limitation: : the thread-block size must be a divider of the reduction block's size
! Perform a reduction on an initialized, 128-element shared block
! Compute the dot-product between 2 consecutive HH vectors
! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
! Limitation 2: the size of the warp must be equal to 32
!
! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
!
! -------------------------------------------
! Fortran back-transformation support kernels
! -------------------------------------------
!
! This is the simplest and slowest available backtransformation kernel
!
! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
! 2 Householder reflectors per iteration
!
! ---------------------------------
! Row packing and unpacking kernels
! ---------------------------------
!
! The row group packing kernel
! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)
! - "compute_hh_trafo_kernel" is the Fortran equivalent of the C kernel
! - "compute_hh_trafo_single_kernel" is the reference Fortran kernel
#ifdef WITH_OPENMP
subroutine compute_hh_trafo_real_openmp(off, ncols, istripe, my_thread, THIS_REAL_ELPA_KERNEL)
#else
subroutine compute_hh_trafo_real(off, ncols, istripe, THIS_REAL_ELPA_KERNEL)
#endif
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
use real_generic_simple_kernel, only : double_hh_trafo_generic_simple
#endif
!#if defined(WITH_REAL_GENERIC_KERNEL)
! use real_generic_kernel, only : double_hh_trafo_generic
!#endif
#if defined(WITH_REAL_BGP_KERNEL)
use real_bgp_kernel, only : double_hh_trafo_bgp
#endif
#if defined(WITH_REAL_BGQ_KERNEL)
use real_bgq_kernel, only : double_hh_trafo_bgq
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_c_kernel
implicit none
integer, intent(in) :: THIS_REAL_ELPA_KERNEL
! Private variables in OMP regions (my_thread) should better be in the argument list!
integer :: off, ncols, istripe
#ifdef WITH_OPENMP
integer :: my_thread, noff
#endif
integer :: j, nl, jj, jjj
real*8 :: w(nbw,6), ttt
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
if (ncols < 1) return
endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
call timer%start("compute_hh_trafo_real_openmp")
#else
call timer%start("compute_hh_trafo_real")
#endif
#endif
ttt = mpi_wtime()
#ifndef WITH_OPENMP
nl = merge(stripe_width, last_stripe_width, istripe na) then
! there is not enough work for all
do i = 0, nprocs
limits(i) = min(na, i*nb)
enddo
else
do i = 0, nprocs
limits(i) = (i*na)/nprocs
enddo
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("determine_workload")
#endif
end subroutine
!-------------------------------------------------------------------------------
subroutine bandred_complex(na, a, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, tmat, wantDebug, &
useGPU, success)
!-------------------------------------------------------------------------------
! bandred_complex: Reduces a distributed hermitian matrix to band form
!
! Parameters
!
! na Order of matrix
!
! a(lda,matrixCols) Distributed matrix which should be reduced.
! Distribution is like in Scalapack.
! Opposed to Scalapack, a(:,:) must be set completely (upper and lower half)
! a(:,:) is overwritten on exit with the band and the Householder vectors
! in the upper half.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith of output matrix
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
! tmat(nbw,nbw,numBlocks) where numBlocks = (na-1)/nbw + 1
! Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_functions
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
integer :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
complex*16 :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
integer :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer :: l_cols, l_rows
integer :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer :: istep, ncol, lch, lcx, nlc
integer :: tile_size, l_rows_tile, l_cols_tile
real*8 :: vnorm2
complex*16 :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
complex*16, allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
integer(kind=c_intptr_t):: umc_dev, tmat_dev,vav_dev,vmr_dev,a_dev
integer :: cur_l_rows, cur_l_cols,vmr_size ,umc_size
integer(kind=c_size_t) :: lc_start, lc_end, lr_end, lce_1, lcs_1,lre_1
integer :: na_rows, na_cols
integer, external :: numroc
logical, intent(in) :: wantDebug
logical, intent(out) :: success
character(200) :: errorMessage
integer :: istat
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("bandred_complex")
#endif
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
success = .true.
! Semibandwith nbw must be a multiple of blocksize nblk
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_bandred_complex: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_bandred_complex: ELPA2 works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
if (useGPU) then
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
! if (na_rows .ne. na_rows2) then
! print *,"bandred_complex: Why is na_rows not equal? ",na_rows,na_rows2
! endif
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
! if (na_cols .ne. na_cols2) then
! print *,"bandred_complex: Why is na_cols not equal? ",na_cols,na_cols2
! endif
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_complex_datatype)
if (.not.(successCUDA)) then
print *, " bandred_complex: cuda malloc failed tmat_dev ", istat
stop
endif
successCUDA = cuda_malloc(vav_dev, nbw*nbw*size_of_complex_datatype)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed vav_dev ", istat
stop
endif
successCUDA = cuda_malloc(a_dev, lda*na_cols*size_of_complex_datatype)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed a_dev ", istat
stop
endif
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
tile_size = nblk*least_common_multiple(np_rows,np_cols) ! minimum global tile size
tile_size = ((128*max(np_rows,np_cols)-1)/tile_size+1)*tile_size ! make local tiles at least 128 wide
l_rows_tile = tile_size/np_rows ! local rows of a tile
l_cols_tile = tile_size/np_cols ! local cols of a tile
if (useGPU) then
if (size(a,dim=1) .ne. lda .or. size(a,dim=2) .ne. na_cols) then
print *,"bandred_complex: sizes of a wrong ? ",lda,size(a,dim=1),na_cols,size(a,dim=2)
endif
successCUDA = cuda_memcpy(a_dev, loc(a(1,1)),(lda)*(na_cols)*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy faild a_dev ", istat
stop
endif
endif
do istep = (na-1)/nbw, 1, -1
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Number of local columns/rows of remaining matrix
l_cols = local_index(istep*nbw, my_pcol, np_cols, nblk, -1)
l_rows = local_index(istep*nbw, my_prow, np_rows, nblk, -1)
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
if (useGPU) then
cur_l_rows = max(l_rows, 1)
cur_l_cols = max(l_cols, 1)
vmr_size = cur_l_rows * 2 * n_cols
umc_size = cur_l_cols * 2 * n_cols
if ((.not. allocated(umc)) .or. (umc_size .gt. ubound(umc, dim=1))) then
if (allocated(umc)) then
deallocate(umc, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umc "//errorMessage
stop
endif
successCUDA = cuda_free(umc_dev)
if (.not.(successCUDA))then
print *,"bandred_complex: error in cudaFree"
stop
endif
endif
allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umc "//errorMessage
stop
endif
if (max(l_cols,1) * 2*n_cols .gt. umc_size) then
print *,"bandred_complex: umc_size ",max(l_cols,1) * 2*n_cols,umc_size
endif
successCUDA = cuda_malloc(umc_dev, umc_size*size_of_complex_datatype)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed umc_dev ", istat
stop
endif
endif
if ((.not. allocated(vmr)) .or. (vmr_size .gt. ubound(vmr, dim=1))) then
if (allocated(vmr)) then
deallocate(vmr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating vmr "//errorMessage
stop
endif
successCUDA = cuda_free(vmr_dev)
if (.not.(successCUDA))then
print *,"bandred_complex: error in cudaFree"
stop
endif
endif
allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vmr "//errorMessage
stop
endif
if (max(l_rows,1) * 2*n_cols .gt. vmr_size) then
print *,"bandred_complex: vmc_size ",max(l_rows,1) * 2*n_cols,vmr_size
endif
successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_complex_datatype)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed vmr_dev ", istat
stop
endif
endif
if ((.not. allocated(vr)) .or. (l_rows + 1 .gt. ubound(vr, dim=1))) then
if (allocated(vr)) then
deallocate(vr, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating vr "//errorMessage
stop
endif
endif
allocate(vr(l_rows + 1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vr "//errorMessage
stop
endif
endif
else ! GPU not used
allocate(vmr(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vmr "//errorMessage
stop
endif
allocate(umc(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umc "//errorMessage
stop
endif
allocate(vr(l_rows+1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vr "//errorMessage
stop
endif
endif ! useGPU
vmr(1:l_rows,1:n_cols) = 0.
vr(:) = 0
tmat(:,:,istep) = 0
if (useGPU) then
lc_start = local_index(istep*nbw+1, my_pcol, np_cols, nblk, -1)
lc_end = local_index(istep*nbw+n_cols, my_pcol, np_cols, nblk, -1)
lr_end = local_index((istep-1)*nbw + n_cols, my_prow, np_rows, nblk, -1)
if (lc_start .le. 0) lc_start = 1
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_complex_datatype,kind=c_size_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_complex_datatype),kind=c_size_t )), &
int(lda*size_of_complex_datatype,kind=c_size_t), &
int(lr_end*size_of_complex_datatype,kind=c_size_t), &
int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int))
if (.not.(successCUDA)) then
print *, "bandred_complex: error in cudaMemcpy2"
stop
endif
endif
endif
! Reduce current block to lower triangular form
do lc = n_cols, 1, -1
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! Absolute number of pivot row
lr = local_index(nrow, my_prow, np_rows, nblk, -1) ! current row length
lch = local_index(ncol, my_pcol, np_cols, nblk, -1) ! HV local column number
tau = 0
if(nrow == 1) exit ! Nothing to do
cur_pcol = pcol(ncol, nblk, np_cols) ! Processor column owning current block
if (my_pcol==cur_pcol) then
! Get vector to be transformed; distribute last element and norm of
! remaining elements to all procs in current column
vr(1:lr) = a(1:lr,lch) ! vector to be transformed
if (my_prow==prow(nrow, nblk, np_rows)) then
aux1(1) = dot_product(vr(1:lr-1),vr(1:lr-1))
aux1(2) = vr(lr)
else
aux1(1) = dot_product(vr(1:lr),vr(1:lr))
aux1(2) = 0.
endif
call mpi_allreduce(aux1,aux2,2,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
vnorm2 = aux2(1)
vrl = aux2(2)
! Householder transformation
call hh_transform_complex(vrl, vnorm2, xf, tau)
! Scale vr and store Householder vector for back transformation
vr(1:lr) = vr(1:lr) * xf
if (my_prow==prow(nrow, nblk, np_rows)) then
a(1:lr-1,lch) = vr(1:lr-1)
a(lr,lch) = vrl
vr(lr) = 1.
else
a(1:lr,lch) = vr(1:lr)
endif
endif
! Broadcast Householder vector and tau along columns
vr(lr+1) = tau
call MPI_Bcast(vr,lr+1,MPI_DOUBLE_COMPLEX,cur_pcol,mpi_comm_cols,mpierr)
vmr(1:lr,lc) = vr(1:lr)
tau = vr(lr+1)
tmat(lc,lc,istep) = conjg(tau) ! Store tau in diagonal of tmat
! Transform remaining columns in current block with Householder vector
! Local dot product
aux1 = 0
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
enddo
! Get global dot products
if (nlc>0) call mpi_allreduce(aux1,aux2,nlc,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
endif
enddo
enddo
! Calculate scalar products of stored Householder vectors.
! This can be done in different ways, we use zherk
if (useGPU) then
cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_complex_datatype),kind=c_size_t)), &
int(lda*size_of_complex_datatype,kind=c_size_t), loc(a(1,lc_start)), &
int(lda*size_of_complex_datatype,kind=c_size_t), &
int(lr_end*size_of_complex_datatype,kind=c_size_t), int((lc_end - lc_start+1),kind=c_size_t) &
,int(cudaMemcpyHostToDevice,kind=c_int))
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy a_dev failed ", istat
stop
endif
endif
endif
vav = 0
if (l_rows>0) &
call zherk('U','C',n_cols,l_rows,CONE,vmr,ubound(vmr,dim=1),CZERO,vav,ubound(vav,dim=1))
call herm_matrix_allreduce(n_cols,vav, nbw,nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation
do lc=n_cols,1,-1
tau = tmat(lc,lc,istep)
if (lc vmc (stored in umc, second half)
call elpa_transpose_vectors_complex (vmr, ubound(vmr,dim=1), mpi_comm_rows, &
umc(1,n_cols+1), ubound(umc,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
! Calculate umc = A**T * vmr
! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix
umc(1:l_cols,1:n_cols) = 0.d0
vmr(1:l_rows,n_cols+1:2*n_cols) = 0
if (l_cols>0 .and. l_rows>0) then
if (useGPU) then
if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then
print *,"bandred_complex: vmr size 2 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size
stop
endif
successCUDA = cuda_memcpy(vmr_dev, loc(vmr(1,1)),vmr_size*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy vmr_dev failed ", istat
stop
endif
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
print *,"bandred_complex: umc size 2 :",size(umc,dim=1)*size(umc,dim=2),umc_size
stop
endif
successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy umc_dev failed ", istat
stop
endif
endif
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce0) then
allocate(tmp(l_cols,n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating tmp "//errorMessage
stop
endif
call mpi_allreduce(umc,tmp,l_cols*n_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
umc(1:l_cols,1:n_cols) = tmp(1:l_cols,1:n_cols)
deallocate(tmp, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating tmp "//errorMessage
stop
endif
endif
! U = U * Tmat**T
if (useGPU) then
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
print *,"bandred_complex: umc size 4 :",size(umc,dim=1)*size(umc,dim=2),umc_size
stop
endif
successCUDA = cuda_memcpy(umc_dev, loc(umc(1,1)),umc_size*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed umc_dev ", istat
stop
endif
successCUDA = cuda_memcpy(tmat_dev,loc(tmat(1,1,istep)),nbw*nbw*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed tmat_dev ", istat
stop
endif
call cublas_ztrmm('Right','Upper','C','Nonunit',l_cols,n_cols,CONE,tmat_dev,nbw,umc_dev,cur_l_cols)
else ! not useGPU
call ztrmm('Right','Upper','C','Nonunit',l_cols,n_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),umc,ubound(umc,dim=1))
endif
! VAV = Tmat * V**T * A * V * Tmat**T = (U*Tmat**T)**T * V * Tmat**T
if (useGPU) then
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)), nbw*nbw*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav_dev ", istat
stop
endif
call cublas_zgemm('C','N',n_cols,n_cols,l_cols,CONE,umc_dev,cur_l_cols,(umc_dev +( cur_l_cols *n_cols) &
*size_of_complex_datatype ), cur_l_cols,CZERO,vav_dev, nbw)
call cublas_ztrmm('Right','Upper','C','Nonunit',n_cols,n_cols,CONE,tmat_dev,nbw,vav_dev,nbw)
successCUDA = cuda_memcpy(loc(vav(1,1)), vav_dev,nbw*nbw*size_of_complex_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav ", istat
stop
endif
call herm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_cols)
successCUDA = cuda_memcpy(vav_dev,loc(vav(1,1)),nbw*nbw*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vav_dev ", istat
stop
endif
else
call zgemm('C','N',n_cols,n_cols,l_cols,CONE,umc,ubound(umc,dim=1),umc(1,n_cols+1), &
ubound(umc,dim=1),CZERO,vav,ubound(vav,dim=1))
call ztrmm('Right','Upper','C','Nonunit',n_cols,n_cols,CONE,tmat(1,1,istep), &
ubound(tmat,dim=1),vav,ubound(vav,dim=1))
call herm_matrix_allreduce(n_cols,vav,nbw,nbw,mpi_comm_cols)
endif
! U = U - 0.5 * V * VAV
if (useGPU) then
call cublas_zgemm('N','N',l_cols,n_cols,n_cols,(-0.5d0,0.d0),(umc_dev + (cur_l_cols * n_cols )*size_of_complex_datatype), &
cur_l_cols,vav_dev, nbw,CONE,umc_dev,cur_l_cols)
! Transpose umc -> umr (stored in vmr, second half)
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
print *,"bandred_complex: umc size 5 :",size(umc,dim=1)*size(umc,dim=2),umc_size
stop
endif
successCUDA = cuda_memcpy(loc(umc(1,1)),umc_dev,umc_size*size_of_complex_datatype,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed umc ", istat
stop
endif
call elpa_transpose_vectors_complex (umc, ubound(umc,dim=1), mpi_comm_cols, &
vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
if (size(vmr,dim=1)*size(vmr,dim=2) .gt. vmr_size) then
print *,"bandred_complex: vmr size 4 :",size(vmr,dim=1)*size(vmr,dim=2),vmr_size
stop
endif
successCUDA = cuda_memcpy(vmr_dev,loc(vmr(1,1)),vmr_size*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy failed vav_dev", istat
stop
endif
if (size(umc,dim=1)*size(umc,dim=2) .gt. umc_size) then
print *,"bandred_complex: umc size 6 :",size(umc,dim=1)*size(umc,dim=2),umc_size
stop
endif
successCUDA = cuda_memcpy(umc_dev,loc(umc(1,1)),umc_size*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy failed umc_dev ", istat
stop
endif
else ! not useGPU
call zgemm('N','N',l_cols,n_cols,n_cols,(-0.5d0,0.d0),umc(1,n_cols+1),ubound(umc,dim=1),vav,ubound(vav,dim=1), &
CONE,umc,ubound(umc,dim=1))
! Transpose umc -> umr (stored in vmr, second half)
call elpa_transpose_vectors_complex (umc, ubound(umc,dim=1), mpi_comm_cols, &
vmr(1,n_cols+1), ubound(vmr,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
endif
! A = A - V*U**T - U*V**T
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
lre = min(l_rows,(i+1)*l_rows_tile)
if (lce0) then
if (useGPU) then
call cublas_zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm_dev,max_local_rows, &
q_dev,ldq,CZERO,tmp_dev,n_cols)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_complex_datatype, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
else
call zgemm('C','N',n_cols,l_cols,l_rows,CONE,hvm,ubound(hvm,dim=1), &
q,ldq,CZERO,tmp1,n_cols)
endif
else
if (useGPU) then
if (l_cols*n_cols .gt. (max_local_cols)*(nbw)) then
print *,"trans_ev_band_to_full_complex: tmp_dev ",l_cols*n_cols,max_local_cols
stop
endif
! istat = cuda_memset(tmp_dev, 0, l_cols*n_cols*size_of_complex_datatype)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemset"
! stop
! endif
endif
tmp1(1:l_cols*n_cols) = 0
endif
call mpi_allreduce(tmp1,tmp2,n_cols*l_cols,MPI_DOUBLE_COMPLEX,MPI_SUM,mpi_comm_rows,mpierr)
if (l_rows>0) then
if (useGPU) then
successCUDA = cuda_memcpy(tmp_dev,loc(tmp2),l_cols*n_cols*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
! tmat_temp(1:nbw,1:nbw) = tmat(1:nbw,1:nbw,istep)
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)),nbw*nbw*size_of_complex_datatype,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
call cublas_ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat_dev,nbw,tmp_dev,n_cols)
call cublas_zgemm('N','N',l_rows,l_cols,n_cols,-CONE,hvm_dev, max_local_rows, &
tmp_dev,n_cols,CONE,q_dev,ldq)
else ! not useGPU
call ztrmm('L','U','C','N',n_cols,l_cols,CONE,tmat(1,1,istep),ubound(tmat,dim=1),tmp2,n_cols)
call zgemm('N','N',l_rows,l_cols,n_cols,-CONE,hvm,ubound(hvm,dim=1), &
tmp2,n_cols,CONE,q,ldq)
endif
endif
!#ifdef WITH_GPU_VERSION
! istat =cuda_memcpy(loc(hvm(1,1)),hvm_dev,((max_local_rows)*nbw*size_of_complex_datatype),cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
! stop
! endif
!#endif
enddo
deallocate(tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_complex: error when deallocating tmp1, tmp2, hvb, hvm "//errorMessage
stop
endif
if (useGPU) then
successCUDA = cuda_free(hvm_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(tmp_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
successCUDA = cuda_memcpy(loc(q), q_dev,ldq*matrixCols*size_of_complex_datatype, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
! q(1:ldq,1:na_cols) = q_temp(1:ldq,1:na_cols)
successCUDA = cuda_free(q_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaFree"
stop
endif
! deallocate(q_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"trans_ev_band_to_full_complex: error when deallocating q_temp "//errorMessage
! endif
!deallocate(tmat_temp, stat=istat, errmsg=errorMessage)
!if (istat .ne. 0) then
!print *,"trans_ev_band_to_full_complex: error when deallocating tmat_temp "//errorMessage
!endif
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_band_to_full_complex")
#endif
end subroutine trans_ev_band_to_full_complex
!---------------------------------------------------------------------------------------------------
subroutine tridiag_band_complex(na, nb, nblk, a, lda, d, e, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm)
!-------------------------------------------------------------------------------
! tridiag_band_complex:
! Reduces a complex hermitian symmetric band matrix to tridiagonal form
!
! na Order of matrix a
!
! nb Semi bandwith
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! a(lda,matrixCols) Distributed system matrix reduced to banded form in the upper diagonal
!
! lda Leading dimension of a
! matrixCols local columns of matrix a
!
! d(na) Diagonal of tridiagonal matrix, set only on PE 0 (output)
!
! e(na) Subdiagonal of tridiagonal matrix, set only on PE 0 (output)
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
! mpi_comm
! MPI-Communicator for the total processor set
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer, intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
complex*16, intent(in) :: a(lda,matrixCols)
real*8, intent(out) :: d(na), e(na) ! set only on PE 0
integer :: mpierr
real*8 :: vnorm2
complex*16 :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
complex*16 :: hd(nb), hs(nb)
!#ifdef WITH_GPU_VERSION
! integer(C_SIZE_T) :: h_dev, hv_new_dev ,ab_dev,x_dev,hs_dev,tau_new_dev,hv_dev,hd_dev
! complex*16, allocatable :: ab_temp(:,:)
!#endif
integer :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
integer :: my_pe, n_pes, mpier
integer :: my_prow, np_rows, my_pcol, np_cols
integer :: ireq_ab, ireq_hv
integer :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
integer, allocatable :: mpi_statuses(:,:)
integer, allocatable :: omp_block_limits(:)
integer :: max_threads, my_thread, my_block_s, my_block_e, iter
integer :: omp_get_max_threads
integer :: mpi_status(MPI_STATUS_SIZE)
complex*16, allocatable :: hv_t(:,:), tau_t(:)
#endif
integer, allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
integer, allocatable :: limits(:), snd_limits(:,:)
integer, allocatable :: block_limits(:)
complex*16, allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
integer :: istat
character(200) :: errorMessage
! ! dummies for calling redist_band
! real*8 :: r_a(1,1), r_ab(1,1)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("tridiag_band_complex")
#endif
call mpi_comm_rank(mpi_comm,my_pe,mpierr)
call mpi_comm_size(mpi_comm,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
!#ifdef WITH_GPU_VERSION
! t_1 = 0
! t_2 = 0
!#endif
! Get global_id mapping 2D procssor coordinates to global id
allocate(global_id(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating global_id "//errorMessage
stop
endif
global_id(:,:) = 0
global_id(my_prow, my_pcol) = my_pe
call mpi_allreduce(mpi_in_place, global_id, np_rows*np_cols, mpi_integer, mpi_sum, mpi_comm, mpierr)
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
! Set work distribution
allocate(block_limits(0:n_pes), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating block_limits "//errorMessage
stop
endif
call divide_band(nblocks_total, n_pes, block_limits)
! nblocks: the number of blocks for my task
nblocks = block_limits(my_pe+1) - block_limits(my_pe)
! allocate the part of the band matrix which is needed by this PE
! The size is 1 block larger than needed to avoid extensive shifts
allocate(ab(2*nb,(nblocks+1)*nb), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating ab "//errorMessage
stop
endif
!#ifdef WITH_GPU_VERSION
! allocate(ab_temp(2*nb,nblocks*nb), stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error when allocating ab_temp "//errorMessage
! stop
! endif
!#endif
ab = 0 ! needed for lower half, the extra block should also be set to 0 for safety
!#ifdef WITH_GPU_VERSION
!
! istat = cuda_malloc(ab_dev, 2*nb*(nblocks+1)*nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed ab_dev", istat
!
! istat = cuda_malloc(hv_new_dev, nb*size_of_complex_datatype )
! if (istat .ne. 0) print *, " cuda malloc failed hv_new_dev", istat
!
!! istat = cuda_malloc(temp_c_dev, nb*nb*size_of_complex_datatype )
!! if(istat .ne. 0) print *, " cuda malloc failed temp_c", istat
!
! istat = cuda_malloc(h_dev , nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed h_dev", istat
!
! istat = cuda_malloc(hs_dev , nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed hs_dev", istat
!
! istat = cuda_malloc(x_dev , 1*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed x_dev", istat
!
! istat = cuda_malloc( tau_new_dev , 1*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed tau_new_dev", istat
!
! istat = cuda_malloc(hv_dev , nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed hv_dev", istat
!
! istat = cuda_malloc(hd_dev , nb*size_of_complex_datatype)
! if (istat .ne. 0) print *, " cuda malloc failed hd_dev", istat
!#endif
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
! Redistribute band in a to ab
call redist_band_complex(a, lda, na, nblk, nb, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm, ab)
! Calculate the workload for each sweep in the back transformation
! and the space requirements to hold the HH vectors
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating limits "//errorMessage
stop
endif
call determine_workload(na, nb, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
num_hh_vecs = 0
num_chunks = 0
nx = na
do n = 1, nblocks_total
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
! add to number of householder vectors
! please note: for nx==1 the one and only HH vector is 0 and is neither calculated nor send below!
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_hh_vecs = num_hh_vecs + local_size
num_chunks = num_chunks+1
endif
nx = nx - nb
enddo
! Allocate space for HH vectors
allocate(hh_trans_complex(nb,num_hh_vecs), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_trans_comples "//errorMessage
stop
endif
! Allocate and init MPI requests
allocate(ireq_hhr(num_chunks), stat=istat, errmsg=errorMessage) ! Recv requests
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating ireq_hhr "//errorMessage
stop
endif
allocate(ireq_hhs(nblocks), stat=istat, errmsg=errorMessage) ! Send requests
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating ireq_hhs "//errorMessage
stop
endif
num_hh_vecs = 0
num_chunks = 0
nx = na
nt = 0
do n = 1, nblocks_total
call determine_workload(nx, nb, np_rows, limits)
local_size = limits(my_prow+1) - limits(my_prow)
if (mod(n-1,np_cols) == my_pcol .and. local_size>0 .and. nx>1) then
num_chunks = num_chunks+1
call mpi_irecv(hh_trans_complex(1,num_hh_vecs+1), nb*local_size, MPI_COMPLEX16, nt, &
10+n-block_limits(nt), mpi_comm, ireq_hhr(num_chunks), mpierr)
num_hh_vecs = num_hh_vecs + local_size
endif
nx = nx - nb
if (n == block_limits(nt+1)) then
nt = nt + 1
endif
enddo
ireq_hhs(:) = MPI_REQUEST_NULL
! Buffers for gathering/sending the HH vectors
allocate(hh_gath(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! gathers HH vectors
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_gath "//errorMessage
stop
endif
allocate(hh_send(nb,max_blk_size,nblocks), stat=istat, errmsg=errorMessage) ! send buffer for HH vectors
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_sebd "//errorMessage
stop
endif
hh_gath(:,:,:) = 0
hh_send(:,:,:) = 0
! Some counters
allocate(hh_cnt(nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_cnt "//errorMessage
stop
endif
allocate(hh_dst(nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hh_dst "//errorMessage
stop
endif
hh_cnt(:) = 1 ! The first transfomation vector is always 0 and not calculated at all
hh_dst(:) = 0 ! PE number for receive
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
! Limits for sending
allocate(snd_limits(0:np_rows,nblocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating snd_limits "//errorMessage
stop
endif
do iblk=1,nblocks
call determine_workload(na-(iblk+block_limits(my_pe)-1)*nb, nb, np_rows, snd_limits(:,iblk))
enddo
#ifdef WITH_OPENMP
! OpenMP work distribution:
max_threads = 1
!$ max_threads = omp_get_max_threads()
! For OpenMP we need at least 2 blocks for every thread
max_threads = MIN(max_threads, nblocks/2)
if (max_threads==0) max_threads = 1
allocate(omp_block_limits(0:max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating omp_block_limits "//errorMessage
stop
endif
! Get the OpenMP block limits
call divide_band(nblocks, max_threads, omp_block_limits)
allocate(hv_t(nb,max_threads), tau_t(max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating hv_t, tau_t "//errorMessage
stop
endif
hv_t = 0
tau_t = 0
#endif
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
if (my_pe>0 .and. na_s<=na) then
! send first column to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
#ifdef WITH_OPENMP
do istep=1,na-1-block_limits(my_pe)*nb
#else
do istep=1,na-1
#endif
if (my_pe==0) then
n = MIN(na-na_s,nb) ! number of rows to be reduced
hv(:) = 0
tau = 0
! Transform first column of remaining matrix
! Opposed to the real case, the last step (istep=na-1) is needed here for making
! the last subdiagonal element a real number
vnorm2 = sum(dble(ab(3:n+1,na_s-n_off))**2+dimag(ab(3:n+1,na_s-n_off))**2)
if (n<2) vnorm2 = 0. ! Safety only
call hh_transform_complex(ab(2,na_s-n_off),vnorm2,hf,tau)
hv(1) = 1
hv(2:n) = ab(3:n+1,na_s-n_off)*hf
d(istep) = ab(1,na_s-n_off)
e(istep) = ab(2,na_s-n_off)
if (istep == na-1) then
d(na) = ab(1,na_s+1-n_off)
e(na) = 0
endif
else
if (na>na_s) then
! Receive Householder vector from previous task, from PE owning subdiagonal
#ifdef WITH_OPENMP
call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,mpi_status,mpierr)
#else
call mpi_recv(hv,nb,MPI_COMPLEX16,my_pe-1,2,mpi_comm,MPI_STATUS_IGNORE,mpierr)
#endif
tau = hv(1)
hv(1) = 1.
endif
endif
na_s = na_s+1
if (na_s-n_off > nb) then
!#ifdef WITH_GPU_VERSION
! ab_temp(:,1:nblocks*nb) = ab(:,nb+1:(nblocks +1)*nb)
! ab(:, 1:nblocks*nb) = ab_temp(:, 1:nblocks*nb)
!#else
ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
!#endif
ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
n_off = n_off + nb
endif
#ifdef WITH_OPENMP
if (max_threads > 1) then
! Codepath for OpenMP
! Please note that in this case it is absolutely necessary to have at least 2 blocks per thread!
! Every thread is one reduction cycle behind its predecessor and thus starts one step later.
! This simulates the behaviour of the MPI tasks which also work after each other.
! The code would be considerably easier, if the MPI communication would be made within
! the parallel region - this is avoided here since this would require
! MPI_Init_thread(MPI_THREAD_MULTIPLE) at the start of the program.
hv_t(:,1) = hv
tau_t(1) = tau
do iter = 1, 2
! iter=1 : work on first block
! iter=2 : work on remaining blocks
! This is done in 2 iterations so that we have a barrier in between:
! After the first iteration, it is guaranteed that the last row of the last block
! is completed by the next thread.
! After the first iteration it is also the place to exchange the last row
! with MPI calls
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
do my_thread = 1, max_threads
if (iter == 1) then
my_block_s = omp_block_limits(my_thread-1) + 1
my_block_e = my_block_s
else
my_block_s = omp_block_limits(my_thread-1) + 2
my_block_e = omp_block_limits(my_thread)
endif
do iblk = my_block_s, my_block_e
ns = na_s + (iblk-1)*nb - n_off - my_thread + 1 ! first column in block
ne = ns+nb-1 ! last column in block
if (istepna) exit
hv = hv_t(:,my_thread)
tau = tau_t(my_thread)
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Transform diagonal block
call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1)
hv_t(:,my_thread) = 0
tau_t(my_thread) = 0
if (nr<=0) cycle ! No subdiagonal block present any more
! Transform subdiagonal block
call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
if (nr>1) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation for first column
! (stored in hv_t(:,my_thread) and tau_t(my_thread))
vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_t(my_thread))
hv_t(1 ,my_thread) = 1.
hv_t(2:nr,my_thread) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0
! update subdiagonal block for old and new Householder transformation
! This way we can use a nonsymmetric rank 2 update which is (hopefully) faster
call ZGEMV('C',nr,nb-1,tau_t(my_thread),ab(nb,ns+1),2*nb-1,hv_t(1,my_thread),1,(0.d0,0.d0),h(2),1)
x = dot_product(hs(1:nr),hv_t(1:nr,my_thread))*tau_t(my_thread)
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update ("DGER2")
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) &
- hv_t(1:nr,my_thread)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
enddo
else
! No new Householder transformation for nr=1, just complete the old one
ab(nb+1,ns) = ab(nb+1,ns) - hs(1) ! Note: hv(1) == 1
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
enddo
! For safety: there is one remaining dummy transformation (but tau is 0 anyways)
hv_t(1,my_thread) = 1.
endif
enddo
enddo ! my_thread
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
if (iter==1) then
! We are at the end of the first block
! Send our first column to previous PE
if (my_pe>0 .and. na_s <= na) then
call mpi_wait(ireq_ab,mpi_status,mpierr)
ab_s(1:nb+1) = ab(1:nb+1,na_s-n_off)
call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
! Request last column from next PE
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne <= na) then
call mpi_recv(ab(1,ne-n_off),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr)
endif
else
! We are at the end of all blocks
! Send last HH vector and TAU to next PE if it has been calculated above
ne = na_s + nblocks*nb - (max_threads-1) - 1
if (istep>=max_threads .and. ne < na) then
call mpi_wait(ireq_hv,mpi_status,mpierr)
hv_s(1) = tau_t(max_threads)
hv_s(2:) = hv_t(2:,max_threads)
call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
endif
! "Send" HH vector and TAU to next OpenMP thread
do my_thread = max_threads, 2, -1
hv_t(:,my_thread) = hv_t(:,my_thread-1)
tau_t(my_thread) = tau_t(my_thread-1)
enddo
endif
enddo ! iter
else
! Codepath for 1 thread without OpenMP
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
#endif /* WITH_OPENMP */
!#ifdef WITH_GPU_VERSION
! call cpu_time(start)
!#endif
do iblk=1,nblocks
ns = na_s + (iblk-1)*nb - n_off ! first column in block
ne = ns+nb-1 ! last column in block
if (ns+n_off>na) exit
! Store Householder vector for back transformation
hh_cnt(iblk) = hh_cnt(iblk) + 1
hh_gath(1 ,hh_cnt(iblk),iblk) = tau
hh_gath(2:nb,hh_cnt(iblk),iblk) = hv(2:nb)
#ifndef WITH_OPENMP
if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
call mpi_wait(ireq_hhs(iblk), MPI_STATUS_IGNORE, mpierr)
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), MPI_COMPLEX16, &
global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
! Reset counter and increase destination row
hh_cnt(iblk) = 0
hh_dst(iblk) = hh_dst(iblk)+1
endif
! The following code is structured in a way to keep waiting times for
! other PEs at a minimum, especially if there is only one block.
! For this reason, it requests the last column as late as possible
! and sends the Householder vector and the first column as early
! as possible.
#endif /* OpenMP */
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
! Multiply diagonal block and subdiagonal block with Householder vector
if (iblk==nblocks .and. nc==nb) then
! We need the last column from the next PE.
! First do the matrix multiplications without last column ...
! Diagonal block, the contribution of the last element is added below!
ab(1,ne) = 0
call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
! Subdiagonal block
if (nr>0) call ZGEMV('N',nr,nb-1,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
! ... then request last column ...
#ifdef WITH_OPENMP
call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,mpi_status,mpierr)
#else
call mpi_recv(ab(1,ne),nb+1,MPI_COMPLEX16,my_pe+1,1,mpi_comm,MPI_STATUS_IGNORE,mpierr)
#endif
! ... and complete the result
hs(1:nr) = hs(1:nr) + ab(2:nr+1,ne)*tau*hv(nb)
hd(nb) = hd(nb) + ab(1,ne)*hv(nb)*tau
else
! Normal matrix multiply
call ZHEMV('L',nc,tau,ab(1,ns),2*nb-1,hv,1,(0.d0,0.d0),hd,1)
if (nr>0) call ZGEMV('N',nr,nb,tau,ab(nb+1,ns),2*nb-1,hv,1,(0.d0,0.d0),hs,1)
endif
! Calculate first column of subdiagonal block and calculate new
! Householder transformation for this column
hv_new(:) = 0 ! Needed, last rows must be 0 for nr < nb
!#ifdef WITH_GPU_VERSION
! istat = cuda_memset(hv_new_dev, 0,nb*size_of_complex_datatype )
! if (istat .ne. 0) print *, " cuda memset failed hv_new_dev", istat
!#endif
tau_new = 0
if (nr>0) then
! complete (old) Householder transformation for first column
ab(nb+1:nb+nr,ns) = ab(nb+1:nb+nr,ns) - hs(1:nr) ! Note: hv(1) == 1
! calculate new Householder transformation ...
if (nr>1) then
vnorm2 = sum(dble(ab(nb+2:nb+nr,ns))**2+dimag(ab(nb+2:nb+nr,ns))**2)
call hh_transform_complex(ab(nb+1,ns),vnorm2,hf,tau_new)
hv_new(1) = 1.
hv_new(2:nr) = ab(nb+2:nb+nr,ns)*hf
ab(nb+2:,ns) = 0
endif
! ... and send it away immediatly if this is the last block
if (iblk==nblocks) then
#ifdef WITH_OPENMP
call mpi_wait(ireq_hv,mpi_status,mpierr)
#else
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
#endif
hv_s(1) = tau_new
hv_s(2:) = hv_new(2:)
call mpi_isend(hv_s,nb,MPI_COMPLEX16,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
endif
endif
! Transform diagonal block
x = dot_product(hv(1:nc),hd(1:nc))*conjg(tau)
hd(1:nc) = hd(1:nc) - 0.5*x*hv(1:nc)
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy2d((ab_dev + (ns-1)*2*nb*size_of_complex_datatype), 2*nb*size_of_complex_datatype,loc(a(1,ns)), 2*nb*size_of_complex_datatype, 2*size_of_complex_datatype , &
! 2*nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *, "cuda memcpy a_dev H2D failed ", istat
! istat =cuda_memcpy(hv_dev,loc(hv),nc*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed hv_dev", istat
! istat =cuda_memcpy(hd_dev,loc(hd), nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat
!#endif
if (my_pe>0 .and. iblk==1) then
! The first column of the diagonal block has to be send to the previous PE
! Calculate first column only ...
!#ifdef WITH_GPU_VERSION
! call double_hh_transform_2( ns, nc, nb )
! istat=cuda_memcpy(loc(ab),ab_dev,(2*nb*(nblocks+1)*nb)*size_of_complex_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) print *, " cuda memcpy failed ab ", istat
!#else
ab(1:nc,ns) = ab(1:nc,ns) - hd(1:nc)*conjg(hv(1)) - hv(1:nc)*conjg(hd(1))
!#endif
! ... send it away ...
#ifdef WITH_OPENMP
call mpi_wait(ireq_ab,mpi_status,mpierr)
#else
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
#endif
ab_s(1:nb+1) = ab(1:nb+1,ns)
call mpi_isend(ab_s,nb+1,MPI_COMPLEX16,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
! ... and calculate remaining columns with rank-2 update
if (nc>1) then
!#ifdef WITH_GPU_VERSION
! call cublas_ZHER2( 'L',nc -1,(-1.d0,0.d0), hd_dev + 1*16, 1, hv_dev +1*16, 1 , ab_dev + (ns*2*nb )*16, 2*nb-1)
!#else
call ZHER2('L',nc-1,(-1.d0,0.d0),hd(2),1,hv(2),1,ab(1,ns+1),2*nb-1)
!#endif
endif
else
! No need to send, just a rank-2 update
!#ifdef WITH_GPU_VERSION
! call cublas_ZHER2( 'L',nc ,(-1.d0,0.d0), hd_dev, 1, hv_dev, 1 , ab_dev + ((ns-1)*2*nb )*16, 2*nb-1)
!#else
call ZHER2('L',nc,(-1.d0,0.d0),hd,1,hv,1,ab(1,ns),2*nb-1)
!#endif
endif
!#ifdef WITH_GPU_VERSION
! istat=cuda_memcpy( loc(hd),hd_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) print *,"cuda memcpy failed hd_dev", istat
!#endif
! Do the remaining double Householder transformation on the subdiagonal block cols 2 ... nb
!#ifdef WITH_GPU_VERSION
! istat =cuda_memcpy(hs_dev,loc(hs),nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed hs_dev", istat
!#endif
if (nr>0) then
if (nr>1) then
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(hv_new_dev,loc(hv_new),nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed hv_new_dev", istat
!
! istat = cuda_memcpy(h_dev,loc(h),nb*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed h_dev", istat
!
! call cublas_ZGEMV('C',nr,nb-1,tau_new,ab_dev + (nb-1 + ns *2*nb)*16,2*nb-1,hv_new_dev,1,(0.d0,0.d0),h_dev + 1* 16,1)
!
! istat = cuda_memcpy(tau_new_dev,loc(tau_new),1*size_of_complex_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) print *,"cuda memcpy failed tau_new_dev", istat
!
! call dot_product_kernel(nr , tau_new)
! call dot_product_kernel_1( nb, nr , ns)
!
! istat = cuda_memcpy(loc(x),x_dev,1*size_of_complex_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) print *, " cuda memcpy failed x_dev ", istat
!
! istat =cuda_memcpy(loc(h),h_dev,nb*size_of_complex_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) print *, " cuda memcpy failed h ", istat
!#else
call ZGEMV('C',nr,nb-1,tau_new,ab(nb,ns+1),2*nb-1,hv_new,1,(0.d0,0.d0),h(2),1)
x = dot_product(hs(1:nr),hv_new(1:nr))*tau_new
h(2:nb) = h(2:nb) - x*hv(2:nb)
! Unfortunately there is no BLAS routine like DSYR2 for a nonsymmetric rank 2 update
do i=2,nb
ab(2+nb-i:1+nb+nr-i,i+ns-1) = ab(2+nb-i:1+nb+nr-i,i+ns-1) - hv_new(1:nr)*conjg(h(i)) - hs(1:nr)*conjg(hv(i))
enddo
!#endif
else
! No double Householder transformation for nr=1, just complete the row
!#ifdef WITH_GPU_VERSION
! call double_hh_transform_1(nb, ns)
!#else
do i=2,nb
ab(2+nb-i,i+ns-1) = ab(2+nb-i,i+ns-1) - hs(1)*conjg(hv(i))
enddo
!#endif
endif
endif
! Use new HH vector for the next block
hv(:) = hv_new(:)
tau = tau_new
enddo
!#ifdef WITH_GPU_VERSION
! call cpu_time(finish)
! tstep2 = finish-start
! t_2 = t_2 + tstep2
!#endif
#ifdef WITH_OPENMP
endif
#endif
#ifdef WITH_OPENMP
do iblk = 1, nblocks
if (hh_dst(iblk) >= np_rows) exit
if (snd_limits(hh_dst(iblk)+1,iblk) == snd_limits(hh_dst(iblk),iblk)) exit
if (hh_cnt(iblk) == snd_limits(hh_dst(iblk)+1,iblk)-snd_limits(hh_dst(iblk),iblk)) then
! Wait for last transfer to finish
call mpi_wait(ireq_hhs(iblk), mpi_status, mpierr)
! Copy vectors into send buffer
hh_send(:,1:hh_cnt(iblk),iblk) = hh_gath(:,1:hh_cnt(iblk),iblk)
! Send to destination
call mpi_isend(hh_send(1,1,iblk), nb*hh_cnt(iblk), mpi_complex16, &
global_id(hh_dst(iblk),mod(iblk+block_limits(my_pe)-1,np_cols)), &
10+iblk, mpi_comm, ireq_hhs(iblk), mpierr)
! Reset counter and increase destination row
hh_cnt(iblk) = 0
hh_dst(iblk) = hh_dst(iblk)+1
endif
enddo
#endif
enddo
!#ifdef WITH_GPU_VERSION
! call cpu_time(finish_1)
! tstep1 = finish_1-start_1
! t_1 = t_1 + tstep1
!#endif
! Finish the last outstanding requests
#ifdef WITH_OPENMP
call mpi_wait(ireq_ab,mpi_status,mpierr)
call mpi_wait(ireq_hv,mpi_status,mpierr)
allocate(mpi_statuses(MPI_STATUS_SIZE,max(nblocks,num_chunks)), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when allocating mpi_statuses "//errorMessage
stop
endif
call mpi_waitall(nblocks, ireq_hhs, mpi_statuses, mpierr)
call mpi_waitall(num_chunks, ireq_hhr, mpi_statuses, mpierr)
deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating mpi_statuses "//errorMessage
stop
endif
#else
call mpi_wait(ireq_ab,MPI_STATUS_IGNORE,mpierr)
call mpi_wait(ireq_hv,MPI_STATUS_IGNORE,mpierr)
call mpi_waitall(nblocks, ireq_hhs, MPI_STATUSES_IGNORE, mpierr)
call mpi_waitall(num_chunks, ireq_hhr, MPI_STATUSES_IGNORE, mpierr)
#endif /* WITH_OPENMP */
call mpi_barrier(mpi_comm,mpierr)
deallocate(ab, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating ab "//errorMessage
stop
endif
!#ifdef WITH_GPU_VERSION
! deallocate(ab_temp, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
! print *,"error when deallocating ab_temp "//errorMessage
! stop
! endif
!
!#endif
deallocate(ireq_hhr, ireq_hhs, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating ireq_hhr, ireq_hhs "//errorMessage
stop
endif
deallocate(hh_cnt, hh_dst, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating hh_cnt, hh_dst "//errorMessage
stop
endif
deallocate(hh_gath, hh_send, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating hh_gath, hh_send, "//errorMessage
stop
endif
deallocate(limits, snd_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating limits, snd_limits "//errorMessage
stop
endif
deallocate(block_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating block_limits, "//errorMessage
stop
endif
deallocate(global_id, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_band_complex: error when deallocating global_id, "//errorMessage
stop
endif
!#ifdef WITH_GPU_VERSION
! istat = cuda_free(ab_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(hv_new_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(hs_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(h_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(tau_new_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
! istat = cuda_free(x_dev)
! if (istat .ne. 0) then
! print *,"error in cudaFree"
! stop
! endif
!
!#endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("tridiag_band_complex")
#endif
!#ifdef WITH_GPU_VERSION
! contains
!
! subroutine dot_product_kernel(nr,tau_new)
! implicit none
! integer, intent(in) :: nr
! complex*16, intent(in) :: tau_new
!
! call launch_dot_product_kernel( hs_dev,hv_new_dev,tau_new,x_dev,h_dev,hv_dev, nr )
! end subroutine
!
! subroutine dot_product_kernel_1( nb , nr , ns)
! implicit none
! integer, intent(in) :: nb, nr, ns
!
! call launch_dot_product_kernel_1(ab_dev,hs_dev, hv_new_dev,x_dev,h_dev,hv_dev,nb , nr, ns)
! end subroutine
!
! subroutine double_hh_transform_1( nb , ns)
! implicit none
! integer, intent(in) :: nb, ns
!
! call launch_double_hh_transform_1(ab_dev,hs_dev,hv_dev,nb , ns)
! end subroutine
!
! subroutine double_hh_transform_2( ns,nc, nb)
! implicit none
! integer, intent(in) :: nc, ns, nb
!
! call launch_double_hh_transform_2(ab_dev,hd_dev,hv_dev,nc , ns, nb)
! end subroutine
!#endif
end subroutine tridiag_band_complex ! has to be checked for GPU
!---------------------------------------------------------------------------------------------------
#define ATODEV istat = cuda_memcpy(loc(a), a_dev, stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost)
#define ATOHOST istat = cuda_memcpy(a_dev, loc(a), stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost)
subroutine trans_ev_tridi_to_band_complex(na, nev, nblk, nbw, q, ldq, matrixCols, &
mpi_comm_rows, mpi_comm_cols, &
wantDebug, useGPU, success, THIS_COMPLEX_ELPA_KERNEL)
!-------------------------------------------------------------------------------
! trans_ev_tridi_to_band_complex:
! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nev Number eigenvectors to compute (= columns of matrix q)
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nb semi bandwith
!
! q On input: Eigenvectors of tridiagonal matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
! matrixCols local columns of matrix q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns/both
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use cuda_functions
implicit none
logical, intent(in) :: useGPU
integer, intent(in) :: THIS_COMPLEX_ELPA_KERNEL
integer, intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
complex*16 :: q(ldq,matrixCols)
! complex*16 :: q(ldq,*)
integer :: np_rows, my_prow, np_cols, my_pcol
integer :: tmp
integer :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer :: current_n, current_local_n, current_n_start, current_n_end
integer :: next_n, next_local_n, next_n_start, next_n_end
integer :: bottom_msg_length, top_msg_length, next_top_msg_length
integer :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
integer :: thread_width, csw, b_off, b_len
#endif
integer :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer :: a_off, current_tv_off, max_blk_size
integer :: mpierr, src, src_offset, dst, offset, nfact, num_blk
logical :: flag
integer :: n_times
#ifdef WITH_OPENMP
complex*16, allocatable :: a(:,:,:,:)
#else
complex*16, allocatable :: a(:,:,:)
#endif
complex*16, allocatable :: row(:)
complex*16, allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
complex*16, allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
complex*16, allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else
complex*16, allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
complex*16, allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
complex*16, allocatable :: result_buffer(:,:,:)
complex*16, allocatable :: bcast_buffer(:,:)
integer(kind=c_intptr_t):: a_dev
integer(kind=c_intptr_t):: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t):: row_dev
integer(kind=c_intptr_t):: row_group_dev
integer(kind=c_intptr_t):: hh_tau_dev
integer(kind=c_intptr_t):: hh_dot_dev
integer :: row_group_size, unpack_idx
integer :: top, chunk, this_chunk
integer :: n_off
integer, allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer, allocatable :: top_send_request(:), bottom_send_request(:)
integer, allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
integer, allocatable :: mpi_statuses(:,:)
integer :: mpi_status(MPI_STATUS_SIZE)
#endif
integer, external :: numroc
integer :: na_rows, na_cols
! real*8 :: ttt0, ttt1, ttt2, t2_compute_kernel, t0_compute_kernel,t1_compute_kernel, &
! t0_mpi_time, t1_mpi_time,t2_mpi_time
! real*8 :: t0_cpu_code,t1_cpu_code,t2_cpu_code,t0_block_time,t1_block_time,t2_block_time,t0_cuda_memcpy
! real*8 :: t0_inner_do_time, t1_inner_do_time , t2_inner_do_time,t0_outer_do_time ,t1_outer_do_time , &
! t2_outer_do_time ,t0_result_time ,t1_result_time, t2_result_time,t0_mpi_recv_time, &
! t1_mpi_recv_time,t2_mpi_recv_time
! real*8 :: t1_mpi_wait_time,t0_mpi_wait_time,t2_mpi_wait_time,t1_memcpy_time,t0_memcpy_time,t2_memcpy_time, &
! t1_mpi_irecv_time,t0_mpi_irecv_time,t2_mpi_irecv_time,t0_mpi_outer_wait_time,t1_mpi_outer_wait_time,&
! t2_mpi_outer_wait_time, time0
! real*4 :: time1
! MPI send/recv tags, arbitrary
integer, parameter :: bottom_recv_tag = 111
integer, parameter :: top_recv_tag = 222
integer, parameter :: result_recv_tag = 333
#ifdef WITH_OPENMP
integer :: max_threads, my_thread
integer :: omp_get_max_threads
#endif
! Just for measuring the kernel performance
real*8 :: kernel_time
integer*8 :: kernel_flops
logical, intent(in) :: wantDebug
logical :: success
integer :: istat
character(200) :: errorMessage
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("trans_ev_tridi_to_band_complex")
#endif
if (useGPU) then
n_times =0
! n_times_1 =0
unpack_idx = 0
row_group_size = 0
! time0=0
! t0_compute_kernel=0
endif
kernel_time = 1.d-100
kernel_flops = 0
#ifdef WITH_OPENMP
max_threads = 1
max_threads = omp_get_max_threads()
#endif
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
if (useGPU) then
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
endif
success = .true.
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
nfact = nbw / nblk
! local number of eigenvectors
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if (l_nev==0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
thread_width = 0
#endif
stripe_width = 0
stripe_count = 0
last_stripe_width = 0
else
! Suggested stripe width is 48 - should this be reduced for the complex case ???
#ifdef WITH_OPENMP
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#endif
if (useGPU) then
stripe_width = 256
else
stripe_width = 48 ! Must be a multiple of 4
endif
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
stripe_count = (thread_width-1)/stripe_width + 1
#else /* WITH_OPENMP */
stripe_count = (l_nev-1)/stripe_width + 1
#endif /* WITH_OPENMP */
! Adapt stripe width so that last one doesn't get too small
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
stripe_width = (thread_width-1)/stripe_count + 1
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
stripe_width = (l_nev-1)/stripe_count + 1
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 !!!
endif
#endif /* WITH_OPENMP */
#ifndef WITH_OPENMP
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif /* WITH_OPENMP */
endif
! Determine the matrix distribution at the beginning
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error when allocating limits "//errorMessage
stop
endif
call determine_workload(na, nbw, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
!DEC$ ATTRIBUTES ALIGN: 64:: a
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
if (.not.(useGPU) then
allocate(a(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
! a(:,:,:,:) should be set to 0 in a parallel region, not here!
endif
#else /* OpenMP */
if (.not.(useGPU)) then
allocate(a(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
a(:,:,:) = 0
endif
#endif /* WITH_OPENMP */
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating row "//errorMessage
stop
endif
row(:) = 0
if (useGPU) then
num = (stripe_width*a_dim2*stripe_count)*size_of_complex_datatype
if (na_rows * na_cols .lt. stripe_width*a_dim2*stripe_count) then
print *,"trans_ev_tridi_to_band_complex a_dev ",na_rows * na_cols, stripe_width*a_dim2*stripe_count
! stop
endif
successCUDA = cuda_malloc(a_dev, stripe_width*a_dim2*stripe_count*size_of_complex_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
if (num .gt. na_rows * na_cols) then
print *,"trans_ev_tridi_to_band_complex a_dev 1",num, na_rows * na_cols
! stop
endif
successCUDA = cuda_memset(a_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
num = (l_nev)*size_of_complex_datatype
successCUDA = cuda_malloc( row_dev,num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
! "row_group" and "row_group_dev" are needed for GPU optimizations
allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating row_group "//errorMessage
stop
endif
row_group(:, :) = 0
num = (l_nev*nblk)*size_of_complex_datatype
successCUDA = cuda_malloc(row_group_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_group_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
endif ! useGPU
! Copy q from a block cyclic distribution into a distribution with contiguous rows,
! and transpose the matrix using stripes of given stripe_width for cache blocking.
! The peculiar way it is done below is due to the fact that the last row should be
! ready first since it is the first one to start below
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
! Please note about the OMP usage below:
! This is not for speed, but because we want the matrix a in the memory and
! in the cache of the correct thread (if possible)
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
a(:,:,:,my_thread) = 0 ! if possible, do first touch allocation!
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#endif /* WITH_OPENMP */
do ip = np_rows-1, 0, -1
if (my_prow == ip) then
! Receive my rows which have not yet been received
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src < my_prow) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr)
#else /* WITH_OPENMP */
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu(i - limits(ip), .false.)
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
else
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp(row,i-limits(ip),my_thread)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu(row,i-limits(ip))
endif
#endif /* WITH_OPENMP */
elseif (src==my_prow) then
src_offset = src_offset+1
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu(i - limits(ip),.false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else
row(:) = q(src_offset, 1:l_nev)
endif
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp(row,i-limits(ip),my_thread)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu(row,i-limits(ip))
endif
#endif /* WITH_OPENMP */
endif
enddo
! Send all rows which have not yet been send
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if(mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
call MPI_Send(row, l_nev, MPI_COMPLEX16, dst, 0, mpi_comm_rows, mpierr)
endif
enddo
enddo
else if(my_prow < ip) then
! Send all rows going to PE ip
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
call MPI_Send(row, l_nev, MPI_COMPLEX16, ip, 0, mpi_comm_rows, mpierr)
endif
enddo
! Receive all rows from PE ip
do i=limits(my_prow)+1,limits(my_prow+1)
src = mod((i-1)/nblk, np_rows)
if (src == ip) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, mpi_status, mpierr)
#else /* WITH_OPENMP */
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu(i - limits(my_prow), .false.)
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
else
call MPI_Recv(row, l_nev, MPI_COMPLEX16, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp(row,i-limits(my_prow),my_thread)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu(row,i-limits(my_prow))
endif
#endif /* WITH_OPENMP */
endif
enddo
endif
enddo
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu(-1, .true.)
successCUDA = cuda_devicesynchronize()
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaDeviceSynchronize"
stop
endif
endif
! Set up result buffer queue
num_result_blocks = ((na-1)/nblk + np_rows - my_prow) / np_rows
num_result_buffers = 4*nfact
allocate(result_buffer(l_nev,nblk,num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating result_buffer "//errorMessage
stop
endif
allocate(result_send_request(num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating result_send_request "//errorMessage
stop
endif
allocate(result_recv_request(num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating result_recv_request "//errorMessage
stop
endif
result_send_request(:) = MPI_REQUEST_NULL
result_recv_request(:) = MPI_REQUEST_NULL
! Queue up buffers
if (my_prow > 0 .and. l_nev>0) then ! note: row 0 always sends
do j = 1, min(num_result_buffers, num_result_blocks)
call MPI_Irecv(result_buffer(1,1,j), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(j), mpierr)
enddo
endif
num_bufs_recvd = 0 ! No buffers received yet
! Initialize top/bottom requests
allocate(top_send_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_send_request "//errorMessage
stop
endif
allocate(top_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_recv_request "//errorMessage
stop
endif
allocate(bottom_send_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_send_request "//errorMessage
stop
endif
allocate(bottom_recv_request(stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_recv_request "//errorMessage
stop
endif
top_send_request(:) = MPI_REQUEST_NULL
top_recv_request(:) = MPI_REQUEST_NULL
bottom_send_request(:) = MPI_REQUEST_NULL
bottom_recv_request(:) = MPI_REQUEST_NULL
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
allocate(top_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_border_send_buffer "//errorMessage
stop
endif
allocate(top_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_border_recv_buffer "//errorMessage
stop
endif
allocate(bottom_border_send_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_send_buffer "//errorMessage
stop
endif
allocate(bottom_border_recv_buffer(stripe_width*nbw*max_threads, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_recv_buffer "//errorMessage
stop
endif
top_border_send_buffer(:,:) = 0
top_border_recv_buffer(:,:) = 0
bottom_border_send_buffer(:,:) = 0
bottom_border_recv_buffer(:,:) = 0
#else /* OpenMP */
allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_border_send_buffer "//errorMessage
stop
endif
allocate(top_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating top_border_recv_buffer "//errorMessage
stop
endif
allocate(bottom_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_send_buffer "//errorMessage
stop
endif
allocate(bottom_border_recv_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bottom_border_recv_buffer "//errorMessage
stop
endif
top_border_send_buffer(:,:,:) = 0
top_border_recv_buffer(:,:,:) = 0
bottom_border_send_buffer(:,:,:) = 0
bottom_border_recv_buffer(:,:,:) = 0
#endif /* WITH_OPENMP */
! Initialize broadcast buffer
allocate(bcast_buffer(nbw, max_blk_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating bcast_buffer "//errorMessage
stop
endif
bcast_buffer = 0
if (useGPU) then
num = ( nbw * max_blk_size) * size_of_complex_datatype
successCUDA = cuda_malloc(bcast_buffer_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( bcast_buffer_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset"
stop
endif
num = ((max_blk_size-1))*size_of_complex_datatype
successCUDA = cuda_malloc( hh_dot_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( hh_dot_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset"
stop
endif
num = (max_blk_size)*size_of_complex_datatype
successCUDA = cuda_malloc( hh_tau_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
stop
endif
successCUDA = cuda_memset( hh_tau_dev, 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset"
stop
endif
endif ! useGPU
current_tv_off = 0 ! Offset of next row to be broadcast
! ------------------- start of work loop -------------------
a_off = 0 ! offset in A (to avoid unnecessary shifts)
top_msg_length = 0
bottom_msg_length = 0
#ifdef WITH_GPU_VERSION
!! istat = cuda_ProfilerStart()
!! istat = cudaFuncSetCacheConfig ( launch_compute_hh_trafo_c_kernel_complex, cudaFuncCachePreferShared)
!! t0_compute_kernel = 0
! t0_mpi_time = 0
! t0_cuda_memcpy =0
! t0_cpu_code =0
! t0_outer_do_time =0
! t0_inner_do_time =0
! t1_outer_do_time =MPI_Wtime()
! t0_block_time =0
! t0_mpi_wait_time = 0
! t0_memcpy_time = 0
! t0_mpi_outer_wait_time=0
#endif
do sweep = 0, (na-1)/nbw
#ifdef WITH_GPU_VERSION
! t1_cpu_code =MPI_Wtime()
#endif
current_n = na - sweep*nbw
call determine_workload(current_n, nbw, np_rows, limits)
current_n_start = limits(my_prow)
current_n_end = limits(my_prow+1)
current_local_n = current_n_end - current_n_start
next_n = max(current_n - nbw, 0)
call determine_workload(next_n, nbw, np_rows, limits)
next_n_start = limits(my_prow)
next_n_end = limits(my_prow+1)
next_local_n = next_n_end - next_n_start
if (next_n_end < next_n) then
bottom_msg_length = current_n_end - next_n_end
else
bottom_msg_length = 0
endif
if (next_local_n > 0) then
next_top_msg_length = current_n_start - next_n_start
else
next_top_msg_length = 0
endif
#ifdef WITH_GPU_VERSION
! t2_cpu_code =MPI_Wtime()
! t0_cpu_code = t0_cpu_code + (t2_cpu_code - t1_cpu_code)
#endif
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
csw = min(stripe_width, thread_width-(i-1)*stripe_width) ! "current_stripe_width"
b_len = csw*nbw*max_threads
call MPI_Irecv(bottom_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#else /* WITH_OPENMP */
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#endif /* WITH_OPENMP */
enddo
endif
#ifdef WITH_GPU_VERSION
! t1_block_time = MPI_Wtime()
#endif
if (current_local_n > 1) then
if (my_pcol == mod(sweep,np_cols)) then
bcast_buffer(:,1:current_local_n) = hh_trans_complex(:,current_tv_off+1:current_tv_off+current_local_n)
current_tv_off = current_tv_off + current_local_n
endif
call mpi_bcast(bcast_buffer, nbw*current_local_n, MPI_COMPLEX16, mod(sweep,np_cols), mpi_comm_cols, mpierr)
if (useGPU) then
successCUDA = cuda_memcpy(bcast_buffer_dev, loc(bcast_buffer(1,1)), nbw * current_local_n * size_of_complex_datatype , &
cudaMemcpyHostToDevice)
call extract_hh_tau_complex_gpu(nbw, current_local_n, .false.)
call compute_hh_dot_products_complex_gpu(nbw, current_local_n)
endif
else
! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_complex
bcast_buffer(:,1) = 0
if (useGPU) then
successCUDA = cuda_memset(bcast_buffer_dev, 0, nbw * size_of_complex_datatype)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset"
stop
endif
call extract_hh_tau_complex_gpu(nbw, 1, .true.)
!NOTE(ca): I commented out the following line
! istat = cuda_memcpy(loc(bcast_buffer(1,1)),bcast_buffer_dev,nbw*current_local_n * size_of_complex_datatype ,
! cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
! stop
! endif
endif ! useGPU
endif
#ifdef WITH_GPU_VERSION
! t2_block_time =MPI_Wtime()
! t0_block_time = t0_block_time + ( t2_block_time - t1_block_time)
#endif
if (l_nev == 0) cycle
if (current_local_n > 0) then
#ifdef WITH_GPU_VERSION
! t1_inner_do_time =MPI_Wtime()
#endif
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
! Get real stripe width for strip i;
! The last OpenMP tasks may have an even smaller stripe with,
! but we don't care about this, i.e. we send/recv a bit too much in this case.
! csw: current_stripe_width
csw = min(stripe_width, thread_width-(i-1)*stripe_width)
#endif /* WITH_OPENMP */
!wait_b
if (current_n_end < current_n) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call MPI_Wait(bottom_recv_request(i), mpi_status, mpierr)
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
call MPI_Wait(bottom_recv_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time - t1_mpi_wait_time)
!
! t1_memcpy_time =MPI_Wtime()
#endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
n_off = current_local_n+a_off
b_len = csw*nbw
b_off = (my_thread-1)*b_len
a(1:csw,n_off+1:n_off+nbw,i,my_thread) = &
reshape(bottom_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, nbw /))
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
n_off = current_local_n+a_off
if (useGPU) then
! t1_memcpy_time =MPI_Wtime()
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_complex_datatype
successCUDA = cuda_memcpy( a_dev + dev_offset ,loc(bottom_border_recv_buffer(1,1,i)), &
stripe_width*nbw*size_of_complex_datatype ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time - t1_memcpy_time)
else
a(:,n_off+1:n_off+nbw,i) = bottom_border_recv_buffer(:,1:nbw,i)
endif
#endif /* WITH_OPENMP */
if (next_n_end < next_n) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"not yet implemented"
stop
endif
call MPI_Irecv(bottom_border_recv_buffer(1,i), csw*nbw*max_threads, &
MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#else /* WITH_OPENMP */
call MPI_Irecv(bottom_border_recv_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow+1, bottom_recv_tag, &
mpi_comm_rows, bottom_recv_request(i), mpierr)
#endif /* WITH_OPENMP */
endif
endif
if (current_local_n <= bottom_msg_length + top_msg_length) then
!wait_t
if (top_msg_length>0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
if (useGPU) then
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time -t1_mpi_wait_time)
! t1_memcpy_time =MPI_Wtime()
!
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *size_of_complex_datatype
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ))* 16
successCUDA = cuda_memcpy( a_dev+dev_offset ,loc(top_border_recv_buffer(1,1,i)), &
stripe_width*top_msg_length*size_of_complex_datatype , cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time - t1_memcpy_time)
else
a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif ! useGPU
#endif /* WITH_OPENMP */
endif
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
if (top_msg_length>0) then
b_len = csw*top_msg_length
b_off = (my_thread-1)*b_len
a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif
call compute_hh_trafo_complex_cpu_openmp(0, current_local_n, i, my_thread, &
THIS_COMPLEX_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (useGPU) then
call compute_hh_trafo_complex_gpu(0, current_local_n, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu(0, current_local_n, i)
else
call compute_hh_trafo_complex_cpu(0, current_local_n, i, &
THIS_COMPLEX_ELPA_KERNEL)
endif
#endif /* WITH_OPENMP */
!send_b
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time-t1_mpi_wait_time)
#endif
#endif /* WITH_OPENMP */
if (bottom_msg_length>0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
b_len = csw*bottom_msg_length*max_threads
bottom_border_send_buffer(1:b_len,i) = &
reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#else /* WITH_OPENMP */
if (useGPU) then
! t1_memcpy_time =MPI_Wtime()
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_complex_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_complex_datatype , cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time -t1_memcpy_time)
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
endif
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#endif /* WITH_OPENMP */
endif
else
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
call compute_hh_trafo_complex_cpu_openmp(current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
THIS_COMPLEX_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (useGPU) then
call compute_hh_trafo_complex_gpu(current_local_n -bottom_msg_length, bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu(current_local_n -bottom_msg_length, bottom_msg_length, i)
else
call compute_hh_trafo_complex_cpu(current_local_n - bottom_msg_length, bottom_msg_length, i, &
THIS_COMPLEX_ELPA_KERNEL)
endif
#endif /* WITH_OPENMP */
!send_b
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call MPI_Wait(bottom_send_request(i), mpi_status, mpierr)
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time + ( t2_mpi_wait_time-t1_mpi_wait_time)
#endif
#endif /* WITH_OPENMP */
if (bottom_msg_length > 0) then
n_off = current_local_n+nbw-bottom_msg_length+a_off
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
b_len = csw*bottom_msg_length*max_threads
bottom_border_send_buffer(1:b_len,i) = &
reshape(a(1:csw,n_off+1:n_off+bottom_msg_length,i,:), (/ b_len /))
call MPI_Isend(bottom_border_send_buffer(1,i), b_len, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#else /* WITH_OPENMP */
if (useGPU) then
! t1_memcpy_time =MPI_Wtime()
dev_offset = (0 + (n_off * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_complex_datatype
successCUDA = cuda_memcpy( loc(bottom_border_send_buffer(1,1,i)), a_dev + dev_offset, &
stripe_width * bottom_msg_length * size_of_complex_datatype , cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time -t1_memcpy_time)
else
bottom_border_send_buffer(:,1:bottom_msg_length,i) = a(:,n_off+1:n_off+bottom_msg_length,i)
endif
call MPI_Isend(bottom_border_send_buffer(1,1,i), bottom_msg_length*stripe_width, MPI_COMPLEX16, my_prow+1, &
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
#endif /* WITH_OPENMP */
endif
!compute
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call compute_hh_trafo_complex_cpu_openmp(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, &
THIS_COMPLEX_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (useGPU) then
! call compute_hh_trafo_complex_gpu(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, i)
call compute_hh_trafo_complex_gpu(top_msg_length,current_local_n-top_msg_length-bottom_msg_length, i, a_off, &
dev_offset, dev_offset_1, dev_offset_2)
else
call compute_hh_trafo_complex_cpu(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
THIS_COMPLEX_ELPA_KERNEL)
endif
#endif /* WITH_OPENMP */
!wait_t
if (top_msg_length>0) then
#ifdef WITH_OPENMP
call MPI_Wait(top_recv_request(i), mpi_status, mpierr)
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
call MPI_Wait(top_recv_request(i), MPI_STATUS_IGNORE, mpierr)
if (useGPU) then
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time +(t2_mpi_wait_time-t1_mpi_wait_time)
!
! t1_memcpy_time =MPI_Wtime()
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) *size_of_complex_datatype
successCUDA = cuda_memcpy( a_dev + dev_offset , loc(top_border_recv_buffer(:,1,i)), &
stripe_width * top_msg_length *size_of_complex_datatype ,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
!
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + ( t2_memcpy_time-t1_memcpy_time)
else
a(:,a_off+1:a_off+top_msg_length,i) = top_border_recv_buffer(:,1:top_msg_length,i)
endif
#endif /* WITH_OPENMP */
endif
!compute
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
if (top_msg_length>0) then
b_len = csw*top_msg_length
b_off = (my_thread-1)*b_len
a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif
call compute_hh_trafo_complex_cpu_openmp(0, top_msg_length, i, my_thread, &
THIS_COMPLEX_ELPA_KERNEL)
enddo
!$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /* WITH_OPENMP */
if (useGPU) then
call compute_hh_trafo_complex_gpu(0, top_msg_length, i, a_off, dev_offset, dev_offset_1, dev_offset_2)
! call compute_hh_trafo_complex_gpu(0, top_msg_length, i)
else
call compute_hh_trafo_complex_cpu(0, top_msg_length, i, &
THIS_COMPLEX_ELPA_KERNEL)
endif
#endif /* WITH_OPENMP */
endif
if (next_top_msg_length > 0) then
!request top_border data
#ifdef WITH_OPENMP
b_len = csw*next_top_msg_length*max_threads
call MPI_Irecv(top_border_recv_buffer(1,i), b_len, MPI_COMPLEX16, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#else /* WITH_OPENMP */
call MPI_Irecv(top_border_recv_buffer(1,1,i), next_top_msg_length*stripe_width, MPI_COMPLEX16, my_prow-1, &
top_recv_tag, mpi_comm_rows, top_recv_request(i), mpierr)
#endif /* WITH_OPENMP */
endif
!send_t
if (my_prow > 0) then
#ifdef WITH_OPENMP
call MPI_Wait(top_send_request(i), mpi_status, mpierr)
#else /* WITH_OPENMP */
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time+(t2_mpi_wait_time-t1_mpi_wait_time)
#endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
b_len = csw*nbw*max_threads
top_border_send_buffer(1:b_len,i) = reshape(a(1:csw,a_off+1:a_off+nbw,i,:), (/ b_len /))
call MPI_Isend(top_border_send_buffer(1,i), b_len, MPI_COMPLEX16, &
my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#else /* WITH_OPENMP */
if (useGPU) then
! t1_memcpy_time =MPI_Wtime()
dev_offset = (0 + (a_off * stripe_width) + ( (i-1) * stripe_width *a_dim2 )) * size_of_complex_datatype
successCUDA = cuda_memcpy( loc(top_border_send_buffer(:,1,i)), a_dev + dev_offset, &
stripe_width*nbw*size_of_complex_datatype ,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
! t2_memcpy_time =MPI_Wtime()
! t0_memcpy_time = t0_memcpy_time + (t2_memcpy_time-t1_memcpy_time)
!
else
top_border_send_buffer(:,1:nbw,i) = a(:,a_off+1:a_off+nbw,i)
endif
call MPI_Isend(top_border_send_buffer(1,1,i), nbw*stripe_width, MPI_COMPLEX16, my_prow-1, bottom_recv_tag, &
mpi_comm_rows, top_send_request(i), mpierr)
#endif /* WITH_OPENMP */
endif
! Care that there are not too many outstanding top_recv_request's
#ifdef WITH_GPU_VERSION
! t1_mpi_wait_time =MPI_Wtime()
#endif
if (stripe_count > 1) then
if (i>1) then
#ifdef WITH_OPENMP
call MPI_Wait(top_recv_request(i-1), mpi_status, mpierr)
#else
call MPI_Wait(top_recv_request(i-1), MPI_STATUS_IGNORE, mpierr)
#endif
else
#ifdef WITH_OPENMP
call MPI_Wait(top_recv_request(stripe_count), mpi_status, mpierr)
#else
call MPI_Wait(top_recv_request(stripe_count), MPI_STATUS_IGNORE, mpierr)
#endif
endif
endif
#ifdef WITH_GPU_VERSION
! t2_mpi_wait_time =MPI_Wtime()
! t0_mpi_wait_time = t0_mpi_wait_time+(t2_mpi_wait_time-t1_mpi_wait_time)
#endif
enddo
#ifdef WITH_GPU_VERSION
! t2_inner_do_time =MPI_Wtime()
! t0_inner_do_time = t0_inner_do_time + ( t2_inner_do_time - t1_inner_do_time)
#endif
top_msg_length = next_top_msg_length
else
! wait for last top_send_request
#ifdef WITH_GPU_VERSION
! t1_mpi_outer_wait_time =MPI_Wtime()
#endif
do i = 1, stripe_count
#ifdef WITH_OPENMP
call MPI_Wait(top_send_request(i), mpi_status, mpierr)
#else
call MPI_Wait(top_send_request(i), MPI_STATUS_IGNORE, mpierr)
#endif
enddo
#ifdef WITH_GPU_VERSION
! t2_mpi_outer_wait_time =MPI_Wtime()
! t0_mpi_outer_wait_time =t0_mpi_outer_wait_time+(t2_mpi_outer_wait_time-t1_mpi_outer_wait_time)
#endif
endif
#ifdef WITH_GPU_VERSION
! t0_result_time = MPI_Wtime()
#endif
! Care about the result
if (my_prow == 0) then
! topmost process sends nbw rows to destination processes
do j=0,nfact-1
num_blk = sweep*nfact+j ! global number of destination block, 0 based
if (num_blk*nblk >= na) exit
nbuf = mod(num_blk, num_result_buffers) + 1 ! buffer number to get this block
#ifdef WITH_OPENMP
call MPI_Wait(result_send_request(nbuf), mpi_status, mpierr)
#else
call MPI_Wait(result_send_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#endif
dst = mod(num_blk, np_rows)
if (dst == 0) then
if (useGPU) then
row_group_size = min(na - num_blk*nblk, nblk)
call pack_row_group_complex_gpu(row_group(:, :), j * nblk + a_off,row_group_size)
do i = 1, row_group_size
q((num_blk / np_rows) * nblk + i, 1 : l_nev) = row_group(:, i)
enddo
else
do i = 1, min(na - num_blk*nblk, nblk)
call pack_row_complex_cpu(row, j*nblk+i+a_off)
q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
enddo
endif
else
if (useGPU) then
call pack_row_group_complex_gpu(result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else
do i = 1, nblk
call pack_row_complex_cpu(result_buffer(:,i,nbuf),j*nblk+i+a_off)
enddo
endif
call MPI_Isend(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, dst, &
result_recv_tag, mpi_comm_rows, result_send_request(nbuf), mpierr)
endif
enddo
else
! receive and store final result
do j = num_bufs_recvd, num_result_blocks-1
nbuf = mod(j, num_result_buffers) + 1 ! buffer number to get this block
! If there is still work to do, just test for the next result request
! and leave the loop if it is not ready, otherwise wait for all
! outstanding requests
if (next_local_n > 0) then
#ifdef WITH_OPENMP
call MPI_Test(result_recv_request(nbuf), flag, mpi_status, mpierr)
#else
call MPI_Test(result_recv_request(nbuf), flag, MPI_STATUS_IGNORE, mpierr)
#endif
if (.not.flag) exit
else
#ifdef WITH_OPENMP
call MPI_Wait(result_recv_request(nbuf), mpi_status, mpierr)
#else
call MPI_Wait(result_recv_request(nbuf), MPI_STATUS_IGNORE, mpierr)
#endif
endif
! Fill result buffer into q
num_blk = j*np_rows + my_prow ! global number of current block, 0 based
do i = 1, min(na - num_blk*nblk, nblk)
q(j*nblk+i, 1:l_nev) = result_buffer(1:l_nev, i, nbuf)
enddo
! Queue result buffer again if there are outstanding blocks left
if (j+num_result_buffers < num_result_blocks) &
call MPI_Irecv(result_buffer(1,1,nbuf), l_nev*nblk, MPI_COMPLEX16, 0, result_recv_tag, &
mpi_comm_rows, result_recv_request(nbuf), mpierr)
enddo
num_bufs_recvd = j
endif
#ifdef WITH_GPU_VERSION
! t2_result_time =MPI_Wtime()
! t0_result_time = t0_result_time + ( t2_result_time - t1_result_time)
#endif
! Shift the remaining rows to the front of A (if necessary)
offset = nbw - top_msg_length
if (offset<0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: internal error, offset for shifting = ',offset
endif
success = .false.
return
endif
a_off = a_off + offset
if (a_off + next_local_n + nbw > a_dim2) then
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("OpenMP parallel")
#endif
!$omp parallel do private(my_thread, i, j), schedule(static, 1)
do my_thread = 1, max_threads
do i = 1, stripe_count
do j = top_msg_length+1, top_msg_length+next_local_n
A(:,j,i,my_thread) = A(:,j+a_off,i,my_thread)
enddo
enddo
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("OpenMP parallel")
#endif
#else /*WITH_OPENMP */
do i = 1, stripe_count
if (useGPU) then
chunk = min(next_local_n - 1, a_off)
do j = top_msg_length + 1, top_msg_length + next_local_n, chunk
top = min(j + chunk, top_msg_length + next_local_n)
this_chunk = top - j + 1
dev_offset = (0 + ( (j-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) * size_of_complex_datatype
dev_offset_1 = (0 + ( (j + a_off-1) * stripe_width) + ( (i-1) * stripe_width * a_dim2 )) *size_of_complex_datatype
! it is not logical to set here always the parameter "cudaMemcpyDeviceToDevice" do this ONCE at startup
! tmp = cuda_d2d(1)
successCUDA = cuda_memcpy( a_dev + dev_offset , a_dev+dev_offset_1, &
stripe_width*this_chunk*size_of_complex_datatype, cudaMemcpyDeviceToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemcpy"
stop
endif
enddo
else ! useGPU
do j = top_msg_length+1, top_msg_length+next_local_n
A(:,j,i) = A(:,j+a_off,i)
enddo
endif
enddo
#endif /*WITH_OPENMP */
a_off = 0
endif
enddo
#ifdef WITH_GPU_VERSION
! t2_outer_do_time =MPI_Wtime()
! t0_outer_do_time = t0_outer_do_time + ( t2_outer_do_time - t1_outer_do_time)
!
! istat = cuda_ProfilerStop()
#endif
! Just for safety:
if (ANY(top_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_send_request ***',my_prow,my_pcol
if (ANY(bottom_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_send_request ***',my_prow,my_pcol
if (ANY(top_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR top_recv_request ***',my_prow,my_pcol
if (ANY(bottom_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR bottom_recv_request ***',my_prow,my_pcol
if (my_prow == 0) then
#ifdef WITH_OPENMP
allocate(mpi_statuses(MPI_STATUS_SIZE,num_result_buffers), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating mpi_statuses "//errorMessage
stop
endif
call MPI_Waitall(num_result_buffers, result_send_request, mpi_statuses, mpierr)
deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating mpi_statuses "//errorMessage
stop
endif
#else
call MPI_Waitall(num_result_buffers, result_send_request, MPI_STATUSES_IGNORE, mpierr)
#endif
endif
if (ANY(result_send_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_send_request ***',my_prow,my_pcol
if (ANY(result_recv_request /= MPI_REQUEST_NULL)) write(error_unit,*) '*** ERROR result_recv_request ***',my_prow,my_pcol
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,'(" Kernel time:",f10.3," MFlops: ",es12.5)') kernel_time, kernel_flops/kernel_time*1.d-6
! deallocate all working space
if (.not.(useGPU)) then
deallocate(a, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating a "//errorMessage
stop
endif
endif
deallocate(row, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating row "//errorMessage
stop
endif
deallocate(limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating limits "//errorMessage
stop
endif
deallocate(result_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating result_send_request "//errorMessage
stop
endif
deallocate(result_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating result_recv_request "//errorMessage
stop
endif
deallocate(top_border_send_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_send_buffer "//errorMessage
stop
endif
deallocate(top_border_recv_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_recv_buffer "//errorMessage
stop
endif
deallocate(bottom_border_send_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_border_send_buffer "//errorMessage
stop
endif
deallocate(bottom_border_recv_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_border_recv_buffer "//errorMessage
stop
endif
deallocate(result_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating result_buffer "//errorMessage
stop
endif
deallocate(bcast_buffer, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating bcast_buffer "//errorMessage
stop
endif
deallocate(top_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_send_request "//errorMessage
stop
endif
deallocate(top_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating top_recv_request "//errorMessage
stop
endif
deallocate(bottom_send_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_send_request "//errorMessage
stop
endif
deallocate(bottom_recv_request, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating bottom_recv_request "//errorMessage
stop
endif
if (useGPU) then
successCUDA = cuda_free(a_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(hh_tau_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(hh_dot_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(row_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
deallocate(row_group, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error deallocating row_group "//errorMessage
stop
endif
successCUDA= cuda_free(row_group_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
successCUDA = cuda_free(bcast_buffer_dev)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaFree"
stop
endif
endif ! useGPU
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("trans_ev_tridi_to_band_complex")
#endif
return
contains
#ifdef WITH_OPENMP
subroutine pack_row_complex_cpu_openmp(row, n)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
complex*16 :: row(:)
integer :: n, i, noff, nl, nt
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("pack_row_complex")
#endif
do nt = 1, max_threads
do i = 1, stripe_count
noff = (nt-1)*thread_width + (i-1)*stripe_width
nl = min(stripe_width, nt*thread_width-noff, l_nev-noff)
if (nl<=0) exit
row(noff+1:noff+nl) = a(1:nl,n,i,nt)
enddo
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("pack_row_complex")
#endif
end subroutine pack_row_complex_cpu
#else /* WITH_OPENMP */
subroutine pack_row_complex_cpu(row, n)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
complex*16 :: row(:)
integer :: n, i, noff, nl
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("unpack_row_complex_cpu")
#endif
do i=1,stripe_count
nl = merge(stripe_width, last_stripe_width, i1) then
do i=0,nblocks2-1
call mpi_irecv(ab2(1,i*nb2+1),2*nb2*nb2,mpi_real8,0,3,mpi_comm,ireq_ab2(i+1),mpierr)
enddo
endif
! n_off: Offset of ab within band
n_off = block_limits(my_pe)*nb
lwork = nb*nb2
dest = 0
ireq_ab = MPI_REQUEST_NULL
ireq_hv = MPI_REQUEST_NULL
! ---------------------------------------------------------------------------
! Start of calculations
na_s = block_limits(my_pe)*nb + 1
if (my_pe>0 .and. na_s<=na) then
! send first nb2 columns to previous PE
! Only the PE owning the diagonal does that (sending 1 element of the subdiagonal block also)
do i=1,nb2
ab_s(1:nb+1,i) = ab(1:nb+1,na_s-n_off+i-1)
enddo
call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
do istep=1,na/nb2
if (my_pe==0) then
n = MIN(na-na_s-nb2+1,nb) ! number of rows to be reduced
hv(:,:) = 0
tau(:) = 0
! The last step (istep=na-1) is only needed for sending the last HH vectors.
! We don't want the sign of the last element flipped (analogous to the other sweeps)
if (istep < na/nb2) then
! Transform first block column of remaining matrix
call dgeqrf(n, nb2, ab(1+nb2,na_s-n_off), 2*nb-1, tau, work, lwork, info);
do i=1,nb2
hv(i,i) = 1.0
hv(i+1:n,i) = ab(1+nb2+1:1+nb2+n-i,na_s-n_off+i-1)
ab(1+nb2+1:2*nb,na_s-n_off+i-1) = 0
enddo
endif
if (nb2==1) then
d(istep) = ab(1,na_s-n_off)
e(istep) = ab(2,na_s-n_off)
if (istep == na) then
e(na) = 0
endif
else
ab_s2 = 0
ab_s2(:,:) = ab(1:nb2+1,na_s-n_off:na_s-n_off+nb2-1)
if (block_limits2(dest+1)na_s+nb2-1) then
! Receive Householder vectors from previous task, from PE owning subdiagonal
call mpi_recv(hv,nb*nb2,mpi_real8,my_pe-1,2,mpi_comm,mpi_status,mpierr)
do i=1,nb2
tau(i) = hv(i,i)
hv(i,i) = 1.
enddo
endif
endif
na_s = na_s+nb2
if (na_s-n_off > nb) then
ab(:,1:nblocks*nb) = ab(:,nb+1:(nblocks+1)*nb)
ab(:,nblocks*nb+1:(nblocks+1)*nb) = 0
n_off = n_off + nb
endif
do iblk=1,nblocks
ns = na_s + (iblk-1)*nb - n_off ! first column in block
ne = ns+nb-nb2 ! last column in block
if (ns+n_off>na) exit
nc = MIN(na-ns-n_off+1,nb) ! number of columns in diagonal block
nr = MIN(na-nb-ns-n_off+1,nb) ! rows in subdiagonal block (may be < 0!!!)
! Note that nr>=0 implies that diagonal block is full (nc==nb)!
call wy_gen(nc,nb2,w,hv,tau,work,nb)
if (iblk==nblocks .and. nc==nb) then
!request last nb2 columns
call mpi_recv(ab_r,(nb+1)*nb2,mpi_real8,my_pe+1,1,mpi_comm,mpi_status,mpierr)
do i=1,nb2
ab(1:nb+1,ne+i-1) = ab_r(:,i)
enddo
endif
hv_new(:,:) = 0 ! Needed, last rows must be 0 for nr < nb
tau_new(:) = 0
if (nr>0) then
call wy_right(nr,nb,nb2,ab(nb+1,ns),2*nb-1,w,hv,work,nb)
call dgeqrf(nr,nb2,ab(nb+1,ns),2*nb-1,tau_new,work,lwork,info);
do i=1,nb2
hv_new(i,i) = 1.0
hv_new(i+1:,i) = ab(nb+2:2*nb-i+1,ns+i-1)
ab(nb+2:,ns+i-1) = 0
enddo
!send hh-vector
if (iblk==nblocks) then
call mpi_wait(ireq_hv,mpi_status,mpierr)
hv_s = hv_new
do i=1,nb2
hv_s(i,i) = tau_new(i)
enddo
call mpi_isend(hv_s,nb*nb2,mpi_real8,my_pe+1,2,mpi_comm,ireq_hv,mpierr)
endif
endif
call wy_symm(nc,nb2,ab(1,ns),2*nb-1,w,hv,work,work2,nb)
if (my_pe>0 .and. iblk==1) then
!send first nb2 columns to previous PE
call mpi_wait(ireq_ab,mpi_status,mpierr)
do i=1,nb2
ab_s(1:nb+1,i) = ab(1:nb+1,ns+i-1)
enddo
call mpi_isend(ab_s,(nb+1)*nb2,mpi_real8,my_pe-1,1,mpi_comm,ireq_ab,mpierr)
endif
if (nr>0) then
call wy_gen(nr,nb2,w_new,hv_new,tau_new,work,nb)
call wy_left(nb-nb2,nr,nb2,ab(nb+1-nb2,ns+nb2),2*nb-1,w_new,hv_new,work,nb)
endif
! Use new HH vector for the next block
hv(:,:) = hv_new(:,:)
tau = tau_new
enddo
enddo
! Finish the last outstanding requests
call mpi_wait(ireq_ab,mpi_status,mpierr)
call mpi_wait(ireq_hv,mpi_status,mpierr)
allocate(mpi_statuses(MPI_STATUS_SIZE,nblocks2), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error allocating mpi_statuses "//errorMessage
stop
endif
call mpi_waitall(nblocks2,ireq_ab2,mpi_statuses,mpierr)
deallocate(mpi_statuses, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error deallocating mpi_statuses "//errorMessage
stop
endif
call mpi_barrier(mpi_comm,mpierr)
deallocate(block_limits, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error deallocating block_limits "//errorMessage
stop
endif
deallocate(block_limits2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error deallocating block_limits2 "//errorMessage
stop
endif
deallocate(ireq_ab2, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"error deallocating ireq_ab2 "//errorMessage
stop
endif
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("band_band_real")
#endif
end subroutine
subroutine wy_gen(n, nb, W, Y, tau, mem, lda)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer, intent(in) :: n !length of householder-vectors
integer, intent(in) :: nb !number of householder-vectors
integer, intent(in) :: lda !leading dimension of Y and W
real*8, intent(in) :: Y(lda,nb) !matrix containing nb householder-vectors of length b
real*8, intent(in) :: tau(nb) !tau values
real*8, intent(out) :: W(lda,nb) !output matrix W
real*8, intent(in) :: mem(nb) !memory for a temporary matrix of size nb
integer :: i
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_gen")
#endif
W(1:n,1) = tau(1)*Y(1:n,1)
do i=2,nb
W(1:n,i) = tau(i)*Y(1:n,i)
call DGEMV('T',n,i-1,1.d0,Y,lda,W(1,i),1,0.d0,mem,1)
call DGEMV('N',n,i-1,-1.d0,W,lda,mem,1,1.d0,W(1,i),1)
enddo
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_gen")
#endif
end subroutine
subroutine wy_left(n, m, nb, A, lda, W, Y, mem, lda2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer, intent(in) :: n !width of the matrix A
integer, intent(in) :: m !length of matrix W and Y
integer, intent(in) :: nb !width of matrix W and Y
integer, intent(in) :: lda !leading dimension of A
integer, intent(in) :: lda2 !leading dimension of W and Y
real*8, intent(inout) :: A(lda,*) !matrix to be transformed
real*8, intent(in) :: W(m,nb) !blocked transformation matrix W
real*8, intent(in) :: Y(m,nb) !blocked transformation matrix Y
real*8, intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_left")
#endif
call DGEMM('T', 'N', nb, n, m, 1.d0, W, lda2, A, lda, 0.d0, mem, nb)
call DGEMM('N', 'N', m, n, nb, -1.d0, Y, lda2, mem, nb, 1.d0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_left")
#endif
end subroutine
! --------------------------------------------------------------------------------------------------
subroutine wy_right(n, m, nb, A, lda, W, Y, mem, lda2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer, intent(in) :: n !height of the matrix A
integer, intent(in) :: m !length of matrix W and Y
integer, intent(in) :: nb !width of matrix W and Y
integer, intent(in) :: lda !leading dimension of A
integer, intent(in) :: lda2 !leading dimension of W and Y
real*8, intent(inout) :: A(lda,*) !matrix to be transformed
real*8, intent(in) :: W(m,nb) !blocked transformation matrix W
real*8, intent(in) :: Y(m,nb) !blocked transformation matrix Y
real*8, intent(inout) :: mem(n,nb) !memory for a temporary matrix of size n x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_right")
#endif
call DGEMM('N', 'N', n, nb, m, 1.d0, A, lda, W, lda2, 0.d0, mem, n)
call DGEMM('N', 'T', n, m, nb, -1.d0, mem, n, Y, lda2, 1.d0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_right")
#endif
end subroutine
! --------------------------------------------------------------------------------------------------
subroutine wy_symm(n, nb, A, lda, W, Y, mem, mem2, lda2)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer, intent(in) :: n !width/heigth of the matrix A; length of matrix W and Y
integer, intent(in) :: nb !width of matrix W and Y
integer, intent(in) :: lda !leading dimension of A
integer, intent(in) :: lda2 !leading dimension of W and Y
real*8, intent(inout) :: A(lda,*) !matrix to be transformed
real*8, intent(in) :: W(n,nb) !blocked transformation matrix W
real*8, intent(in) :: Y(n,nb) !blocked transformation matrix Y
real*8 :: mem(n,nb) !memory for a temporary matrix of size n x nb
real*8 :: mem2(nb,nb) !memory for a temporary matrix of size nb x nb
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("wy_symm")
#endif
call DSYMM('L', 'L', n, nb, 1.d0, A, lda, W, lda2, 0.d0, mem, n)
call DGEMM('T', 'N', nb, nb, n, 1.d0, mem, n, W, lda2, 0.d0, mem2, nb)
call DGEMM('N', 'N', n, nb, nb, -0.5d0, Y, lda2, mem2, nb, 1.d0, mem, n)
call DSYR2K('L', 'N', n, nb, -1.d0, Y, lda2, mem, n, 1.d0, A, lda)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("wy_symm")
#endif
end subroutine
end module ELPA2