Commit b6283c67 authored by Andreas Marek's avatar Andreas Marek
Browse files

Remove obsolete files from unify of real/complex trans_ev_tridi_to_band

parent 3a63d86a
subroutine trans_ev_tridi_to_band_complex_PRECISION(na, nev, nblk, nbw, q, ldq, matrixCols, &
hh_trans_complex, mpi_comm_rows, mpi_comm_cols, &
wantDebug, useGPU, success, THIS_COMPLEX_ELPA_KERNEL)
!-------------------------------------------------------------------------------
! trans_ev_tridi_to_band_complex:
! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nev Number eigenvectors to compute (= columns of matrix q)
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nb semi bandwith
!
! q On input: Eigenvectors of tridiagonal matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
! matrixCols local columns of matrix q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns/both
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use elpa2_workload
use pack_unpack_complex
use compute_hh_trafo_complex
use precision
use cuda_functions
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: q(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE) :: q(ldq,matrixCols)
#endif
complex(kind=COMPLEX_DATATYPE) :: hh_trans_complex(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: tmp
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
integer(kind=ik) :: thread_width, csw, b_off, b_len
#endif
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
logical :: flag
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), pointer :: aIntern(:,:,:,:)
#else
complex(kind=COMPLEX_DATATYPE), pointer :: aIntern(:,:,:)
#endif
complex(kind=COMPLEX_DATATYPE) :: a_complex
type(c_ptr) :: aIntern_ptr
complex(kind=COMPLEX_DATATYPE), allocatable :: row(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
integer(kind=c_intptr_t) :: aIntern_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx
integer(kind=ik) :: n_times
integer(kind=ik) :: top, chunk, this_chunk
complex(kind=COMPLEX_DATATYPE), allocatable :: result_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bcast_buffer(:,:)
integer(kind=ik) :: n_off
integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
! integer(kind=ik), allocatable :: mpi_statuses(:,:)
#ifdef WITH_MPI
! integer(kind=ik) :: my_mpi_status(MPI_STATUS_SIZE)
#endif
#endif
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: na_rows, na_cols
! real*8 :: ttt0, ttt1, ttt2, t2_compute_kernel, t0_compute_kernel,t1_compute_kernel, &
! t0_mpi_time, t1_mpi_time,t2_mpi_time
! real*8 :: t0_cpu_code,t1_cpu_code,t2_cpu_code,t0_block_time,t1_block_time,t2_block_time,t0_cuda_memcpy
! real*8 :: t0_inner_do_time, t1_inner_do_time , t2_inner_do_time,t0_outer_do_time ,t1_outer_do_time , &
! t2_outer_do_time ,t0_result_time ,t1_result_time, t2_result_time,t0_mpi_recv_time, &
! t1_mpi_recv_time,t2_mpi_recv_time
! real*8 :: t1_mpi_wait_time,t0_mpi_wait_time,t2_mpi_wait_time,t1_memcpy_time,t0_memcpy_time,t2_memcpy_time, &
! t1_mpi_irecv_time,t0_mpi_irecv_time,t2_mpi_irecv_time,t0_mpi_outer_wait_time,t1_mpi_outer_wait_time,&
! t2_mpi_outer_wait_time, time0
! real*4 :: time1
! MPI send/recv tags, arbitrary
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads
#endif
! Just for measuring the kernel performance
real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
! long integer
integer(kind=lik) :: kernel_flops, kernel_flops_recv
logical, intent(in) :: wantDebug
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
logical :: success
#ifndef WITH_MPI
integer(kind=ik) :: j1
#endif
call timer%start("trans_ev_tridi_to_band_complex" // PRECISION_SUFFIX)
if (useGPU) then
n_times =0
! n_times_1 =0
unpack_idx = 0
row_group_size = 0
! time0=0
! t0_compute_kernel=0
endif
kernel_time = 0.0
kernel_flops = 0
#ifdef WITH_OPENMP
max_threads = 1
max_threads = omp_get_max_threads()
#endif
call timer%start("mpi_communication")
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
call timer%stop("mpi_communication")
if (useGPU) then
#ifdef WITH_MPI
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
na_rows = na
na_cols = na
#endif
endif
success = .true.
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
nfact = nbw / nblk
! local number of eigenvectors
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if (l_nev==0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
thread_width = 0
#endif
stripe_width = 0
stripe_count = 0
last_stripe_width = 0
else
! Suggested stripe width is 48 - should this be reduced for the complex case ???
#ifdef WITH_OPENMP
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#endif
if (useGPU) then
stripe_width = 256
else
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = 48 ! Must be a multiple of 2
#else
stripe_width = 48 ! Must be a multiple of 4
#endif
endif
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
stripe_count = (thread_width-1)/stripe_width + 1
#else /* WITH_OPENMP */
stripe_count = (l_nev-1)/stripe_width + 1
#endif /* WITH_OPENMP */
! Adapt stripe width so that last one doesn't get too small
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
stripe_width = (thread_width-1)/stripe_count + 1
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
stripe_width = (l_nev-1)/stripe_count + 1
endif
#endif /* WITH_OPENMP */
if (.not.(useGPU)) then
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
! (2 * sizeof(double complex) == 32)
#else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(float complex) == 32)
#endif
endif
#ifndef WITH_OPENMP
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif /* WITH_OPENMP */
endif
! Determine the matrix distribution at the beginning
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error when allocating limits "//errorMessage
stop
endif
call determine_workload(na, nbw, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
if (.not.(useGPU)) then
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_complex)) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads] )
! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
endif
#else /* OpenMP */
if (.not.(useGPU)) then
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_complex)) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count] )
! allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
aIntern(:,:,:) = 0
endif
#endif /* WITH_OPENMP */
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating row "//errorMessage
stop
endif
row(:) = 0
if (useGPU) then
num = (stripe_width*a_dim2*stripe_count)*size_of_PRECISION_complex
if (na_rows * na_cols .lt. stripe_width*a_dim2*stripe_count) then
print *,"trans_ev_tridi_to_band_complex aIntern_dev ",na_rows * na_cols, stripe_width*a_dim2*stripe_count
! stop
endif
successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count*size_of_PRECISION_complex)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
if (num .gt. na_rows * na_cols) then
print *,"trans_ev_tridi_to_band_complex aIntern_dev 1",num, na_rows * na_cols
! stop
endif
successCUDA = cuda_memset(aIntern_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
num = (l_nev)*size_of_PRECISION_complex
successCUDA = cuda_malloc( row_dev,num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
! "row_group" and "row_group_dev" are needed for GPU optimizations
allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating row_group "//errorMessage
stop
endif
row_group(:, :) = CONST_COMPLEX_0_0
num = (l_nev*nblk)*size_of_PRECISION_complex
successCUDA = cuda_malloc(row_group_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_group_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
endif ! useGPU
! Copy q from a block cyclic distribution into a distribution with contiguous rows,
! and transpose the matrix using stripes of given stripe_width for cache blocking.
! The peculiar way it is done below is due to the fact that the last row should be
! ready first since it is the first one to start below
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
! Please note about the OMP usage below:
! This is not for speed, but because we want the matrix a in the memory and
! in the cache of the correct thread (if possible)
call timer%start("OpenMP parallel_PRECISION")
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
aIntern(:,:,:,my_thread) = CONST_COMPLEX_0_0 ! if possible, do first touch allocation!
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_PRECISION")
#endif /* WITH_OPENMP */
do ip = np_rows-1, 0, -1
if (my_prow == ip) then
! Receive my rows which have not yet been received
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src < my_prow) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, &
MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_PRECISION(i - limits(ip), .false.)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, &
mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else
row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif
else
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call timer%start("OpenMP parallel_PRECISION")
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp_PRECISION(aIntern, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_PRECISION")
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu_PRECISION(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */
elseif (src==my_prow) then
src_offset = src_offset+1
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_PRECISION(i - limits(ip),.false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else
row(:) = q(src_offset, 1:l_nev)
endif
#ifdef WITH_OPENMP
call timer%start("OpenMP parallel_PRECISION")
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp_PRECISION(aIntern, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_PRECISION")
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu_PRECISION(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */
endif
enddo
! Send all rows which have not yet been send
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if(mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Send(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, dst, 0, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
enddo
enddo
else if(my_prow < ip) then
! Send all rows going to PE ip
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Send(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, ip, 0, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
enddo
! Receive all rows from PE ip
do i=limits(my_prow)+1,limits(my_prow+1)
src = mod((i-1)/nblk, np_rows)
if (src == ip) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, &
MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_PRECISION(i - limits(my_prow), .false.)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else
! todo: what of this is correct? Was different for double/single precision
#ifdef DOUBLE_PRECISION_COMPLEX
row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#else
row_group(1:l_nev,row_group_size) = row_group(1:l_nev,row_group_size) ! is this correct
#endif
#endif
else
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call timer%start("OpenMP parallel_PRECISION")
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp_PRECISION(aIntern, row,i-limits(my_prow),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_PRECISION")
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu_PRECISION(aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */