Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
b6283c67
Commit
b6283c67
authored
Jan 25, 2017
by
Andreas Marek
Browse files
Remove obsolete files from unify of real/complex trans_ev_tridi_to_band
parent
3a63d86a
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
src/elpa2_trans_ev_tridi_to_band_complex_template.X90
deleted
100644 → 0
View file @
3a63d86a
subroutine trans_ev_tridi_to_band_complex_PRECISION(na, nev, nblk, nbw, q, ldq, matrixCols, &
hh_trans_complex, mpi_comm_rows, mpi_comm_cols, &
wantDebug, useGPU, success, THIS_COMPLEX_ELPA_KERNEL)
!-------------------------------------------------------------------------------
! trans_ev_tridi_to_band_complex:
! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nev Number eigenvectors to compute (= columns of matrix q)
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nb semi bandwith
!
! q On input: Eigenvectors of tridiagonal matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
! matrixCols local columns of matrix q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns/both
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use elpa2_workload
use pack_unpack_complex
use compute_hh_trafo_complex
use precision
use cuda_functions
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: q(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE) :: q(ldq,matrixCols)
#endif
complex(kind=COMPLEX_DATATYPE) :: hh_trans_complex(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: tmp
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
integer(kind=ik) :: thread_width, csw, b_off, b_len
#endif
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
logical :: flag
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), pointer :: aIntern(:,:,:,:)
#else
complex(kind=COMPLEX_DATATYPE), pointer :: aIntern(:,:,:)
#endif
complex(kind=COMPLEX_DATATYPE) :: a_complex
type(c_ptr) :: aIntern_ptr
complex(kind=COMPLEX_DATATYPE), allocatable :: row(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: row_group(:,:)
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
integer(kind=c_intptr_t) :: aIntern_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx
integer(kind=ik) :: n_times
integer(kind=ik) :: top, chunk, this_chunk
complex(kind=COMPLEX_DATATYPE), allocatable :: result_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bcast_buffer(:,:)
integer(kind=ik) :: n_off
integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
! integer(kind=ik), allocatable :: mpi_statuses(:,:)
#ifdef WITH_MPI
! integer(kind=ik) :: my_mpi_status(MPI_STATUS_SIZE)
#endif
#endif
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: na_rows, na_cols
! real*8 :: ttt0, ttt1, ttt2, t2_compute_kernel, t0_compute_kernel,t1_compute_kernel, &
! t0_mpi_time, t1_mpi_time,t2_mpi_time
! real*8 :: t0_cpu_code,t1_cpu_code,t2_cpu_code,t0_block_time,t1_block_time,t2_block_time,t0_cuda_memcpy
! real*8 :: t0_inner_do_time, t1_inner_do_time , t2_inner_do_time,t0_outer_do_time ,t1_outer_do_time , &
! t2_outer_do_time ,t0_result_time ,t1_result_time, t2_result_time,t0_mpi_recv_time, &
! t1_mpi_recv_time,t2_mpi_recv_time
! real*8 :: t1_mpi_wait_time,t0_mpi_wait_time,t2_mpi_wait_time,t1_memcpy_time,t0_memcpy_time,t2_memcpy_time, &
! t1_mpi_irecv_time,t0_mpi_irecv_time,t2_mpi_irecv_time,t0_mpi_outer_wait_time,t1_mpi_outer_wait_time,&
! t2_mpi_outer_wait_time, time0
! real*4 :: time1
! MPI send/recv tags, arbitrary
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads
#endif
! Just for measuring the kernel performance
real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
! long integer
integer(kind=lik) :: kernel_flops, kernel_flops_recv
logical, intent(in) :: wantDebug
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
logical :: success
#ifndef WITH_MPI
integer(kind=ik) :: j1
#endif
call timer%start("trans_ev_tridi_to_band_complex" // PRECISION_SUFFIX)
if (useGPU) then
n_times =0
! n_times_1 =0
unpack_idx = 0
row_group_size = 0
! time0=0
! t0_compute_kernel=0
endif
kernel_time = 0.0
kernel_flops = 0
#ifdef WITH_OPENMP
max_threads = 1
max_threads = omp_get_max_threads()
#endif
call timer%start("mpi_communication")
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
call timer%stop("mpi_communication")
if (useGPU) then
#ifdef WITH_MPI
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
na_rows = na
na_cols = na
#endif
endif
success = .true.
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_complex: band backtransform works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
nfact = nbw / nblk
! local number of eigenvectors
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if (l_nev==0) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
thread_width = 0
#endif
stripe_width = 0
stripe_count = 0
last_stripe_width = 0
else
! Suggested stripe width is 48 - should this be reduced for the complex case ???
#ifdef WITH_OPENMP
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#endif
if (useGPU) then
stripe_width = 256
else
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = 48 ! Must be a multiple of 2
#else
stripe_width = 48 ! Must be a multiple of 4
#endif
endif
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
stripe_count = (thread_width-1)/stripe_width + 1
#else /* WITH_OPENMP */
stripe_count = (l_nev-1)/stripe_width + 1
#endif /* WITH_OPENMP */
! Adapt stripe width so that last one doesn't get too small
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
stripe_width = (thread_width-1)/stripe_count + 1
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
stripe_width = (l_nev-1)/stripe_count + 1
endif
#endif /* WITH_OPENMP */
if (.not.(useGPU)) then
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
! (2 * sizeof(double complex) == 32)
#else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(float complex) == 32)
#endif
endif
#ifndef WITH_OPENMP
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif /* WITH_OPENMP */
endif
! Determine the matrix distribution at the beginning
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error when allocating limits "//errorMessage
stop
endif
call determine_workload(na, nbw, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
if (.not.(useGPU)) then
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads*C_SIZEOF(a_complex)) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads] )
! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
endif
#else /* OpenMP */
if (.not.(useGPU)) then
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*C_SIZEOF(a_complex)) /= 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating a "//errorMessage
stop
endif
call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count] )
! allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
aIntern(:,:,:) = 0
endif
#endif /* WITH_OPENMP */
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating row "//errorMessage
stop
endif
row(:) = 0
if (useGPU) then
num = (stripe_width*a_dim2*stripe_count)*size_of_PRECISION_complex
if (na_rows * na_cols .lt. stripe_width*a_dim2*stripe_count) then
print *,"trans_ev_tridi_to_band_complex aIntern_dev ",na_rows * na_cols, stripe_width*a_dim2*stripe_count
! stop
endif
successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count*size_of_PRECISION_complex)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
if (num .gt. na_rows * na_cols) then
print *,"trans_ev_tridi_to_band_complex aIntern_dev 1",num, na_rows * na_cols
! stop
endif
successCUDA = cuda_memset(aIntern_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
num = (l_nev)*size_of_PRECISION_complex
successCUDA = cuda_malloc( row_dev,num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
! "row_group" and "row_group_dev" are needed for GPU optimizations
allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_complex: error allocating row_group "//errorMessage
stop
endif
row_group(:, :) = CONST_COMPLEX_0_0
num = (l_nev*nblk)*size_of_PRECISION_complex
successCUDA = cuda_malloc(row_group_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_group_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_complex: error in cudaMemset "
stop
endif
endif ! useGPU
! Copy q from a block cyclic distribution into a distribution with contiguous rows,
! and transpose the matrix using stripes of given stripe_width for cache blocking.
! The peculiar way it is done below is due to the fact that the last row should be
! ready first since it is the first one to start below
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
! Please note about the OMP usage below:
! This is not for speed, but because we want the matrix a in the memory and
! in the cache of the correct thread (if possible)
call timer%start("OpenMP parallel_PRECISION")
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
aIntern(:,:,:,my_thread) = CONST_COMPLEX_0_0 ! if possible, do first touch allocation!
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_PRECISION")
#endif /* WITH_OPENMP */
do ip = np_rows-1, 0, -1
if (my_prow == ip) then
! Receive my rows which have not yet been received
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src < my_prow) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, &
MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_PRECISION(i - limits(ip), .false.)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, &
mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else
row_group(1:l_nev, row_group_size) = row(1:l_nev) ! is this correct?
#endif
else
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call timer%start("OpenMP parallel_PRECISION")
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp_PRECISION(aIntern, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_PRECISION")
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu_PRECISION(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */
elseif (src==my_prow) then
src_offset = src_offset+1
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_PRECISION(i - limits(ip),.false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else
row(:) = q(src_offset, 1:l_nev)
endif
#ifdef WITH_OPENMP
call timer%start("OpenMP parallel_PRECISION")
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp_PRECISION(aIntern, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_PRECISION")
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu_PRECISION(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */
endif
enddo
! Send all rows which have not yet been send
src_offset = 0
do dst = 0, ip-1
do i=limits(dst)+1,limits(dst+1)
if(mod((i-1)/nblk, np_rows) == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Send(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, dst, 0, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
enddo
enddo
else if(my_prow < ip) then
! Send all rows going to PE ip
src_offset = local_index(limits(ip), my_prow, np_rows, nblk, -1)
do i=limits(ip)+1,limits(ip+1)
src = mod((i-1)/nblk, np_rows)
if (src == my_prow) then
src_offset = src_offset+1
row(:) = q(src_offset, 1:l_nev)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Send(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, ip, 0, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
endif
enddo
! Receive all rows from PE ip
do i=limits(my_prow)+1,limits(my_prow+1)
src = mod((i-1)/nblk, np_rows)
if (src == ip) then
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, &
MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
! row(1:l_nev) = row(1:l_nev)
#endif /* WITH_MPI */
#else /* WITH_OPENMP */
if (useGPU) then
call unpack_and_prepare_row_group_complex_gpu_PRECISION(i - limits(my_prow), .false.)
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row_group(:, row_group_size), l_nev,MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else
! todo: what of this is correct? Was different for double/single precision
#ifdef DOUBLE_PRECISION_COMPLEX
row_group(1:l_nev,row_group_size) = row(1:l_nev) ! is this correct ?
#else
row_group(1:l_nev,row_group_size) = row_group(1:l_nev,row_group_size) ! is this correct
#endif
#endif
else
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Recv(row, l_nev, MPI_COMPLEX_EXPLICIT_PRECISION, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call timer%stop("mpi_communication")
#else
! row(1:l_nev) = row(1:l_nev)
#endif
endif
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_complex: not yet implemented"
stop
endif
call timer%start("OpenMP parallel_PRECISION")
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
call unpack_row_complex_cpu_openmp_PRECISION(aIntern, row,i-limits(my_prow),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo
!$omp end parallel do
call timer%stop("OpenMP parallel_PRECISION")
#else /* WITH_OPENMP */
if (.not.(useGPU)) then
call unpack_row_complex_cpu_PRECISION(aIntern, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
endif
#endif /* WITH_OPENMP */