Commit 55d014a1 authored by Andreas Marek's avatar Andreas Marek
Browse files

Unify real/complex ev_tridi_to_band

parent 51003ba0
......@@ -57,9 +57,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_trans_ev_tridi_to_band_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
......@@ -913,8 +912,7 @@ EXTRA_DIST = \
src/elpa2_template.X90 \
src/elpa2_tridiag_band_template.X90 \
src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_trans_ev_tridi_to_band_template.X90 \
src/precision_macros.h \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
......
......@@ -69,8 +69,7 @@
#define COMPLEXCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#include "elpa2_tridiag_band_template.X90"
#undef COMPLEXCASE
#include "elpa2_trans_ev_tridi_to_band_complex_template.X90"
#include "elpa2_trans_ev_tridi_to_band_template.X90"
subroutine compute_hh_dot_products_complex_gpu_PRECISION(nbw, n)
use cuda_c_kernel
......
......@@ -69,8 +69,7 @@
#define REALCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#include "elpa2_tridiag_band_template.X90"
#undef REALCASE
#include "elpa2_trans_ev_tridi_to_band_real_template.X90"
#include "elpa2_trans_ev_tridi_to_band_template.X90"
......
#if REALCASE == 1
subroutine trans_ev_tridi_to_band_real_PRECISION &
#endif
#if COMPLEXCASE == 1
subroutine trans_ev_tridi_to_band_complex_PRECISION &
#endif
(na, nev, nblk, nbw, q, &
#if REALCASE == 1
q_dev, &
#endif
ldq, matrixCols, &
#if REALCASE == 1
hh_trans_real, &
#endif
#if COMPLEXCASE == 1
hh_trans_complex, &
#endif
mpi_comm_rows, mpi_comm_cols, wantDebug, useGPU, success, &
#if REALCASE == 1
THIS_REAL_ELPA_KERNEL)
#endif
#if COMPLEXCASE == 1
THIS_COMPLEX_ELPA_KERNEL)
#endif
!-------------------------------------------------------------------------------
! trans_ev_tridi_to_band_real/complex:
! Transforms the eigenvectors of a tridiagonal matrix back to the eigenvectors of the band matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nev Number eigenvectors to compute (= columns of matrix q)
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nb semi bandwith
!
! q On input: Eigenvectors of tridiagonal matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! q_dev GPU device pointer to q
!
! ldq Leading dimension of q
! matrixCols local columns of matrix q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns/both
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use elpa2_workload
#if REALCASE == 1
use pack_unpack_real
use pack_unpack_real_gpu
use compute_hh_trafo_real
#endif
#if COMPLEXCASE == 1
use pack_unpack_complex
use compute_hh_trafo_complex
#endif
use cuda_functions
use precision
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
#if REALCASE == 1
integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
#endif
#if COMPLEXCASE == 1
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
#endif
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: q(ldq,*)
#else
real(kind=REAL_DATATYPE) :: q(ldq,matrixCols)
#endif
real(kind=REAL_DATATYPE), intent(in) :: hh_trans_real(:,:)
integer(kind=c_intptr_t) :: q_dev
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: q(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE) :: q(ldq,matrixCols)
#endif
complex(kind=COMPLEX_DATATYPE) :: hh_trans_complex(:,:)
#endif
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: tmp
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
integer(kind=ik) :: thread_width, csw, b_off, b_len
#endif
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
logical :: flag
#if REALCASE == 1
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:,:)
#else
real(kind=REAL_DATATYPE), pointer :: aIntern(:,:,:)
#endif
real(kind=REAL_DATATYPE) :: a_real
#endif
#if COMPLEXCASE == 1
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), pointer :: aIntern(:,:,:,:)
#else
complex(kind=COMPLEX_DATATYPE), pointer :: aIntern(:,:,:)
#endif
complex(kind=COMPLEX_DATATYPE) :: a_complex
#endif
type(c_ptr) :: aIntern_ptr
#if REALCASE == 1
real(kind=REAL_DATATYPE) , allocatable :: row(:)
real(kind=REAL_DATATYPE) , allocatable :: row_group(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: row(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: row_group(:,:)
#endif
#if REALCASE == 1
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
real(kind=REAL_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else
real(kind=REAL_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
real(kind=REAL_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:), top_border_recv_buffer(:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:), bottom_border_recv_buffer(:,:)
#else
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
#endif
integer(kind=c_intptr_t) :: aIntern_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx
#if COMPLEXCASE == 1
integer(kind=ik) :: n_times
#endif
integer(kind=ik) :: top, chunk, this_chunk
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: result_buffer(:,:,:)
real(kind=REAL_DATATYPE), allocatable :: bcast_buffer(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: result_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bcast_buffer(:,:)
#endif
integer(kind=ik) :: n_off
integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
! integer(kind=ik), allocatable :: mpi_statuses(:,:)
#endif
#ifdef WITH_OPENMP
#ifdef WITH_MPI
! integer(kind=ik) :: my_MPI_STATUS_(MPI_STATUS_SIZE)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: na_rows, na_cols
#endif
! MPI send/recv tags, arbitrary
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads
#endif
! Just for measuring the kernel performance
real(kind=c_double) :: kernel_time, kernel_time_recv ! MPI_WTIME always needs double
! long integer
integer(kind=lik) :: kernel_flops, kernel_flops_recv
logical, intent(in) :: wantDebug
logical :: success
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
#ifndef WITH_MPI
integer(kind=ik) :: j1
#endif
call timer%start("trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&_" // &
&PRECISION_SUFFIX &
)
if (useGPU) then
#if COMPLEXCASE == 1
n_times = 0
#endif
unpack_idx = 0
row_group_size = 0
endif
success = .true.
kernel_time = 0.0
kernel_flops = 0
#ifdef WITH_OPENMP
max_threads = 1
max_threads = omp_get_max_threads()
#endif
call timer%start("mpi_communication")
call MPI_Comm_rank(mpi_comm_rows, my_prow, mpierr)
call MPI_Comm_size(mpi_comm_rows, np_rows, mpierr)
call MPI_Comm_rank(mpi_comm_cols, my_pcol, mpierr)
call MPI_Comm_size(mpi_comm_cols, np_cols, mpierr)
call timer%stop("mpi_communication")
#if COMPLEXCASE == 1
if (useGPU) then
#ifdef WITH_MPI
na_rows = numroc(na, nblk, my_prow, 0, np_rows)
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
na_rows = na
na_cols = na
#endif
endif
#endif /* COMPLEXCASEc*/
if (mod(nbw,nblk)/=0) then
if (my_prow==0 .and. my_pcol==0) then
if (wantDebug) then
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: ERROR: nbw=',nbw,', nblk=',nblk
write(error_unit,*) 'ELPA2_trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: band backtransform works only for nbw==n*nblk'
endif
success = .false.
return
endif
endif
nfact = nbw / nblk
! local number of eigenvectors
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if (l_nev==0) then
#ifdef WITH_OPENMP
thread_width = 0
#endif
stripe_width = 0
stripe_count = 0
last_stripe_width = 0
else ! l_nev
#if WITH_OPENMP
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
if (useGPU) then
stripe_width = 256 ! Must be a multiple of 4
stripe_count = (l_nev - 1) / stripe_width + 1
else ! useGPU
! openmp only in non-GPU case
thread_width = (l_nev-1)/max_threads + 1 ! number of eigenvectors per OMP thread
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
stripe_width = 48 ! Must be a multiple of 4
#else
stripe_width = 96 ! Must be a multiple of 8
#endif
#endif /* REALCASE */
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = 48 ! Must be a multiple of 2
#else
stripe_width = 48 ! Must be a multiple of 4
#endif
#endif /* COMPLEXCASE */
stripe_count = (thread_width-1)/stripe_width + 1
! Adapt stripe width so that last one doesn't get too small
stripe_width = (thread_width-1)/stripe_count + 1
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(double) == 32)
#else
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
! (8 * sizeof(float) == 32)
#endif
#endif /* REALCASE */
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
! (2 * sizeof(double complex) == 32)
#else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(float complex) == 32)
#endif
#endif /* COMPLEXCASE */
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
endif ! useGPU
#else /* WITH_OPENMP */
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
if (useGPU) then
stripe_width = 256 ! Must be a multiple of 4
stripe_count = (l_nev - 1) / stripe_width + 1
else ! useGPU
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
stripe_width = 48 ! Must be a multiple of 4
#else
stripe_width = 96 ! Must be a multiple of 8
#endif
#endif /* REALCASE */
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = 48 ! Must be a multiple of 2
#else
stripe_width = 48 ! Must be a multiple of 4
#endif
#endif /* COMPLEXCASE */
stripe_count = (l_nev-1)/stripe_width + 1
! Adapt stripe width so that last one doesn't get too small
stripe_width = (l_nev-1)/stripe_count + 1
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(double) == 32)
#else
stripe_width = ((stripe_width+7)/8)*8 ! Must be a multiple of 8 because of AVX/SSE memory alignment of 32 bytes
! (8 * sizeof(float) == 32)
#endif
#endif /* REALCASE */
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 2 because of AVX/SSE memory alignment of 32 bytes
! (2 * sizeof(double complex) == 32)
#else
stripe_width = ((stripe_width+3)/4)*4 ! Must be a multiple of 4 because of AVX/SSE memory alignment of 32 bytes
! (4 * sizeof(float complex) == 32)
#endif
#endif /* COMPLEXCASE */
endif ! useGPU
last_stripe_width = l_nev - (stripe_count-1)*stripe_width
#endif /* WITH_OPENMP */
endif ! l_nev
! Determine the matrix distribution at the beginning
allocate(limits(0:np_rows), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error when allocating limits"//errorMessage
stop
endif
call determine_workload(na, nbw, np_rows, limits)
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
if (useGPU) then
num = (stripe_width*a_dim2*stripe_count)* &
#if REALCASE == 1
size_of_PRECISION_real
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex
#endif
successCUDA = cuda_malloc(aIntern_dev, stripe_width*a_dim2*stripe_count* &
#if REALCASE == 1
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"//errorMessage
stop
endif
successCUDA = cuda_memset(aIntern_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemset"//errorMessage
stop
endif
num = (l_nev)* &
#if REALCASE == 1
size_of_PRECISION_real
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex
#endif
successCUDA = cuda_malloc( row_dev,num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc "
stop
endif
successCUDA = cuda_memset(row_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemset "
stop
endif
! "row_group" and "row_group_dev" are needed for GPU optimizations
allocate(row_group(l_nev, nblk), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error when allocating row_group"//errorMessage
stop
endif
#if REALCASE == 1
row_group(:, :) = CONST_0_0
#endif
#if COMPLEXCASE == 1
row_group(:, :) = CONST_COMPLEX_0_0
#endif
num = (l_nev*nblk)* &
#if REALCASE == 1
size_of_PRECISION_real
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex
#endif
successCUDA = cuda_malloc(row_group_dev, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMalloc"//errorMessage
stop
endif
successCUDA = cuda_memset(row_group_dev , 0, num)
if (.not.(successCUDA)) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error in cudaMemset"//errorMessage
stop
endif
else ! GPUs are not used
#if 0
! realcase or complexcase
!DEC$ ATTRIBUTES ALIGN: 64:: aIntern
#endif
#ifdef WITH_OPENMP
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count*max_threads* &
#if REALCASE == 1
C_SIZEOF(a_real)) /= 0) then
#endif
#if COMPLEXCASE == 1
C_SIZEOF(a_complex)) /= 0) then
#endif
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&^: error when allocating aIntern"//errorMessage
stop
endif
call c_f_pointer(aIntern_ptr, aIntern, [stripe_width,a_dim2,stripe_count,max_threads])
! allocate(aIntern(stripe_width,a_dim2,stripe_count,max_threads), stat=istat, errmsg=errorMessage)
! aIntern(:,:,:,:) should be set to 0 in a parallel region, not here!
#else
if (posix_memalign(aIntern_ptr, 64_C_SIZE_T, stripe_width*a_dim2*stripe_count* &
#if REALCASE == 1
C_SIZEOF(a_real)) /= 0) then
#endif
#if COMPLEXCASE == 1
C_SIZEOF(a_complex)) /= 0) then
#endif
print *,"trans_ev_tridi_to_band_real: error when allocating aIntern"//errorMessage
stop
endif
call c_f_pointer(aIntern_ptr, aIntern,[stripe_width,a_dim2,stripe_count] )
!allocate(aIntern(stripe_width,a_dim2,stripe_count), stat=istat, errmsg=errorMessage)
#if REALCASE == 1
aIntern(:,:,:) = CONST_0_0
#endif
#if COMPLEXCASE == 1
aIntern(:,:,:) = 0
#endif
#endif /* WITH_OPENMP */
endif !useGPU
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
&: error when allocating row"//errorMessage
stop
endif
#if REALCASE == 1
row(:) = CONST_0_0
#endif
#if COMPLEXCASE == 1
row(:) = 0
#endif