Commit a0934d4e authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master' into ELPA_GPU

parents 98a4db33 33a94bfc
...@@ -17,6 +17,11 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \ ...@@ -17,6 +17,11 @@ libelpa@SUFFIX@_la_SOURCES = src/mod_precision.f90 \
src/check_for_gpu.F90 \ src/check_for_gpu.F90 \
src/mod_cuda.F90 \ src/mod_cuda.F90 \
src/interface_c_kernel.F90 \ src/interface_c_kernel.F90 \
src/mod_pack_unpack_real.F90 \
src/elpa2_kernels/mod_single_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_real.F90 \
src/mod_compute_hh_trafo_complex.F90 \
src/mod_pack_unpack_complex.F90 \
src/elpa2_compute.F90 \ src/elpa2_compute.F90 \
src/elpa2.F90 \ src/elpa2.F90 \
src/elpa_c_interface.F90 \ src/elpa_c_interface.F90 \
...@@ -314,6 +319,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90 ...@@ -314,6 +319,9 @@ elpa2.i: $(top_srcdir)/src/elpa2.F90
elpa1.i: $(top_srcdir)/src/elpa1.F90 elpa1.i: $(top_srcdir)/src/elpa1.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@ $(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1.F90 -o $@
mod_compute_hh_trafo_real.i: $(top_srcdir)/src/mod_compute_hh_trafo_real.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -c $(top_srcdir)/src/mod_compute_hh_trafo_real.F90 -o $@
include doxygen.am include doxygen.am
CLEANFILES = \ CLEANFILES = \
......
...@@ -298,6 +298,8 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp ...@@ -298,6 +298,8 @@ function solve_evp_real_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mp
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) real(kind=rk) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
! was
! real a(lda,*), q(ldq,*)
integer(kind=ik) :: my_prow, my_pcol, mpierr integer(kind=ik) :: my_prow, my_pcol, mpierr
real(kind=rk), allocatable :: e(:), tau(:) real(kind=rk), allocatable :: e(:), tau(:)
...@@ -397,6 +399,8 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, ...@@ -397,6 +399,8 @@ function solve_evp_complex_1stage(na, nev, a, lda, ev, q, ldq, nblk, matrixCols,
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols) complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols)
! was
! complex a(lda,*), q(ldq,*)
real(kind=rk) :: ev(na) real(kind=rk) :: ev(na)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
......
...@@ -138,6 +138,8 @@ module ELPA1_compute ...@@ -138,6 +138,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), d(na), e(na), tau(na) real(kind=rk) :: a(lda,matrixCols), d(na), e(na), tau(na)
! was
! real a(lda,*)
integer(kind=ik), parameter :: max_stored_rows = 32 integer(kind=ik), parameter :: max_stored_rows = 32
...@@ -479,6 +481,8 @@ module ELPA1_compute ...@@ -479,6 +481,8 @@ module ELPA1_compute
integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols), tau(na) real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols), tau(na)
! was
! real a(lda,*), q(ldq,*)
integer(kind=ik) :: max_stored_rows integer(kind=ik) :: max_stored_rows
...@@ -911,6 +915,8 @@ module ELPA1_compute ...@@ -911,6 +915,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), tau(na) complex(kind=ck) :: a(lda,matrixCols), tau(na)
! was
! complex a(lda,*)
real(kind=rk) :: d(na), e(na) real(kind=rk) :: d(na), e(na)
integer(kind=ik), parameter :: max_stored_rows = 32 integer(kind=ik), parameter :: max_stored_rows = 32
...@@ -1278,6 +1284,8 @@ module ELPA1_compute ...@@ -1278,6 +1284,8 @@ module ELPA1_compute
integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols), tau(na) complex(kind=ck) :: a(lda,matrixCols), q(ldq,matrixCols), tau(na)
! was
! complex a(lda,*), q(ldq,*)
integer(kind=ik) :: max_stored_rows integer(kind=ik) :: max_stored_rows
...@@ -1678,6 +1686,8 @@ module ELPA1_compute ...@@ -1678,6 +1686,8 @@ module ELPA1_compute
integer(kind=ik) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, nev, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: d(na), e(na), q(ldq,matrixCols) real(kind=rk) :: d(na), e(na), q(ldq,matrixCols)
! was
! real q(ldq,*)
integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows integer(kind=ik) :: i, j, n, np, nc, nev1, l_cols, l_rows
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
...@@ -1902,6 +1912,8 @@ module ELPA1_compute ...@@ -1902,6 +1912,8 @@ module ELPA1_compute
integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows integer(kind=ik) :: na, nev, nqoff, ldq, nblk, matrixCols, mpi_comm_rows
real(kind=rk) :: d(na), e(na), q(ldq,matrixCols) real(kind=rk) :: d(na), e(na), q(ldq,matrixCols)
! was
! real q(ldq,*)
integer(kind=ik), parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used integer(kind=ik), parameter :: min_submatrix_size = 16 ! Minimum size of the submatrices to be used
...@@ -2175,6 +2187,8 @@ module ELPA1_compute ...@@ -2175,6 +2187,8 @@ module ELPA1_compute
mpi_comm_cols, npc_0, npc_n mpi_comm_cols, npc_0, npc_n
integer(kind=ik) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na) integer(kind=ik) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
real(kind=rk) :: d(na), e, q(ldq,matrixCols) real(kind=rk) :: d(na), e, q(ldq,matrixCols)
! was
! real q(ldq,*)
integer(kind=ik), parameter :: max_strip=128 integer(kind=ik), parameter :: max_strip=128
...@@ -3309,6 +3323,8 @@ module ELPA1_compute ...@@ -3309,6 +3323,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols) real(kind=rk) :: a(lda,matrixCols)
! was
! real a(lda, *)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
...@@ -3490,6 +3506,8 @@ module ELPA1_compute ...@@ -3490,6 +3506,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols) real(kind=rk) :: a(lda,matrixCols)
! was
! real a(lda,*)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
...@@ -3626,6 +3644,8 @@ module ELPA1_compute ...@@ -3626,6 +3644,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols) complex(kind=ck) :: a(lda,matrixCols)
!was
! complex a(lda,*)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
...@@ -3803,6 +3823,8 @@ module ELPA1_compute ...@@ -3803,6 +3823,8 @@ module ELPA1_compute
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
complex(kind=ck) :: a(lda,matrixCols) complex(kind=ck) :: a(lda,matrixCols)
! was
! complex a(lda,*)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx integer(kind=ik) :: l_cols, l_rows, l_col1, l_row1, l_colx, l_rowx
......
...@@ -156,6 +156,8 @@ contains ...@@ -156,6 +156,8 @@ contains
mpi_comm_cols, mpi_comm_all mpi_comm_cols, mpi_comm_all
integer(kind=ik), intent(in) :: nblk integer(kind=ik), intent(in) :: nblk
real(kind=rk), intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols) real(kind=rk), intent(inout) :: a(lda,matrixCols), ev(na), q(ldq,matrixCols)
! was
! real a(lda,*), q(ldq,*)
real(kind=rk), allocatable :: hh_trans_real(:,:) real(kind=rk), allocatable :: hh_trans_real(:,:)
integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
...@@ -429,6 +431,8 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, & ...@@ -429,6 +431,8 @@ function solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, &
integer(kind=ik) :: THIS_COMPLEX_ELPA_KERNEL integer(kind=ik) :: THIS_COMPLEX_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all integer(kind=ik), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
complex(kind=ck), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols) complex(kind=ck), intent(inout) :: a(lda,matrixCols), q(ldq,matrixCols)
! was
! complex a(lda,*), q(ldq,*)
real(kind=rk), intent(inout) :: ev(na) real(kind=rk), intent(inout) :: ev(na)
complex(kind=ck), allocatable :: hh_trans_complex(:,:) complex(kind=ck), allocatable :: hh_trans_complex(:,:)
......
...@@ -143,6 +143,8 @@ module ELPA2_compute ...@@ -143,6 +143,8 @@ module ELPA2_compute
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks) real(kind=rk) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
! was
! real a(lda,*), tmat(nbw,nbw,*)
real(kind=rk) :: eps real(kind=rk) :: eps
logical, intent(in) :: useGPU logical, intent(in) :: useGPU
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
...@@ -625,9 +627,7 @@ module ELPA2_compute ...@@ -625,9 +627,7 @@ module ELPA2_compute
else else
if (l_rows>0) & if (l_rows>0) &
call dsyrk('U','T',n_cols,l_rows,1.d0,vmrCPU,ubound(vmrCPU,dim=1),0.d0,vav,ubound(vav,dim=1)) call dsyrk('U','T',n_cols,l_rows,1.d0,vmrCPU,ubound(vmrCPU,dim=1),0.d0,vav,ubound(vav,dim=1))
endif endif
call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows) call symm_matrix_allreduce(n_cols,vav, nbw, nbw,mpi_comm_rows)
! Calculate triangular matrix T for block Householder Transformation ! Calculate triangular matrix T for block Householder Transformation
...@@ -1238,6 +1238,8 @@ module ELPA2_compute ...@@ -1238,6 +1238,8 @@ module ELPA2_compute
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks) real(kind=rk) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
! was
! real a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, & integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, &
...@@ -1689,6 +1691,8 @@ module ELPA2_compute ...@@ -1689,6 +1691,8 @@ module ELPA2_compute
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
real(kind=rk), intent(in) :: a(lda,matrixCols) real(kind=rk), intent(in) :: a(lda,matrixCols)
! was
! real a(lda,*)
real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0 real(kind=rk), intent(out) :: d(na), e(na) ! set only on PE 0
real(kind=rk), intent(out), & real(kind=rk), intent(out), &
allocatable :: hh_trans_real(:,:) allocatable :: hh_trans_real(:,:)
...@@ -2470,12 +2474,16 @@ module ELPA2_compute ...@@ -2470,12 +2474,16 @@ module ELPA2_compute
#endif #endif
use cuda_functions use cuda_functions
use precision use precision
use pack_unpack_real
use compute_hh_trafo_real
implicit none implicit none
logical, intent(in) :: useGPU logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
real(kind=rk) :: q(ldq,matrixCols) real(kind=rk) :: q(ldq,matrixCols)
! was
! real q(ldq,*)
real(kind=rk), intent(out) :: hh_trans_real(:,:) real(kind=rk), intent(out) :: hh_trans_real(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
...@@ -2776,7 +2784,8 @@ module ELPA2_compute ...@@ -2776,7 +2784,8 @@ module ELPA2_compute
!$omp parallel do private(my_thread), schedule(static, 1) !$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads do my_thread = 1, max_threads
call unpack_row_real_cpu_openmp(row,i-limits(ip),my_thread) call unpack_row_real_cpu_openmp(a, row,i-limits(ip),my_thread, stripe_count, &
thread_width, stripe_width, l_nev)
enddo enddo
!$omp end parallel do !$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
...@@ -2791,7 +2800,7 @@ module ELPA2_compute ...@@ -2791,7 +2800,7 @@ module ELPA2_compute
call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
else else
call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call unpack_row_real_cpu(row,i-limits(ip)) call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif endif
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
...@@ -2812,7 +2821,8 @@ module ELPA2_compute ...@@ -2812,7 +2821,8 @@ module ELPA2_compute
!$omp parallel do private(my_thread), schedule(static, 1) !$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads do my_thread = 1, max_threads
call unpack_row_real_cpu_openmp(row,i-limits(ip),my_thread) call unpack_row_real_cpu_openmp(a, row,i-limits(ip),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo enddo
!$omp end parallel do !$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
...@@ -2820,13 +2830,12 @@ module ELPA2_compute ...@@ -2820,13 +2830,12 @@ module ELPA2_compute
#endif #endif
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
if (useGPU) then if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row ! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_real_gpu(i - limits(ip), .false.) call unpack_and_prepare_row_group_real_gpu(i - limits(ip), .false.)
row_group(:, row_group_size) = q(src_offset, 1:l_nev) row_group(:, row_group_size) = q(src_offset, 1:l_nev)
else else
call unpack_row_real_cpu(row,i-limits(ip)) call unpack_row_real_cpu(a, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
endif endif
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
endif endif
...@@ -2869,7 +2878,8 @@ module ELPA2_compute ...@@ -2869,7 +2878,8 @@ module ELPA2_compute
!$omp parallel do private(my_thread), schedule(static, 1) !$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads do my_thread = 1, max_threads
call unpack_row_real_cpu_openmp(row,i-limits(my_prow),my_thread) call unpack_row_real_cpu_openmp(a, row,i-limits(my_prow),my_thread, &
stripe_count, thread_width, stripe_width, l_nev)
enddo enddo
!$omp end parallel do !$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
...@@ -2877,14 +2887,13 @@ module ELPA2_compute ...@@ -2877,14 +2887,13 @@ module ELPA2_compute
#endif #endif
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
if (useGPU) then if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row ! An unpacking of the current row group may occur before queuing the next row
call unpack_and_prepare_row_group_real_gpu(i - limits(my_prow), .false.) call unpack_and_prepare_row_group_real_gpu(i - limits(my_prow), .false.)
call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row_group(:, row_group_size), l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
else else
call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr) call MPI_Recv(row, l_nev, MPI_REAL8, src, 0, mpi_comm_rows, MPI_STATUS_IGNORE, mpierr)
call unpack_row_real_cpu(row,i-limits(my_prow)) call unpack_row_real_cpu(a, row,i-limits(my_prow), stripe_count, stripe_width, last_stripe_width)
endif endif
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
...@@ -3084,7 +3093,6 @@ module ELPA2_compute ...@@ -3084,7 +3093,6 @@ module ELPA2_compute
current_tv_off = 0 ! Offset of next row to be broadcast current_tv_off = 0 ! Offset of next row to be broadcast
! ------------------- start of work loop ------------------- ! ------------------- start of work loop -------------------
a_off = 0 ! offset in A (to avoid unnecessary shifts) a_off = 0 ! offset in A (to avoid unnecessary shifts)
...@@ -3300,7 +3308,9 @@ module ELPA2_compute ...@@ -3300,7 +3308,9 @@ module ELPA2_compute
a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif endif
call compute_hh_trafo_real(0, current_local_n, i, my_thread, & call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, current_local_n, i, my_thread, &
THIS_REAL_ELPA_KERNEL) THIS_REAL_ELPA_KERNEL)
enddo enddo
!$omp end parallel do !$omp end parallel do
...@@ -3309,8 +3319,10 @@ module ELPA2_compute ...@@ -3309,8 +3319,10 @@ module ELPA2_compute
#endif #endif
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
call compute_hh_trafo_real(0, current_local_n, i, & call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
THIS_REAL_ELPA_KERNEL) a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, current_local_n, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
!send_b !send_b
...@@ -3365,7 +3377,9 @@ module ELPA2_compute ...@@ -3365,7 +3377,9 @@ module ELPA2_compute
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1) !$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads do my_thread = 1, max_threads
call compute_hh_trafo_real(current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, & call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
THIS_REAL_ELPA_KERNEL) THIS_REAL_ELPA_KERNEL)
enddo enddo
!$omp end parallel do !$omp end parallel do
...@@ -3384,8 +3398,10 @@ module ELPA2_compute ...@@ -3384,8 +3398,10 @@ module ELPA2_compute
top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr) top_recv_tag, mpi_comm_rows, bottom_send_request(i), mpierr)
endif endif
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
call compute_hh_trafo_real(current_local_n - bottom_msg_length, bottom_msg_length, i, & call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
THIS_REAL_ELPA_KERNEL) a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
!send_b !send_b
call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr) call MPI_Wait(bottom_send_request(i), MPI_STATUS_IGNORE, mpierr)
...@@ -3421,7 +3437,9 @@ module ELPA2_compute ...@@ -3421,7 +3437,9 @@ module ELPA2_compute
!$omp parallel do private(my_thread), schedule(static, 1) !$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads do my_thread = 1, max_threads
call compute_hh_trafo_real(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, & call compute_hh_trafo_real_cpu_openmp(a,stripe_width,a_dim2,stripe_count, max_threads, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, my_thread, &
THIS_REAL_ELPA_KERNEL) THIS_REAL_ELPA_KERNEL)
enddo enddo
!$omp end parallel do !$omp end parallel do
...@@ -3430,9 +3448,10 @@ module ELPA2_compute ...@@ -3430,9 +3448,10 @@ module ELPA2_compute
#endif #endif
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
call compute_hh_trafo_real(top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, & call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
THIS_REAL_ELPA_KERNEL) a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
!wait_t !wait_t
...@@ -3480,7 +3499,9 @@ module ELPA2_compute ...@@ -3480,7 +3499,9 @@ module ELPA2_compute
a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = & a(1:csw,a_off+1:a_off+top_msg_length,i,my_thread) = &
reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /)) reshape(top_border_recv_buffer(b_off+1:b_off+b_len,i), (/ csw, top_msg_length /))
endif endif
call compute_hh_trafo_real(0, top_msg_length, i, my_thread, THIS_REAL_ELPA_KERNEL) call compute_hh_trafo_real_cpu_openmp(a, stripe_width,a_dim2,stripe_count, max_threads, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, top_msg_length, i, my_thread, THIS_REAL_ELPA_KERNEL)
enddo enddo
!$omp end parallel do !$omp end parallel do
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
...@@ -3488,7 +3509,10 @@ module ELPA2_compute ...@@ -3488,7 +3509,10 @@ module ELPA2_compute
#endif #endif
#else /* WITH_OPENMP */ #else /* WITH_OPENMP */
call compute_hh_trafo_real(0, top_msg_length, i, THIS_REAL_ELPA_KERNEL) call compute_hh_trafo_real_cpu(a, stripe_width,a_dim2,stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
0, top_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#endif /* WITH_OPENMP */ #endif /* WITH_OPENMP */
endif endif
...@@ -3637,7 +3661,7 @@ module ELPA2_compute ...@@ -3637,7 +3661,7 @@ module ELPA2_compute
call pack_row_group_real_gpu(result_buffer(:, :, nbuf), j * nblk + a_off, nblk) call pack_row_group_real_gpu(result_buffer(:, :, nbuf), j * nblk + a_off, nblk)
else else
do i = 1, nblk