Commit cfa307bb authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master_pre_stage' into skew

parents 5d0b533f fa78e003
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -148,7 +148,7 @@ def set_cflags_fcflags(instr, cc, fc, instruction_set): ...@@ -148,7 +148,7 @@ def set_cflags_fcflags(instr, cc, fc, instruction_set):
FCFLAGS += "-O3 -xMIC-AVX512" FCFLAGS += "-O3 -xMIC-AVX512"
if (instr == "avx2"): if (instr == "avx2"):
INSTRUCTION_OPTIONS = instruction_set[instr] INSTRUCTION_OPTIONS = instruction_set[instr] + " --disable-avx512"
if (cc == "gnu"): if (cc == "gnu"):
CFLAGS += "-O3 -mavx2 -mfma" CFLAGS += "-O3 -mavx2 -mfma"
else: else:
......
...@@ -1313,6 +1313,29 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then ...@@ -1313,6 +1313,29 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build]) AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1 ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1 ELPA_2STAGE_REAL_GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
AS_HELP_STRING([--enable-nvtx],
[build and install nvtx wrapper for profiling th GPU version, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_nvtx=yes
else
enable_nvtx=no
fi
],
[enable_nvtx=no])
AC_MSG_RESULT([${enable_nvtx}])
if test x"${enable_nvtx}" = x"yes"; then
AC_DEFINE([WITH_NVTX],[1],[enable NVTX support])
AC_LANG_PUSH([C])
AC_SEARCH_LIBS([nvtxRangePop],[nvToolsExt],[have_nvtoolsext=yes],[have_nvtoolsext=no])
if test x"${have_nvtoolsext}" = x"no"; then
AC_MSG_ERROR([Could not link nvToolsExt; try to set the cuda-path or disable GPU support ])
fi
AC_LANG_POP([C])
fi
else else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0 ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0 ELPA_2STAGE_REAL_GPU_COMPILED=0
......
...@@ -465,8 +465,41 @@ module cuda_functions ...@@ -465,8 +465,41 @@ module cuda_functions
end interface end interface
#ifdef WITH_NVTX
! NVTX profiling interfaces
interface nvtxRangePushA
subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA')
use iso_c_binding
character(kind=C_CHAR,len=1) :: name(*)
end subroutine
end interface
interface nvtxRangePop
subroutine nvtxRangePop() bind(C, name='nvtxRangePop')
end subroutine
end interface
#endif
contains contains
#ifdef WITH_NVTX
! this wrapper is needed for the string conversion
subroutine nvtxRangePush(range_name)
implicit none
character(len=*), intent(in) :: range_name
character(kind=C_CHAR,len=1), dimension(len(range_name)+1) :: c_name
integer i
do i = 1, len(range_name)
c_name(i) = range_name(i:i)
end do
c_name(len(range_name)+1) = char(0)
call nvtxRangePushA(c_name)
end subroutine
#endif
! functions to set and query the CUDA devices ! functions to set and query the CUDA devices
function cublas_create(handle) result(success) function cublas_create(handle) result(success)
......
...@@ -142,6 +142,9 @@ function elpa_solve_evp_& ...@@ -142,6 +142,9 @@ function elpa_solve_evp_&
#else #else
nrThreads = 1 nrThreads = 1
#endif #endif
#ifdef WITH_NVTX
call nvtxRangePush("elpa1")
#endif
success = .true. success = .true.
...@@ -352,6 +355,9 @@ function elpa_solve_evp_& ...@@ -352,6 +355,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID #ifdef HAVE_LIKWID
call likwid_markerStartRegion("tridi") call likwid_markerStartRegion("tridi")
#endif #endif
#ifdef WITH_NVTX
call nvtxRangePush("tridi")
#endif
call tridiag_& call tridiag_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -359,6 +365,9 @@ function elpa_solve_evp_& ...@@ -359,6 +365,9 @@ function elpa_solve_evp_&
&PRECISION& &PRECISION&
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads) & (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads)
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID #ifdef HAVE_LIKWID
call likwid_markerStopRegion("tridi") call likwid_markerStopRegion("tridi")
#endif #endif
...@@ -370,6 +379,9 @@ function elpa_solve_evp_& ...@@ -370,6 +379,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID #ifdef HAVE_LIKWID
call likwid_markerStartRegion("solve") call likwid_markerStartRegion("solve")
#endif #endif
#ifdef WITH_NVTX
call nvtxRangePush("solve")
#endif
call solve_tridi_& call solve_tridi_&
&PRECISION& &PRECISION&
...@@ -382,6 +394,9 @@ function elpa_solve_evp_& ...@@ -382,6 +394,9 @@ function elpa_solve_evp_&
#endif #endif
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads) nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads)
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID #ifdef HAVE_LIKWID
call likwid_markerStopRegion("solve") call likwid_markerStopRegion("solve")
#endif #endif
...@@ -447,6 +462,9 @@ function elpa_solve_evp_& ...@@ -447,6 +462,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID #ifdef HAVE_LIKWID
call likwid_markerStartRegion("trans_ev") call likwid_markerStartRegion("trans_ev")
#endif #endif
#ifdef WITH_NVTX
call nvtxRangePush("trans_ev")
#endif
! In the skew-symmetric case this transforms the real part ! In the skew-symmetric case this transforms the real part
call trans_ev_& call trans_ev_&
...@@ -465,6 +483,9 @@ function elpa_solve_evp_& ...@@ -465,6 +483,9 @@ function elpa_solve_evp_&
mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev) mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
endif endif
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID #ifdef HAVE_LIKWID
call likwid_markerStopRegion("trans_ev") call likwid_markerStopRegion("trans_ev")
#endif #endif
...@@ -505,6 +526,9 @@ function elpa_solve_evp_& ...@@ -505,6 +526,9 @@ function elpa_solve_evp_&
endif endif
endif endif
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
! restore original OpenMP settings ! restore original OpenMP settings
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
! store the number of OpenMP threads used in the calling function ! store the number of OpenMP threads used in the calling function
......
...@@ -1552,7 +1552,7 @@ ...@@ -1552,7 +1552,7 @@
endif endif
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, & call PRECISION_GEMM('N', 'N', int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
(-0.5_rk, 0.0_rk), & (-0.5_rk, 0.0_rk), &
umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav, & umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav, &
int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND)) int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND))
......
...@@ -21,7 +21,7 @@ subroutine elpa_cssmv(n, alpha, a, lda, x, y) ...@@ -21,7 +21,7 @@ subroutine elpa_cssmv(n, alpha, a, lda, x, y)
implicit none implicit none
#include "./precision_kinds.F90" #include "./precision_kinds.F90"
integer(kind=ik) :: n, lda integer(kind=BLAS_KIND) :: n, lda
MATH_DATATYPE(kind=rck) :: alpha MATH_DATATYPE(kind=rck) :: alpha
MATH_DATATYPE(kind=rck) :: a( lda, * ), x( * ), y( * ) MATH_DATATYPE(kind=rck) :: a( lda, * ), x( * ), y( * )
integer(kind=ik), parameter :: nb = 64 integer(kind=ik), parameter :: nb = 64
......
...@@ -21,7 +21,7 @@ subroutine elpa_cssr2(n, x, y, a, lda ) ...@@ -21,7 +21,7 @@ subroutine elpa_cssr2(n, x, y, a, lda )
implicit none implicit none
#include "./precision_kinds.F90" #include "./precision_kinds.F90"
integer(kind=ik) :: n, lda integer(kind=BLAS_KIND) :: n, lda
MATH_DATATYPE(kind=rck) :: a( lda, * ), x( * ), y( * ) MATH_DATATYPE(kind=rck) :: a( lda, * ), x( * ), y( * )
integer(kind=ik), parameter :: nb = 64 integer(kind=ik), parameter :: nb = 64
MATH_DATATYPE(kind=rck) :: temp1, temp2 MATH_DATATYPE(kind=rck) :: temp1, temp2
...@@ -69,7 +69,7 @@ subroutine elpa_cssr2(n, x, y, a, lda ) ...@@ -69,7 +69,7 @@ subroutine elpa_cssr2(n, x, y, a, lda )
#if REALCASE == 1 #if REALCASE == 1
call PRECISION_GER(int(ic,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), -one, x( ix ), 1_BLAS_KIND, y( jy ), 1_BLAS_KIND, & call PRECISION_GER(int(ic,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), -one, x( ix ), 1_BLAS_KIND, y( jy ), 1_BLAS_KIND, &
a( ii, jj ), int(lda,kind=BLAS_KIND) ) a( ii, jj ), int(lda,kind=BLAS_KIND) )
call PRECISION_GER(ic, int(nb,kind=BLAS_KIND), one, y( iy ), 1_BLAS_KIND, x( jx ), 1_BLAS_KIND, & call PRECISION_GER(int(ic,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), one, y( iy ), 1_BLAS_KIND, x( jx ), 1_BLAS_KIND, &
a( ii, jj ), int(lda,kind=BLAS_KIND) ) a( ii, jj ), int(lda,kind=BLAS_KIND) )
#endif #endif
end do end do
......
...@@ -176,8 +176,9 @@ program test ...@@ -176,8 +176,9 @@ program test
print *,'' print *,''
endif endif
call set_up_blacsgrid(mpi_comm_world, np_rows, np_cols, layout, & call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, &
my_blacs_ctxt, my_prow, my_pcol) np_cols, layout, &
my_blacs_ctxt, my_prow, my_pcol)
call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, & call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
na_rows, na_cols, sc_desc, my_blacs_ctxt, info) na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
...@@ -278,7 +279,7 @@ program test ...@@ -278,7 +279,7 @@ program test
call e_skewsymmetric%set("solver", elpa_solver_2stage, error_elpa) call e_skewsymmetric%set("solver", elpa_solver_2stage, error_elpa)
call e_skewsymmetric%get("is_skewsymmetric", i,error_elpa) call e_skewsymmetric%get("is_skewsymmetric", int(i,kind=c_int),error_elpa)
call e_skewsymmetric%timer_start("eigenvectors: skewsymmetric ") call e_skewsymmetric%timer_start("eigenvectors: skewsymmetric ")
call e_skewsymmetric%eigenvectors(a_skewsymmetric, ev_skewsymmetric, z_skewsymmetric, error_elpa) call e_skewsymmetric%eigenvectors(a_skewsymmetric, ev_skewsymmetric, z_skewsymmetric, error_elpa)
...@@ -321,9 +322,14 @@ program test ...@@ -321,9 +322,14 @@ program test
#ifdef WITH_MPI #ifdef WITH_MPI
call MPI_BARRIER(MPI_COMM_WORLD, mpierr) call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
#endif #endif
status = check_correctness_evp_numeric_residuals_ss(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
#ifdef TEST_SINGLE
status = check_correctness_evp_numeric_residuals_ss_real_single(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
#else
status = check_correctness_evp_numeric_residuals_ss_real_double(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
#endif
#ifdef WITH_MPI #ifdef WITH_MPI
call MPI_BARRIER(MPI_COMM_WORLD, mpierr) call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
......
...@@ -69,30 +69,31 @@ ...@@ -69,30 +69,31 @@
#endif #endif
#if REALCASE == 1 #if REALCASE == 1
function check_correctness_evp_numeric_residuals_ss_& function check_correctness_evp_numeric_residuals_ss_real_&
&MATH_DATATYPE&
&_&
&PRECISION& &PRECISION&
& (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status) & (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status)
use tests_blas_interfaces use tests_blas_interfaces
use tests_scalapack_interfaces use tests_scalapack_interfaces
use precision_for_tests
use iso_c_binding
implicit none implicit none
#include "../../src/general/precision_kinds.F90" #include "../../src/general/precision_kinds.F90"
integer(kind=ik) :: status, na_cols, na_rows integer(kind=BLAS_KIND) :: status, na_cols, na_rows
integer(kind=ik), intent(in) :: na, nev, nblk, myid, np_rows, np_cols, my_prow, my_pcol integer(kind=BLAS_KIND), intent(in) :: na, nev, nblk, myid, np_rows, np_cols, my_prow, my_pcol
real(kind=rk), intent(in) :: as(:,:) real(kind=rk), intent(in) :: as(:,:)
real(kind=rk) :: tmpr real(kind=rk) :: tmpr
complex(kind=rck), intent(in) :: z(:,:) complex(kind=rck), intent(in) :: z(:,:)
real(kind=rk) :: ev(:) real(kind=rk) :: ev(:)
complex(kind=rck), dimension(size(as,dim=1),size(as,dim=2)) :: tmp1, tmp2 complex(kind=rck), dimension(size(as,dim=1),size(as,dim=2)) :: tmp1, tmp2
complex(kind=rck) :: xc complex(kind=rck) :: xc
complex(kind=rck), allocatable :: as_complex(:,:) complex(kind=rck), allocatable :: as_complex(:,:)
integer(kind=ik) :: sc_desc(:) integer(kind=BLAS_KIND) :: sc_desc(:)
integer(kind=ik) :: i, j, rowLocal, colLocal integer(kind=BLAS_KIND) :: i, j, rowLocal, colLocal
real(kind=rck) :: err, errmax integer(kind=c_int) :: row_Local, col_Local
real(kind=rck) :: err, errmax
integer :: mpierr integer :: mpierr
...@@ -259,7 +260,11 @@ ...@@ -259,7 +260,11 @@
! First check, whether the elements on diagonal are 1 .. "normality" of the vectors ! First check, whether the elements on diagonal are 1 .. "normality" of the vectors
err = 0.0_rk err = 0.0_rk
do i=1, nev do i=1, nev
if (map_global_array_index_to_local_index(i, i, rowLocal, colLocal, nblk, np_rows, np_cols, my_prow, my_pcol)) then if (map_global_array_index_to_local_index(int(i,kind=c_int), int(i,kind=c_int), row_Local, col_Local, &
int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
int(my_prow,kind=c_int), int(my_pcol,kind=c_int)) ) then
rowLocal = int(row_Local,kind=INT_TYPE)
colLocal = int(col_Local,kind=INT_TYPE)
err = max(err, abs(tmp1(rowLocal,colLocal) - CONE)) err = max(err, abs(tmp1(rowLocal,colLocal) - CONE))
endif endif
end do end do
...@@ -312,31 +317,48 @@ ...@@ -312,31 +317,48 @@
deallocate(as_complex) deallocate(as_complex)
end function end function
#endif #endif /* REALCASE */
#if REALCASE == 1 #if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL #ifdef DOUBLE_PRECISION_REAL
!c> int check_correctness_evp_numeric_residuals_real_double_f(int na, int nev, int na_rows, int na_cols, !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_ss_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
!c> double *as, double *z, double *ev, int sc_desc[9], !c> double *as, complex double *z, double *ev, TEST_C_INT_TYPE sc_desc[9],
!c> int nblk, int myid, int np_rows, int np_cols, int my_prow, int my_pcol); !c> TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
#else #else
!c> int check_correctness_evp_numeric_residuals_real_single_f(int na, int nev, int na_rows, int na_cols, !c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_ss_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
!c> float *as, float *z, float *ev, int sc_desc[9], !c> float *as, complex float *z, float *ev, TEST_C_INT_TYPE sc_desc[9],
!c> int nblk, int myid, int np_rows, int np_cols, int my_prow, int my_pcol); !c> TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
#endif #endif
#endif /* REALCASE */ #endif /* REALCASE */
#if COMPLEXCASE == 1 #if REALCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX function check_correctness_evp_numeric_residuals_ss_real_&
!c> int check_correctness_evp_numeric_residuals_complex_double_f(int na, int nev, int na_rows, int na_cols, &PRECISION&
!c> complex double *as, complex double *z, double *ev, int sc_desc[9], &_f (na, nev, na_rows, na_cols, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status) &
!c> int nblk, int myid, int np_rows, int np_cols, int my_prow, int my_pcol); bind(C,name="check_correctness_evp_numeric_residuals_ss_&
#else &MATH_DATATYPE&
!c> int check_correctness_evp_numeric_residuals_complex_single_f(int na, int nev, int na_rows, int na_cols, &_&
!c> complex float *as, complex float *z, float *ev, int sc_desc[9], &PRECISION&
!c> int nblk, int myid, int np_rows, int np_cols, int my_prow, int my_pcol); &_f")
#endif
#endif /* COMPLEXCASE */ use precision_for_tests
use iso_c_binding
implicit none
#include "./test_precision_kinds.F90"
TEST_INT_TYPE :: status
TEST_INT_TYPE, value :: na, nev, myid, na_rows, na_cols, nblk, np_rows, np_cols, my_prow, my_pcol
real(kind=rck) :: as(1:na_rows,1:na_cols)
complex(kind=rck) :: z(1:na_rows,1:na_cols)
real(kind=rck) :: ev(1:na)
TEST_INT_TYPE :: sc_desc(1:9)
status = check_correctness_evp_numeric_residuals_ss_real_&
&PRECISION&
& (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
end function
#endif /* REALCASE */
function check_correctness_evp_numeric_residuals_& function check_correctness_evp_numeric_residuals_&
&MATH_DATATYPE& &MATH_DATATYPE&
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment