Commit cfa307bb authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'master_pre_stage' into skew

parents 5d0b533f fa78e003
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -148,7 +148,7 @@ def set_cflags_fcflags(instr, cc, fc, instruction_set):
FCFLAGS += "-O3 -xMIC-AVX512"
if (instr == "avx2"):
INSTRUCTION_OPTIONS = instruction_set[instr]
INSTRUCTION_OPTIONS = instruction_set[instr] + " --disable-avx512"
if (cc == "gnu"):
CFLAGS += "-O3 -mavx2 -mfma"
else:
......
......@@ -1313,6 +1313,29 @@ if test x"$use_real_gpu" = x"yes" -o x"$use_complex_gpu" = x"yes" ; then
AC_DEFINE([WITH_GPU_KERNEL],[1],[GPU kernel should be build])
ELPA_2STAGE_COMPLEX_GPU_COMPILED=1
ELPA_2STAGE_REAL_GPU_COMPILED=1
AC_MSG_CHECKING(whether --enable-nvtx is specified)
AC_ARG_ENABLE([nvtx],
AS_HELP_STRING([--enable-nvtx],
[build and install nvtx wrapper for profiling th GPU version, default no.]),
[
if test x"$enableval" = x"yes"; then
enable_nvtx=yes
else
enable_nvtx=no
fi
],
[enable_nvtx=no])
AC_MSG_RESULT([${enable_nvtx}])
if test x"${enable_nvtx}" = x"yes"; then
AC_DEFINE([WITH_NVTX],[1],[enable NVTX support])
AC_LANG_PUSH([C])
AC_SEARCH_LIBS([nvtxRangePop],[nvToolsExt],[have_nvtoolsext=yes],[have_nvtoolsext=no])
if test x"${have_nvtoolsext}" = x"no"; then
AC_MSG_ERROR([Could not link nvToolsExt; try to set the cuda-path or disable GPU support ])
fi
AC_LANG_POP([C])
fi
else
ELPA_2STAGE_COMPLEX_GPU_COMPILED=0
ELPA_2STAGE_REAL_GPU_COMPILED=0
......
......@@ -465,8 +465,41 @@ module cuda_functions
end interface
#ifdef WITH_NVTX
! NVTX profiling interfaces
interface nvtxRangePushA
subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA')
use iso_c_binding
character(kind=C_CHAR,len=1) :: name(*)
end subroutine
end interface
interface nvtxRangePop
subroutine nvtxRangePop() bind(C, name='nvtxRangePop')
end subroutine
end interface
#endif
contains
#ifdef WITH_NVTX
! this wrapper is needed for the string conversion
subroutine nvtxRangePush(range_name)
implicit none
character(len=*), intent(in) :: range_name
character(kind=C_CHAR,len=1), dimension(len(range_name)+1) :: c_name
integer i
do i = 1, len(range_name)
c_name(i) = range_name(i:i)
end do
c_name(len(range_name)+1) = char(0)
call nvtxRangePushA(c_name)
end subroutine
#endif
! functions to set and query the CUDA devices
function cublas_create(handle) result(success)
......
......@@ -142,6 +142,9 @@ function elpa_solve_evp_&
#else
nrThreads = 1
#endif
#ifdef WITH_NVTX
call nvtxRangePush("elpa1")
#endif
success = .true.
......@@ -352,6 +355,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID
call likwid_markerStartRegion("tridi")
#endif
#ifdef WITH_NVTX
call nvtxRangePush("tridi")
#endif
call tridiag_&
&MATH_DATATYPE&
......@@ -359,6 +365,9 @@ function elpa_solve_evp_&
&PRECISION&
& (obj, na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU_tridiag, wantDebug, nrThreads)
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID
call likwid_markerStopRegion("tridi")
#endif
......@@ -370,6 +379,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID
call likwid_markerStartRegion("solve")
#endif
#ifdef WITH_NVTX
call nvtxRangePush("solve")
#endif
call solve_tridi_&
&PRECISION&
......@@ -382,6 +394,9 @@ function elpa_solve_evp_&
#endif
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_solve_tridi, wantDebug, success, nrThreads)
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID
call likwid_markerStopRegion("solve")
#endif
......@@ -447,6 +462,9 @@ function elpa_solve_evp_&
#ifdef HAVE_LIKWID
call likwid_markerStartRegion("trans_ev")
#endif
#ifdef WITH_NVTX
call nvtxRangePush("trans_ev")
#endif
! In the skew-symmetric case this transforms the real part
call trans_ev_&
......@@ -465,6 +483,9 @@ function elpa_solve_evp_&
mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
endif
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
#ifdef HAVE_LIKWID
call likwid_markerStopRegion("trans_ev")
#endif
......@@ -505,6 +526,9 @@ function elpa_solve_evp_&
endif
endif
#ifdef WITH_NVTX
call nvtxRangePop()
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
! store the number of OpenMP threads used in the calling function
......
......@@ -1552,7 +1552,7 @@
endif
#endif
#if COMPLEXCASE == 1
call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, &
call PRECISION_GEMM('N', 'N', int(l_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), int(n_cols,kind=BLAS_KIND), &
(-0.5_rk, 0.0_rk), &
umcCPU(1,n_cols+1), int(ubound(umcCPU,dim=1),kind=BLAS_KIND), vav, &
int(ubound(vav,dim=1),kind=BLAS_KIND), ONE, umcCPU, int(ubound(umcCPU,dim=1),kind=BLAS_KIND))
......
......@@ -21,7 +21,7 @@ subroutine elpa_cssmv(n, alpha, a, lda, x, y)
implicit none
#include "./precision_kinds.F90"
integer(kind=ik) :: n, lda
integer(kind=BLAS_KIND) :: n, lda
MATH_DATATYPE(kind=rck) :: alpha
MATH_DATATYPE(kind=rck) :: a( lda, * ), x( * ), y( * )
integer(kind=ik), parameter :: nb = 64
......
......@@ -21,7 +21,7 @@ subroutine elpa_cssr2(n, x, y, a, lda )
implicit none
#include "./precision_kinds.F90"
integer(kind=ik) :: n, lda
integer(kind=BLAS_KIND) :: n, lda
MATH_DATATYPE(kind=rck) :: a( lda, * ), x( * ), y( * )
integer(kind=ik), parameter :: nb = 64
MATH_DATATYPE(kind=rck) :: temp1, temp2
......@@ -69,7 +69,7 @@ subroutine elpa_cssr2(n, x, y, a, lda )
#if REALCASE == 1
call PRECISION_GER(int(ic,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), -one, x( ix ), 1_BLAS_KIND, y( jy ), 1_BLAS_KIND, &
a( ii, jj ), int(lda,kind=BLAS_KIND) )
call PRECISION_GER(ic, int(nb,kind=BLAS_KIND), one, y( iy ), 1_BLAS_KIND, x( jx ), 1_BLAS_KIND, &
call PRECISION_GER(int(ic,kind=BLAS_KIND), int(nb,kind=BLAS_KIND), one, y( iy ), 1_BLAS_KIND, x( jx ), 1_BLAS_KIND, &
a( ii, jj ), int(lda,kind=BLAS_KIND) )
#endif
end do
......
......@@ -176,7 +176,8 @@ program test
print *,''
endif
call set_up_blacsgrid(mpi_comm_world, np_rows, np_cols, layout, &
call set_up_blacsgrid(int(mpi_comm_world,kind=BLAS_KIND), np_rows, &
np_cols, layout, &
my_blacs_ctxt, my_prow, my_pcol)
call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
......@@ -278,7 +279,7 @@ program test
call e_skewsymmetric%set("solver", elpa_solver_2stage, error_elpa)
call e_skewsymmetric%get("is_skewsymmetric", i,error_elpa)
call e_skewsymmetric%get("is_skewsymmetric", int(i,kind=c_int),error_elpa)
call e_skewsymmetric%timer_start("eigenvectors: skewsymmetric ")
call e_skewsymmetric%eigenvectors(a_skewsymmetric, ev_skewsymmetric, z_skewsymmetric, error_elpa)
......@@ -321,9 +322,14 @@ program test
#ifdef WITH_MPI
call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
#endif
status = check_correctness_evp_numeric_residuals_ss(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
#ifdef TEST_SINGLE
status = check_correctness_evp_numeric_residuals_ss_real_single(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
#else
status = check_correctness_evp_numeric_residuals_ss_real_double(na, nev, as_skewsymmetric, z_complex, ev_skewsymmetric, &
sc_desc, nblk, myid, np_rows,np_cols, my_prow, my_pcol)
#endif
#ifdef WITH_MPI
call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
......
......@@ -69,17 +69,17 @@
#endif
#if REALCASE == 1
function check_correctness_evp_numeric_residuals_ss_&
&MATH_DATATYPE&
&_&
function check_correctness_evp_numeric_residuals_ss_real_&
&PRECISION&
& (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status)
use tests_blas_interfaces
use tests_scalapack_interfaces
use precision_for_tests
use iso_c_binding
implicit none
#include "../../src/general/precision_kinds.F90"
integer(kind=ik) :: status, na_cols, na_rows
integer(kind=ik), intent(in) :: na, nev, nblk, myid, np_rows, np_cols, my_prow, my_pcol
integer(kind=BLAS_KIND) :: status, na_cols, na_rows
integer(kind=BLAS_KIND), intent(in) :: na, nev, nblk, myid, np_rows, np_cols, my_prow, my_pcol
real(kind=rk), intent(in) :: as(:,:)
real(kind=rk) :: tmpr
complex(kind=rck), intent(in) :: z(:,:)
......@@ -89,9 +89,10 @@
complex(kind=rck), allocatable :: as_complex(:,:)
integer(kind=ik) :: sc_desc(:)
integer(kind=BLAS_KIND) :: sc_desc(:)
integer(kind=ik) :: i, j, rowLocal, colLocal
integer(kind=BLAS_KIND) :: i, j, rowLocal, colLocal
integer(kind=c_int) :: row_Local, col_Local
real(kind=rck) :: err, errmax
integer :: mpierr
......@@ -259,7 +260,11 @@
! First check, whether the elements on diagonal are 1 .. "normality" of the vectors
err = 0.0_rk
do i=1, nev
if (map_global_array_index_to_local_index(i, i, rowLocal, colLocal, nblk, np_rows, np_cols, my_prow, my_pcol)) then
if (map_global_array_index_to_local_index(int(i,kind=c_int), int(i,kind=c_int), row_Local, col_Local, &
int(nblk,kind=c_int), int(np_rows,kind=c_int), int(np_cols,kind=c_int), &
int(my_prow,kind=c_int), int(my_pcol,kind=c_int)) ) then
rowLocal = int(row_Local,kind=INT_TYPE)
colLocal = int(col_Local,kind=INT_TYPE)
err = max(err, abs(tmp1(rowLocal,colLocal) - CONE))
endif
end do
......@@ -312,31 +317,48 @@
deallocate(as_complex)
end function
#endif
#endif /* REALCASE */
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
!c> int check_correctness_evp_numeric_residuals_real_double_f(int na, int nev, int na_rows, int na_cols,
!c> double *as, double *z, double *ev, int sc_desc[9],
!c> int nblk, int myid, int np_rows, int np_cols, int my_prow, int my_pcol);
!c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_ss_real_double_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
!c> double *as, complex double *z, double *ev, TEST_C_INT_TYPE sc_desc[9],
!c> TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
#else
!c> int check_correctness_evp_numeric_residuals_real_single_f(int na, int nev, int na_rows, int na_cols,
!c> float *as, float *z, float *ev, int sc_desc[9],
!c> int nblk, int myid, int np_rows, int np_cols, int my_prow, int my_pcol);
!c> TEST_C_INT_TYPE check_correctness_evp_numeric_residuals_ss_real_single_f(TEST_C_INT_TYPE na, TEST_C_INT_TYPE nev, TEST_C_INT_TYPE na_rows, TEST_C_INT_TYPE na_cols,
!c> float *as, complex float *z, float *ev, TEST_C_INT_TYPE sc_desc[9],
!c> TEST_C_INT_TYPE nblk, TEST_C_INT_TYPE myid, TEST_C_INT_TYPE np_rows, TEST_C_INT_TYPE np_cols, TEST_C_INT_TYPE my_prow, TEST_C_INT_TYPE my_pcol);
#endif
#endif /* REALCASE */
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
!c> int check_correctness_evp_numeric_residuals_complex_double_f(int na, int nev, int na_rows, int na_cols,
!c> complex double *as, complex double *z, double *ev, int sc_desc[9],
!c> int nblk, int myid, int np_rows, int np_cols, int my_prow, int my_pcol);
#else
!c> int check_correctness_evp_numeric_residuals_complex_single_f(int na, int nev, int na_rows, int na_cols,
!c> complex float *as, complex float *z, float *ev, int sc_desc[9],
!c> int nblk, int myid, int np_rows, int np_cols, int my_prow, int my_pcol);
#endif
#endif /* COMPLEXCASE */
#if REALCASE == 1
function check_correctness_evp_numeric_residuals_ss_real_&
&PRECISION&
&_f (na, nev, na_rows, na_cols, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol) result(status) &
bind(C,name="check_correctness_evp_numeric_residuals_ss_&
&MATH_DATATYPE&
&_&
&PRECISION&
&_f")
use precision_for_tests
use iso_c_binding
implicit none
#include "./test_precision_kinds.F90"
TEST_INT_TYPE :: status
TEST_INT_TYPE, value :: na, nev, myid, na_rows, na_cols, nblk, np_rows, np_cols, my_prow, my_pcol
real(kind=rck) :: as(1:na_rows,1:na_cols)
complex(kind=rck) :: z(1:na_rows,1:na_cols)
real(kind=rck) :: ev(1:na)
TEST_INT_TYPE :: sc_desc(1:9)
status = check_correctness_evp_numeric_residuals_ss_real_&
&PRECISION&
& (na, nev, as, z, ev, sc_desc, nblk, myid, np_rows, np_cols, my_prow, my_pcol)
end function
#endif /* REALCASE */
function check_correctness_evp_numeric_residuals_&
&MATH_DATATYPE&
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment