Commit c2c83c2f authored by Pavel Kus's avatar Pavel Kus

interface of elpa1 changed to allow use of GPU as in elpa2

gpu tests for elpa 1 added

Conflicts:
	src/elpa1.F90
	src/elpa1_tridiag_real_template.X90
	src/elpa_c_interface.F90
	test/Fortran/test_complex.F90
	test/Fortran/test_complex_single.F90
	test/Fortran/test_real.F90
	test/Fortran/test_real_single.F90
	test/Fortran/test_real_with_c.F90
parent f59e2758
......@@ -15,6 +15,7 @@ libelpa@SUFFIX@_public_la_SOURCES = \
src/elpa1.F90 \
src/elpa2.F90 \
src/elpa1_auxiliary.F90 \
src/elpa1_utilities.F90 \
src/elpa2_utilities.F90 \
src/elpa_utilities.F90
......@@ -377,16 +378,20 @@ endif
if WITH_GPU_VERSION
noinst_PROGRAMS += \
elpa1_test_complex_gpu@SUFFIX@ \
elpa1_test_real_gpu@SUFFIX@ \
elpa2_test_complex_gpu@SUFFIX@ \
elpa2_test_real_gpu@SUFFIX@
if WANT_SINGLE_PRECISION_REAL
noinst_PROGRAMS += \
elpa1_test_real_gpu_single_precision@SUFFIX@ \
elpa2_test_real_gpu_single_precision@SUFFIX@
endif
if WANT_SINGLE_PRECISION_COMPLEX
noinst_PROGRAMS += \
elpa1_test_complex_gpu_single_precision@SUFFIX@ \
elpa2_test_complex_gpu_single_precision@SUFFIX@
endif
......@@ -652,6 +657,16 @@ EXTRA_elpa2_test_complex_api_single_precision@SUFFIX@_DEPENDENCIES = test/Fortra
endif
if WITH_GPU_VERSION
elpa1_test_real_gpu@SUFFIX@_SOURCES = test/Fortran/test_real_gpu.F90
elpa1_test_real_gpu@SUFFIX@_LDADD = $(build_lib)
elpa1_test_real_gpu@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa1_test_real_gpu@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa1_test_complex_gpu@SUFFIX@_SOURCES = test/Fortran/test_complex_gpu.F90
elpa1_test_complex_gpu@SUFFIX@_LDADD = $(build_lib)
elpa1_test_complex_gpu@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa1_test_complex_gpu@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_real_gpu@SUFFIX@_SOURCES = test/Fortran/test_real2_gpu.F90
elpa2_test_real_gpu@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_gpu@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -663,6 +678,11 @@ elpa2_test_complex_gpu@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_module
EXTRA_elpa2_test_complex_gpu@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
if WANT_SINGLE_PRECISION_REAL
elpa1_test_real_gpu_single_precision@SUFFIX@_SOURCES = test/Fortran/test_real_gpu_single.F90
elpa1_test_real_gpu_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa1_test_real_gpu_single_precision@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa1_test_real_gpu_single_precision@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_real_gpu_single_precision@SUFFIX@_SOURCES = test/Fortran/test_real2_gpu_single.F90
elpa2_test_real_gpu_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_gpu_single_precision@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -670,6 +690,11 @@ EXTRA_elpa2_test_real_gpu_single_precision@SUFFIX@_DEPENDENCIES = test/Fortran/e
endif
if WANT_SINGLE_PRECISION_COMPLEX
elpa1_test_complex_gpu_single_precision@SUFFIX@_SOURCES = test/Fortran/test_complex_gpu_single.F90
elpa1_test_complex_gpu_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa1_test_complex_gpu_single_precision@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa1_test_complex_gpu_single_precision@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_complex_gpu_single_precision@SUFFIX@_SOURCES = test/Fortran/test_complex2_gpu_single.F90
elpa2_test_complex_gpu_single_precision@SUFFIX@_LDADD = $(build_lib)
elpa2_test_complex_gpu_single_precision@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -736,14 +761,18 @@ endif
if WITH_GPU_VERSION
check_SCRIPTS += \
elpa1_test_real_gpu@SUFFIX@.sh \
elpa1_test_complex_gpu@SUFFIX@.sh \
elpa2_test_real_gpu@SUFFIX@.sh \
elpa2_test_complex_gpu@SUFFIX@.sh
if WANT_SINGLE_PRECISION_REAL
check_SCRIPTS += \
elpa1_test_real_gpu_single_precision@SUFFIX@.sh \
elpa2_test_real_gpu_single_precision@SUFFIX@.sh
endif
if WANT_SINGLE_PRECISION_COMPLEX
check_SCRIPTS += \
elpa1_test_complex_gpu_single_precision@SUFFIX@.sh \
elpa2_test_complex_gpu_single_precision@SUFFIX@.sh
endif
......@@ -765,6 +794,9 @@ TESTS = $(check_SCRIPTS)
mod_precision.i: $(top_srcdir)/src/mod_precision.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/mod_precision.F90 -o $@
elpa1_utilities.i: $(top_srcdir)/src/elpa1_utilities.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa1_utilities.F90 -o $@
elpa2_utilities.i: $(top_srcdir)/src/elpa2_utilities.F90
$(CPP) $(CPPFLAGS) -I$(top_builddir)/ -I$(top_srcdir)/ -c $(top_srcdir)/src/elpa2_utilities.F90 -o $@
......
......@@ -30,3 +30,14 @@
#define ELPA2_COMPLEX_KERNEL_GPU 12
#define ELPA2_NUMBER_OF_COMPLEX_KERNELS 12
#define ELPA1_REAL_KERNEL_GENERIC 1
#define ELPA1_REAL_KERNEL_GPU 2
#define ELPA1_NUMBER_OF_REAL_KERNELS 2
#define ELPA1_COMPLEX_KERNEL_GENERIC 1
#define ELPA1_COMPLEX_KERNEL_GPU 2
#define ELPA1_NUMBER_OF_COMPLEX_KERNELS 2
......@@ -252,7 +252,7 @@ module ELPA
if (useELPA1) then
success = solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols)
matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
else
success = solve_evp_real_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
......@@ -350,7 +350,7 @@ module ELPA
if (useELPA1) then
success = solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols)
matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
else
success = solve_evp_real_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
......@@ -446,7 +446,7 @@ module ELPA
if (useELPA1) then
success = solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols)
matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
else
success = solve_evp_complex_2stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
......@@ -540,7 +540,7 @@ module ELPA
if (useELPA1) then
success = solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols)
matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all)
else
success = solve_evp_complex_2stage_single(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
......
......@@ -85,6 +85,7 @@ module ELPA1
use, intrinsic :: iso_c_binding
use elpa_utilities
use elpa1_auxiliary
use elpa1_utilities
implicit none
......@@ -522,8 +523,11 @@ end function get_elpa_communicators
#define COMPLEX_DATATYPE c_float
function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols) result(success)
matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_REAL_ELPA_KERNEL_API) result(success)
use iso_c_binding
use cuda_functions
use mod_check_for_gpu
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
......@@ -531,15 +535,20 @@ function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
real(kind=REAL_DATATYPE) :: ev(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*), q(ldq,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
integer(kind=ik) :: THIS_REAL_ELPA_KERNEL
integer(kind=c_int) :: my_prow, my_pcol, mpierr
logical :: useGPU
integer(kind=ik) :: numberOfGPUDevices
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, mpierr
real(kind=REAL_DATATYPE), allocatable :: e(:), tau(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical :: success
......@@ -553,6 +562,10 @@ function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
#ifdef HAVE_DETAILED_TIMINGS
......@@ -566,14 +579,43 @@ function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
useGPU = .false.
if (present(THIS_REAL_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
endif
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif
allocate(e(na), tau(na))
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
call tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, useGPU)
#else
call tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
call tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
......@@ -657,7 +699,10 @@ end function solve_evp_real_1stage_double
function solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols) result(success)
mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_REAL_ELPA_KERNEL_API) result(success)
use cuda_functions
use mod_check_for_gpu
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
......@@ -666,7 +711,7 @@ function solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixC
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
real(kind=REAL_DATATYPE) :: ev(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*), q(ldq,*)
......@@ -674,13 +719,21 @@ function solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixC
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif
integer(kind=c_int) :: my_prow, my_pcol, mpierr
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, mpierr
real(kind=REAL_DATATYPE), allocatable :: e(:), tau(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical :: success
logical, save :: firstCall = .true.
logical :: wantDebug
integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
integer(kind=ik) :: THIS_REAL_ELPA_KERNEL
logical :: useGPU
integer(kind=ik) :: numberOfGPUDevices
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_evp_real_1stage_single")
#endif
......@@ -689,6 +742,9 @@ function solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixC
call timer%start("mpi_communication")
#endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
......@@ -704,13 +760,41 @@ function solve_evp_real_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixC
firstCall = .false.
endif
useGPU = .false.
if (present(THIS_REAL_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
endif
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif
allocate(e(na), tau(na))
ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_REAL
call tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
call tridiag_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, useGPU)
#else
call tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau)
call tridiag_real_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, useGPU)
#endif
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
......@@ -793,17 +877,18 @@ end function solve_evp_real_1stage_single
!> \result success
function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols) result(success)
mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_REAL_ELPA_KERNEL_API) result(success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
! use precision
use precision
use iso_c_binding
use elpa_mpi
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*)
#else
......@@ -821,6 +906,12 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr
logical, save :: firstCall = .true.
logical :: wantDebug
integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
integer(kind=ik) :: THIS_REAL_ELPA_KERNEL
logical :: useGPU
integer(kind=ik) :: numberOfGPUDevices
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_evp_complex_1stage_double")
#endif
......@@ -944,16 +1035,18 @@ end function solve_evp_complex_1stage_double
function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols) result(success)
mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_REAL_ELPA_KERNEL_API) result(success)
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use precision
use iso_c_binding
use elpa_mpi
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*)
#else
......@@ -971,6 +1064,12 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr
logical, save :: firstCall = .true.
logical :: wantDebug
integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
integer(kind=ik) :: THIS_REAL_ELPA_KERNEL
logical :: useGPU
integer(kind=ik) :: numberOfGPUDevices
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("solve_evp_complex_1stage_single")
#endif
......
......@@ -52,7 +52,7 @@
! distributed along with the original code in the file "COPYING".
#endif
subroutine M_tridiag_real_PRECISSION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau)
subroutine M_tridiag_real_PRECISSION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau, useGPU)
!-------------------------------------------------------------------------------
! tridiag_real: Reduces a distributed symmetric matrix to tridiagonal form
! (like Scalapack Routine PDSYTRD)
......@@ -82,6 +82,9 @@
! tau(na) Factors for the Householder vectors (returned), needed for back transformation
!
!-------------------------------------------------------------------------------
use cuda_functions
use iso_c_binding
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
......@@ -89,6 +92,8 @@
implicit none
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU
real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na), tau(na)
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: a(lda,*)
......@@ -107,6 +112,15 @@
! updated after each istep (in the main cycle) to contain number of
! local columns and rows of the remaining part of the matrix
integer(kind=ik) :: l_cols, l_rows
! number of local columns used for allocation of a_dev
! pkus: isn't it the same as matrixCols?
integer(kind=ik) :: na_cols
integer(kind=C_intptr_T) :: a_dev
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: nstor
integer(kind=ik) :: istep, i, j, lcs, lce, lrs, lre
......@@ -146,6 +160,17 @@
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("mpi_communication")
#endif
! pkus: what is the difference between na_cols and matrixCols?
if (useGPU) then
#ifdef WITH_MPI
na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else
na_cols = na
#endif
write(*,*) "na_cols", na_cols, "matrixCols", matrixCols
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
! pkus: what is tile size exactly?
......@@ -165,10 +190,7 @@
max_local_cols = max_blocks_col*nblk
allocate(tmp(MAX(max_local_rows,max_local_cols)), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"tridiag_real: error when allocating tmp "//errorMessage
stop
endif
call check_alloc("tridiag_real", "tmp", istat, errorMessage)
allocate(vr(max_local_rows+1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
! ELPA2 -- 2-stage solver for ELPA
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#include "config-f90.h"
#include <elpa/elpa_kernel_constants.h>
module ELPA1_utilities
use ELPA_utilities
use precision
implicit none
PRIVATE ! By default, all routines contained are private
! The following routines are public:
public :: get_actual_real_kernel_name, get_actual_complex_kernel_name
public :: REAL_ELPA_KERNEL_GENERIC, REAL_ELPA_KERNEL_GPU, DEFAULT_REAL_ELPA_KERNEL
public :: COMPLEX_ELPA_KERNEL_GENERIC, COMPLEX_ELPA_KERNEL_GPU, DEFAULT_COMPLEX_ELPA_KERNEL
public :: REAL_ELPA_KERNEL_NAMES, COMPLEX_ELPA_KERNEL_NAMES
public :: get_actual_complex_kernel, get_actual_real_kernel
public :: check_allowed_complex_kernels, check_allowed_real_kernels
public :: AVAILABLE_COMPLEX_ELPA_KERNELS, AVAILABLE_REAL_ELPA_KERNELS
public :: print_available_real_kernels, print_available_complex_kernels
public :: query_available_real_kernels, query_available_complex_kernels
integer, parameter :: number_of_real_kernels = ELPA1_NUMBER_OF_REAL_KERNELS
integer, parameter :: REAL_ELPA_KERNEL_GENERIC = ELPA1_REAL_KERNEL_GENERIC
integer(kind=ik), parameter :: REAL_ELPA_KERNEL_GPU = ELPA1_REAL_KERNEL_GPU
! #ifdef WITH_GPU_VERSION
! integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GPU
! #else
integer(kind=ik), parameter :: DEFAULT_REAL_ELPA_KERNEL = REAL_ELPA_KERNEL_GENERIC
! #endif
character(35), parameter, dimension(number_of_real_kernels) :: &
REAL_ELPA_KERNEL_NAMES = (/"REAL_ELPA_KERNEL_GENERIC ", &
"REAL_ELPA_KERNEL_GPU "/)
integer, parameter :: number_of_complex_kernels = ELPA1_NUMBER_OF_COMPLEX_KERNELS
integer, parameter :: COMPLEX_ELPA_KERNEL_GENERIC = ELPA1_COMPLEX_KERNEL_GENERIC
integer(kind=ik), parameter :: COMPLEX_ELPA_KERNEL_GPU = ELPA1_COMPLEX_KERNEL_GPU
! #ifdef WITH_GPU_VERSION
! integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GPU
! #else
integer(kind=ik), parameter :: DEFAULT_COMPLEX_ELPA_KERNEL = COMPLEX_ELPA_KERNEL_GENERIC
! #endif
character(35), parameter, dimension(number_of_complex_kernels) :: &
COMPLEX_ELPA_KERNEL_NAMES = (/"COMPLEX_ELPA_KERNEL_GENERIC ", &
"COMPLEX_ELPA_KERNEL_GPU "/)
integer(kind=ik), parameter :: &
AVAILABLE_REAL_ELPA_KERNELS(number_of_real_kernels) = &
(/ &
#if WITH_REAL_GENERIC_KERNEL
1 &
#else
0 &
#endif
#ifdef WITH_GPU_VERSION
,1 &
#else
,0 &