Commit 6dac0853 authored by Pavel Kus's avatar Pavel Kus

tridiag_real from elpa1 ported to GPU. So far, performance seems to be much worse than on CPU

Conflicts:
	src/elpa1_auxiliary.F90
	src/elpa1_tridiag_real_template.X90
parent c2c83c2f
......@@ -596,8 +596,8 @@ function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
print *,"Warning: using GPU with blocksize different from 128"
! error stop
endif
! set the neccessary parameters
......
......@@ -54,6 +54,8 @@
!> \brief Fortran module which provides helper routines for matrix calculations
module ELPA1_AUXILIARY
use elpa_utilities
implicit none
public :: elpa_mult_at_b_real_double !< Multiply double-precision real matrices A**T * B
......
......@@ -101,9 +101,6 @@ module ELPA1_COMPUTE
public :: trans_ev_complex_single ! Transform complex single-precision eigenvectors of a tridiagonal matrix back
#endif
public :: local_index ! Get local index of a block cyclic distributed matrix
public :: least_common_multiple ! Get least common multiple
public :: hh_transform_real_double
public :: hh_transform_real
public :: elpa_reduce_add_vectors_real_double
......
......@@ -228,82 +228,6 @@
end subroutine M_solve_secular_equation_PRECISSION
!-------------------------------------------------------------------------------
#ifndef ALREADY_DEFINED
integer function local_index(idx, my_proc, num_procs, nblk, iflag)
!-------------------------------------------------------------------------------
! local_index: returns the local index for a given global index
! If the global index has no local index on the
! processor my_proc behaviour is defined by iflag
!
! Parameters
!
! idx Global index
!
! my_proc Processor row/column for which to calculate the local index
!
! num_procs Total number of processors along row/column
!
! nblk Blocksize
!
! iflag Controls the behaviour if idx is not on local processor
! iflag< 0 : Return last local index before that row/col
! iflag==0 : Return 0
! iflag> 0 : Return next local index after that row/col
!-------------------------------------------------------------------------------
use precision
implicit none
integer(kind=ik) :: idx, my_proc, num_procs, nblk, iflag
integer(kind=ik) :: iblk
iblk = (idx-1)/nblk ! global block number, 0 based
if (mod(iblk,num_procs) == my_proc) then
! block is local, always return local row/col number
local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1
else
! non local block
if (iflag == 0) then
local_index = 0
else
local_index = (iblk/num_procs)*nblk
if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk
if (iflag>0) local_index = local_index + 1
endif
endif
end function local_index
#endif /* ALREADY_DEFINED */
#ifndef ALREADY_DEFINED
integer function least_common_multiple(a, b)
! Returns the least common multiple of a and b
! There may be more efficient ways to do this, we use the most simple approach
use precision
implicit none
integer(kind=ik), intent(in) :: a, b
do least_common_multiple = a, a*(b-1), a
if(mod(least_common_multiple,b)==0) exit
enddo
! if the loop is left regularly, least_common_multiple = a*b
end function least_common_multiple
#endif /* ALREADY_DEFINED */
subroutine M_hh_transform_real_PRECISSION(alpha, xnorm_sq, xf, tau)
! Similar to LAPACK routine DLARFP, but uses ||x||**2 instead of x(:)
! and returns the factor xf by which x has to be scaled.
......@@ -343,4 +267,3 @@
ALPHA = BETA
endif
end subroutine M_hh_transform_real_PRECISSION
#define ALREADY_DEFINED 1
This diff is collapsed.
This diff is collapsed.
......@@ -45,6 +45,7 @@
module elpa_pdgeqrf
use elpa_utilities
use elpa1_compute
use elpa_pdlarfb
use qr_utils_mod
......
......@@ -45,6 +45,7 @@
module qr_utils_mod
use elpa_mpi
use elpa1_compute
use elpa_utilities
implicit none
PRIVATE
......
......@@ -58,8 +58,13 @@ module ELPA_utilities
private ! By default, all routines contained are private
public :: debug_messages_via_environment_variable, pcol, prow, error_unit, check_alloc
public :: debug_messages_via_environment_variable, error_unit
public :: check_alloc, check_alloc_CUDA, check_memcpy_CUDA
public :: map_global_array_index_to_local_index
public :: pcol, prow
public :: local_index ! Get local index of a block cyclic distributed matrix
public :: least_common_multiple ! Get least common multiple
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 0
#endif
......@@ -102,23 +107,23 @@ module ELPA_utilities
!-------------------------------------------------------------------------------
!Processor col for global col number
pure function pcol(i, nblk, np_cols) result(col)
pure function pcol(global_col, nblk, np_cols) result(local_col)
use precision
implicit none
integer(kind=ik), intent(in) :: i, nblk, np_cols
integer(kind=ik) :: col
col = MOD((i-1)/nblk,np_cols)
integer(kind=ik), intent(in) :: global_col, nblk, np_cols
integer(kind=ik) :: local_col
local_col = MOD((global_col-1)/nblk,np_cols)
end function
!-------------------------------------------------------------------------------
!Processor row for global row number
pure function prow(i, nblk, np_rows) result(row)
pure function prow(global_row, nblk, np_rows) result(local_row)
use precision
implicit none
integer(kind=ik), intent(in) :: i, nblk, np_rows
integer(kind=ik) :: row
row = MOD((i-1)/nblk,np_rows)
integer(kind=ik), intent(in) :: global_row, nblk, np_rows
integer(kind=ik) :: local_row
local_row = MOD((global_row-1)/nblk,np_rows)
end function
!-------------------------------------------------------------------------------
......@@ -162,6 +167,79 @@ module ELPA_utilities
end function
integer function local_index(idx, my_proc, num_procs, nblk, iflag)
!-------------------------------------------------------------------------------
! local_index: returns the local index for a given global index
! If the global index has no local index on the
! processor my_proc behaviour is defined by iflag
!
! Parameters
!
! idx Global index
!
! my_proc Processor row/column for which to calculate the local index
!
! num_procs Total number of processors along row/column
!
! nblk Blocksize
!
! iflag Controls the behaviour if idx is not on local processor
! iflag< 0 : Return last local index before that row/col
! iflag==0 : Return 0
! iflag> 0 : Return next local index after that row/col
!-------------------------------------------------------------------------------
use precision
implicit none
integer(kind=ik) :: idx, my_proc, num_procs, nblk, iflag
integer(kind=ik) :: iblk
iblk = (idx-1)/nblk ! global block number, 0 based
if (mod(iblk,num_procs) == my_proc) then
! block is local, always return local row/col number
local_index = (iblk/num_procs)*nblk + mod(idx-1,nblk) + 1
else
! non local block
if (iflag == 0) then
local_index = 0
else
local_index = (iblk/num_procs)*nblk
if (mod(iblk,num_procs) > my_proc) local_index = local_index + nblk
if (iflag>0) local_index = local_index + 1
endif
endif
end function local_index
integer function least_common_multiple(a, b)
! Returns the least common multiple of a and b
! There may be more efficient ways to do this, we use the most simple approach
use precision
implicit none
integer(kind=ik), intent(in) :: a, b
do least_common_multiple = a, a*(b-1), a
if(mod(least_common_multiple,b)==0) exit
enddo
! if the loop is left regularly, least_common_multiple = a*b
end function least_common_multiple
subroutine check_alloc(function_name, variable_name, istat, errorMessage)
use precision
......@@ -174,8 +252,38 @@ module ELPA_utilities
character(len=*), intent(in) :: errorMessage
if (istat .ne. 0) then
print *, function_name, ": error when allocating ", variable_name, " ", errorMessage
stop
print *, function_name, ": error when allocating ", variable_name, " ", errorMessage
stop
endif
end subroutine
subroutine check_alloc_CUDA(function_name, variable_name, successCUDA)
use precision
implicit none
character(len=*), intent(in) :: function_name
character(len=*), intent(in) :: variable_name
logical :: successCUDA
if (.not.(successCUDA)) then
print *, function_name, ": error in cuda_malloc when allocating ", variable_name
stop
endif
end subroutine
subroutine check_memcpy_CUDA(file_name, line, successCUDA)
use precision
implicit none
character(len=*), intent(in) :: file_name
integer(kind=ik), intent(in) :: line
logical :: successCUDA
if (.not.(successCUDA)) then
print *, file_name, ":", line, " error in cuda_memcpy when copying "
stop
endif
end subroutine
......
......@@ -345,6 +345,35 @@ module cuda_functions
end subroutine cublas_ctrmm_c
end interface
interface
subroutine cublas_dgemv_c(cta, m, n, alpha, a, lda, x, incx, beta, y, incy) bind(C,name='cublasDgemv')
use iso_c_binding
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT),value :: m,n
integer(kind=C_INT), intent(in), value :: lda,incx,incy
real(kind=C_DOUBLE),value :: alpha,beta
integer(kind=C_intptr_T), value :: a, x, y
end subroutine cublas_dgemv_c
end interface
interface
subroutine cublas_sgemv_c(cta, m, n, alpha, a, lda, x, incx, beta, y, incy) bind(C,name='cublasSgemv')
use iso_c_binding
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT),value :: m,n
integer(kind=C_INT), intent(in), value :: lda,incx,incy
real(kind=C_FLOAT),value :: alpha,beta
integer(kind=C_intptr_T), value :: a, x, y
end subroutine cublas_sgemv_c
end interface
contains
......@@ -661,5 +690,33 @@ module cuda_functions
#endif
end subroutine cublas_ctrmm
subroutine cublas_dgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
use iso_c_binding
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,incx,incy
real(kind=C_DOUBLE) :: alpha,beta
integer(kind=C_intptr_T) :: a, x, y
#ifdef WITH_GPU_VERSION
call cublas_dgemv_c(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
#endif
end subroutine cublas_dgemv
subroutine cublas_sgemv(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
use iso_c_binding
implicit none
character(1,C_CHAR),value :: cta
integer(kind=C_INT) :: m,n
integer(kind=C_INT), intent(in) :: lda,incx,incy
real(kind=C_FLOAT) :: alpha,beta
integer(kind=C_intptr_T) :: a, x, y
#ifdef WITH_GPU_VERSION
call cublas_sgemv_c(cta, m, n, alpha, a, lda, x, incx, beta, y, incy)
#endif
end subroutine cublas_sgemv
end module cuda_functions
......@@ -63,6 +63,7 @@
#define M_cublas_PRECISSION_gemm cublas_dgemm
#define M_cublas_PRECISSION_trmm cublas_dtrmm
#define M_cublas_PRECISSION_gemv cublas_dgemv
#define M_PRECISSION_SUFFIX "_double"
#define M_CONST_0_0 0.0_rk8
......@@ -70,7 +71,7 @@
#define M_CONST_1_0 1.0_rk8
#define M_CONST_2_0 2.0_rk8
#define M_CONST_8_0 8.0_rk8
#define M_size_of_PRECISSION_real_datatape size_of_double_real_datatype
#define M_size_of_PRECISSION_real size_of_double_real_datatype
#define M_MPI_REAL_PRECISSION MPI_REAL8
#else
......@@ -139,6 +140,7 @@
#undef M_cublas_PRECISSION_gemm
#undef M_cublas_PRECISSION_trmm
#undef M_cublas_PRECISSION_gemv
#undef M_PRECISSION_SUFFIX
#undef M_CONST_0_0
......@@ -146,7 +148,7 @@
#undef M_CONST_1_0
#undef M_CONST_2_0
#undef M_CONST_8_0
#undef M_size_of_PRECISSION_real_datatape
#undef M_size_of_PRECISSION_real
#undef M_MPI_REAL_PRECISSION
#define M_elpa_transpose_vectors_real_PRECISSION elpa_transpose_vectors_real_single
......@@ -213,6 +215,7 @@
#define M_cublas_PRECISSION_gemm cublas_sgemm
#define M_cublas_PRECISSION_trmm cublas_strmm
#define M_cublas_PRECISSION_gemv cublas_sgemv
#define M_PRECISSION_SUFFIX "_single"
#define M_CONST_0_0 0.0_rk4
......@@ -220,7 +223,7 @@
#define M_CONST_1_0 1.0_rk4
#define M_CONST_2_0 2.0_rk4
#define M_CONST_8_0 8.0_rk4
#define M_size_of_PRECISSION_real_datatape size_of_single_real_datatype
#define M_size_of_PRECISSION_real size_of_single_real_datatype
#define M_MPI_REAL_PRECISSION MPI_REAL4
#endif
......@@ -65,6 +65,44 @@
endif
#endif
#ifdef ELPA1
#ifdef REALCASE
#ifdef DOUBLE_PRECISION_REAL
if (myid .eq. 0) then
print *," "
print *,"Real valued double-precision version of ELPA1 is used"
print *," "
endif
#else
if (myid .eq. 0) then
print *," "
print *,"Real valued single-precision version of ELPA1 is used"
print *," "
endif
#endif
#endif
#ifdef COMPLEXCASE
#ifdef DOUBLE_PRECISION_COMPLEX
if (myid .eq. 0) then
print *," "
print *,"Complex valued double-precision version of ELPA1 is used"
print *," "
endif
#else
if (myid .eq. 0) then
print *," "
print *,"Complex valued single-precision version of ELPA1 is used"
print *," "
endif
#endif
#endif /* DATATYPE */
#else /* ELPA1 */
#ifdef REALCASE
#ifdef DOUBLE_PRECISION_REAL
if (myid .eq. 0) then
......@@ -99,6 +137,8 @@
#endif /* DATATYPE */
#endif /* ELPA1 */
#ifdef WITH_MPI
call MPI_BARRIER(MPI_COMM_WORLD, mpierr)
#endif
......
......@@ -182,7 +182,7 @@ program test_real_double_precision
if(myid==0) then
print *
print '(a)','Standard eigenvalue problem - REAL version'
print '(a)','Standard eigenvalue problem - ELPA1, REAL version'
print *
print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment