Commit 7bbb47f6 authored by Pavel Kus's avatar Pavel Kus
Browse files

make complex gpu tests accept useGPU flag

Conflicts:
	src/elpa1.F90
	src/elpa1_tridiag_complex_template.X90
parent 2e8df093
...@@ -525,12 +525,13 @@ end function get_elpa_communicators ...@@ -525,12 +525,13 @@ end function get_elpa_communicators
function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, & function solve_evp_real_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, & matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_REAL_ELPA_KERNEL_API) result(success) THIS_REAL_ELPA_KERNEL_API) result(success)
use iso_c_binding use precision
use cuda_functions use cuda_functions
use mod_check_for_gpu use mod_check_for_gpu
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#endif #endif
use iso_c_binding
use elpa_mpi use elpa_mpi
use elpa1_compute use elpa1_compute
implicit none implicit none
...@@ -883,6 +884,8 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -883,6 +884,8 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr
use timings use timings
#endif #endif
use precision use precision
use cuda_functions
use mod_check_for_gpu
use iso_c_binding use iso_c_binding
use elpa_mpi use elpa_mpi
use elpa1_compute use elpa1_compute
...@@ -896,15 +899,15 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -896,15 +899,15 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr
#endif #endif
real(kind=REAL_DATATYPE) :: ev(na) real(kind=REAL_DATATYPE) :: ev(na)
integer(kind=c_int) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=c_int) :: l_rows, l_cols, l_cols_nev integer(kind=c_int) :: l_rows, l_cols, l_cols_nev
real(kind=REAL_DATATYPE), allocatable :: q_real(:,:), e(:) real(kind=REAL_DATATYPE), allocatable :: q_real(:,:), e(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: tau(:) complex(kind=COMPLEX_DATATYPE), allocatable :: tau(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical :: success logical :: success
logical, save :: firstCall = .true. logical, save :: firstCall = .true.
logical :: wantDebug logical :: wantDebug
integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API integer(kind=ik), intent(in), optional :: THIS_REAL_ELPA_KERNEL_API
integer(kind=ik) :: THIS_REAL_ELPA_KERNEL integer(kind=ik) :: THIS_REAL_ELPA_KERNEL
...@@ -919,6 +922,10 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -919,6 +922,10 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication") call timer%start("mpi_communication")
#endif #endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
...@@ -936,6 +943,34 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -936,6 +943,34 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr
firstCall = .false. firstCall = .false.
endif endif
useGPU = .false.
if (present(THIS_REAL_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
endif
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
...@@ -947,9 +982,9 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -947,9 +982,9 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) call tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, useGPU)
#else #else
call tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) call tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, useGPU)
#endif #endif
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0 if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0
...@@ -972,9 +1007,9 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -972,9 +1007,9 @@ function solve_evp_complex_1stage_double(na, nev, a, lda, ev, q, ldq, nblk, matr
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call trans_ev_complex_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) call trans_ev_complex_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#else #else
call trans_ev_complex_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) call trans_ev_complex_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif #endif
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0 if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0
...@@ -1041,6 +1076,8 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -1041,6 +1076,8 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr
use timings use timings
#endif #endif
use precision use precision
use cuda_functions
use mod_check_for_gpu
use iso_c_binding use iso_c_binding
use elpa_mpi use elpa_mpi
use elpa1_compute use elpa1_compute
...@@ -1054,8 +1091,8 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -1054,8 +1091,8 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr
#endif #endif
real(kind=REAL_DATATYPE) :: ev(na) real(kind=REAL_DATATYPE) :: ev(na)
integer(kind=c_int) :: my_prow, my_pcol, np_rows, np_cols, mpierr integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=c_int) :: l_rows, l_cols, l_cols_nev integer(kind=c_int) :: l_rows, l_cols, l_cols_nev
real(kind=REAL_DATATYPE), allocatable :: q_real(:,:), e(:) real(kind=REAL_DATATYPE), allocatable :: q_real(:,:), e(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: tau(:) complex(kind=COMPLEX_DATATYPE), allocatable :: tau(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
...@@ -1077,6 +1114,10 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -1077,6 +1114,10 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
call timer%start("mpi_communication") call timer%start("mpi_communication")
#endif #endif
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr) call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr) call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr) call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
...@@ -1094,6 +1135,33 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -1094,6 +1135,33 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr
firstCall = .false. firstCall = .false.
endif endif
useGPU = .false.
if (present(THIS_REAL_ELPA_KERNEL_API)) then
! user defined kernel via the optional argument in the API call
THIS_REAL_ELPA_KERNEL = THIS_REAL_ELPA_KERNEL_API
else
! if kernel is not choosen via api
! check whether set by environment variable
THIS_REAL_ELPA_KERNEL = DEFAULT_REAL_ELPA_KERNEL
endif
if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
useGPU = .true.
endif
if (nblk .ne. 128) then
print *,"At the moment GPU version needs blocksize 128"
error stop
endif
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
endif
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
...@@ -1105,9 +1173,9 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -1105,9 +1173,9 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) call tridiag_complex_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, useGPU)
#else #else
call tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau) call tridiag_complex_single(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, useGPU)
#endif #endif
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0 if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_complex :',ttt1-ttt0
...@@ -1130,9 +1198,9 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr ...@@ -1130,9 +1198,9 @@ function solve_evp_complex_1stage_single(na, nev, a, lda, ev, q, ldq, nblk, matr
ttt0 = MPI_Wtime() ttt0 = MPI_Wtime()
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev) q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
call trans_ev_complex_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) call trans_ev_complex_double(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#else #else
call trans_ev_complex_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) call trans_ev_complex_single(na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#endif #endif
ttt1 = MPI_Wtime() ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0 if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_complex:',ttt1-ttt0
......
...@@ -86,7 +86,7 @@ ...@@ -86,7 +86,7 @@
!> !>
!> \param useGPU If true, GPU version of the subroutine will be used !> \param useGPU If true, GPU version of the subroutine will be used
!> !>
subroutine trans_ev_complex_PRECISION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols) subroutine trans_ev_complex_PRECISION(na, nqc, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
...@@ -103,6 +103,8 @@ ...@@ -103,6 +103,8 @@
#else #else
complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols) complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols)
#endif #endif
logical, intent(in) :: useGPU
integer(kind=ik) :: max_stored_rows integer(kind=ik) :: max_stored_rows
#ifdef DOUBLE_PRECISION_COMPLEX #ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=ck8), parameter :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8) complex(kind=ck8), parameter :: CZERO = (0.0_rk8,0.0_rk8), CONE = (1.0_rk8,0.0_rk8)
......
...@@ -80,7 +80,7 @@ ...@@ -80,7 +80,7 @@
!> !>
!> \param useGPU If true, GPU version of the subroutine will be used !> \param useGPU If true, GPU version of the subroutine will be used
!> !>
subroutine tridiag_complex_PRECISION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau) subroutine tridiag_complex_PRECISION(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, d, e, tau, useGPU)
#ifdef HAVE_DETAILED_TIMINGS #ifdef HAVE_DETAILED_TIMINGS
use timings use timings
#else #else
...@@ -89,7 +89,9 @@ ...@@ -89,7 +89,9 @@
use precision use precision
implicit none implicit none
integer(kind=ik) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU
complex(kind=COMPLEX_DATATYPE) :: tau(na) complex(kind=COMPLEX_DATATYPE) :: tau(na)
#ifdef USE_ASSUMED_SIZE #ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*) complex(kind=COMPLEX_DATATYPE) :: a(lda,*)
...@@ -174,7 +176,7 @@ ...@@ -174,7 +176,7 @@
allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage) allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_complex", "uc_p", istat, errorMessage) call check_alloc("tridiag_complex", "uc_p", istat, errorMessage)
#endif #endif
tmp = 0 tmp = 0
v_row = 0 v_row = 0
u_row = 0 u_row = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment