Some debug changes for ELPA GPU

parent 2845f74c
This diff is collapsed.
......@@ -113,9 +113,9 @@
end function
!c> int elpa_solve_evp_real_stage2(int na, int nev, int ncols, double *a, int lda, double *ev, double *q, int ldq, int nblk, int mpi_comm_rows, int mpi_comm_cols, int THIS_REAL_ELPA_KERNEL_API, int useQR);
!c> int elpa_solve_evp_real_stage2(int na, int nev, int ncols, double *a, int lda, double *ev, double *q, int ldq, int nblk, int_na_rows, int na_cols, int mpi_comm_rows, int mpi_comm_cols, int THIS_REAL_ELPA_KERNEL_API, int useQR);
function solve_elpa2_evp_real_wrapper(na, nev, ncols, a, lda, ev, q, ldq, nblk, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_REAL_ELPA_KERNEL_API, useQR) &
result(success) bind(C,name="elpa_solve_evp_real_2stage")
......@@ -123,7 +123,7 @@
use elpa2, only : solve_evp_real_2stage
integer(kind=c_int) :: success
integer(kind=c_int), value, intent(in) :: na, nev, ncols, lda, ldq, nblk, mpi_comm_cols, mpi_comm_rows, &
integer(kind=c_int), value, intent(in) :: na, nev, ncols, lda, ldq, nblk, na_rows, na_cols, mpi_comm_cols, mpi_comm_rows, &
mpi_comm_all
integer(kind=c_int), value, intent(in) :: THIS_REAL_ELPA_KERNEL_API, useQR
real(kind=c_double) :: a(1:lda,1:ncols), ev(1:na), q(1:ldq,1:ncols)
......@@ -138,7 +138,7 @@
useQRFortran = .true.
endif
successFortran = solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
successFortran = solve_evp_real_2stage(na, nev, a, lda, ev, q, ldq, nblk, na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_REAL_ELPA_KERNEL_API, useQRFortran)
if (successFortran) then
......@@ -149,9 +149,9 @@
end function
! int elpa_solve_evp_complex_stage2(int na, int nev, int ncols, double_complex *a, int lda, double *ev, double_complex *q, int ldq, int nblk, int mpi_comm_rows, int mpi_comm_cols);
! int elpa_solve_evp_complex_stage2(int na, int nev, int ncols, double_complex *a, int lda, double *ev, double_complex *q, int ldq, int nblk, int na_rows, int na_cols, int mpi_comm_rows, int mpi_comm_cols);
function solve_elpa2_evp_complex_wrapper(na, nev, ncols, a, lda, ev, q, ldq, nblk, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
THIS_COMPLEX_ELPA_KERNEL_API) &
result(success) bind(C,name="elpa_solve_evp_complex_2stage")
......@@ -159,14 +159,14 @@
use elpa2, only : solve_evp_complex_2stage
integer(kind=c_int) :: success
integer(kind=c_int), value, intent(in) :: na, nev, ncols, lda, ldq, nblk, mpi_comm_cols, mpi_comm_rows, &
integer(kind=c_int), value, intent(in) :: na, nev, ncols, lda, ldq, nblk, na_rows, na_cols, mpi_comm_cols, mpi_comm_rows, &
mpi_comm_all
integer(kind=c_int), value, intent(in) :: THIS_COMPLEX_ELPA_KERNEL_API
complex(kind=c_double_complex) :: a(1:lda,1:ncols), q(1:ldq,1:ncols)
real(kind=c_double) :: ev(1:na)
logical :: successFortran
successFortran = solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, mpi_comm_rows, mpi_comm_cols, &
successFortran = solve_evp_complex_2stage(na, nev, a, lda, ev, q, ldq, nblk, na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL_API)
if (successFortran) then
......
......@@ -346,7 +346,7 @@ program test_complex2
call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
if (.not.(success)) then
write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
......
......@@ -79,6 +79,10 @@ program test_complex2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
use elpa2_utilities
use mod_read_input_parameters
......@@ -113,39 +117,44 @@ program test_complex2
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer :: nblk
integer na, nev
integer :: nblk
integer :: na, nev
!-------------------------------------------------------------------------------
! Local Variables
integer np_rows, np_cols, na_rows, na_cols
integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer, external :: numroc
real*8, allocatable :: ev(:), xr(:,:)
real*8, allocatable :: ev(:), xr(:,:)
complex*16, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
integer :: omp_get_max_threads, required_mpi_thread_level, &
provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
integer, parameter :: error_unit = 6
#endif
logical :: success
logical :: success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum
#endif
write_to_file = .false.
success = .true.
......@@ -160,6 +169,17 @@ program test_complex2
!-------------------------------------------------------------------------------
! MPI Initialization
call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
devnum = 0
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
#endif
STATUS = 0
#ifdef WITH_OPENMP
......@@ -324,7 +344,7 @@ program test_complex2
call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
......
......@@ -79,6 +79,9 @@ program test_complex2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
use elpa2_utilities
use mod_read_input_parameters
......@@ -112,38 +115,43 @@ program test_complex2
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer :: nblk
integer na, nev
integer :: nblk
integer :: na, nev
!-------------------------------------------------------------------------------
! Local Variables
integer np_rows, np_cols, na_rows, na_cols
integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer, external :: numroc
real*8, allocatable :: ev(:), xr(:,:)
real*8, allocatable :: ev(:), xr(:,:)
complex*16, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:)
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
complex*16, parameter :: CZERO = (0.d0,0.d0), CONE = (1.d0,0.d0)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
integer :: omp_get_max_threads, required_mpi_thread_level, &
provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
integer, parameter :: error_unit = 6
#endif
logical :: success
logical :: success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum
#endif
write_to_file = .false.
success = .true.
......@@ -157,6 +165,15 @@ program test_complex2
!-------------------------------------------------------------------------------
! MPI Initialization
call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
devnum = 0
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
#endif
STATUS = 0
......@@ -325,7 +342,7 @@ program test_complex2
call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
success = solve_evp_complex_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
if (.not.(success)) then
write(error_unit,*) "solve_evp_complex_2stage produced an error! Aborting..."
......
......@@ -80,6 +80,9 @@ program test_real2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
#ifdef WITH_OPENMP
use test_util
#endif
......@@ -109,35 +112,39 @@ program test_real2
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer :: nblk
integer na, nev
integer :: nblk
integer :: na, nev
!-------------------------------------------------------------------------------
! Local Variables
integer np_rows, np_cols, na_rows, na_cols
integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer, external :: numroc
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
integer, parameter :: error_unit = 6
#endif
logical :: success
logical :: success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum
#endif
write_to_file = .false.
success = .true.
......@@ -152,6 +159,16 @@ program test_real2
! MPI Initialization
call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
devnum = 0
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
#endif
STATUS = 0
#ifdef WITH_OPENMP
if (myid .eq. 0) then
......@@ -180,6 +197,9 @@ program test_real2
if (myid .eq. 0) then
print *," "
print *,"This ELPA2 is build with"
#ifdef WITH_GPU_SUPPORT
print *,"GPU support"
#endif
#ifdef WITH_REAL_AVX_BLOCK2_KERNEL
print *,"AVX optimized kernel (2 blocking) for real matrices"
#endif
......@@ -321,7 +341,7 @@ program test_real2
end if
call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, na_rows, na_cols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
if (.not.(success)) then
......
......@@ -80,6 +80,9 @@ program test_real2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
use elpa2_utilities
......@@ -114,34 +117,40 @@ program test_real2
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer :: nblk
integer na, nev
integer :: nblk
integer :: na, nev
!-------------------------------------------------------------------------------
! Local Variables
integer np_rows, np_cols, na_rows, na_cols
integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer, external :: numroc
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
integer :: omp_get_max_threads, required_mpi_thread_level, &
provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
integer, parameter :: error_unit = 6
#endif
logical :: success
logical :: success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum
#endif
write_to_file = .false.
success = .true.
......@@ -157,6 +166,16 @@ program test_real2
! MPI Initialization
call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
devnum = 0
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
#endif
STATUS = 0
#ifdef WITH_OPENMP
......@@ -313,7 +332,7 @@ program test_real2
call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_world, &
REAL_ELPA_KERNEL_GENERIC_SIMPLE)
if (.not.(success)) then
......
......@@ -80,6 +80,10 @@ program test_real2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
use elpa2_utilities
use mod_read_input_parameters
......@@ -112,33 +116,39 @@ program test_real2
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer :: nblk
integer na, nev
integer :: nblk
integer :: na, nev
!-------------------------------------------------------------------------------
! Local Variables
integer np_rows, np_cols, na_rows, na_cols
integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer, external :: numroc
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
integer :: omp_get_max_threads, required_mpi_thread_level, &
provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
integer, parameter :: error_unit = 6
#endif
logical :: success
logical :: success
#ifdef WITH_GPU_VERSION
character(len=1024) :: envname
integer :: istat, devnum
#endif
success = .true.
write_to_file = .false.
......@@ -153,6 +163,16 @@ program test_real2
! MPI Initialization
call setup_mpi(myid, nprocs)
#ifdef WITH_GPU_VERSION
devnum = 0
istat = cuda_setdevice(devnum)
if (istat .ne. 0) then
print *,"Cannot set CudaDevice"
stop
endif
#endif
STATUS = 0
#ifdef WITH_OPENMP
......@@ -312,7 +332,7 @@ program test_real2
call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
success = solve_evp_real_2stage(na, nev, a, na_rows, ev, z, na_rows, nblk, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
na_rows, na_cols, mpi_comm_rows, mpi_comm_cols, mpi_comm_world)
if (.not.(success)) then
write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
......
......@@ -81,6 +81,10 @@ program test_real2
use ELPA1
use ELPA2
#ifdef WITH_GPU_VERSION
use cuda_routines
#endif
use elpa2_utilities
use mod_read_input_parameters
......@@ -113,34 +117,40 @@ program test_real2
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer :: nblk
integer na, nev
integer :: nblk
integer :: na, nev
!-------------------------------------------------------------------------------
! Local Variables
integer np_rows, np_cols, na_rows, na_cols
integer :: np_rows, np_cols, na_rows, na_cols
integer myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer, external :: numroc
integer, external :: numroc
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
real*8, allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
integer :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer :: STATUS
#ifdef WITH_OPENMP
integer :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
integer :: omp_get_max_threads, required_mpi_thread_level, &
provided_mpi_thread_level
#endif
logical :: write_to_file
logical :: write_to_file
#ifndef HAVE_ISO_FORTRAN_ENV
integer, parameter :: error_unit = 6
integer, parameter :: error_unit = 6