Commit cf009642 authored by Pavel Kus's avatar Pavel Kus

interface for already banded matrix added

parent 1dccb3c6
......@@ -315,6 +315,7 @@ dist_files_DATA = \
test/Fortran/test_real2_default.F90 \
test/Fortran/test_real2_qr.F90 \
test/Fortran/test_real2_api.F90 \
test/Fortran/test_real2_banded.F90 \
test/Fortran/test_real.F90 \
test/Fortran/test_real_with_c.F90 \
test/Fortran/test_toeplitz.F90 \
......@@ -345,6 +346,7 @@ noinst_PROGRAMS = \
elpa2_test_real_default@SUFFIX@ \
elpa2_test_real_qr@SUFFIX@ \
elpa2_test_real_api@SUFFIX@ \
elpa2_test_real_banded@SUFFIX@ \
elpa2_test_complex@SUFFIX@ \
elpa2_test_complex_default@SUFFIX@ \
elpa2_test_complex_api@SUFFIX@ \
......@@ -532,6 +534,11 @@ elpa2_test_real_api@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_api@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_api@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa2_test_real_banded@SUFFIX@_SOURCES = test/Fortran/test_real2_banded.F90
elpa2_test_real_banded@SUFFIX@_LDADD = $(build_lib)
elpa2_test_real_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
EXTRA_elpa2_test_real_banded@SUFFIX@_DEPENDENCIES = test/Fortran/elpa_print_headers.X90
elpa1_test_complex@SUFFIX@_SOURCES = test/Fortran/test_complex.F90
elpa1_test_complex@SUFFIX@_LDADD = $(build_lib)
elpa1_test_complex@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) @FC_MODOUT@private_modules @FC_MODINC@private_modules
......@@ -731,6 +738,7 @@ check_SCRIPTS = \
elpa2_test_real_qr@SUFFIX@.sh \
elpa2_test_complex_default@SUFFIX@.sh \
elpa2_test_real_api@SUFFIX@.sh \
elpa2_test_real_banded@SUFFIX@.sh \
elpa2_test_complex_api@SUFFIX@.sh \
elpa_driver_real@SUFFIX@.sh \
elpa_driver_complex@SUFFIX@.sh \
......
#if REALCASE == 1
function solve_evp_real_2stage_PRECISION (na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_ELPA_KERNEL_API, useQR, useGPU) result(success)
mpi_comm_all, THIS_ELPA_KERNEL_API, useQR, useGPU, bandwidth) result(success)
#elif COMPLEXCASE == 1
function solve_evp_complex_2stage_PRECISION(na, nev, a, lda, ev, q, ldq, nblk, &
matrixCols, mpi_comm_rows, mpi_comm_cols, &
mpi_comm_all, THIS_ELPA_KERNEL_API, useGPU) result(success)
mpi_comm_all, THIS_ELPA_KERNEL_API, useGPU, bandwidth) result(success)
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
......@@ -23,8 +23,11 @@
logical, intent(in), optional :: useGPU
#if REALCASE == 1
logical, intent(in), optional :: useQR
#endif
#endif
logical :: useQRActual, useQREnvironment
integer(kind=c_int), intent(in), optional :: bandwidth
integer(kind=c_int), intent(in), optional :: THIS_ELPA_KERNEL_API
integer(kind=c_int) :: THIS_ELPA_KERNEL
......@@ -190,44 +193,61 @@
endif
endif
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
! on this and maybe allow a run-time optimization here
if (do_useGPU) then
nbw = nblk
if(present(bandwidth)) then
nbw = bandwidth
if ((nbw == 0) .or. (mod(nbw, nblk) .ne. 0)) then
if (wantDebug) then
write(error_unit,*) "Specified bandwidth has to be a multiple of blocksize"
endif
print *, "Specified bandwidth has to be a multiple of blocksize"
success = .false.
return
endif
ttts = MPI_Wtime()
else
! Choose bandwidth, must be a multiple of nblk, set to a value >= 32
! On older systems (IBM Bluegene/P, Intel Nehalem) a value of 32 was optimal.
! For Intel(R) Xeon(R) E5 v2 and v3, better use 64 instead of 32!
! For IBM Bluegene/Q this is not clear at the moment. We have to keep an eye
! on this and maybe allow a run-time optimization here
if (do_useGPU) then
nbw = nblk
else
#if REALCASE == 1
nbw = (63/nblk+1)*nblk
nbw = (63/nblk+1)*nblk
#elif COMPLEXCASE == 1
nbw = (31/nblk+1)*nblk
nbw = (31/nblk+1)*nblk
#endif
endif
endif
num_blocks = (na-1)/nbw + 1
num_blocks = (na-1)/nbw + 1
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,solve_evp_NUMBER_2stage_PRECISION_STR//": error when allocating tmat "//errorMessage
stop
endif
allocate(tmat(nbw,nbw,num_blocks), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,solve_evp_NUMBER_2stage_PRECISION_STR//": error when allocating tmat "//errorMessage
stop
endif
! Reduction full -> band
! Reduction full -> band
ttt0 = MPI_Wtime()
ttts = ttt0
ttt0 = MPI_Wtime()
ttts = ttt0
#if REALCASE == 1
call bandred_real_PRECISION(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual)
call bandred_real_PRECISION(na, a, a_dev, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, tmat_dev, wantDebug, do_useGPU, success, useQRActual)
#elif COMPLEXCASE == 1
call bandred_complex_PRECISION(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, do_useGPU, success)
call bandred_complex_PRECISION(na, a, lda, nblk, nbw, matrixCols, num_blocks, mpi_comm_rows, mpi_comm_cols, &
tmat, wantDebug, do_useGPU, success)
#endif
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time '//bandred_NUMBER_PRECISION_STR//' :',ttt1-ttt0
if (.not.(success)) return
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time '//bandred_NUMBER_PRECISION_STR//' :',ttt1-ttt0
end if ! matrix not already banded on input
! Reduction band -> tridiagonal
......@@ -327,32 +347,36 @@
stop
endif
! Backtransform stage 2
ttt0 = MPI_Wtime()
if(present(bandwidth)) then
time_evp_back = ttt1-ttts
else
! Backtransform stage 2
ttt0 = MPI_Wtime()
#if REALCASE == 1
call trans_ev_band_to_full_real_PRECISION(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, &
q, q_dev, ldq, matrixCols, num_blocks, &
mpi_comm_rows, mpi_comm_cols, do_useGPU, useQRActual)
call trans_ev_band_to_full_real_PRECISION(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, &
q, q_dev, ldq, matrixCols, num_blocks, &
mpi_comm_rows, mpi_comm_cols, do_useGPU, useQRActual)
#elif COMPLEXCASE == 1
call trans_ev_band_to_full_complex_PRECISION(na, nev, nblk, nbw, a, lda, tmat, &
q, ldq, matrixCols, num_blocks, &
mpi_comm_rows, mpi_comm_cols, do_useGPU)
call trans_ev_band_to_full_complex_PRECISION(na, nev, nblk, nbw, a, lda, tmat, &
q, ldq, matrixCols, num_blocks, &
mpi_comm_rows, mpi_comm_cols, do_useGPU)
#endif
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time '//trans_ev_band_to_full_NUMBER_PRECISION_STR//' :',ttt1-ttt0
time_evp_back = ttt1-ttts
ttt1 = MPI_Wtime()
if (my_prow==0 .and. my_pcol==0 .and. elpa_print_times) &
write(error_unit,*) 'Time '//trans_ev_band_to_full_NUMBER_PRECISION_STR//' :',ttt1-ttt0
deallocate(tmat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,solve_evp_NUMBER_2stage_PRECISION_STR//": error when deallocating tmat"//errorMessage
stop
endif
time_evp_back = ttt1-ttts
deallocate(tmat, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,solve_evp_NUMBER_2stage_PRECISION_STR//": error when deallocating tmat"//errorMessage
stop
endif
endif ! not present(bandwidth)
call timer%stop(solve_evp_NUMBER_2stage_PRECISION_STR)
1 format(a,f10.3)
end function solve_evp_NUMBER_2stage_PRECISION
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
#include "config-f90.h"
!>
!> Fortran test programm to demonstrates the use of
!> ELPA 2 real case library.
!> If "HAVE_REDIRECT" was defined at build time
!> the stdout and stderr output of each MPI task
!> can be redirected to files if the environment
!> variable "REDIRECT_ELPA_TEST_OUTPUT" is set
!> to "true".
!>
!> By calling executable [arg1] [arg2] [arg3] [arg4]
!> one can define the size (arg1), the number of
!> Eigenvectors to compute (arg2), and the blocking (arg3).
!> If these values are not set default values (4000, 1500, 16)
!> are choosen.
!> If these values are set the 4th argument can be
!> "output", which specifies that the EV's are written to
!> an ascii file.
!>
!> The real ELPA 2 kernel is set as the default kernel.
!> However, this can be overriden by setting
!> the environment variable "REAL_ELPA_KERNEL" to an
!> appropiate value.
!>
program test_real2_double_precision
!-------------------------------------------------------------------------------
! Standard eigenvalue problem - REAL version
!
! This program demonstrates the use of the ELPA module
! together with standard scalapack routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
!-------------------------------------------------------------------------------
use precision
use ELPA1
use ELPA2
use mod_check_for_gpu, only : check_for_gpu
use elpa_utilities, only : error_unit
#ifdef WITH_OPENMP
use test_util
#endif
use mod_read_input_parameters
use mod_check_correctness
use mod_setup_mpi
use mod_blacs_infrastructure
use mod_prepare_matrix
use elpa_mpi
#ifdef HAVE_REDIRECT
use redirect
#endif
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
use output_types
implicit none
!-------------------------------------------------------------------------------
! Please set system size parameters below!
! na: System size
! nev: Number of eigenvectors to be calculated
! nblk: Blocking factor in block cyclic distribution
!-------------------------------------------------------------------------------
integer(kind=ik) :: nblk
integer(kind=ik) :: na, nev
integer(kind=ik) :: np_rows, np_cols, na_rows, na_cols
integer(kind=ik) :: myid, nprocs, my_prow, my_pcol, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: i, mpierr, my_blacs_ctxt, sc_desc(9), info, nprow, npcol
integer(kind=ik), external :: numroc
real(kind=rk8), allocatable :: a(:,:), z(:,:), tmp1(:,:), tmp2(:,:), as(:,:), ev(:)
integer(kind=ik) :: iseed(4096) ! Random seed, size should be sufficient for every generator
integer(kind=ik) :: STATUS
#ifdef WITH_OPENMP
integer(kind=ik) :: omp_get_max_threads, required_mpi_thread_level, provided_mpi_thread_level
#endif
logical :: successELPA, success
integer(kind=ik) :: numberOfDevices
logical :: gpuAvailable
type(output_t) :: write_to_file
character(len=8) :: task_suffix
integer(kind=ik) :: j
integer(kind=ik) :: global_row, global_col, local_row, local_col
integer(kind=ik) :: bandwidth
#define DOUBLE_PRECISION_REAL 1
successELPA = .true.
gpuAvailable = .false.
call read_input_parameters(na, nev, nblk, write_to_file)
!-------------------------------------------------------------------------------
! MPI Initialization
call setup_mpi(myid, nprocs)
gpuAvailable = check_for_gpu(myid, numberOfDevices)
STATUS = 0
#define REALCASE
#include "elpa_print_headers.X90"
#ifdef HAVE_DETAILED_TIMINGS
! initialise the timing functionality
#ifdef HAVE_LIBPAPI
call timer%measure_flops(.true.)
#endif
call timer%measure_allocated_memory(.true.)
call timer%measure_virtual_memory(.true.)
call timer%measure_max_allocated_memory(.true.)
call timer%set_print_options(&
#ifdef HAVE_LIBPAPI
print_flop_count=.true., &
print_flop_rate=.true., &
#endif
print_allocated_memory = .true. , &
print_virtual_memory=.true., &
print_max_allocated_memory=.true.)
call timer%enable()
call timer%start("program: test_real2_double_precision")
#endif
!-------------------------------------------------------------------------------
! Selection of number of processor rows/columns
! We try to set up the grid square-like, i.e. start the search for possible
! divisors of nprocs with a number next to the square root of nprocs
! and decrement it until a divisor is found.
do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
if(mod(nprocs,np_cols) == 0 ) exit
enddo
! at the end of the above loop, nprocs is always divisible by np_cols
np_rows = nprocs/np_cols
if(myid==0) then
print *
print '(a)','Standard eigenvalue problem - REAL version'
print *
print '(3(a,i0))','Matrix size=',na,', Number of eigenvectors=',nev,', Block size=',nblk
print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
print *
endif
!-------------------------------------------------------------------------------
! Set up BLACS context and MPI communicators
!
! The BLACS context is only necessary for using Scalapack.
!
! For ELPA, the MPI communicators along rows/cols are sufficient,
! and the grid setup may be done in an arbitrary way as long as it is
! consistent (i.e. 0<=my_prow<np_rows, 0<=my_pcol<np_cols and every
! process has a unique (my_prow,my_pcol) pair).
call set_up_blacsgrid(mpi_comm_world, my_blacs_ctxt, np_rows, np_cols, &
nprow, npcol, my_prow, my_pcol)
if (myid==0) then
print '(a)','| Past BLACS_Gridinfo.'
end if
! All ELPA routines need MPI communicators for communicating within
! rows or columns of processes, these are set in elpa_get_communicators.
mpierr = elpa_get_communicators(mpi_comm_world, my_prow, my_pcol, &
mpi_comm_rows, mpi_comm_cols)
if (myid==0) then
print '(a)','| Past split communicator setup for rows and columns.'
end if
call set_up_blacs_descriptor(na ,nblk, my_prow, my_pcol, np_rows, np_cols, &
na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
if (myid==0) then
print '(a)','| Past scalapack descriptor setup.'
end if
!-------------------------------------------------------------------------------
! Allocate matrices and set up a test matrix for the eigenvalue problem
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("set up matrix")
#endif
allocate(a (na_rows,na_cols))
allocate(z (na_rows,na_cols))
allocate(as(na_rows,na_cols))
allocate(ev(na))
call prepare_matrix_double(na, myid, sc_desc, iseed, a, z, as)
! set values outside of the bandwidth to zero
bandwidth = nblk
do local_row = 1, na_rows
global_row = index_l2g( local_row, nblk, my_prow, np_rows )
do local_col = 1, na_cols
global_col = index_l2g( local_col, nblk, my_pcol, np_cols )
if (ABS(global_row-global_col) > bandwidth) then
a(local_row, local_col) = 0.0
as(local_row, local_col) = 0.0
end if
end do
end do
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("set up matrix")
#endif
! set print flag in elpa1
elpa_print_times = .true.
!-------------------------------------------------------------------------------
! Calculate eigenvalues/eigenvectors
if (myid==0) then
print '(a)','| Entering two-stage ELPA solver ... '
print *
end if
#ifdef WITH_MPI
call mpi_barrier(mpi_comm_world, mpierr) ! for correct timings only
#endif
successELPA = elpa_solve_evp_real_2stage_double(na, nev, a, na_rows, ev, z, na_rows, nblk, na_cols, &
mpi_comm_rows, mpi_comm_cols, mpi_comm_world, bandwidth = bandwidth)
if (.not.(successELPA)) then
write(error_unit,*) "solve_evp_real_2stage produced an error! Aborting..."
#ifdef WITH_MPI
call MPI_ABORT(mpi_comm_world, 1, mpierr)
#endif
endif
if (myid==0) then
print '(a)','| Two-step ELPA solver complete.'
print *
end if
if(myid == 0) print *,'Time transform to tridi :',time_evp_fwd
if(myid == 0) print *,'Time solve tridi :',time_evp_solve
if(myid == 0) print *,'Time transform back EVs :',time_evp_back
if(myid == 0) print *,'Total time (sum above) :',time_evp_back+time_evp_solve+time_evp_fwd
if(write_to_file%eigenvectors) then
write(unit = task_suffix, fmt = '(i8.8)') myid
open(17,file="EVs_real2_out_task_"//task_suffix(1:8)//".txt",form='formatted',status='new')
write(17,*) "Part of eigenvectors: na_rows=",na_rows,"of na=",na," na_cols=",na_cols," of na=",na
do i=1,na_rows
do j=1,na_cols
write(17,*) "row=",i," col=",j," element of eigenvector=",z(i,j)
enddo
enddo
close(17)
endif
if(write_to_file%eigenvalues) then
if (myid == 0) then
open(17,file="Eigenvalues_real2_out.txt",form='formatted',status='new')
do i=1,na
write(17,*) i,ev(i)
enddo
close(17)
endif
endif
!-------------------------------------------------------------------------------
! Test correctness of result (using plain scalapack routines)
allocate(tmp1(na_rows,na_cols))
allocate(tmp2(na_rows,na_cols))
status = check_correctness(na, nev, as, z, ev, sc_desc, myid, tmp1, tmp2)
deallocate(a)
deallocate(as)
deallocate(z)
deallocate(tmp1)
deallocate(tmp2)
deallocate(ev)
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("program: test_real2_double_precision")
print *," "
print *,"Timings program: test_real2_double_precision"
call timer%print("program: test_real2_double_precision")
print *," "
print *,"End timings program: test_real2_double_precision"
#endif
#ifdef WITH_MPI
call blacs_gridexit(my_blacs_ctxt)
call mpi_finalize(mpierr)
#endif
call EXIT(STATUS)
end
!-------------------------------------------------------------------------------
......@@ -156,4 +156,9 @@ module mod_blacs_infrastructure
end subroutine
integer function index_l2g(idx_loc, nblk, iproc, nprocs)
index_l2g = nprocs * nblk * ((idx_loc-1) / nblk) + mod(idx_loc-1,nblk) + mod(nprocs+iproc, nprocs)*nblk + 1
return
end
end module
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment