Commit 88156480 authored by Lorenz Huedepohl's avatar Lorenz Huedepohl
Browse files

Merge remote-tracking branch 'origin/master' into elpa_interface

parents a314ad0b 68dc5464
This diff is collapsed.
......@@ -4,7 +4,7 @@ AC_PREREQ([2.69])
AC_INIT([elpa],m4_esyscmd_s([awk '/^ *Version:/ {print $2;}' elpa.spec]), [elpa-library@mpcdf.mpg.de])
AC_SUBST([PACKAGE_VERSION])
AC_CONFIG_SRCDIR([src/elpa1.F90])
AC_CONFIG_SRCDIR([src/elpa.F90])
AM_INIT_AUTOMAKE([foreign -Wall subdir-objects])
......
......@@ -19,7 +19,7 @@ elpa/elpa_generated.h: $(top_srcdir)/src/elpa_c_interface.F90 | elpa
test/shared/generated.h: $(wildcard $(top_srcdir)/test/shared/*.F90) | test/shared
$(call extract_interface,!c>)
elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2_kernels/*.c) $(wildcard $(top_srcdir)/src/elpa2_kernels/*.s) | elpa
elpa/elpa_generated_fortran_interfaces.h: $(wildcard $(top_srcdir)/src/elpa2/kernels/*.c) $(wildcard $(top_srcdir)/src/elpa2/kernels/*.s) | elpa
$(call extract_interface,!f>)
$(call extract_interface,#!f>)
......
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU, bandwidth)"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU)"
.br
.RI " "
.br
......@@ -47,8 +47,6 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
......@@ -57,7 +55,7 @@ use elpa2
#include <complex.h>
.br
.RI "success = \fBelpa_solve_evp_complex_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU, \fBint\fP bandwidth);"
.RI "success = \fBelpa_solve_evp_complex_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double complex *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble complex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU);"
.br
.RI " "
.br
......@@ -92,8 +90,6 @@ use elpa2
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU, bandwidth)"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_COMPLEX_ELPA_KERNEL, useGPU)"
.br
.RI " "
.br
......@@ -47,8 +47,6 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
......@@ -57,7 +55,7 @@ use elpa2
#include <complex.h>
.br
.RI "success = \fBelpa_solve_evp_complex_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU, \fBint\fP bandwidth);"
.RI "success = \fBelpa_solve_evp_complex_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB complex *\fPa, \fBint\fP lda, \fB float *\fPev, \fBcomplex *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_COMPLEX_KERNEL, \fBint\fP useGPU);"
.br
.RI " "
.br
......@@ -92,8 +90,6 @@ use elpa2
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU, bandwidth)"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)"
.br
.RI " "
.br
......@@ -49,15 +49,13 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix"
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
#include "elpa.h"
.br
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU, \fBint\fP bandwidth);"
.RI "success = \fBelpa_solve_evp_real_2stage_double\fP (\fBint\fP na, \fBint\fP nev, \fB double *\fPa, \fBint\fP lda, \fB double *\fPev, \fBdouble *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU);"
.br
.RI " "
.br
......@@ -94,8 +92,6 @@ use elpa2
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
......@@ -12,7 +12,7 @@ use elpa1
use elpa2
.br
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU, bandwidth)"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (na, nev, a(lda,matrixCols), ev(nev), q(ldq, matrixCols), ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, THIS_REAL_ELPA_KERNEL, useQR, useGPU)"
.br
.RI " "
.br
......@@ -49,15 +49,13 @@ use elpa2
.br
.RI "logical, intent(in), optional: \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "integer, intent(in), optional: \fBbandwidth\fP: bandwidth of an already banded matrix "
.br
.RI "logical \fBsuccess\fP: return value indicating success or failure"
.br
.SS C INTERFACE
#include "elpa.h"
.br
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU, \fBint\fP bandwidth);"
.RI "success = \fBelpa_solve_evp_real_2stage_single\fP (\fBint\fP na, \fBint\fP nev, \fB float *\fPa, \fBint\fP lda, \fB float *\fPev, \fBfloat *\fPq, \fBint\fP ldq, \fBint\fP nblk, \fBint\fP matrixCols, \fBint\fP mpi_comm_rows, \fBint\fP mpi_comm_cols, \fBint\fP mpi_comm_all, \fBint\fP THIS_ELPA_REAL_KERNEL, \fBint\fP useQR, \fBint\fP useGPU);"
.br
.RI " "
.br
......@@ -94,8 +92,6 @@ use elpa2
.br
.RI "int \fBuseGPU\fP: decide whether GPUs should be used or not"
.br
.RI "int \fBbandwidth\fP: bandwidth of an already banded matrix (-1 = matrix is not banded)"
.br
.RI "int \fBsuccess\fP: return value indicating success (1) or failure (0)
.SH DESCRIPTION
......
This diff is collapsed.
......@@ -49,7 +49,8 @@
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! This file has been rewritten by A. Marek, MPCDF
#include "config-f90.h"
!> \brief Fortran module which provides helper routines for matrix calculations
......@@ -282,7 +283,7 @@ module ELPA1_AUXILIARY
#define REALCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
function elpa_cholesky_real_double(na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
......@@ -294,7 +295,7 @@ module ELPA1_AUXILIARY
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief cholesky_real_single: Cholesky factorization of a single-precision real symmetric matrix
!> \details
......@@ -323,7 +324,7 @@ module ELPA1_AUXILIARY
#define REALCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_invert_trm_real_double: Inverts a double-precision real upper triangular matrix
!> \details
!> \param na Order of matrix
......@@ -345,7 +346,7 @@ module ELPA1_AUXILIARY
#if WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_invert_trm_real_single: Inverts a single-precision real upper triangular matrix
!> \details
......@@ -370,7 +371,7 @@ module ELPA1_AUXILIARY
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_cholesky_complex_double: Cholesky factorization of a double-precision complex hermitian matrix
!> \details
......@@ -397,7 +398,7 @@ module ELPA1_AUXILIARY
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#define COMPLEXCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_cholesky_complex_single: Cholesky factorization of a single-precision complex hermitian matrix
!> \details
......@@ -424,7 +425,7 @@ module ELPA1_AUXILIARY
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_invert_trm_complex_double: Inverts a double-precision complex upper triangular matrix
!> \details
......@@ -448,7 +449,7 @@ module ELPA1_AUXILIARY
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#define COMPLEXCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_invert_trm_complex_single: Inverts a single-precision complex upper triangular matrix
!> \details
......@@ -473,7 +474,7 @@ module ELPA1_AUXILIARY
#define REALCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief mult_at_b_real_double: Performs C : = A**T * B
!> where A is a square matrix (na,na) which is optionally upper or lower triangular
!> B is a (na,ncb) matrix
......@@ -514,7 +515,7 @@ module ELPA1_AUXILIARY
#if WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_mult_at_b_real_single: Performs C : = A**T * B
!> where A is a square matrix (na,na) which is optionally upper or lower triangular
......@@ -560,7 +561,7 @@ module ELPA1_AUXILIARY
#define COMPLEXCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_mult_ah_b_complex_double: Performs C : = A**H * B
!> where A is a square matrix (na,na) which is optionally upper or lower triangular
......@@ -605,7 +606,7 @@ module ELPA1_AUXILIARY
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#define COMPLEXCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_mult_ah_b_complex_single: Performs C : = A**H * B
!> where A is a square matrix (na,na) which is optionally upper or lower triangular
......@@ -652,7 +653,7 @@ module ELPA1_AUXILIARY
#define REALCASE 1
#define DOUBLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_solve_tridi_double: Solve tridiagonal eigensystem for a double-precision matrix with divide and conquer method
!> \details
......@@ -682,7 +683,7 @@ module ELPA1_AUXILIARY
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "precision_macros.h"
#include "../precision_macros.h"
!> \brief elpa_solve_tridi_single: Solve tridiagonal eigensystem for a single-precision matrix with divide and conquer method
!> \details
......
......@@ -157,7 +157,7 @@ module ELPA1_COMPUTE
#define DOUBLE_PRECISION_REAL 1
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "../precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
......@@ -168,7 +168,7 @@ module ELPA1_COMPUTE
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "../precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
......@@ -180,7 +180,7 @@ module ELPA1_COMPUTE
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "../precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef COMPLEXCASE
......@@ -190,7 +190,7 @@ module ELPA1_COMPUTE
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "../precision_macros.h"
#include "elpa_transpose_vectors.X90"
#include "elpa_reduce_add_vectors.X90"
#undef COMPLEXCASE
......@@ -201,7 +201,7 @@ module ELPA1_COMPUTE
! real double precision
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "../precision_macros.h"
#include "elpa1_compute_template.X90"
#undef REALCASE
......@@ -213,7 +213,7 @@ module ELPA1_COMPUTE
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "../precision_macros.h"
#include "elpa1_compute_template.X90"
#undef REALCASE
......@@ -224,7 +224,7 @@ module ELPA1_COMPUTE
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "../precision_macros.h"
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
......@@ -235,7 +235,7 @@ module ELPA1_COMPUTE
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "../precision_macros.h"
#include "elpa1_compute_template.X90"
#undef COMPLEXCASE
......
......@@ -50,8 +50,12 @@
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! Author: Andreas Marek, MPCDF
#endif
#include "../sanity.X90"
#if REALCASE == 1
!cannot use __FILE__ because filename with path can be too long for gfortran (max line length)
......
......@@ -52,6 +52,8 @@
! distributed along with the original code in the file "COPYING".
#endif
#include "../sanity.X90"
subroutine merge_systems_&
&PRECISION &
( na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
......
......@@ -52,6 +52,8 @@
! distributed along with the original code in the file "COPYING".
#endif
#include "../sanity.X90"
subroutine solve_tridi_&
&PRECISION &
( na, nev, d, e, q, ldq, nblk, matrixCols, mpi_comm_rows, &
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
#include "../sanity.X90"
function elpa_solve_evp_&
&MATH_DATATYPE&
&_1stage_&
&PRECISION&
& (na, nev, a, lda, ev, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm_all, &
useGPU) result(success)
use precision
use cuda_functions
use mod_check_for_gpu
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use iso_c_binding
use elpa_mpi
use elpa1_compute
implicit none
integer(kind=c_int), intent(in) :: na, nev, lda, ldq, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, mpi_comm_all
real(kind=REAL_DATATYPE), intent(out) :: ev(na)
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,*)
real(kind=C_DATATYPE_KIND), intent(out) :: q(ldq,*)
#else
real(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,matrixCols)
real(kind=C_DATATYPE_KIND), intent(out) :: q(ldq,matrixCols)
#endif
real(kind=C_DATATYPE_KIND), allocatable :: tau(:)
#endif /* REALCASE */
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
complex(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,*)
complex(kind=C_DATATYPE_KIND), intent(out) :: q(ldq,*)
#else
complex(kind=C_DATATYPE_KIND), intent(inout) :: a(lda,matrixCols)
complex(kind=C_DATATYPE_KIND), intent(out) :: q(ldq,matrixCols)
#endif
real(kind=REAL_DATATYPE), allocatable :: q_real(:,:)
complex(kind=C_DATATYPE_KIND), allocatable :: tau(:)
integer(kind=c_int) :: l_cols, l_rows, l_cols_nev, np_rows, np_cols
#endif /* COMPLEXCASE */
logical, intent(in), optional :: useGPU
logical :: success
logical :: do_useGPU
integer(kind=ik) :: numberOfGPUDevices
integer(kind=c_int) :: my_pe, n_pes, my_prow, my_pcol, mpierr
real(kind=C_DATATYPE_KIND), allocatable :: e(:)
real(kind=c_double) :: ttt0, ttt1 ! MPI_WTIME always needs double
logical, save :: firstCall = .true.
logical :: wantDebug
integer(kind=c_int) :: istat
character(200) :: errorMessage
call timer%start("elpa_solve_evp_&
&MATH_DATATYPE&
&_1stage_&
&PRECISION&
&")
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_all,my_pe,mpierr)
call mpi_comm_size(mpi_comm_all,n_pes,mpierr)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
#if COMPLEXCASE == 1
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
#endif
call timer%stop("mpi_communication")
success = .true.
wantDebug = .false.
if (firstCall) then
! are debug messages desired?
wantDebug = debug_messages_via_environment_variable()
firstCall = .false.
endif
do_useGPU = .false.
if (present(useGPU)) then
! user defined GPU usage via the optional argument in the API call
if (useGPU) then
if (check_for_gpu(my_pe,numberOfGPUDevices, wantDebug=wantDebug)) then
do_useGPU = .true.
! set the neccessary parameters
cudaMemcpyHostToDevice = cuda_memcpyHostToDevice()
cudaMemcpyDeviceToHost = cuda_memcpyDeviceToHost()
cudaMemcpyDeviceToDevice = cuda_memcpyDeviceToDevice()
cudaHostRegisterPortable = cuda_hostRegisterPortable()
cudaHostRegisterMapped = cuda_hostRegisterMapped()
else
print *,"GPUs are requested but not detected! Aborting..."
success = .false.
return
endif
endif
else
! check whether set by environment variable
do_useGPU = gpu_usage_via_environment_variable()
endif
#if COMPLEXCASE == 1
l_rows = local_index(na, my_prow, np_rows, nblk, -1) ! Local rows of a and q
l_cols = local_index(na, my_pcol, np_cols, nblk, -1) ! Local columns of q
l_cols_nev = local_index(nev, my_pcol, np_cols, nblk, -1) ! Local columns corresponding to nev
allocate(q_real(l_rows,l_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_1stage_&
&PRECISION&
&" // ": error when allocating q_real "//errorMessage
stop 1
endif
#endif
allocate(e(na), tau(na), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"solve_evp_&
&MATH_DATATYPE&
&_1stage_&
&PRECISION&
&" // ": error when allocating e, tau "//errorMessage
stop 1
endif
ttt0 = MPI_Wtime()
call tridiag_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (na, a, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, ev, e, tau, do_useGPU)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time tridiag_real :',ttt1-ttt0
time_evp_fwd = ttt1-ttt0
ttt0 = MPI_Wtime()
call solve_tridi_&
&PRECISION&
& (na, nev, ev, e, &
#if REALCASE == 1
q, ldq, &
#endif
#if COMPLEXCASE == 1
q_real, l_rows, &
#endif
nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, wantDebug, success)
if (.not.(success)) return
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time solve_tridi :',ttt1-ttt0
time_evp_solve = ttt1-ttt0
ttt0 = MPI_Wtime()
#if COMPLEXCASE == 1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
#endif
call trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU)
ttt1 = MPI_Wtime()
if(my_prow==0 .and. my_pcol==0 .and. elpa_print_times) write(error_unit,*) 'Time trans_ev_real:',ttt1-ttt0
time_evp_back = ttt1-ttt0
#if COMPLEXCASE == 1
deallocate(q_real, stat=istat, errmsg=errorMessage)