Commit 071ba6be authored by Andreas Marek's avatar Andreas Marek

Unify template for real, complex simple kernel

parent 9e2c7b80
......@@ -52,8 +52,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_compute_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_simple_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_simple_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90
lib_LTLIBRARIES = libelpa@SUFFIX@.la
......@@ -775,8 +774,7 @@ EXTRA_DIST = \
src/elpa2_compute_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_simple_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_simple_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \
src/elpa_qr/elpa_qrkernels.X90 \
src/ev_tridi_band_gpu_c_v2_complex_template.Xcu \
......
......@@ -10,7 +10,7 @@
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
......
......@@ -10,7 +10,7 @@
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
......@@ -68,18 +68,22 @@ module complex_generic_simple_kernel
contains
#define COMPLEXCASE 1
#define DOUBLE_PRECISION_COMPLEX 1
#define COMPLEX_DATATYPE ck8
#include "elpa2_kernels_complex_simple_template.X90"
#define DATATYPE ck8
#include "elpa2_kernels_simple_template.X90"
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
#undef DATATYPE
#undef COMPLEXCASE
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#define COMPLEXCASE 1
#undef DOUBLE_PRECISION_COMPLEX
#define COMPLEX_DATATYPE ck4
#include "elpa2_kernels_complex_simple_template.X90"
#define DATATYPE ck4
#include "elpa2_kernels_simple_template.X90"
#undef DOUBLE_PRECISION_COMPLEX
#undef COMPLEX_DATATYPE
#undef DATATYPE
#undef COMPLEXCASE
#endif
......
......@@ -10,7 +10,7 @@
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
......
......@@ -10,7 +10,7 @@
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
......@@ -69,19 +69,22 @@ module real_generic_simple_kernel
contains
#define REALCASE 1
#define DOUBLE_PRECISION_REAL 1
#define REAL_DATATYPE rk8
#include "elpa2_kernels_real_simple_template.X90"
#define DATATYPE rk8
#include "elpa2_kernels_simple_template.X90"
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef DATATYPE
#undef REALCASE
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#undef DOUBLE_PRECISION_REAL
#define REAL_DATATYPE rk4
#include "elpa2_kernels_real_simple_template.X90"
#define DATATYPE rk4
#include "elpa2_kernels_simple_template.X90"
#undef DOUBLE_PRECISION_REAL
#undef REAL_DATATYPE
#undef DATATYPE
#undef REALCASE
#endif
end module real_generic_simple_kernel
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
#endif
#ifdef DOUBLE_PRECISION_REAL
subroutine double_hh_trafo_generic_simple_double(q, hh, nb, nq, ldq, ldh)
#else
subroutine double_hh_trafo_generic_simple_single(q, hh, nb, nq, ldq, ldh)
#endif
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
#endif
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*)
real(kind=REAL_DATATYPE), intent(in) :: hh(ldh,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,1:nb+1)
real(kind=REAL_DATATYPE), intent(in) :: hh(ldh,2)
#endif
real(kind=REAL_DATATYPE) :: s, h1, h2, tau1, tau2, x(nq), y(nq)
integer(kind=ik) :: i
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic simple: double_hh_trafo_generic_simple_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel generic simple: double_hh_trafo_generic_simple_single")
#endif
#endif
! Calculate dot product of the two Householder vectors
s = hh(2,2)*1
do i=3,nb
s = s+hh(i,2)*hh(i-1,1)
enddo
! Do the Householder transformations
x(1:nq) = q(1:nq,2)
y(1:nq) = q(1:nq,1) + q(1:nq,2)*hh(2,2)
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
x(1:nq) = x(1:nq) + q(1:nq,i)*h1
y(1:nq) = y(1:nq) + q(1:nq,i)*h2
enddo
x(1:nq) = x(1:nq) + q(1:nq,nb+1)*hh(nb,1)
tau1 = hh(1,1)
tau2 = hh(1,2)
h1 = -tau1
x(1:nq) = x(1:nq)*h1
h1 = -tau2
h2 = -tau2*s
y(1:nq) = y(1:nq)*h1 + x(1:nq)*h2
q(1:nq,1) = q(1:nq,1) + y(1:nq)
q(1:nq,2) = q(1:nq,2) + x(1:nq) + y(1:nq)*hh(2,2)
do i=3,nb
h1 = hh(i-1,1)
h2 = hh(i,2)
q(1:nq,i) = q(1:nq,i) + x(1:nq)*h1 + y(1:nq)*h2
enddo
q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic simple: double_hh_trafo_generic_simple_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel generic simple: double_hh_trafo_generic_simple_single")
#endif
#endif
#ifdef DOUBLE_PRECISION_REAL
end subroutine double_hh_trafo_generic_simple_double
#else
end subroutine double_hh_trafo_generic_simple_single
#endif
......@@ -58,6 +58,7 @@
! --------------------------------------------------------------------------------------------------
#endif
#if COMPLEXCASE==1
! the intel compiler creates a temp copy of array q
! this should be avoided without using assumed size arrays
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -73,14 +74,14 @@
integer(kind=ik), intent(in) :: nb, nq, ldq
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
complex(kind=COMPLEX_DATATYPE), intent(in) :: hh(*)
complex(kind=DATATYPE), intent(inout) :: q(ldq,*)
complex(kind=DATATYPE), intent(in) :: hh(*)
#else
complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(1:ldq,1:nb)
complex(kind=COMPLEX_DATATYPE), intent(in) :: hh(1:nb)
complex(kind=DATATYPE), intent(inout) :: q(1:ldq,1:nb)
complex(kind=DATATYPE), intent(in) :: hh(1:nb)
#endif
integer(kind=ik) :: i
complex(kind=COMPLEX_DATATYPE) :: h1, tau1, x(nq)
complex(kind=DATATYPE) :: h1, tau1, x(nq)
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -88,13 +89,14 @@
call timer%start("kernel complex generic simple: single_hh_trafo_complex_generic_simple_double")
#endif
#else
#else /* DOUBLE_PRECISION_COMPLEX */
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel complex generic simple: single_hh_trafo_complex_generic_simple_single")
#endif
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
! Just one Householder transformation
x(1:nq) = q(1:nq,1)
......@@ -117,26 +119,43 @@
call timer%stop("kernel complex generic simple: single_hh_trafo_complex_generic_simple_double")
#endif
#else
#else /* DOUBLE_PRECISION_COMPLEX */
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel complex generic simple: single_hh_trafo_complex_generic_simple_single")
#endif
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine single_hh_trafo_complex_generic_simple_double
#else
end subroutine single_hh_trafo_complex_generic_simple_single
#endif
#endif /* COMPLEXCASE == 1 */
! --------------------------------------------------------------------------------------------------
#if REALCASE==1
#ifdef DOUBLE_PRECISION_REAL
subroutine double_hh_trafo_generic_simple_double(q, hh, nb, nq, ldq, ldh)
#else
subroutine double_hh_trafo_generic_simple_single(q, hh, nb, nq, ldq, ldh)
#endif
#endif /* REALCASE == 1 */
#if COMPLEXCASE==1
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine double_hh_trafo_complex_generic_simple_double(q, hh, nb, nq, ldq, ldh)
#else
subroutine double_hh_trafo_complex_generic_simple_single(q, hh, nb, nq, ldq, ldh)
#endif
#endif /* COMPLEXCASE==1 */
use precision
#ifdef HAVE_DETAILED_TIMINGS
use timings
......@@ -144,16 +163,51 @@
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#if REALCASE==1
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(ldq,*)
complex(kind=COMPLEX_DATATYPE), intent(in) :: hh(ldh,*)
real(kind=DATATYPE), intent(inout) :: q(ldq,*)
real(kind=DATATYPE), intent(in) :: hh(ldh,*)
#else
complex(kind=COMPLEX_DATATYPE), intent(inout) :: q(1:ldq,1:nb+1)
complex(kind=COMPLEX_DATATYPE), intent(in) :: hh(1:ldh,1:2)
real(kind=DATATYPE), intent(inout) :: q(1:ldq,1:nb+1)
real(kind=DATATYPE), intent(in) :: hh(1:ldh,1:6)
#endif
complex(kind=COMPLEX_DATATYPE) :: s, h1, h2, tau1, tau2, x(nq), y(nq)
real(kind=DATATYPE) :: s, h1, h2, tau1, tau2, x(nq), y(nq)
#endif /* REALCASE==1 */
#if COMPLEXCASE==1
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=DATATYPE), intent(inout) :: q(ldq,*)
complex(kind=DATATYPE), intent(in) :: hh(ldh,*)
#else
complex(kind=DATATYPE), intent(inout) :: q(1:ldq,1:nb+1)
complex(kind=DATATYPE), intent(in) :: hh(1:ldh,1:2)
#endif
complex(kind=DATATYPE) :: s, h1, h2, tau1, tau2, x(nq), y(nq)
#endif /* COMPLEXCASE==1 */
integer(kind=ik) :: i
#if REALCASE==1
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel real generic simple: double_hh_trafo_real_generic_simple_double")
#endif
#else
#ifdef HAVE_DETAILED_TIMINGS
call timer%start("kernel real generic simple: double_hh_trafo_real_generic_simple_single")
#endif
#endif
#endif /* REALCASE==1 */
#if COMPLEXCASE==1
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef HAVE_DETAILED_TIMINGS
......@@ -168,28 +222,55 @@
#endif
#endif /* COMPLEXCASE==1 */
! Calculate dot product of the two Householder vectors
#if REALCASE==1
s = hh(2,2)*1.0
do i=3,nb
s = s+hh(i,2)*hh(i-1,1)
enddo
#endif
s = conjg(hh(2,2))*1
#if COMPLEXCASE==1
s = conjg(hh(2,2))*1.0
do i=3,nb
s = s+(conjg(hh(i,2))*hh(i-1,1))
enddo
#endif
! Do the Householder transformations
x(1:nq) = q(1:nq,2)
#if REALCASE==1
y(1:nq) = q(1:nq,1) + q(1:nq,2)*hh(2,2)
#endif
#if COMPLEXCASE==1
y(1:nq) = q(1:nq,1) + q(1:nq,2)*conjg(hh(2,2))
#endif
do i=3,nb
#if REALCASE==1
h1 = hh(i-1,1)
h2 = hh(i,2)
#endif
#if COMPLEXCASE==1
h1 = conjg(hh(i-1,1))
h2 = conjg(hh(i,2))
#endif
x(1:nq) = x(1:nq) + q(1:nq,i)*h1
y(1:nq) = y(1:nq) + q(1:nq,i)*h2
enddo
x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
#if REALCASE==1
x(1:nq) = x(1:nq) + q(1:nq,nb+1)*hh(nb,1)
#endif
#if COMPLEXCASE==1
x(1:nq) = x(1:nq) + q(1:nq,nb+1)*conjg(hh(nb,1))
#endif
tau1 = hh(1,1)
tau2 = hh(1,2)
......@@ -210,22 +291,58 @@
q(1:nq,nb+1) = q(1:nq,nb+1) + x(1:nq)*hh(nb,1)
#if REALCASE==1
#ifdef DOUBLE_PRECISION_REAL
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel real generic simple: double_hh_trafo_complex_generic_simple_double")
#endif
#else /* DOUBLE_PRECISION_REAL */
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel real generic simple: double_hh_trafo_complex_generic_simple_single")
#endif
#endif /* DOUBLE_PRECISION_REAL */
#endif /* REALCASE==1 */
#if COMPLEXCASE==1
#ifdef DOUBLE_PRECISION_COMPLEX
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel complex generic simple: double_hh_trafo_complex_generic_simple_double")
#endif
#else
#else /* DOUBLE_PRECISION_COMPLEX */
#ifdef HAVE_DETAILED_TIMINGS
call timer%stop("kernel complex generic simple: double_hh_trafo_complex_generic_simple_single")
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
#endif /* COMPLEXCASE==1 */
#if REALCASE==1
#ifdef DOUBLE_PRECISION_REAL
end subroutine double_hh_trafo_generic_simple_double
#else
end subroutine double_hh_trafo_generic_simple_single
#endif
#endif /* REALCASE==1 */
#if COMPLEXCASE==1
#ifdef DOUBLE_PRECISION_COMPLEX
end subroutine double_hh_trafo_complex_generic_simple_double
#else
end subroutine double_hh_trafo_complex_generic_simple_single
#endif
#endif /* COMPLEXCASE==1 */
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment