Scheduled maintenance on Monday 2019-06-24 between 10:00-11:00 CEST

...
 
Commits (3)
......@@ -198,6 +198,11 @@ endif
#if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
#endif
if WITH_REAL_BLAS_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_blas_block4.F90
endif
if WITH_REAL_BGP_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
endif
......
......@@ -615,6 +615,7 @@ m4_define(elpa_m4_generic_kernels, [
real_generic_simple_block4
complex_generic
complex_generic_simple
real_blas_block4
])
m4_define(elpa_m4_sse_assembly_kernels, [
......
......@@ -47,7 +47,8 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_BLAS_BLOCK4, 26, @ELPA_2STAGE_REAL_BLAS_BLOCK4_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
......@@ -85,6 +85,10 @@
! use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
!#endif
#if defined(WITH_REAL_BLAS_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_blas_block4_kernel !, only : double_hh_trafo_generic_simple
#endif
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_kernel !, only : double_hh_trafo_generic
#endif
......@@ -1432,6 +1436,144 @@
#endif /* REALCASE */
!====================================================================================
#if REALCASE == 1
! BLAS block4 real kernel
#if defined(WITH_REAL_BLAS_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_BLAS_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_BLAS_BLOCK6_KERNEL))
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
!#ifdef WITH_OPENMP
!
!#ifdef USE_ASSUMED_SIZE
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_blas_4hv_&
! &PRECISION&
! & (a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!#else
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_blas_4hv_&
! &PRECISION&
! & (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe,my_thread), w(1:nbw,1:6), nbw, nl, &
! stripe_width, nbw)
!#endif
!
!#else
#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), nbw, nl, &
stripe_width, nbw)
#endif
!#endif
enddo
! do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
!!#ifdef WITH_OPENMP
!!
!!#ifdef USE_ASSUMED_SIZE
!! call double_hh_trafo_&
!! &MATH_DATATYPE&
!! &_blas_&
!! &PRECISION&
!! & (a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!!#else
!! call double_hh_trafo_&
!! &MATH_DATATYPE&
!! &_blas_&
!! &PRECISION&
!! & (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe,my_thread), w(1:nbw,1:6), nbw, &
!! nl, stripe_width, nbw)
!!#endif
!!
!!#else
!!
!#ifdef USE_ASSUMED_SIZE
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_blas_&
! &PRECISION&
! & (a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!#else
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_blas_&
! &PRECISION&
! & (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), &
! nbw, nl, stripe_width, nbw)
!#endif
!
!!#endif
! enddo
!#ifdef WITH_OPENMP
!
! if (jj==1) call single_hh_trafo_&
! &MATH_DATATYPE&
! &_cpu_openmp_&
! &PRECISION&
! & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
! bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
!
!#else
!!!! if (jj==1) call single_hh_trafo_&
!!!! &MATH_DATATYPE&
!!!! &_cpu_&
!!!! &PRECISION&
!!!! & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), &
!!!! nbw, nl, stripe_width)
! transform more of the remaining vectors by single kernel before the double is introduced
do jj = j, 1, -1
call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off:jj+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+jj), &
nbw, nl, stripe_width)
enddo
!#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_BLAS_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_BLAS_BLOCK4_KERNEL */
#endif /* REALCASE */
!====================================================================================
#if REALCASE == 1
!real generic simple block6 kernel
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)
......
......@@ -62,13 +62,14 @@
#if REALCASE==1
subroutine quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_blas_4hv_&
&_blas_4hv_&
&PRECISION&
& (q, hh, nb, nq, ldq, ldh)
use precision
use elpa_abstract_impl
implicit none
#include "../../general/precision_kinds.F90"
!class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
......@@ -90,12 +91,12 @@
! Calculate dot product of the two Householder vectors
h_mat(:,:) = 0.0
h_mat(:,:) = 0.0_rk
h_mat(1,4) = -1.0
h_mat(2,3) = -1.0
h_mat(3,2) = -1.0
h_mat(4,1) = -1.0
h_mat(1,4) = -1.0_rk
h_mat(2,3) = -1.0_rk
h_mat(3,2) = -1.0_rk
h_mat(4,1) = -1.0_rk
h_mat(1,5:nb+3) = -hh(2:nb, 1)
h_mat(2,4:nb+2) = -hh(2:nb, 2)
......@@ -104,26 +105,32 @@
! TODO we do not need the diagonal, but how to do it with BLAS?
!s_mat = - matmul(h_mat, transpose(h_mat))
call DSYRK('L', 'N', 4, nb+3, &
-1.0_rk8, h_mat, 4, &
0.0_rk8, s_mat, 4)
call PRECISION_SYRK('L', 'N', 4, nb+3, &
-ONE, h_mat, 4, &
ZERO, s_mat, 4)
! w_comb = - matmul(q(1:nq, 1:nb+3), transpose(h_mat))
call DGEMM('N', 'T', nq, 4, nb+3, &
-1.0_rk8, q, ldq, &
h_mat, 4, &
0.0_rk8, w_comb, nq)
call PRECISION_GEMM('N', 'T', nq, 4, nb+3, &
-ONE, q, ldq, &
h_mat, 4, &
ZERO, w_comb, nq)
! Rank-1 update
do i = 1, 4
w_comb(1:nq,i) = matmul(w_comb(1:nq,1:i-1), hh(1,i) * s_mat(i,1:i-1)) + hh(1,i) * w_comb(1:nq, i)
!w_comb(1:nq,1) = hh(1,1) * w_comb(1:nq, 1)
call PRECISION_SCAL(nq, hh(1,1), w_comb(1:nq, 1), 1)
do i = 2, 4
! w_comb(1:nq,i) = matmul(w_comb(1:nq,1:i-1), hh(1,i) * s_mat(i,1:i-1)) + hh(1,i) * w_comb(1:nq, i)
call PRECISION_GEMV('N', nq, i-1, &
hh(1,i), w_comb(1:nq, 1:i-1), nq, &
s_mat(i,1:i-1), 1, &
hh(1,i), w_comb(1:nq,i), 1)
enddo
!q(1:nq, 1:nb+3) = matmul(w_comb, h_mat) + q(1:nq, 1:nb+3)
call DGEMM('N', 'N', nq, nb+3, 4, &
1.0_rk8, w_comb, nq, &
h_mat, 4, &
1.0_rk8, q, ldq)
call PRECISION_GEMM('N', 'N', nq, nb+3, 4, &
ONE, w_comb, nq, &
h_mat, 4, &
ONE, q, ldq)
end subroutine
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
#endif
#include "config-f90.h"
!TODO why this ifndef???
#ifndef USE_ASSUMED_SIZE
module real_blas_block4_kernel
private
public quad_hh_trafo_real_blas_4hv_double
#ifdef WANT_SINGLE_PRECISION_REAL
public quad_hh_trafo_real_blas_4hv_single
#endif
contains
#endif
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "blas_block4_template.F90"
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "blas_block4_template.F90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif
#ifndef USE_ASSUMED_SIZE
end module real_blas_block4_kernel
#endif
! --------------------------------------------------------------------------------------------------
......@@ -80,9 +80,9 @@
real(kind=C_DATATYPE_KIND), intent(in) :: hh(1:ldh,1:6)
#endif
!TODO remove
real(kind=C_DATATYPE_KIND) :: q_copy(1:ldq,1:nb+3)
real(kind=C_DATATYPE_KIND) :: diff
! !TODO remove
! real(kind=C_DATATYPE_KIND) :: q_copy(1:ldq,1:nb+3)
! real(kind=C_DATATYPE_KIND) :: diff
real(kind=C_DATATYPE_KIND) :: s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4
real(kind=C_DATATYPE_KIND) :: vs_1_2, vs_1_3, vs_2_3, vs_1_4, vs_2_4, vs_3_4
......@@ -112,22 +112,22 @@
#endif /* COMPLEXCASE==1 */
integer(kind=ik) :: i
!TODO remove
print *, "SIMPLE BLOCK4, nb, nq, ldq, ldh ", nb, nq, ldq, ldh
!print *, "Q:", q(1:ldq,1:nb+3)
!print *, "HH:", hh(1:ldh,1:6)
! !TODO remove
! print *, "SIMPLE BLOCK4, nb, nq, ldq, ldh ", nb, nq, ldq, ldh
! !print *, "Q:", q(1:ldq,1:nb+3)
! !print *, "HH:", hh(1:ldh,1:6)
! call the blas kernel for future comparison
! TODO remove
#if REALCASE==1
q_copy(:,:) = q(:,1:nb+3)
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_blas_4hv_&
&PRECISION&
& (q_copy, hh, nb, nq, ldq, ldh)
#endif
!
! ! call the blas kernel for future comparison
! ! TODO remove
!#if REALCASE==1
! q_copy(:,:) = q(:,1:nb+3)
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_generic_blas_4hv_&
! &PRECISION&
! & (q_copy, hh, nb, nq, ldq, ldh)
!#endif
! Calculate dot product of the two Householder vectors
......@@ -332,12 +332,12 @@
q(1:nq,nb+3) = - (x(1:nq) * h1) + q(1:nq,nb+3)
!TODO remove
diff = maxval(abs(q(:,1:nb+3) - q_copy(:, 1:nb+3)))
print *, "DIFFERENCE: ", diff
! !TODO remove
! diff = maxval(abs(q(:,1:nb+3) - q_copy(:, 1:nb+3)))
! print *, "DIFFERENCE: ", diff
end subroutine
! TODO remove
#include "blas_block4_template.F90"
!! TODO remove
!#include "blas_block4_template.F90"
......@@ -16,6 +16,7 @@
#undef C_PLACPY
#undef C_PTRAN
#undef PRECISION_SCAL
#undef PRECISION_TRTRI
#undef PRECISION_POTRF
#undef PRECISION_TRSM
......@@ -75,6 +76,7 @@
#define BLAS_CHAR_AND_SY_OR_HE DSY
#define SPECIAL_COMPLEX_DATATYPE ck8
#define PRECISION_SCAL DSCAL
#define PRECISION_TRTRI DTRTRI
#define PRECISION_POTRF DPOTRF
#define PRECISION_TRSM DTRSM
......@@ -135,6 +137,7 @@
#define BLAS_CHAR_AND_SY_OR_HE SSY
#define SPECIAL_COMPLEX_DATATYPE ck4
#define PRECISION_SCAL SSCAL
#define PRECISION_TRTRI STRTRI
#define PRECISION_POTRF SPOTRF
#define PRECISION_TRSM STRSM
......@@ -202,6 +205,7 @@
#undef C_PLACPY
#undef C_PTRAN
#undef PRECISION_SCAL
#undef PRECISION_TRTRI
#undef PRECISION_POTRF
#undef PRECISION_TRSM
......@@ -273,6 +277,7 @@
#define C_PLACPY pzlacpy_
#define C_PTRAN pztranc_
#define PRECISION_SCAL ZSCAL
#define PRECISION_TRTRI ZTRTRI
#define PRECISION_POTRF ZPOTRF
#define PRECISION_TRSM ZTRSM
......@@ -337,6 +342,7 @@
#define C_PLACPY pclacpy_
#define C_PTRAN pctranc_
#define PRECISION_SCAL CSCAL
#define PRECISION_TRTRI CTRTRI
#define PRECISION_POTRF CPOTRF
#define PRECISION_TRSM CTRSM
......