Commit f7af9173 authored by Pavel Kus's avatar Pavel Kus

the BLAS kernel incorporated to the infrastructure

parent 8000ba03
......@@ -200,6 +200,10 @@ if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
endif
if WITH_REAL_BLAS_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_blas_block4.F90
endif
if WITH_REAL_BGP_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
endif
......
......@@ -618,6 +618,7 @@ m4_define(elpa_m4_generic_kernels, [
real_generic_simple_block6
complex_generic
complex_generic_simple
real_blas_block4
])
m4_define(elpa_m4_sse_assembly_kernels, [
......
......@@ -51,7 +51,8 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 26, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 27, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 28, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 29, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 29, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_BLAS_BLOCK4, 30, @ELPA_2STAGE_REAL_BLAS_BLOCK4_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
......@@ -85,6 +85,10 @@
! use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
!#endif
#if defined(WITH_REAL_BLAS_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_blas_block4_kernel !, only : double_hh_trafo_generic_simple
#endif
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_kernel !, only : double_hh_trafo_generic
#endif
......@@ -1470,6 +1474,144 @@
#endif /* REALCASE */
!====================================================================================
#if REALCASE == 1
! BLAS block4 real kernel
#if defined(WITH_REAL_BLAS_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_BLAS_BLOCK4) then
#endif /* not WITH_FIXED_REAL_KERNEL */
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_BLAS_BLOCK6_KERNEL))
do j = ncols, 4, -4
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
!#ifdef WITH_OPENMP
!
!#ifdef USE_ASSUMED_SIZE
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_blas_4hv_&
! &PRECISION&
! & (a(1,j+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!#else
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_blas_4hv_&
! &PRECISION&
! & (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe,my_thread), w(1:nbw,1:6), nbw, nl, &
! stripe_width, nbw)
!#endif
!
!#else
#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (a(1,j+off+a_off-3,istripe), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_blas_4hv_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-3:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), nbw, nl, &
stripe_width, nbw)
#endif
!#endif
enddo
! do jj = j, 2, -2
! w(:,1) = bcast_buffer(1:nbw,jj+off)
! w(:,2) = bcast_buffer(1:nbw,jj+off-1)
!!#ifdef WITH_OPENMP
!!
!!#ifdef USE_ASSUMED_SIZE
!! call double_hh_trafo_&
!! &MATH_DATATYPE&
!! &_blas_&
!! &PRECISION&
!! & (a(1,jj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
!!#else
!! call double_hh_trafo_&
!! &MATH_DATATYPE&
!! &_blas_&
!! &PRECISION&
!! & (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe,my_thread), w(1:nbw,1:6), nbw, &
!! nl, stripe_width, nbw)
!!#endif
!!
!!#else
!!
!#ifdef USE_ASSUMED_SIZE
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_blas_&
! &PRECISION&
! & (a(1,jj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!#else
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_blas_&
! &PRECISION&
! & (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), &
! nbw, nl, stripe_width, nbw)
!#endif
!
!!#endif
! enddo
!#ifdef WITH_OPENMP
!
! if (jj==1) call single_hh_trafo_&
! &MATH_DATATYPE&
! &_cpu_openmp_&
! &PRECISION&
! & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
! bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
!
!#else
!!!! if (jj==1) call single_hh_trafo_&
!!!! &MATH_DATATYPE&
!!!! &_cpu_&
!!!! &PRECISION&
!!!! & (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), &
!!!! nbw, nl, stripe_width)
! transform more of the remaining vectors by single kernel before the double is introduced
do jj = j, 1, -1
call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off:jj+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+jj), &
nbw, nl, stripe_width)
enddo
!#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_BLAS_BLOCK6_KERNEL)) */
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_BLAS_BLOCK4_KERNEL */
#endif /* REALCASE */
!====================================================================================
#if REALCASE == 1
!real generic simple block6 kernel
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)
......
......@@ -62,7 +62,7 @@
#if REALCASE==1
subroutine quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_blas_4hv_&
&_blas_4hv_&
&PRECISION&
& (q, hh, nb, nq, ldq, ldh)
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! --------------------------------------------------------------------------------------------------
#endif
#include "config-f90.h"
!TODO why this ifndef???
#ifndef USE_ASSUMED_SIZE
module real_blas_block4_kernel
private
public quad_hh_trafo_real_blas_4hv_double
#ifdef WANT_SINGLE_PRECISION_REAL
public quad_hh_trafo_real_blas_4hv_single
#endif
contains
#endif
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "blas_block4_template.F90"
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "blas_block4_template.F90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif
#ifndef USE_ASSUMED_SIZE
end module real_blas_block4_kernel
#endif
! --------------------------------------------------------------------------------------------------
......@@ -80,9 +80,9 @@
real(kind=C_DATATYPE_KIND), intent(in) :: hh(1:ldh,1:6)
#endif
!TODO remove
real(kind=C_DATATYPE_KIND) :: q_copy(1:ldq,1:nb+3)
real(kind=C_DATATYPE_KIND) :: diff
! !TODO remove
! real(kind=C_DATATYPE_KIND) :: q_copy(1:ldq,1:nb+3)
! real(kind=C_DATATYPE_KIND) :: diff
real(kind=C_DATATYPE_KIND) :: s_1_2, s_1_3, s_2_3, s_1_4, s_2_4, s_3_4
real(kind=C_DATATYPE_KIND) :: vs_1_2, vs_1_3, vs_2_3, vs_1_4, vs_2_4, vs_3_4
......@@ -112,22 +112,22 @@
#endif /* COMPLEXCASE==1 */
integer(kind=ik) :: i
!TODO remove
print *, "SIMPLE BLOCK4, nb, nq, ldq, ldh ", nb, nq, ldq, ldh
!print *, "Q:", q(1:ldq,1:nb+3)
!print *, "HH:", hh(1:ldh,1:6)
! !TODO remove
! print *, "SIMPLE BLOCK4, nb, nq, ldq, ldh ", nb, nq, ldq, ldh
! !print *, "Q:", q(1:ldq,1:nb+3)
! !print *, "HH:", hh(1:ldh,1:6)
! call the blas kernel for future comparison
! TODO remove
#if REALCASE==1
q_copy(:,:) = q(:,1:nb+3)
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_blas_4hv_&
&PRECISION&
& (q_copy, hh, nb, nq, ldq, ldh)
#endif
!
! ! call the blas kernel for future comparison
! ! TODO remove
!#if REALCASE==1
! q_copy(:,:) = q(:,1:nb+3)
! call quad_hh_trafo_&
! &MATH_DATATYPE&
! &_generic_blas_4hv_&
! &PRECISION&
! & (q_copy, hh, nb, nq, ldq, ldh)
!#endif
! Calculate dot product of the two Householder vectors
......@@ -332,12 +332,12 @@
q(1:nq,nb+3) = - (x(1:nq) * h1) + q(1:nq,nb+3)
!TODO remove
diff = maxval(abs(q(:,1:nb+3) - q_copy(:, 1:nb+3)))
print *, "DIFFERENCE: ", diff
! !TODO remove
! diff = maxval(abs(q(:,1:nb+3) - q_copy(:, 1:nb+3)))
! print *, "DIFFERENCE: ", diff
end subroutine
! TODO remove
#include "blas_block4_template.F90"
!! TODO remove
!#include "blas_block4_template.F90"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment