Commit 3172d0c1 authored by Andreas Marek's avatar Andreas Marek

Real generic_simple_block6 kernel

parent 580978df
......@@ -109,6 +109,7 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/kernels/complex_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/kernels/simple_block6_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/compute_hh_trafo.F90 \
......@@ -193,6 +194,9 @@ if WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block4.F90
endif
if WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_simple_block6.F90
endif
if WITH_REAL_BGP_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_bgp.f90
endif
......@@ -786,6 +790,7 @@ EXTRA_DIST = \
src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
src/elpa2/kernels/simple_block6_template.F90 \
src/elpa2/pack_unpack_cpu.F90 \
src/elpa2/pack_unpack_gpu.F90 \
src/elpa2/qr/elpa_pdgeqrf_template.F90 \
......
......@@ -550,6 +550,7 @@ m4_define(elpa_m4_generic_kernels, [
real_generic
real_generic_simple
real_generic_simple_block4
real_generic_simple_block6
complex_generic
complex_generic_simple
])
......
......@@ -47,7 +47,8 @@ enum ELPA_SOLVERS {
X(ELPA_2STAGE_REAL_VSX_BLOCK2, 22, @ELPA_2STAGE_REAL_VSX_BLOCK2_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK4, 23, @ELPA_2STAGE_REAL_VSX_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_VSX_BLOCK6, 24, @ELPA_2STAGE_REAL_VSX_BLOCK6_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__)
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4, 25, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK4_COMPILED@, __VA_ARGS__) \
X(ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6, 26, @ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6_COMPILED@, __VA_ARGS__)
#define ELPA_FOR_ALL_2STAGE_REAL_KERNELS_AND_DEFAULT(X) \
ELPA_FOR_ALL_2STAGE_REAL_KERNELS(X) \
......
......@@ -81,6 +81,10 @@
use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
#endif
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
#endif
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
use real_generic_kernel !, only : double_hh_trafo_generic
#endif
......@@ -133,6 +137,7 @@
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:)
! real(kind=C_DATATYPE_KIND), target :: a1(stripe_width,a_dim2,stripe_count)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count)
......@@ -144,6 +149,7 @@
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
#endif
#if COMPLEXCASE == 1
! complex(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
......@@ -170,13 +176,14 @@
integer(kind=ik) :: j, nl, jj, jjj, n_times
#if REALCASE == 1
real(kind=C_DATATYPE_KIND) :: w(nbw,6)
! real(kind=C_DATATYPE_KIND) :: w1(nbw,6)
#endif
#if COMPLEXCASE == 1
complex(kind=C_DATATYPE_KIND) :: w(nbw,2)
#endif
real(kind=c_double) :: ttt ! MPI_WTIME always needs double
integer :: k,l,m
j = -99
if (wantDebug) then
......@@ -1395,7 +1402,8 @@
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), &
nbw, nl, stripe_width, nbw)
#endif
#endif
......@@ -1415,7 +1423,8 @@
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), &
nbw, nl, stripe_width)
#endif
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)) */
......@@ -1427,7 +1436,198 @@
#endif /* REALCASE */
#if REALCASE == 1
! generic simple block4 real kernel
!real generic simple block6 kernel
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE_BLOCK6) then
#endif /* not WITH_FIXED_REAL_KERNEL */
! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
do j = ncols, 6, -6
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
w(:,3) = bcast_buffer(1:nbw,j+off-2)
w(:,4) = bcast_buffer(1:nbw,j+off-3)
w(:,5) = bcast_buffer(1:nbw,j+off-4)
w(:,6) = bcast_buffer(1:nbw,j+off-5)
! w1(:,:) = w(:,:)
! a1(:,:,:) = a(:,:,:)
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_6hv_&
&PRECISION&
& (a(1,j+off+a_off-5,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_6hv_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-5:j+off+a_off-1,istripe,my_thread), w(1:nbw,1:6), &
nbw, nl, stripe_width, nbw)
#endif
#else /* WITH_OPENMP */
!print *,"j=",j," off=",off," a_off=",a_off," istripe=",istripe," nbw=",nbw," nl=",nl, &
! " stipe_width=",stripe_width,"nbw=",nbw
#ifdef USE_ASSUMED_SIZE
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_6hv_&
&PRECISION&
& (a(1,j+off+a_off-5,istripe), w, nbw, nl, stripe_width, nbw)
#else
!print *,"a=",a(1,151,1),"a1=",a1(1,151,1)
!call hexa_hh_trafo_&
! &MATH_DATATYPE&
! &_sse_6hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off-5,istripe)), w, nbw, nl, stripe_width, nbw)
call hexa_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_6hv_&
&PRECISION&
& (a(1:stripe_width,j+off+a_off-5:j+off+a_off+nbw-1,istripe), w(1:nbw,1:6), &
nbw, nl, stripe_width, nbw)
!print *," did : ",stripe_width,j+off+a_off-5,j+off+a_off+nbw-1,istripe
!do k=1,nbw
! do l=1,6
! if (w(k,l) .ne. w1(k,l)) then
! print *," w not identical ",k,l,w(k,l),w1(k,l)
! stop
! endif
! enddo
!enddo
!do k=1,stripe_width
! do l=j+off+a_off-5,j+off+a_off+nbw-1
! do m=1,istripe
! if (a(k,l,m) .ne. 0.) then
! if (abs(a(k,l,m)-a1(k,l,m))/abs(a(k,l,m)) .gt. 1e-6) then
! print *,j,k,l,m,a(k,l,m),a1(k,l,m)
! stop
! endif
! else
! if(a1(k,l,m) .ne. 0.0) then
! print *,k,l,m,a1(k,l,m)," but should be zero"
! stop
! endif
! endif
! enddo
! enddo
!enddo
!print *,"done j=",j
#endif
#endif /* WITH_OPENMP */
enddo
do jj = j, 4, -4
w(:,1) = bcast_buffer(1:nbw,jj+off)
w(:,2) = bcast_buffer(1:nbw,jj+off-1)
w(:,3) = bcast_buffer(1:nbw,jj+off-2)
w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1,jj+off+a_off-3,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-3:jj+off+a_off+nbw-1,istripe,my_thread), &
w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
#else /* WITH_OPENMP */
#ifdef USE_ASSUMED_SIZE
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1,jj+off+a_off-3,istripe), w, &
nbw, nl, stripe_width, nbw)
#else
call quad_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_4hv_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-3:jj+off+a_off+nbw-1,istripe), &
w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
#endif
#endif /* WITH_OPENMP */
enddo
do jjj = jj, 2, -2
w(:,1) = bcast_buffer(1:nbw,jjj+off)
w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,jjj+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe,my_thread), w(1:nbw,1:6), nbw, &
nl, stripe_width, nbw)
#endif
#else /* WITH_OPENMP */
#ifdef USE_ASSUMED_SIZE
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1,jjj+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_&
&PRECISION&
& (a(1:stripe_width,jj+off+a_off-1:jj+off+a_off-1+nbw,istripe), w(1:nbw,1:6), nbw, nl, &
stripe_width, nbw)
#endif
#endif /* WITH_OPENMP */
enddo
#ifdef WITH_OPENMP
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_openmp_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, istripe,my_thread), &
bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
if (jjj==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_cpu_&
&PRECISION&
& (a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#ifndef WITH_FIXED_REAL_KERNEL
endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL */
#endif /* REALCASE */
#if REALCASE == 1
! sparc64 block 4 real kernel
#if defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)
#ifndef WITH_FIXED_REAL_KERNEL
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! Author: A. Marek, MPCDF
! --------------------------------------------------------------------------------------------------
#endif
#include "config-f90.h"
#ifndef USE_ASSUMED_SIZE
module real_generic_simple_block6_kernel
private
public hexa_hh_trafo_real_generic_simple_6hv_double
#ifdef WANT_SINGLE_PRECISION_REAL
public hexa_hh_trafo_real_generic_simple_6hv_single
#endif
contains
#endif
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "simple_block6_template.F90"
#undef REALCASE
#undef DOUBLE_PRECISION
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "simple_block6_template.F90"
#undef REALCASE
#undef SINGLE_PRECISION
#endif
#ifndef USE_ASSUMED_SIZE
end module real_generic_simple_block6_kernel
#endif
! --------------------------------------------------------------------------------------------------
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
!
! --------------------------------------------------------------------------------------------------
!
! This file contains the compute intensive kernels for the Householder transformations.
!
! This is the small and simple version (no hand unrolling of loops etc.) but for some
! compilers this performs better than a sophisticated version with transformed and unrolled loops.
!
! It should be compiled with the highest possible optimization level.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! Author: A. Marek, MPCDF
! --------------------------------------------------------------------------------------------------
#endif
subroutine hexa_hh_trafo_&
&MATH_DATATYPE&
&_generic_simple_6hv_&
&PRECISION&
& (q, hh, nb, nq, ldq, ldh)
use precision
use elpa_abstract_impl
implicit none
integer(kind=ik), intent(in) :: nb, nq, ldq, ldh
#if REALCASE==1
#ifdef USE_ASSUMED_SIZE
real(kind=C_DATATYPE_KIND), intent(inout) :: q(ldq,*)
real(kind=C_DATATYPE_KIND), intent(in) :: hh(ldh,*)
#else
real(kind=C_DATATYPE_KIND), intent(inout) :: q(1:ldq,1:nb+5)
real(kind=C_DATATYPE_KIND), intent(in) :: hh(1:ldh,1:6)
#endif
real(kind=C_DATATYPE_KIND) :: scalarproduct(15)
real(kind=C_DATATYPE_KIND) :: vs_1_2, vs_1_3, vs_2_3, vs_1_4, vs_2_4, vs_3_4
real(kind=C_DATATYPE_KIND) :: vs_1_5, vs_1_6, vs_2_5, vs_2_6, vs_3_5
real(kind=C_DATATYPE_KIND) :: vs_3_6, vs_4_5, vs_4_6, vs_5_6
real(kind=C_DATATYPE_KIND) :: a_1_1(nq), a_2_1(nq), a_3_1(nq), a_4_1(nq), a_5_1(nq), a_6_1(nq)
real(kind=C_DATATYPE_KIND) :: h_6_5, h_6_4, h_6_3, h_6_2, h_6_1
real(kind=C_DATATYPE_KIND) :: h_5_4, h_5_3, h_5_2, h_5_1
real(kind=C_DATATYPE_KIND) :: h_4_3, h_4_2, h_4_1
real(kind=C_DATATYPE_KIND) :: h_2_1, h_3_2, h_3_1
real(kind=C_DATATYPE_KIND) :: h1, h2, h3, h4, h5, h6
real(kind=C_DATATYPE_KIND) :: w(nq), z(nq), x(nq), y(nq), t(nq), v(nq)
real(kind=C_DATATYPE_KIND) :: tau1, tau2, tau3, tau4, tau5, tau6
#endif /* REALCASE==1 */
integer(kind=ik) :: i, j
! Calculate dot product of the two Householder vectors
scalarproduct(1) = hh(2,2)
scalarproduct(2) = hh(3,3)
scalarproduct(3) = hh(2,3)
scalarproduct(4) = hh(4,4)
scalarproduct(5) = hh(3,4)
scalarproduct(6) = hh(2,4)
scalarproduct(7) = hh(5,5)
scalarproduct(8) = hh(4,5)
scalarproduct(9) = hh(3,5)
scalarproduct(10) = hh(2,5)
scalarproduct(11) = hh(6,6)
scalarproduct(12) = hh(5,6)
scalarproduct(13) = hh(4,6)
scalarproduct(14) = hh(3,6)
scalarproduct(15) = hh(2,6)
scalarproduct(1) = scalarproduct(1) + hh(2,1) * hh(3,2)
scalarproduct(3) = scalarproduct(3) + hh(2,2) * hh(3,3)
scalarproduct(6) = scalarproduct(6) + hh(2,3) * hh(3,4)
scalarproduct(10) = scalarproduct(10) + hh(2,4) * hh(3,5)
scalarproduct(15) = scalarproduct(15) + hh(2,5) * hh(3,6)
scalarproduct(1) = scalarproduct(1) + hh(3,1) * hh(4,2)
scalarproduct(3) = scalarproduct(3) + hh(3,2) * hh(4,3)
scalarproduct(6) = scalarproduct(6) + hh(3,3) * hh(4,4)
scalarproduct(10) = scalarproduct(10) + hh(3,4) * hh(4,5)
scalarproduct(15) = scalarproduct(15) + hh(3,5) * hh(4,6)
scalarproduct(2) = scalarproduct(2) + hh(2,1) * hh(4,3)
scalarproduct(5) = scalarproduct(5) + hh(2,2) * hh(4,4)
scalarproduct(9) = scalarproduct(9) + hh(2,3) * hh(4,5)
scalarproduct(14) = scalarproduct(14) + hh(2,4) * hh(4,6)
scalarproduct(1) = scalarproduct(1) + hh(4,1) * hh(5,2)
scalarproduct(3) = scalarproduct(3) + hh(4,2) * hh(5,3)
scalarproduct(6) = scalarproduct(6) + hh(4,3) * hh(5,4)
scalarproduct(10) = scalarproduct(10) + hh(4,4) * hh(5,5)
scalarproduct(15) = scalarproduct(15) + hh(4,5) * hh(5,6)
scalarproduct(2) = scalarproduct(2) + hh(3,1) * hh(5,3)
scalarproduct(5) = scalarproduct(5) + hh(3,2) * hh(5,4)
scalarproduct(9) = scalarproduct(9) + hh(3,3) * hh(5,5)
scalarproduct(14) = scalarproduct(14) + hh(3,4) * hh(5,6)
scalarproduct(4) = scalarproduct(4) + hh(2,1) * hh(5,4)
scalarproduct(8) = scalarproduct(8) + hh(2,2) * hh(5,5)
scalarproduct(13) = scalarproduct(13) + hh(2,3) * hh(5,6)
scalarproduct(1) = scalarproduct(1) + hh(5,1) * hh(6,2)
scalarproduct(3) = scalarproduct(3) + hh(5,2) * hh(6,3)
scalarproduct(6) = scalarproduct(6) + hh(5,3) * hh(6,4)
scalarproduct(10) = scalarproduct(10) + hh(5,4) * hh(6,5)
scalarproduct(15) = scalarproduct(15) + hh(5,5) * hh(6,6)
scalarproduct(2) = scalarproduct(2) + hh(4,1) * hh(6,3)
scalarproduct(5) = scalarproduct(5) + hh(4,2) * hh(6,4)
scalarproduct(9) = scalarproduct(9) + hh(4,3) * hh(6,5)
scalarproduct(14) = scalarproduct(14) + hh(4,4) * hh(6,6)
scalarproduct(4) = scalarproduct(4) + hh(3,1) * hh(6,4)
scalarproduct(8) = scalarproduct(8) + hh(3,2) * hh(6,5)
scalarproduct(13) = scalarproduct(13) + hh(3,3) * hh(6,6)
scalarproduct(7) = scalarproduct(7) + hh(2,1) * hh(6,5)
scalarproduct(12) = scalarproduct(12) + hh(2,2) * hh(6,6)
!DIR$ IVDEP
do i=7,nb
scalarproduct(1) = scalarproduct(1) + hh(i-1,1) * hh(i,2)
scalarproduct(3) = scalarproduct(3) + hh(i-1,2) * hh(i,3)
scalarproduct(6) = scalarproduct(6) + hh(i-1,3) * hh(i,4)
scalarproduct(10) = scalarproduct(10) + hh(i-1,4) * hh(i,5)
scalarproduct(15) = scalarproduct(15) + hh(i-1,5) * hh(i,6)
scalarproduct(2) = scalarproduct(2) + hh(i-2,1) * hh(i,3)
scalarproduct(5) = scalarproduct(5) + hh(i-2,2) * hh(i,4)
scalarproduct(9) = scalarproduct(9) + hh(i-2,3) * hh(i,5)
scalarproduct(14) = scalarproduct(14) + hh(i-2,4) * hh(i,6)
scalarproduct(4) = scalarproduct(4) + hh(i-3,1) * hh(i,4)
scalarproduct(8) = scalarproduct(8) +