Commit 26401f3b authored by Andreas Marek's avatar Andreas Marek
Browse files

Fix real block6 kernel on power8

parent e13b6e7f
......@@ -250,13 +250,13 @@ endif
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_4hv_single_precision.c
#endif
#endif
#
#if WITH_REAL_VSX_BLOCK4_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
#endif
#endif
if WITH_REAL_VSX_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_4hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK4_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_4hv_double_precision.c
......@@ -286,19 +286,19 @@ if WANT_SINGLE_PRECISION_REAL
endif
endif
#if WITH_REAL_SPARC64_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
#endif
#endif
#
#if WITH_REAL_VSX_BLOCK6_KERNEL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
#if WANT_SINGLE_PRECISION_REAL
# libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_single_precision.c
#endif
#endif
if WITH_REAL_SPARC64_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sparc64_6hv_single_precision.c
endif
endif
if WITH_REAL_VSX_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_double_precision.c
if WANT_SINGLE_PRECISION_REAL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_vsx_6hv_single_precision.c
endif
endif
if WITH_REAL_SSE_BLOCK6_KERNEL
libelpa@SUFFIX@_private_la_SOURCES += src/elpa2/kernels/real_sse_6hv_double_precision.c
......
......@@ -670,19 +670,19 @@
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
ttt = mpi_wtime()
do j = ncols, 1, -1
#ifdef WITH_OPENMP
call single_hh_trafo_&
&MATH_DATATYPE&
&_vsx_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
call single_hh_trafo_&
&MATH_DATATYPE&
&_vsx_1hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
!#ifdef WITH_OPENMP
! call single_hh_trafo_&
! &MATH_DATATYPE&
! &_vsx_1hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
! call single_hh_trafo_&
! &MATH_DATATYPE&
! &_vsx_1hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
......@@ -973,33 +973,33 @@
do j = ncols, 2, -2
w(:,1) = bcast_buffer(1:nbw,j+off)
w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
call double_hh_trafo_&
&MATH_DATATYPE&
&_vsx_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
call double_hh_trafo_&
&MATH_DATATYPE&
&_vsx_2hv_&
&PRECISION&
& (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
!#ifdef WITH_OPENMP
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_vsx_2hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!#else
! call double_hh_trafo_&
! &MATH_DATATYPE&
! &_vsx_2hv_&
! &PRECISION&
! & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!#endif
enddo
#ifdef WITH_OPENMP
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_vsx_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
if (j==1) call single_hh_trafo_&
&MATH_DATATYPE&
&_vsx_1hv_&
&PRECISION&
& (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
!#ifdef WITH_OPENMP
! if (j==1) call single_hh_trafo_&
! &MATH_DATATYPE&
! &_vsx_1hv_&
! &PRECISION&
! & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#else
! if (j==1) call single_hh_trafo_&
! &MATH_DATATYPE&
! &_vsx_1hv_&
! &PRECISION&
! & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
#ifndef WITH_FIXED_COMPLEX_KERNEL
endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
......
......@@ -823,7 +823,7 @@ void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq
_SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *) &q[offset]);
q3 = _SSE_LOAD(0, (unsigned long int *) &q[2*offset]);
q3 = _SSE_ADD(q3, y3);
_SSE_STORE((__vector unsigned int) q3,0, (unsigned int *) &q[2*offset]);
_SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *) &q[2*offset]);
q4 = _SSE_LOAD(0, (unsigned long int *) &q[3*offset]);
q4 = _SSE_ADD(q4, y4);
_SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *) &q[3*offset]);
......
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "real_vsx_6hv_template.c"
#undef REALCASE
#undef DOUBLE_PRECISION
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF
#include "config-f90.h"
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "../../general/precision_macros.h"
#include "real_vsx_6hv_template.c"
#undef REALCASE
#undef SINGLE_PRECISION
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de), based on Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#ifdef HAVE_VSX_SSE
#include <altivec.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __vector double
#define _SSE_LOAD (__vector double) vec_ld
#define _SSE_ADD vec_add
#define _SSE_SUB vec_sub
#define _SSE_MUL vec_mul
#define _SSE_STORE vec_st
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __vector float
#define _SSE_LOAD (__vector float) vec_ld
#define _SSE_ADD vec_add
#define _SSE_SUB vec_sub
#define _SSE_MUL vec_mul
#define _SSE_STORE vec_st
#endif
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_vsx_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_vsx_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_vsx_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_vsx_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_vsx_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_vsx_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef DOUBLE_PRECISION_REAL
/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_vsx_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_vsx_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_vsx_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_vsx_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_vsx_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_vsx_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int worked_on ;
worked_on = 0;
// calculating scalar products to compute
// 6 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
double scalarprods[15];
#endif
#ifdef SINGLE_PRECISION_REAL
float scalarprods[15];
#endif
scalarprods[0] = hh[(ldh+1)];
scalarprods[1] = hh[(ldh*2)+2];
scalarprods[2] = hh[(ldh*2)+1];
scalarprods[3] = hh[(ldh*3)+3];
scalarprods[4] = hh[(ldh*3)+2];
scalarprods[5] = hh[(ldh*3)+1];
scalarprods[6] = hh[(ldh*4)+4];
scalarprods[7] = hh[(ldh*4)+3];
scalarprods[8] = hh[(ldh*4)+2];
scalarprods[9] = hh[(ldh*4)+1];
scalarprods[10] = hh[(ldh*5)+5];
scalarprods[11] = hh[(ldh*5)+4];
scalarprods[12] = hh[(ldh*5)+3];
scalarprods[13] = hh[(ldh*5)+2];
scalarprods[14] = hh[(ldh*5)+1];
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)];
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
// loop counter = 3
scalarprods[0] += hh[2] * hh[(3+ldh)];
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
scalarprods[1] += hh[1] * hh[3+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
// loop counter = 4
scalarprods[0] += hh[3] * hh[(4+ldh)];
scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
scalarprods[1] += hh[2] * hh[4+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
scalarprods[3] += hh[1] * hh[4+(ldh*3)];
scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
// loop counter = 5
scalarprods[0] += hh[4] * hh[(5+ldh)];
scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
scalarprods[1] += hh[3] * hh[5+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
scalarprods[3] += hh[2] * hh[5+(ldh*3)];
scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
scalarprods[6] += hh[1] * hh[5+(ldh*4)];
scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
#pragma ivdep
for (i = 6; i < nb; i++)
{
scalarprods[0] += hh[i-1] * hh[(i+ldh)];
scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
}
// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-2; i+=4)
{
hh_trafo_kernel_4_VSX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 4;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-4; i+=8)
{
hh_trafo_kernel_8_VSX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 8;
}
#endif
if (nq == i)
{
return;
}
#ifdef DOUBLE_PRECISION_REAL
if (nq -i == 2)
{
hh_trafo_kernel_2_VSX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 2;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq -i == 4)
{
hh_trafo_kernel_4_VSX_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
worked_on += 4;
}
#endif
#ifdef WITH_DEBUG
if (worked_on != nq)
{
printf("Error in real SSE BLOCK6 kernel \n");
abort();
}
#endif
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 4 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 8 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_VSX_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_VSX_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [4 x nb+3] * hh
// hh contains four householder vectors
/////////////////////////////////////////////////////
int i;
__SSE_DATATYPE a1_1 = _SSE_LOAD(0, (unsigned long int *) 0, &q[ldq*5]);
__SSE_DATATYPE a2_1 = _SSE_LOAD(0, (unsigned long int *) 0, &q[ldq*4]);
__SSE_DATATYPE a3_1 = _SSE_LOAD(0, (unsigned long int *) 0, &q[ldq*3]);