Commit e04b11b2 authored by Andreas Marek's avatar Andreas Marek

Unify 128bit kernels real block2, block4, and block6

parent b6369e22
......@@ -900,7 +900,7 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_single_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx-avx2_6hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/complex_sse_1hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_sse_6hv_template.c \
@top_srcdir@/src/elpa2/kernels/real_128_BLOCK_template.c \
@top_srcdir@/src/elpa2/kernels/complex_template.F90 \
@top_srcdir@/src/elpa2/kernels/complex_avx-avx2_2hv_double_precision.c \
@top_srcdir@/src/elpa2/kernels/real_avx512_2hv_double_precision.c \
......
......@@ -785,7 +785,6 @@ EXTRA_DIST = \
src/elpa2/kernels/real_vsx_4hv_template.c \
src/elpa2/kernels/real_vsx_6hv_template.c \
src/elpa2/kernels/real_128bit_BLOCK_template.c \
src/elpa2/kernels/real_sse_6hv_template.c \
src/elpa2/kernels/real_template.F90 \
src/elpa2/kernels/simple_template.F90 \
src/elpa2/kernels/simple_block4_template.F90 \
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -48,7 +48,7 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK2
#define BLOCK2 1
#include "../../general/precision_macros.h"
#include "real_128bit_BLOCK_template.c"
#undef REALCASE
......
......@@ -48,7 +48,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK2
#define BLOCK2 1
#include "../../general/precision_macros.h"
#include "real_128bit_BLOCK_template.c"
#undef BLOCK2
......
......@@ -48,7 +48,7 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK4
#define BLOCK4 1
#include "../../general/precision_macros.h"
#include "real_128bit_BLOCK_template.c"
#undef BLOCK4
......
......@@ -48,8 +48,10 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_sse_6hv_template.c"
#include "real_128bit_BLOCK_template.c"
#undef REALCASE
#undef BLOCK6
#undef DOUBLE_PRECISION
......@@ -48,8 +48,10 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_sse_6hv_template.c"
#include "real_128bit_BLOCK_template.c"
#undef REALCASE
#undef BLOCK6
#undef SINGLE_PRECISION
......@@ -48,8 +48,10 @@
#define REALCASE 1
#define DOUBLE_PRECISION 1
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_sse_6hv_template.c"
#include "real_128bit_BLOCK_template.c"
#undef REALCASE
#undef BLOCK6
#undef DOUBLE_PRECISION
......@@ -48,8 +48,10 @@
#define REALCASE 1
#define SINGLE_PRECISION 1
#define BLOCK6 1
#include "../../general/precision_macros.h"
#include "real_sse_6hv_template.c"
#include "real_128bit_BLOCK_template.c"
#undef REALCASE
#undef BLOCK6
#undef SINGLE__PRECISION
// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file contains additions, changes and
// enhancements authored by Intel Corporation which is not part of
// the ELPA consortium.
//
// More information can be found here:
// http://elpa.mpcdf.mpg.de/
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see <http://www.gnu.org/licenses/>
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de), based on Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"
#ifdef HAVE_SSE_INTRINSICS
#include <x86intrin.h>
#endif
#ifdef HAVE_SPARC64_SSE
#include <fjmfunc.h>
#include <emmintrin.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_SUB _mm_sub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_STORE _mm_store_pd
#define _SSE_SET _mm_set_pd
#define _SSE_SET1 _mm_set1_pd
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_SUB _mm_sub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_STORE _mm_store_ps
#define _SSE_SET _mm_set_ps
#define _SSE_SET1 _mm_set1_ps
#endif
#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_sse_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sparc64_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_sparc64_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif
#ifdef DOUBLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sse_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_sparc64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sparc64_6hv_double")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_double) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f> subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sse_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f> subroutine hexa_hh_trafo_real_sparc64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f> bind(C, name="hexa_hh_trafo_real_sparc64_6hv_single")
!f> use, intrinsic :: iso_c_binding
!f> integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f> type(c_ptr), value :: q
!f> real(kind=c_float) :: hh(pnb,6)
!f> end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sparc64_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sparc64_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#endif
{
int i;
int nb = *pnb;
int nq = *pldq;
int ldq = *pldq;
int ldh = *pldh;
int worked_on ;
worked_on = 0;
// calculating scalar products to compute
// 6 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
double scalarprods[15];
#endif
#ifdef SINGLE_PRECISION_REAL
float scalarprods[15];
#endif
scalarprods[0] = hh[(ldh+1)]; // 1 = hh(2,2)
scalarprods[1] = hh[(ldh*2)+2]; // 2 = hh(3,3)
scalarprods[2] = hh[(ldh*2)+1]; // 3 = hh(2,3)
scalarprods[3] = hh[(ldh*3)+3]; // 4 = hh(4,4)
scalarprods[4] = hh[(ldh*3)+2]; // 5 = hh(3,4)
scalarprods[5] = hh[(ldh*3)+1]; // 6 = hh(2,4)
scalarprods[6] = hh[(ldh*4)+4]; // 7 = hh(5,5)
scalarprods[7] = hh[(ldh*4)+3]; // 8 = hh(4,5)
scalarprods[8] = hh[(ldh*4)+2]; // 9 = hh(3,5)
scalarprods[9] = hh[(ldh*4)+1]; //10 = hh(2,5)
scalarprods[10] = hh[(ldh*5)+5]; //11 = hh(6,6)
scalarprods[11] = hh[(ldh*5)+4]; //12 = hh(5,6)
scalarprods[12] = hh[(ldh*5)+3]; //13 = hh(4,6)
scalarprods[13] = hh[(ldh*5)+2]; //14 = hh(3,6)
scalarprods[14] = hh[(ldh*5)+1]; //15 = hh(2,6)
// calculate scalar product of first and fourth householder Vector
// loop counter = 2
scalarprods[0] += hh[1] * hh[(2+ldh)]; // 1 = 1 + hh(2,1) * hh(3,2)
scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)]; // 3 = 3 + hh(2,2) * hh(3,3)
scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)]; // 6 = 6 + hh(2,3) * hh(3,4)
scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)]; //10 =10 + hh(2,4) * hh(3,5)
scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)]; //15 =15 + hh(2,5) * hh(3,6)
// loop counter = 3
scalarprods[0] += hh[2] * hh[(3+ldh)]; // 1 = 1 + hh(3,1) * hh(4,2)
scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)]; // 3 = 3 + hh(3,2) * hh(4,3)
scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)]; // 6 = 6 + hh(3,3) * hh(4,4)
scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)]; //10 =10 + hh(3,4) * hh(4,5)
scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)]; //15 =15 + hh(3,5) * hh(4,6)
scalarprods[1] += hh[1] * hh[3+(ldh*2)]; // 2 = 2 + hh(2,1) * hh(4,3)
scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)]; // 5 = 5 + hh(2,2) * hh(4,4)
scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)]; // 9 = 9 + hh(2,3) * hh(4,5)
scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)]; //14 =14 + hh(2,4) * hh(4,6)
// loop counter = 4
scalarprods[0] += hh[3] * hh[(4+ldh)]; // 1 = 1 + hh(4,1) * hh(5,2)
scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)]; // 3 = 3 + hh(4,2) * hh(5,3)
scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)]; // 6 = 6 + hh(4,3) * hh(5,4)
scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)]; //10 =10 + hh(4,4) * hh(5,5)
scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; //15 =15 + hh(4,5) * hh(5,6)
scalarprods[1] += hh[2] * hh[4+(ldh*2)]; // 2 = 2 + hh(3,1) * hh(5,3)
scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)]; // 5 = 5 + hh(3,2) * hh(5,4)
scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)]; // 9 = 9 + hh(3,3) * hh(5,5)
scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; //14 =14 + hh(3,4) * hh(5,6)
scalarprods[3] += hh[1] * hh[4+(ldh*3)]; // 4 = 4 + hh(2,1) * hh(5,4)
scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)]; // 8 = 8 + hh(2,2) * hh(5,5)
scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; //13 =13 + hh(2,3) * hh(5,6)
// loop counter = 5
scalarprods[0] += hh[4] * hh[(5+ldh)]; // 1 = 1 + hh(5,1) * hh(6,2)
scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)]; // 3 = 3 + hh(5,2) * hh(6,3)
scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)]; // 6 = 6 + hh(5,3) * hh(6,4)
scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)]; //10 =10 + hh(5,4) * hh(6,5)
scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; //15 =15 + hh(5,5) * hh(6,6)
scalarprods[1] += hh[3] * hh[5+(ldh*2)]; // 2 = 2 + hh(4,1) * hh(6,3)
scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)]; // 5 = 5 + hh(4,2) * hh(6,4)
scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)]; // 9 = 9 + hh(4,3) * hh(6,5)
scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; //14 =14 + hh(4,4) * hh(6,6)
scalarprods[3] += hh[2] * hh[5+(ldh*3)]; // 4 = 4 + hh(3,1) * hh(6,4)
scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)]; // 8 = 8 + hh(3,2) * hh(6,5)
scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; //13 =13 + hh(3,3) * hh(6,6)
scalarprods[6] += hh[1] * hh[5+(ldh*4)]; // 7 = 7 + hh(2,1) * hh(6,5)
scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)]; //12 =12 + hh(2,2) * hh(6,6)
#pragma ivdep
for (i = 6; i < nb; i++) // do i = 7, nb
{
scalarprods[0] += hh[i-1] * hh[(i+ldh)]; // 1 = 1 + hh(i-1,1) * hh(i,2)
scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)]; // 3 = 3 + hh(i-1,2) * hh(i,3)
scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)]; // 6 = 6 + hh(i-1,3) * hh(i,4)
scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)]; //10 =10 + hh(i-1,4) * hh(i,5)
scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)]; //15 =15 + hh(i-1,5) * hh(i,6)
scalarprods[1] += hh[i-2] * hh[i+(ldh*2)]; // 2 = 2 + hh(i-2,1) * hh(i,3)
scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)]; // 5 = 5 + hh(i-2,2) * hh(i,4)
scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)]; // 9 = 9 + hh(i-2,3) * hh(i,5)
scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; //14 =14 + hh(i-2,4) * hh(i,6)
scalarprods[3] += hh[i-3] * hh[i+(ldh*3)]; // 4 = 4 + hh(i-3,1) * hh(i,4)
scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)]; // 8 = 8 + hh(i-3,2) * hh(i,5)
scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; //13 =13 + hh(i-3,3) * hh(i,6)
scalarprods[6] += hh[i-4] * hh[i+(ldh*4)]; // 7 = 7 + hh(i-4,1) * hh(i,5)
scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)]; //12 =12 + hh(i-4,2) * hh(i,6)
scalarprods[10] += hh[i-5] * hh[i+(ldh*5)]; //11 =11 + hh(i-5,1) * hh(i,6)
}
// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
for (i = 0; i < nq-2; i+=4)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
worked_on += 4;
}
#endif
#ifdef SINGLE_PRECISION_REAL
for (i = 0; i < nq-4; i+=8)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_8_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_8_SPARC64_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
worked_on += 8;
}
#endif
if (nq == i)
{
return;
}
#ifdef DOUBLE_PRECISION_REAL
if (nq -i == 2)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_2_SPARC64_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
worked_on += 2;
}
#endif
#ifdef SINGLE_PRECISION_REAL
if (nq -i == 4)
{
#ifdef HAVE_SSE_INTRINSICS
hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
#ifdef HAVE_SPARC64_SSE
hh_trafo_kernel_4_SPARC64_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
#endif
worked_on += 4;
}
#endif
#ifdef WITH_DEBUG
if (worked_on != nq)
{
#ifdef HAVE_SSE_INTRINSICS
printf("Error in real SSE BLOCK6 kernel \n");
#endif
#ifdef HAVE_SPARC64_SSE
printf("Error in real SPARC64 BLOCK6 kernel \n");
#endif
abort();
}
#endif
}
/**
* Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
* 4 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
* 8 rows of Q simultaneously, a
#endif
* matrix Vector product with two householder
* vectors + a rank 1 update is performed
*/
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
#endif
{
/////////////////////////////////////////////////////
// Matrix Vector Multiplication, Q [4 x nb+3] * hh
// hh contains four householder vectors
/////////////////////////////////////////////////////
int i;
__SSE_DATATYPE a1_1 = _SSE_LOAD(&q[ldq*5]); // a_1_1 = q(1:nq,6)
__SSE_DATATYPE a2_1 = _SSE_LOAD(&q[ldq*4]); // a_2_1 = q(1:nq,5)
__SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq*3]); // a_3_1 = q(1:nq,4)
__SSE_DATATYPE a4_1 = _SSE_LOAD(&q[ldq*2]); // a_4_1 = q(1:nq,3)
__SSE_DATATYPE a5_1 = _SSE_LOAD(&q[ldq]); // a_5_1 = q(1,nq,2)
__SSE_DATATYPE a6_1 = _SSE_LOAD(&q[0]); // a_6_1 = q(1:nq,1)
#ifdef HAVE_SSE_INTRINSICS
__SSE_DATATYPE h_6_5 = _SSE_SET1(hh[(ldh*5)+1]); // h_6_5 = hh(2,6)
__SSE_DATATYPE h_6_4 = _SSE_SET1(hh[(ldh*5)+2]); // h_6_4 = hh(3,6)
__SSE_DATATYPE h_6_3 = _SSE_SET1(hh[(ldh*5)+3]); // h_6_3 = hh(4,6)
__SSE_DATATYPE h_6_2 = _SSE_SET1(hh[(ldh*5)+4]); // h_6_2 = hh(5,6)
__SSE_DATATYPE h_6_1 = _SSE_SET1(hh[(ldh*5)+5]); // h_6_1 = hh(6,6)
#endif
#ifdef HAVE_SPARC64_SSE
__SSE_DATATYPE h_6_5 = _SSE_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
__SSE_DATATYPE h_6_4 = _SSE_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
__SSE_DATATYPE h_6_3 = _SSE_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
__SSE_DATATYPE h_6_2 = _SSE_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
__SSE_DATATYPE h_6_1 = _SSE_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
#endif
register __SSE_DATATYPE t1 = _SSE_ADD(a6_1, _SSE_MUL(a5_1, h_6_5)); // t1 = a_6_1 + a_5_1 * h_6_5
t1 = _SSE_ADD(t1, _SSE_MUL(a4_1, h_6_4)); // t1 = t1 + a_4_1 * h_6_4
t1 = _SSE_ADD(t1, _SSE_MUL(a3_1, h_6_3)); // t1 = t1 + a_3_1 * h_6_3
t1 = _SSE_ADD(t1, _SSE_MUL(a2_1, h_6_2)); // t1 = t1 + a_2_1 * h_6_2
t1 = _SSE_ADD(t1, _SSE_MUL(a1_1, h_6_1)); // t1 = t1 + a_1_1 * h_6_1
#ifdef HAVE_SSE_INTRINSICS
__SSE_DATATYPE h_5_4 = _SSE_SET1(hh[(ldh*4)+1]); // h_5_4 = hh(2,5)
__SSE_DATATYPE h_5_3 = _SSE_SET1(hh[(ldh*4)+2]); // h_5_3 = hh(3,5)
__SSE_DATATYPE h_5_2 = _SSE_SET1(hh[(ldh*4)+3]); // h_5_2 = hh(4,5)
__SSE_DATATYPE h_5_1 = _SSE_SET1(hh[(ldh*4)+4]); // h_5_1 = hh(5,5)
#endif
#ifdef HAVE_SPARC64_SSE
__SSE_DATATYPE h_5_4 = _SSE_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
__SSE_DATATYPE h_5_3 = _SSE_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
__SSE_DATATYPE h_5_2 = _SSE_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
__SSE_DATATYPE h_5_1 = _SSE_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
#endif
register __SSE_DATATYPE v1 = _SSE_ADD(a5_1, _SSE_MUL(a4_1, h_5_4)); // v1 = a_5_1 + a_4_1 * h_5_4
v1 = _SSE_ADD(v1, _SSE_MUL(a3_1, h_5_3)); // v1 = v1 + a_3_1 * h_5_3
v1 = _SSE_ADD(v1, _SSE_MUL(a2_1, h_5_2)); // v1 = v1 + a_2_1 * h_5_2
v1 = _SSE_ADD(v1, _SSE_MUL(a1_1, h_5_1)); // v1 = v1 + a_1_1 * h_5_1
#ifdef HAVE_SSE_INTRINSICS
__SSE_DATATYPE h_4_3 = _SSE_SET1(hh[(ldh*3)+1]); // h_4_3 = hh(2,4)
__SSE_DATATYPE h_4_2 = _SSE_SET1(hh[(ldh*3)+2]); // h_4_2 = hh(3,4)
__SSE_DATATYPE h_4_1 = _SSE_SET1(hh[(ldh*3)+3]); // h_4_1 = hh(4,4)
#endif
#ifdef HAVE_SPARC64_SSE
__SSE_DATATYPE h_4_3 = _SSE_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
__SSE_DATATYPE h_4_2 = _SSE_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
__SSE_DATATYPE h_4_1 = _SSE_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
#endif
register __SSE_DATATYPE w1 = _SSE_ADD(a4_1, _SSE_MUL(a3_1, h_4_3)); // w1 = a_4_1 + a_3_1 * h_4_3
w1 = _SSE_ADD(w1, _SSE_MUL(a2_1, h_4_2)); // w1 = w1 + a_2_1 * h_4_2
w1 = _SSE_ADD(w1, _SSE_MUL(a1_1, h_4_1)); // w1 = w1 + a_1_1 * h_4_1
#ifdef HAVE_SSE_INTRINSICS
__SSE_DATATYPE h_2_1 = _SSE_SET1(hh[ldh+1]); // h_2_1 = hh(2,2)
__SSE_DATATYPE h_3_2 = _SSE_SET1(hh[(ldh*2)+1]); // h_3_2 = hh(2,3)
__SSE_DATATYPE h_3_1 = _SSE_SET1(hh[(ldh*2)+2]); // h_3_1 = hh(3,3)
#endif
#ifdef HAVE_SPARC64_SSE
__SSE_DATATYPE h_2_1 = _SSE_SET(hh[ldh+1], hh[ldh+1]);
__SSE_DATATYPE h_3_2 = _SSE_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
__SSE_DATATYPE h_3_1 = _SSE_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
#endif
register __SSE_DATATYPE z1 = _SSE_ADD(a3_1, _SSE_MUL(a2_1, h_3_2)); // z1 = a_3_1 + a_2_1 * h_3_2
z1 = _SSE_ADD(z1, _SSE_MUL(a1_1, h_3_1)); // z1 = z1 + a_1_1 * h_3_1
register __SSE_DATATYPE y1 = _SSE_ADD(a2_1, _SSE_MUL(a1_1, h_2_1)); // y1 = a_2_1 + a_1_1 * h_2_1
register __SSE_DATATYPE x1 = a1_1; // x1 = a_1_1
__SSE_DATATYPE a1_2 = _SSE_LOAD(&q[(ldq*5)+offset]);
__SSE_DATATYPE a2_2 = _SSE_LOAD(&q[(ldq*4)+offset]);
__SSE_DATATYPE a3_2 = _SSE_LOAD(&q[(ldq*3)+offset]);
__SSE_DATATYPE a4_2 = _SSE_LOAD(&q[(ldq*2)+offset]);
__SSE_DATATYPE a5_2 = _SSE_LOAD(&q[(ldq)+offset]);
__SSE_DATATYPE a6_2 = _SSE_LOAD(&q[offset]);
register __SSE_DATATYPE t2 = _SSE_ADD(a6_2, _SSE_MUL(a5_2, h_6_5));
t2 = _SSE_ADD(t2, _SSE_MUL(a4_2, h_6_4));
t2 = _SSE_ADD(t2, _SSE_MUL(a3_2, h_6_3));
t2 = _SSE_ADD(t2, _SSE_MUL(a2_2, h_6_2));
t2 = _SSE_ADD(t2, _SSE_MUL(a1_2, h_6_1));
register __SSE_DATATYPE v2 = _SSE_ADD(a5_2, _SSE_MUL(a4_2, h_5_4));
v2 = _SSE_ADD(v2, _SSE_MUL(a3_2, h_5_3));
v2 = _SSE_ADD(v2, _SSE_MUL(a2_2, h_5_2));
v2 = _SSE_ADD(v2, _SSE_MUL(a1_2, h_5_1));
register __SSE_DATATYPE w2 = _SSE_ADD(a4_2, _SSE_MUL(a3_2, h_4_3));
w2 = _SSE_ADD(w2, _SSE_MUL(a2_2, h_4_2));
w2 = _SSE_ADD(w2, _SSE_MUL(a1_2, h_4_1));
register __SSE_DATATYPE z2 = _SSE_ADD(a3_2, _SSE_MUL(a2_2, h_3_2));
z2 = _SSE_ADD(z2, _SSE_MUL(a1_2, h_3_1));
register __SSE_DATATYPE y2 = _SSE_ADD(a2_2, _SSE_MUL(a1_2, h_2_1));
register __SSE_DATATYPE x2 = a1_2;
__SSE_DATATYPE q1;
__SSE_DATATYPE q2;
__SSE_DATATYPE h1;
__SSE_DATATYPE h2;
__SSE_DATATYPE h3;
__SSE_DATATYPE h4;
__SSE_DATATYPE h5;
__SSE_DATATYPE h6;
for(i = 6; i < nb; i++) // do i=7,nb