real_sse_6hv_template.c 72.1 KB
Newer Older
1 2 3 4 5 6
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
Andreas Marek's avatar
Andreas Marek committed
7
//        Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
Andreas Marek's avatar
Andreas Marek committed
9
//        Informatik,
10
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
//        Schwerpunkt Wissenschaftliches Rechnen ,
12 13
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
Andreas Marek's avatar
Andreas Marek committed
14 15
//        Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//        and
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
Andreas Marek's avatar
Andreas Marek committed
36
//    along with ELPA.        If not, see <http://www.gnu.org/licenses/>
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de), based on Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

64
#ifdef HAVE_SSE_INTRINSICS
65
#include <x86intrin.h>
66 67 68 69 70
#endif
#ifdef HAVE_SPARC64_SSE
#include <fjmfunc.h>
#include <emmintrin.h>
#endif
71 72
#include <stdio.h>
#include <stdlib.h>
73

74

75 76 77 78 79 80 81 82 83 84
#define __forceinline __attribute__((always_inline)) static

#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_SUB _mm_sub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_STORE _mm_store_pd
85 86
#define _SSE_SET _mm_set_pd
#define _SSE_SET1 _mm_set1_pd
87 88 89 90 91 92 93 94 95
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_SUB _mm_sub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_STORE _mm_store_ps
96 97
#define _SSE_SET _mm_set_ps
#define _SSE_SET1 _mm_set1_ps
98 99 100 101 102 103
#endif

#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

104
#ifdef HAVE_SSE_INTRINSICS
105 106 107 108 109 110 111 112 113 114 115 116
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_sse_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sparc64_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_sparc64_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif


135 136 137 138 139 140

#ifdef DOUBLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
141 142 143 144 145
!f>                                bind(C, name="hexa_hh_trafo_real_sse_6hv_double")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_double)        :: hh(pnb,6)
146 147 148 149
!f>   end subroutine
!f> end interface
!f>#endif
*/
150 151 152 153
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f>   subroutine hexa_hh_trafo_real_sparc64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
154 155 156 157 158
!f>                                bind(C, name="hexa_hh_trafo_real_sparc64_6hv_double")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_double)        :: hh(pnb,6)
159 160 161 162
!f>   end subroutine
!f> end interface
!f>#endif
*/
163
#endif
164

165 166 167 168 169
#ifdef SINGLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
170 171 172 173 174
!f>                                bind(C, name="hexa_hh_trafo_real_sse_6hv_single")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_float)        :: hh(pnb,6)
175 176 177 178
!f>   end subroutine
!f> end interface
!f>#endif
*/
179 180 181 182
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f>   subroutine hexa_hh_trafo_real_sparc64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
183 184 185 186 187
!f>                                bind(C, name="hexa_hh_trafo_real_sparc64_6hv_single")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_float)        :: hh(pnb,6)
188 189 190 191 192
!f>   end subroutine
!f> end interface
!f>#endif
*/

193 194
#endif

195
#ifdef HAVE_SSE_INTRINSICS
196 197 198 199 200 201
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
202 203 204 205 206 207 208 209 210
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sparc64_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sparc64_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#endif
211
{
Andreas Marek's avatar
Andreas Marek committed
212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on ;

        worked_on = 0;

        // calculating scalar products to compute
        // 6 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
        double scalarprods[15];
#endif
#ifdef SINGLE_PRECISION_REAL
        float scalarprods[15];
#endif

230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
        scalarprods[0] = hh[(ldh+1)];      // 1 = hh(2,2)
        scalarprods[1] = hh[(ldh*2)+2];    // 2 = hh(3,3)
        scalarprods[2] = hh[(ldh*2)+1];    // 3 = hh(2,3)
        scalarprods[3] = hh[(ldh*3)+3];    // 4 = hh(4,4)
        scalarprods[4] = hh[(ldh*3)+2];    // 5 = hh(3,4)
        scalarprods[5] = hh[(ldh*3)+1];    // 6 = hh(2,4)
        scalarprods[6] = hh[(ldh*4)+4];    // 7 = hh(5,5)
        scalarprods[7] = hh[(ldh*4)+3];    // 8 = hh(4,5)
        scalarprods[8] = hh[(ldh*4)+2];    // 9 = hh(3,5)
        scalarprods[9] = hh[(ldh*4)+1];    //10 = hh(2,5) 
        scalarprods[10] = hh[(ldh*5)+5];   //11 = hh(6,6) 
        scalarprods[11] = hh[(ldh*5)+4];   //12 = hh(5,6)
        scalarprods[12] = hh[(ldh*5)+3];   //13 = hh(4,6)
        scalarprods[13] = hh[(ldh*5)+2];   //14 = hh(3,6)
        scalarprods[14] = hh[(ldh*5)+1];   //15 = hh(2,6)
245

Andreas Marek's avatar
Andreas Marek committed
246 247
        // calculate scalar product of first and fourth householder Vector
        // loop counter = 2
248 249 250 251 252
        scalarprods[0] += hh[1] * hh[(2+ldh)];             // 1 = 1 + hh(2,1) * hh(3,2)
        scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];     // 3 = 3 + hh(2,2) * hh(3,3)
        scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];   // 6 = 6 + hh(2,3) * hh(3,4)
        scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];   //10 =10 + hh(2,4) * hh(3,5) 
        scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];  //15 =15 + hh(2,5) * hh(3,6)
253

Andreas Marek's avatar
Andreas Marek committed
254
        // loop counter = 3
255 256 257 258 259
        scalarprods[0] += hh[2] * hh[(3+ldh)];             // 1 = 1 + hh(3,1) * hh(4,2)
        scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];     // 3 = 3 + hh(3,2) * hh(4,3)
        scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];   // 6 = 6 + hh(3,3) * hh(4,4)
        scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];   //10 =10 + hh(3,4) * hh(4,5)
        scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];  //15 =15 + hh(3,5) * hh(4,6)
260

261 262 263 264
        scalarprods[1] += hh[1] * hh[3+(ldh*2)];           // 2 = 2 + hh(2,1) * hh(4,3)
        scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];   // 5 = 5 + hh(2,2) * hh(4,4)
        scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];   // 9 = 9 + hh(2,3) * hh(4,5)
        scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];  //14 =14 + hh(2,4) * hh(4,6)
265

Andreas Marek's avatar
Andreas Marek committed
266
        // loop counter = 4
267 268 269 270 271
        scalarprods[0] += hh[3] * hh[(4+ldh)];            // 1 = 1 + hh(4,1) * hh(5,2)
        scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];    // 3 = 3 + hh(4,2) * hh(5,3)
        scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];  // 6 = 6 + hh(4,3) * hh(5,4)
        scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];  //10 =10 + hh(4,4) * hh(5,5)
        scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)]; //15 =15 + hh(4,5) * hh(5,6)
272

273 274 275 276
        scalarprods[1] += hh[2] * hh[4+(ldh*2)];          // 2 = 2 + hh(3,1) * hh(5,3)
        scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];  // 5 = 5 + hh(3,2) * hh(5,4)
        scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];  // 9 = 9 + hh(3,3) * hh(5,5)
        scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)]; //14 =14 + hh(3,4) * hh(5,6)
277

278 279 280
        scalarprods[3] += hh[1] * hh[4+(ldh*3)];          // 4 = 4 + hh(2,1) * hh(5,4)
        scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];    // 8 = 8 + hh(2,2) * hh(5,5)
        scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)]; //13 =13 + hh(2,3) * hh(5,6)
281

Andreas Marek's avatar
Andreas Marek committed
282
        // loop counter = 5
283 284 285 286 287
        scalarprods[0] += hh[4] * hh[(5+ldh)];            // 1 = 1 + hh(5,1) * hh(6,2)
        scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];    // 3 = 3 + hh(5,2) * hh(6,3)
        scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];  // 6 = 6 + hh(5,3) * hh(6,4)
        scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];  //10 =10 + hh(5,4) * hh(6,5)
        scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)]; //15 =15 + hh(5,5) * hh(6,6)
288

289 290 291 292
        scalarprods[1] += hh[3] * hh[5+(ldh*2)];          // 2 = 2 + hh(4,1) * hh(6,3)
        scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];  // 5 = 5 + hh(4,2) * hh(6,4)
        scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];  // 9 = 9 + hh(4,3) * hh(6,5)
        scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)]; //14 =14 + hh(4,4) * hh(6,6)
293

294 295 296
        scalarprods[3] += hh[2] * hh[5+(ldh*3)];          // 4 = 4 + hh(3,1) * hh(6,4)
        scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];    // 8 = 8 + hh(3,2) * hh(6,5)
        scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)]; //13 =13 + hh(3,3) * hh(6,6)
297

298 299
        scalarprods[6] += hh[1] * hh[5+(ldh*4)];          // 7 = 7 + hh(2,1) * hh(6,5)
        scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];   //12 =12 + hh(2,2) * hh(6,6) 
300

Andreas Marek's avatar
Andreas Marek committed
301
        #pragma ivdep
302
        for (i = 6; i < nb; i++)                                     // do i = 7, nb
Andreas Marek's avatar
Andreas Marek committed
303
        {
304 305 306 307 308
                scalarprods[0] += hh[i-1] * hh[(i+ldh)];             // 1 = 1 + hh(i-1,1) * hh(i,2)
                scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];     // 3 = 3 + hh(i-1,2) * hh(i,3)
                scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];   // 6 = 6 + hh(i-1,3) * hh(i,4)
                scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];   //10 =10 + hh(i-1,4) * hh(i,5)
                scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];  //15 =15 + hh(i-1,5) * hh(i,6)
309

310 311 312 313
                scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];          // 2 = 2 + hh(i-2,1) * hh(i,3)
                scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];  // 5 = 5 + hh(i-2,2) * hh(i,4)
                scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];  // 9 = 9 + hh(i-2,3) * hh(i,5)
                scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)]; //14 =14 + hh(i-2,4) * hh(i,6)
314

315 316 317
                scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];          // 4 = 4 + hh(i-3,1) * hh(i,4)
                scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];    // 8 = 8 + hh(i-3,2) * hh(i,5)
                scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)]; //13 =13 + hh(i-3,3) * hh(i,6)
318

319 320
                scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];          // 7 = 7 + hh(i-4,1) * hh(i,5)
                scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];   //12 =12 + hh(i-4,2) * hh(i,6)
321

322
                scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];         //11 =11 + hh(i-5,1) * hh(i,6)
Andreas Marek's avatar
Andreas Marek committed
323
        }
324

Andreas Marek's avatar
Andreas Marek committed
325
        // Production level kernel calls with padding
326
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
327 328
        for (i = 0; i < nq-2; i+=4)
        {
329
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
330
                hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
331 332
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
333
                hh_trafo_kernel_4_SPARC64_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
334 335
#endif

Andreas Marek's avatar
Andreas Marek committed
336 337
                worked_on += 4;
        }
338 339
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
340 341
        for (i = 0; i < nq-4; i+=8)
        {
342
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
343
                hh_trafo_kernel_8_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
344 345
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
346
                hh_trafo_kernel_8_SPARC64_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
347 348
#endif

Andreas Marek's avatar
Andreas Marek committed
349 350
                worked_on += 8;
        }
351
#endif
Andreas Marek's avatar
Andreas Marek committed
352 353 354 355
        if (nq == i)
        {
                return;
        }
356
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
357 358
        if (nq -i == 2)
        {
359
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
360
                hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
361 362
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
363
                hh_trafo_kernel_2_SPARC64_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
364 365
#endif

Andreas Marek's avatar
Andreas Marek committed
366 367
                worked_on += 2;
        }
368 369
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
370 371
        if (nq -i == 4)
        {
372
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
373
                hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
374 375
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
376
                hh_trafo_kernel_4_SPARC64_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
377
#endif
Andreas Marek's avatar
Andreas Marek committed
378 379
                worked_on += 4;
        }
380
#endif
381
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
382 383
        if (worked_on != nq)
        {
384
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
385
                printf("Error in real SSE BLOCK6 kernel \n");
386 387
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
388
                printf("Error in real SPARC64 BLOCK6 kernel \n");
389 390
#endif

Andreas Marek's avatar
Andreas Marek committed
391 392
                abort();
        }
393
#endif
394 395 396 397 398 399 400 401 402 403 404 405 406
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 4 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
407
#ifdef HAVE_SSE_INTRINSICS
408 409 410 411 412 413
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
414 415 416 417 418 419 420 421 422 423
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
#endif

424
{
Andreas Marek's avatar
Andreas Marek committed
425 426 427 428 429
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [4 x nb+3] * hh
        // hh contains four householder vectors
        /////////////////////////////////////////////////////
        int i;
430

431 432 433 434 435 436
        __SSE_DATATYPE a1_1 = _SSE_LOAD(&q[ldq*5]);   // a_1_1 = q(1:nq,6)
        __SSE_DATATYPE a2_1 = _SSE_LOAD(&q[ldq*4]);   // a_2_1 = q(1:nq,5)
        __SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq*3]);   // a_3_1 = q(1:nq,4)
        __SSE_DATATYPE a4_1 = _SSE_LOAD(&q[ldq*2]);   // a_4_1 = q(1:nq,3)
        __SSE_DATATYPE a5_1 = _SSE_LOAD(&q[ldq]);     // a_5_1 = q(1,nq,2)
        __SSE_DATATYPE a6_1 = _SSE_LOAD(&q[0]);       // a_6_1 = q(1:nq,1)
437

438
#ifdef HAVE_SSE_INTRINSICS
439 440 441 442 443
        __SSE_DATATYPE h_6_5 = _SSE_SET1(hh[(ldh*5)+1]);  // h_6_5 = hh(2,6)
        __SSE_DATATYPE h_6_4 = _SSE_SET1(hh[(ldh*5)+2]);  // h_6_4 = hh(3,6)
        __SSE_DATATYPE h_6_3 = _SSE_SET1(hh[(ldh*5)+3]);  // h_6_3 = hh(4,6)
        __SSE_DATATYPE h_6_2 = _SSE_SET1(hh[(ldh*5)+4]);  // h_6_2 = hh(5,6)
        __SSE_DATATYPE h_6_1 = _SSE_SET1(hh[(ldh*5)+5]);  // h_6_1 = hh(6,6)
444 445 446
#endif

#ifdef HAVE_SPARC64_SSE
447 448 449 450 451
        __SSE_DATATYPE h_6_5 = _SSE_SET(hh[(ldh*5)+1], hh[(ldh*5)+1]);
        __SSE_DATATYPE h_6_4 = _SSE_SET(hh[(ldh*5)+2], hh[(ldh*5)+2]);
        __SSE_DATATYPE h_6_3 = _SSE_SET(hh[(ldh*5)+3], hh[(ldh*5)+3]);
        __SSE_DATATYPE h_6_2 = _SSE_SET(hh[(ldh*5)+4], hh[(ldh*5)+4]);
        __SSE_DATATYPE h_6_1 = _SSE_SET(hh[(ldh*5)+5], hh[(ldh*5)+5]);
452
#endif
453

454 455 456 457 458
        register __SSE_DATATYPE t1 = _SSE_ADD(a6_1, _SSE_MUL(a5_1, h_6_5));  // t1 = a_6_1 + a_5_1 * h_6_5
        t1 = _SSE_ADD(t1, _SSE_MUL(a4_1, h_6_4)); // t1 = t1 + a_4_1 * h_6_4
        t1 = _SSE_ADD(t1, _SSE_MUL(a3_1, h_6_3)); // t1 = t1 + a_3_1 * h_6_3
        t1 = _SSE_ADD(t1, _SSE_MUL(a2_1, h_6_2)); // t1 = t1 + a_2_1 * h_6_2
        t1 = _SSE_ADD(t1, _SSE_MUL(a1_1, h_6_1)); // t1 = t1 + a_1_1 * h_6_1
459

460
#ifdef HAVE_SSE_INTRINSICS
461 462 463 464
        __SSE_DATATYPE h_5_4 = _SSE_SET1(hh[(ldh*4)+1]);  // h_5_4 = hh(2,5)
        __SSE_DATATYPE h_5_3 = _SSE_SET1(hh[(ldh*4)+2]);  // h_5_3 = hh(3,5)
        __SSE_DATATYPE h_5_2 = _SSE_SET1(hh[(ldh*4)+3]);  // h_5_2 = hh(4,5)
        __SSE_DATATYPE h_5_1 = _SSE_SET1(hh[(ldh*4)+4]);  // h_5_1 = hh(5,5)
465 466 467
#endif

#ifdef HAVE_SPARC64_SSE
468 469 470 471
        __SSE_DATATYPE h_5_4 = _SSE_SET(hh[(ldh*4)+1], hh[(ldh*4)+1]);
        __SSE_DATATYPE h_5_3 = _SSE_SET(hh[(ldh*4)+2], hh[(ldh*4)+2]);
        __SSE_DATATYPE h_5_2 = _SSE_SET(hh[(ldh*4)+3], hh[(ldh*4)+3]);
        __SSE_DATATYPE h_5_1 = _SSE_SET(hh[(ldh*4)+4], hh[(ldh*4)+4]);
472 473
#endif

474 475 476 477
        register __SSE_DATATYPE v1 = _SSE_ADD(a5_1, _SSE_MUL(a4_1, h_5_4)); // v1 = a_5_1 + a_4_1 * h_5_4
        v1 = _SSE_ADD(v1, _SSE_MUL(a3_1, h_5_3)); // v1 = v1 + a_3_1 * h_5_3
        v1 = _SSE_ADD(v1, _SSE_MUL(a2_1, h_5_2)); // v1 = v1 + a_2_1 * h_5_2
        v1 = _SSE_ADD(v1, _SSE_MUL(a1_1, h_5_1)); // v1 = v1 + a_1_1 * h_5_1
478

479
#ifdef HAVE_SSE_INTRINSICS
480 481 482
        __SSE_DATATYPE h_4_3 = _SSE_SET1(hh[(ldh*3)+1]);  // h_4_3 = hh(2,4)
        __SSE_DATATYPE h_4_2 = _SSE_SET1(hh[(ldh*3)+2]);  // h_4_2 = hh(3,4)
        __SSE_DATATYPE h_4_1 = _SSE_SET1(hh[(ldh*3)+3]);  // h_4_1 = hh(4,4)
483 484 485
#endif

#ifdef HAVE_SPARC64_SSE
486 487 488
        __SSE_DATATYPE h_4_3 = _SSE_SET(hh[(ldh*3)+1], hh[(ldh*3)+1]);
        __SSE_DATATYPE h_4_2 = _SSE_SET(hh[(ldh*3)+2], hh[(ldh*3)+2]);
        __SSE_DATATYPE h_4_1 = _SSE_SET(hh[(ldh*3)+3], hh[(ldh*3)+3]);
489 490
#endif

491 492 493
        register __SSE_DATATYPE w1 = _SSE_ADD(a4_1, _SSE_MUL(a3_1, h_4_3)); // w1 = a_4_1 + a_3_1 * h_4_3
        w1 = _SSE_ADD(w1, _SSE_MUL(a2_1, h_4_2));  // w1 = w1 + a_2_1 * h_4_2
        w1 = _SSE_ADD(w1, _SSE_MUL(a1_1, h_4_1));  // w1 = w1 + a_1_1 * h_4_1
494

495
#ifdef HAVE_SSE_INTRINSICS
496 497 498
        __SSE_DATATYPE h_2_1 = _SSE_SET1(hh[ldh+1]);     // h_2_1 = hh(2,2)
        __SSE_DATATYPE h_3_2 = _SSE_SET1(hh[(ldh*2)+1]); // h_3_2 = hh(2,3)
        __SSE_DATATYPE h_3_1 = _SSE_SET1(hh[(ldh*2)+2]); // h_3_1 = hh(3,3)
499 500 501
#endif

#ifdef HAVE_SPARC64_SSE
502 503 504
        __SSE_DATATYPE h_2_1 = _SSE_SET(hh[ldh+1], hh[ldh+1]);
        __SSE_DATATYPE h_3_2 = _SSE_SET(hh[(ldh*2)+1], hh[(ldh*2)+1]);
        __SSE_DATATYPE h_3_1 = _SSE_SET(hh[(ldh*2)+2], hh[(ldh*2)+2]);
505 506
#endif

507 508 509
        register __SSE_DATATYPE z1 = _SSE_ADD(a3_1, _SSE_MUL(a2_1, h_3_2));  // z1 = a_3_1 + a_2_1 * h_3_2
        z1 = _SSE_ADD(z1, _SSE_MUL(a1_1, h_3_1));  // z1 = z1 + a_1_1 * h_3_1
        register __SSE_DATATYPE y1 = _SSE_ADD(a2_1, _SSE_MUL(a1_1, h_2_1));  // y1 = a_2_1 + a_1_1 * h_2_1
510

511
        register __SSE_DATATYPE x1 = a1_1;  // x1 = a_1_1
512

Andreas Marek's avatar
Andreas Marek committed
513 514 515 516 517 518
        __SSE_DATATYPE a1_2 = _SSE_LOAD(&q[(ldq*5)+offset]);
        __SSE_DATATYPE a2_2 = _SSE_LOAD(&q[(ldq*4)+offset]);
        __SSE_DATATYPE a3_2 = _SSE_LOAD(&q[(ldq*3)+offset]);
        __SSE_DATATYPE a4_2 = _SSE_LOAD(&q[(ldq*2)+offset]);
        __SSE_DATATYPE a5_2 = _SSE_LOAD(&q[(ldq)+offset]);
        __SSE_DATATYPE a6_2 = _SSE_LOAD(&q[offset]);
519

Andreas Marek's avatar
Andreas Marek committed
520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
        register __SSE_DATATYPE t2 = _SSE_ADD(a6_2, _SSE_MUL(a5_2, h_6_5));
        t2 = _SSE_ADD(t2, _SSE_MUL(a4_2, h_6_4));
        t2 = _SSE_ADD(t2, _SSE_MUL(a3_2, h_6_3));
        t2 = _SSE_ADD(t2, _SSE_MUL(a2_2, h_6_2));
        t2 = _SSE_ADD(t2, _SSE_MUL(a1_2, h_6_1));
        register __SSE_DATATYPE v2 = _SSE_ADD(a5_2, _SSE_MUL(a4_2, h_5_4));
        v2 = _SSE_ADD(v2, _SSE_MUL(a3_2, h_5_3));
        v2 = _SSE_ADD(v2, _SSE_MUL(a2_2, h_5_2));
        v2 = _SSE_ADD(v2, _SSE_MUL(a1_2, h_5_1));
        register __SSE_DATATYPE w2 = _SSE_ADD(a4_2, _SSE_MUL(a3_2, h_4_3));
        w2 = _SSE_ADD(w2, _SSE_MUL(a2_2, h_4_2));
        w2 = _SSE_ADD(w2, _SSE_MUL(a1_2, h_4_1));
        register __SSE_DATATYPE z2 = _SSE_ADD(a3_2, _SSE_MUL(a2_2, h_3_2));
        z2 = _SSE_ADD(z2, _SSE_MUL(a1_2, h_3_1));
        register __SSE_DATATYPE y2 = _SSE_ADD(a2_2, _SSE_MUL(a1_2, h_2_1));
535

Andreas Marek's avatar
Andreas Marek committed
536
        register __SSE_DATATYPE x2 = a1_2;
537

Andreas Marek's avatar
Andreas Marek committed
538 539
        __SSE_DATATYPE q1;
        __SSE_DATATYPE q2;
540

Andreas Marek's avatar
Andreas Marek committed
541 542 543 544 545 546
        __SSE_DATATYPE h1;
        __SSE_DATATYPE h2;
        __SSE_DATATYPE h3;
        __SSE_DATATYPE h4;
        __SSE_DATATYPE h5;
        __SSE_DATATYPE h6;
547

548
        for(i = 6; i < nb; i++)             // do i=7,nb
Andreas Marek's avatar
Andreas Marek committed
549
        {
550
#ifdef HAVE_SSE_INTRINSICS
551
                h1 = _SSE_SET1(hh[i-5]);  // h1 = hh(i-5,1)
552 553
#endif
#ifdef HAVE_SPARC64_SSE
554
                h1 = _SSE_SET(hh[i-5], hh[i-5]);
555
#endif
Andreas Marek's avatar
Andreas Marek committed
556 557 558
        
                q1 = _SSE_LOAD(&q[i*ldq]);
                q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
559

560
                x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));  // x1 = x1 + q(1:nq,i) * h1
Andreas Marek's avatar
Andreas Marek committed
561
                x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
562

563
#ifdef HAVE_SSE_INTRINSICS
564
                h2 = _SSE_SET1(hh[ldh+i-4]);   // h2 = hh(i-4,2)
565 566 567
#endif

#ifdef HAVE_SPARC64_SSE
568
                h2 = _SSE_SET(hh[ldh+i-4], hh[ldh+i-4]);
569
#endif
570

571
                y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));  // y1 = y1 + q1(1:nq,i) * h2
Andreas Marek's avatar
Andreas Marek committed
572
                y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
573

574
#ifdef HAVE_SSE_INTRINSICS
575
                h3 = _SSE_SET1(hh[(ldh*2)+i-3]);  // h3 = hh(i-3,3)
576 577
#endif
#ifdef HAVE_SPARC64_SSE
578
                h3 = _SSE_SET(hh[(ldh*2)+i-3], hh[(ldh*2)+i-3]);
579
#endif
580

581
                z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));   // z1 = z1 + q(1:nq,i) * h3
Andreas Marek's avatar
Andreas Marek committed
582
                z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
583
#ifdef HAVE_SSE_INTRINSICS
584
                h4 = _SSE_SET1(hh[(ldh*3)+i-2]);    // h4 = hh(i-2,4)
585 586
#endif
#ifdef HAVE_SPARC64_SSE
587
                h4 = _SSE_SET(hh[(ldh*3)+i-2], hh[(ldh*3)+i-2]);
588 589
#endif

590
                w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));  // w1 = w1 + q1(1:nq,i) * h4
Andreas Marek's avatar
Andreas Marek committed
591
                w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
592

593
#ifdef HAVE_SSE_INTRINSICS
594
                h5 = _SSE_SET1(hh[(ldh*4)+i-1]);   // h5 = hh(i-1,5)
595 596
#endif
#ifdef HAVE_SPARC64_SSE
597
                h5 = _SSE_SET(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
598
#endif
599
                v1 = _SSE_ADD(v1, _SSE_MUL(q1,h5)); // v1 = v1 + q1(1:nq,i) * h5
Andreas Marek's avatar
Andreas Marek committed
600
                v2 = _SSE_ADD(v2, _SSE_MUL(q2,h5));
601

602
#ifdef HAVE_SSE_INTRINSICS
603
                h6 = _SSE_SET1(hh[(ldh*5)+i]);  // h6 = hh(i,6)
604 605 606
#endif

#ifdef HAVE_SPARC64_SSE
607
                h6 = _SSE_SET(hh[(ldh*5)+i], hh[(ldh*5)+i]);
608
#endif
609

610
                t1 = _SSE_ADD(t1, _SSE_MUL(q1,h6));  // t1 = t1 + q1(1:nq,i) * h6
Andreas Marek's avatar
Andreas Marek committed
611 612
                t2 = _SSE_ADD(t2, _SSE_MUL(q2,h6));
        }
613

614
#ifdef HAVE_SSE_INTRINSICS
615
        h1 = _SSE_SET1(hh[nb-5]);  // h1 = hh(nb-4,1)
616 617 618
#endif

#ifdef HAVE_SPARC64_SSE
619
        h1 = _SSE_SET(hh[nb-5], hh[nb-5]);
620 621
#endif

Andreas Marek's avatar
Andreas Marek committed
622 623
        q1 = _SSE_LOAD(&q[nb*ldq]);
        q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
624

625
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));   // x1 = x1 + q1(1:nq,nb+1) * h1
Andreas Marek's avatar
Andreas Marek committed
626
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
627

628
#ifdef HAVE_SSE_INTRINSICS
629
        h2 = _SSE_SET1(hh[ldh+nb-4]);  // h2 = hh(nb-3,2)
630 631 632
#endif

#ifdef HAVE_SPARC64_SSE
633
        h2 = _SSE_SET(hh[ldh+nb-4], hh[ldh+nb-4]);
634
#endif
635 636


637
        y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));  // y1 = y1 + q1(1:nq,nb+1) * h2
Andreas Marek's avatar
Andreas Marek committed
638
        y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
639

640
#ifdef HAVE_SSE_INTRINSICS
641
        h3 = _SSE_SET1(hh[(ldh*2)+nb-3]); // h3 = hh(nb-2,3)
642 643 644
#endif

#ifdef HAVE_SPARC64_SSE
645
        h3 = _SSE_SET(hh[(ldh*2)+nb-3], hh[(ldh*2)+nb-3]);
646
#endif
647 648


649
        z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3)); // z1 = z1 + q1(1:nq,nb+1) * h3
Andreas Marek's avatar
Andreas Marek committed
650
        z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
651

652
#ifdef HAVE_SSE_INTRINSICS
653
        h4 = _SSE_SET1(hh[(ldh*3)+nb-2]);  // h4 = hh(nb-1,4)
654 655 656
#endif

#ifdef HAVE_SPARC64_SSE
657
        h4 = _SSE_SET(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
658
#endif
659

660
        w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));  // w1 = w1 + q1(1:nq,nb+1) * h4
Andreas Marek's avatar
Andreas Marek committed
661
        w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
662

663
#ifdef HAVE_SSE_INTRINSICS
664
        h5 = _SSE_SET1(hh[(ldh*4)+nb-1]);  // h5 = hh(nb, 5)
665 666 667
#endif

#ifdef HAVE_SPARC64_SSE
668
        h5 = _SSE_SET(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
669 670 671
#endif


672

673
        v1 = _SSE_ADD(v1, _SSE_MUL(q1,h5));  // v1 = v1 + q1(1:nq,nb+1) * h5
Andreas Marek's avatar
Andreas Marek committed
674
        v2 = _SSE_ADD(v2, _SSE_MUL(q2,h5));
675
#ifdef HAVE_SSE_INTRINSICS
676
        h1 = _SSE_SET1(hh[nb-4]);   // h1 = hh(nb-3,1)
677 678
#endif
#ifdef HAVE_SPARC64_SSE
679
        h1 = _SSE_SET(hh[nb-4], hh[nb-4]);
680 681
#endif

Andreas Marek's avatar
Andreas Marek committed
682 683
        q1 = _SSE_LOAD(&q[(nb+1)*ldq]);
        q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]);
684

685
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));  // x1 = x1 + q1(1:nq,nb+2) * h1
Andreas Marek's avatar
Andreas Marek committed
686
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
687
#ifdef HAVE_SSE_INTRINSICS
688
        h2 = _SSE_SET1(hh[ldh+nb-3]); // h2 = hh(nb-2,2)
689 690
#endif
#ifdef HAVE_SPARC64_SSE
691
        h2 = _SSE_SET(hh[ldh+nb-3], hh[ldh+nb-3]);
692 693
#endif

694
        y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));  // y1 = y1 + q1(1:nq,nb+2) * h2
Andreas Marek's avatar
Andreas Marek committed
695
        y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
696

697
#ifdef HAVE_SSE_INTRINSICS
698
        h3 = _SSE_SET1(hh[(ldh*2)+nb-2]);  // h3 = hh(nb-1,3)
699 700
#endif
#ifdef HAVE_SPARC64_SSE
701
        h3 = _SSE_SET(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
702 703
#endif

704
        z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));  // z1 = z1 + q1(1:nq,nb+2)  * h3
Andreas Marek's avatar
Andreas Marek committed
705
        z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
706

707
#ifdef HAVE_SSE_INTRINSICS
708
        h4 = _SSE_SET1(hh[(ldh*3)+nb-1]); // h4 = hh(nb,4)
709 710
#endif
#ifdef HAVE_SPARC64_SSE
711
        h4 = _SSE_SET(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
712 713
#endif

714

715
        w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));  // w1 = w1 + q1(1:nq,nb+2) * h4
Andreas Marek's avatar
Andreas Marek committed
716
        w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
717

718
#ifdef HAVE_SSE_INTRINSICS
719
        h1 = _SSE_SET1(hh[nb-3]); // h1 = hh(nb-2,1)
720 721
#endif
#ifdef HAVE_SPARC64_SSE
722
        h1 = _SSE_SET(hh[nb-3], hh[nb-3]);
723 724
#endif

725

Andreas Marek's avatar
Andreas Marek committed
726 727
        q1 = _SSE_LOAD(&q[(nb+2)*ldq]);
        q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]);
728

729
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));  // x1 = x1 + q1(1:nq,nb+3) * h1
Andreas Marek's avatar
Andreas Marek committed
730
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
731

732
#ifdef HAVE_SSE_INTRINSICS
733
        h2 = _SSE_SET1(hh[ldh+nb-2]); // h2 = hh(nb-1,2)
734 735
#endif

736
#ifdef HAVE_SPARC64_SSE
737
        h2 = _SSE_SET(hh[ldh+nb-2], hh[ldh+nb-2]);
738 739
#endif

740
        y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));  // y1 = y1 + q1(1:nq,nb+3) * h2
Andreas Marek's avatar
Andreas Marek committed
741
        y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
742
#ifdef HAVE_SSE_INTRINSICS
743
        h3 = _SSE_SET1(hh[(ldh*2)+nb-1]); // h3 = hh(nb,3)
744 745 746
#endif

#ifdef HAVE_SPARC64_SSE
747
        h3 = _SSE_SET(hh[(ldh*2)+nb-1], hh[(ldh*2)+nb-1]);
748
#endif
749

750
        z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3)); // z1 = z1 + q1(1:nq,nb+3) * h3
Andreas Marek's avatar
Andreas Marek committed
751
        z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
752
#ifdef HAVE_SSE_INTRINSICS
753
        h1 = _SSE_SET1(hh[nb-2]);  // h1 = hh(nb-1,1)
754 755 756
#endif

#ifdef HAVE_SPARC64_SSE
757
        h1 = _SSE_SET(hh[nb-2], hh[nb-2]);
758 759
#endif

Andreas Marek's avatar
Andreas Marek committed
760 761
        q1 = _SSE_LOAD(&q[(nb+3)*ldq]);
        q2 = _SSE_LOAD(&q[((nb+3)*ldq)+offset]);
762

763
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));   // x1 = x1 + q1(1:nq,nb+4) * h1
Andreas Marek's avatar
Andreas Marek committed
764
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
765

766
#ifdef HAVE_SSE_INTRINSICS
767
        h2 = _SSE_SET1(hh[ldh+nb-1]);  // h2 = hh(nb,2)
768 769
#endif

770
#ifdef HAVE_SPARC64_SSE
771
        h2 = _SSE_SET(hh[ldh+nb-1], hh[ldh+nb-1]);
772
#endif
773

774
        y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));  // y1 = y1 + q1(1:n1,nb+4) * h2
Andreas Marek's avatar
Andreas Marek committed
775
        y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
776

777
#ifdef HAVE_SSE_INTRINSICS
778
        h1 = _SSE_SET1(hh[nb-1]);  // h1 = hh(nb,1)
779 780 781
#endif

#ifdef HAVE_SPARC64_SSE
782
        h1 = _SSE_SET(hh[nb-1], hh[nb-1]);
783 784
#endif

Andreas Marek's avatar
Andreas Marek committed
785 786
        q1 = _SSE_LOAD(&q[(nb+4)*ldq]);
        q2 = _SSE_LOAD(&q[((nb+4)*ldq)+offset]);
787

788
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));  // x1 = x1 + q1(1:nq,nb+5) * h1
Andreas Marek's avatar
Andreas Marek committed
789
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
790

Andreas Marek's avatar
Andreas Marek committed
791 792 793
        /////////////////////////////////////////////////////
        // Apply tau, correct wrong calculation using pre-calculated scalar products
        /////////////////////////////////////////////////////
794

795

796
#ifdef HAVE_SSE_INTRINSICS
797
        __SSE_DATATYPE tau1 = _SSE_SET1(hh[0]);  // tau1 = hh(1,1)
798 799
#endif
#ifdef HAVE_SPARC64_SSE
800
        __SSE_DATATYPE tau1 = _SSE_SET(hh[0], hh[0]);
801
#endif
802
        x1 = _SSE_MUL(x1, tau1);  // x1 = x1 * tau1
Andreas Marek's avatar
Andreas Marek committed
803
        x2 = _SSE_MUL(x2, tau1);
804

805
#ifdef HAVE_SSE_INTRINSICS
806 807
        __SSE_DATATYPE tau2 = _SSE_SET1(hh[ldh]);            // tau2 = hh(1,2)
        __SSE_DATATYPE vs_1_2 = _SSE_SET1(scalarprods[0]);   // vs_1_2 = product(1)
808 809
#endif
#ifdef HAVE_SPARC64_SSE
810 811
        __SSE_DATATYPE tau2 = _SSE_SET(hh[ldh], hh[ldh]);
        __SSE_DATATYPE vs_1_2 = _SSE_SET(scalarprods[0], scalarprods[0]);
812 813
#endif

814
        h2 = _SSE_MUL(tau2, vs_1_2);  // h2 = tau2 * vs_1_2
Andreas Marek's avatar
Andreas Marek committed
815

816
        y1 = _SSE_SUB(_SSE_MUL(y1,tau2), _SSE_MUL(x1,h2));   // y1 = y1 * tau2 - x1 * h2
Andreas Marek's avatar
Andreas Marek committed
817 818 819
        y2 = _SSE_SUB(_SSE_MUL(y2,tau2), _SSE_MUL(x2,h2));

#ifdef HAVE_SSE_INTRINSICS
820 821 822
        __SSE_DATATYPE tau3 = _SSE_SET1(hh[ldh*2]);         // tau3 = hh(1,3)
        __SSE_DATATYPE vs_1_3 = _SSE_SET1(scalarprods[1]);  // vs_1_3 = prods(2)
        __SSE_DATATYPE vs_2_3 = _SSE_SET1(scalarprods[2]);  // vs_2_3 = prods(3)
Andreas Marek's avatar
Andreas Marek committed
823 824
#endif
#ifdef HAVE_SPARC64_SSE
825 826 827
        __SSE_DATATYPE tau3 = _SSE_SET(hh[ldh*2], hh[ldh*2]);
        __SSE_DATATYPE vs_1_3 = _SSE_SET(scalarprods[1], scalarprods[1]);
        __SSE_DATATYPE vs_2_3 = _SSE_SET(scalarprods[2], scalarprods[2]) ;
Andreas Marek's avatar
Andreas Marek committed
828 829
#endif

830 831
        h2 = _SSE_MUL(tau3, vs_1_3);   // h2 = tau3 * vs_1_3
        h3 = _SSE_MUL(tau3, vs_2_3);   // h3 = tau3 * vs_2_3
Andreas Marek's avatar
Andreas Marek committed
832

833
        z1 = _SSE_SUB(_SSE_MUL(z1,tau3), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2))); // Z1 = z1 * tau3 - (y1*h3 + x1*h2)
Andreas Marek's avatar
Andreas Marek committed
834
        z2 = _SSE_SUB(_SSE_MUL(z2,tau3), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2)));
835

Andreas Marek's avatar
Andreas Marek committed
836
#ifdef HAVE_SSE_INTRINSICS
837 838 839
        __SSE_DATATYPE tau4 = _SSE_SET1(hh[ldh*3]);         // tau4 = hh(1,4)
        __SSE_DATATYPE vs_1_4 = _SSE_SET1(scalarprods[3]);  // vs_1_4 = prods(4)
        __SSE_DATATYPE vs_2_4 = _SSE_SET1(scalarprods[4]);  // vs_2_4 = prods(5)
Andreas Marek's avatar
Andreas Marek committed
840 841
#endif
#ifdef HAVE_SPARC64_SSE
842 843 844
        __SSE_DATATYPE tau4 = _SSE_SET(hh[ldh*3], hh[ldh*3]);
        __SSE_DATATYPE vs_1_4 = _SSE_SET(scalarprods[3], scalarprods[3]);
        __SSE_DATATYPE vs_2_4 = _SSE_SET(scalarprods[4], scalarprods[4]);
Andreas Marek's avatar
Andreas Marek committed
845 846
#endif

847 848
        h2 = _SSE_MUL(tau4, vs_1_4);  // h2 = tau4 * vs_1_4
        h3 = _SSE_MUL(tau4, vs_2_4);  // h3 = tau4 * vs_2_4
849
#ifdef HAVE_SSE_INTRINSICS
850
        __SSE_DATATYPE vs_3_4 = _SSE_SET1(scalarprods[5]);  // vs_3_4 = prods(6)
851 852
#endif
#ifdef HAVE_SPARC64_SSE
853
        __SSE_DATATYPE vs_3_4 = _SSE_SET(scalarprods[5], scalarprods[5]);
854 855
#endif

856
        h4 = _SSE_MUL(tau4, vs_3_4); // h4 = tau4 * vs_3_4
857

Andreas Marek's avatar
Andreas Marek committed
858 859
        w1 = _SSE_SUB(_SSE_MUL(w1,tau4), _SSE_ADD(_SSE_MUL(z1,h4), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2))));
        w2 = _SSE_SUB(_SSE_MUL(w2,tau4), _SSE_ADD(_SSE_MUL(z2,h4), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2))));
860
        // w1 = w1 * tau4 - (z1 *h4 + y1 * h3 + x1 *h2)
Andreas Marek's avatar
Andreas Marek committed
861
#ifdef HAVE_SSE_INTRINSICS
862 863 864
        __SSE_DATATYPE tau5 = _SSE_SET1(hh[ldh*4]);         // tau5 = hh(1,5)
        __SSE_DATATYPE vs_1_5 = _SSE_SET1(scalarprods[6]);  // vs_1_5 = prods(7)
        __SSE_DATATYPE vs_2_5 = _SSE_SET1(scalarprods[7]);  // vs_2_5 = prods(8)
Andreas Marek's avatar
Andreas Marek committed
865 866
#endif
#ifdef HAVE_SPARC64_SSE
867 868 869
        __SSE_DATATYPE tau5 = _SSE_SET(hh[ldh*4], hh[ldh*4]);
        __SSE_DATATYPE vs_1_5 = _SSE_SET(scalarprods[6], scalarprods[6]);
        __SSE_DATATYPE vs_2_5 = _SSE_SET(scalarprods[7], scalarprods[7]);
Andreas Marek's avatar
Andreas Marek committed
870 871
#endif

872 873
        h2 = _SSE_MUL(tau5, vs_1_5);   // h2 = tau5 * vs_1_5
        h3 = _SSE_MUL(tau5, vs_2_5);   // h3 = tau5 * vs_2_5
Andreas Marek's avatar
Andreas Marek committed
874
#ifdef HAVE_SSE_INTRINSICS
875 876
        __SSE_DATATYPE vs_3_5 = _SSE_SET1(scalarprods[8]);  // vs_3_5 = prods(9)
        __SSE_DATATYPE vs_4_5 = _SSE_SET1(scalarprods[9]);  // vs_4_5 = prods(10)
Andreas Marek's avatar
Andreas Marek committed
877 878
#endif
#ifdef HAVE_SPARC64_SSE
879 880
        __SSE_DATATYPE vs_3_5 = _SSE_SET(scalarprods[8], scalarprods[8]);
        __SSE_DATATYPE vs_4_5 = _SSE_SET(scalarprods[9], scalarprods[9]);
Andreas Marek's avatar
Andreas Marek committed
881 882
#endif

883 884
        h4 = _SSE_MUL(tau5, vs_3_5);  // h4 = tau5 * vs_3_5
        h5 = _SSE_MUL(tau5, vs_4_5);  // h5 = tau5 * vs_4_5
885

Andreas Marek's avatar
Andreas Marek committed
886 887
        v1 = _SSE_SUB(_SSE_MUL(v1,tau5), _SSE_ADD(_SSE_ADD(_SSE_MUL(w1,h5), _SSE_MUL(z1,h4)), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2))));
        v2 = _SSE_SUB(_SSE_MUL(v2,tau5), _SSE_ADD(_SSE_ADD(_SSE_MUL(w2,h5), _SSE_MUL(z2,h4)), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2))));
888
       // v1 = v1 * tau5 - (w1 * h5 + z1 * h4 + y1 * h3 + x1 * h2)
Andreas Marek's avatar
Andreas Marek committed
889
#ifdef HAVE_SSE_INTRINSICS
890 891 892
        __SSE_DATATYPE tau6 = _SSE_SET1(hh[ldh*5]);         // tau6 = hh(1,6)
 	__SSE_DATATYPE vs_1_6 = _SSE_SET1(scalarprods[10]); // vs_1_6 = prods(11)
        __SSE_DATATYPE vs_2_6 = _SSE_SET1(scalarprods[11]); // vs_2_6 = prods(12)
Andreas Marek's avatar
Andreas Marek committed
893 894
#endif
#ifdef HAVE_SPARC64_SSE
895 896 897
        __SSE_DATATYPE tau6 = _SSE_SET(hh[ldh*5], hh[ldh*5]);
        __SSE_DATATYPE vs_1_6 = _SSE_SET(scalarprods[10], scalarprods[10]);
        __SSE_DATATYPE vs_2_6 = _SSE_SET(scalarprods[11], scalarprods[11]);
Andreas Marek's avatar
Andreas Marek committed
898 899
#endif

900 901
        h2 = _SSE_MUL(tau6, vs_1_6); // h2 = tau6 * vs_1_6
        h3 = _SSE_MUL(tau6, vs_2_6); // h3 = tau6 * vs_2_6
902
#ifdef HAVE_SSE_INTRINSICS
903 904 905
        __SSE_DATATYPE vs_3_6 = _SSE_SET1(scalarprods[12]); // vs_3_6 = prods(13)
        __SSE_DATATYPE vs_4_6 = _SSE_SET1(scalarprods[13]); // vs_4_6 = prods(14)
        __SSE_DATATYPE vs_5_6 = _SSE_SET1(scalarprods[14]); // vs_5_6 = prods(15)
906 907
#endif
#ifdef HAVE_SPARC64_SSE
908 909 910
        __SSE_DATATYPE vs_3_6 = _SSE_SET(scalarprods[12], scalarprods[12]);
        __SSE_DATATYPE vs_4_6 = _SSE_SET(scalarprods[13], scalarprods[13]);
        __SSE_DATATYPE vs_5_6 = _SSE_SET(scalarprods[14], scalarprods[14]);
911 912
#endif

913 914 915
        h4 = _SSE_MUL(tau6, vs_3_6); // h4 = tau6 * vs_3_6
        h5 = _SSE_MUL(tau6, vs_4_6); // h5 = tau6 * vs_4_6
        h6 = _SSE_MUL(tau6, vs_5_6); // h6 = tau6 * vs_5_6
916

Andreas Marek's avatar
Andreas Marek committed
917 918
        t1 = _SSE_SUB(_SSE_MUL(t1,tau6), _SSE_ADD( _SSE_MUL(v1,h6), _SSE_ADD(_SSE_ADD(_SSE_MUL(w1,h5), _SSE_MUL(z1,h4)), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2)))));
        t2 = _SSE_SUB(_SSE_MUL(t2,tau6), _SSE_ADD( _SSE_MUL(v2,h6), _SSE_ADD(_SSE_ADD(_SSE_MUL(w2,h5), _SSE_MUL(z2,h4)), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2)))));
919
        // t1 = t1 * tau6 - ( v1 * h6 + w1*h5 + z1*h4 +y1*h3 + x1*h2)
920

Andreas Marek's avatar
Andreas Marek committed
921 922 923
        /////////////////////////////////////////////////////
        // Rank-1 update of Q [4 x nb+3]
        /////////////////////////////////////////////////////
924

Andreas Marek's avatar
Andreas Marek committed
925 926
        q1 = _SSE_LOAD(&q[0]);
        q2 = _SSE_LOAD(&q[offset]);
927
        q1 = _SSE_SUB(q1, t1);       // q1(1:n1,1) = q1(1:nq,1) - t
Andreas Marek's avatar
Andreas Marek committed
928 929 930
        q2 = _SSE_SUB(q2, t2);
        _SSE_STORE(&q[0],q1);
        _SSE_STORE(&q[offset],q2);
931