real_sse_6hv_template.c 106 KB
Newer Older
1 2 3 4 5 6
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
Andreas Marek's avatar
Andreas Marek committed
7
//        Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
Andreas Marek's avatar
Andreas Marek committed
9
//        Informatik,
10
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
//        Schwerpunkt Wissenschaftliches Rechnen ,
12 13
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
Andreas Marek's avatar
Andreas Marek committed
14 15
//        Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//        and
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
Andreas Marek's avatar
Andreas Marek committed
36
//    along with ELPA.        If not, see <http://www.gnu.org/licenses/>
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de), based on Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

64
#ifdef HAVE_SSE_INTRINSICS
65
#include <x86intrin.h>
66 67 68 69 70
#endif
#ifdef HAVE_SPARC64_SSE
#include <fjmfunc.h>
#include <emmintrin.h>
#endif
71 72
#include <stdio.h>
#include <stdlib.h>
73

74

75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
#define __forceinline __attribute__((always_inline)) static

#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_SUB _mm_sub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_STORE _mm_store_pd
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_SUB _mm_sub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_STORE _mm_store_ps
#endif

#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

100
#ifdef HAVE_SSE_INTRINSICS
101 102 103 104 105 106 107 108 109 110 111 112
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_sse_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sparc64_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_sparc64_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif


131 132 133 134 135 136

#ifdef DOUBLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
137 138 139 140 141
!f>                                bind(C, name="hexa_hh_trafo_real_sse_6hv_double")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_double)        :: hh(pnb,6)
142 143 144 145
!f>   end subroutine
!f> end interface
!f>#endif
*/
146 147 148 149
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f>   subroutine hexa_hh_trafo_real_sparc64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
150 151 152 153 154
!f>                                bind(C, name="hexa_hh_trafo_real_sparc64_6hv_double")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_double)        :: hh(pnb,6)
155 156 157 158
!f>   end subroutine
!f> end interface
!f>#endif
*/
159
#endif
160

161 162 163 164 165
#ifdef SINGLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
166 167 168 169 170
!f>                                bind(C, name="hexa_hh_trafo_real_sse_6hv_single")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_float)        :: hh(pnb,6)
171 172 173 174
!f>   end subroutine
!f> end interface
!f>#endif
*/
175 176 177 178
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f>   subroutine hexa_hh_trafo_real_sparc64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
179 180 181 182 183
!f>                                bind(C, name="hexa_hh_trafo_real_sparc64_6hv_single")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_float)        :: hh(pnb,6)
184 185 186 187 188
!f>   end subroutine
!f> end interface
!f>#endif
*/

189 190
#endif

191
#ifdef HAVE_SSE_INTRINSICS
192 193 194 195 196 197
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
198 199 200 201 202 203 204 205 206
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sparc64_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sparc64_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#endif
207
{
Andreas Marek's avatar
Andreas Marek committed
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on ;

        worked_on = 0;

        // calculating scalar products to compute
        // 6 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
        double scalarprods[15];
#endif
#ifdef SINGLE_PRECISION_REAL
        float scalarprods[15];
#endif

        scalarprods[0] = hh[(ldh+1)];
        scalarprods[1] = hh[(ldh*2)+2];
        scalarprods[2] = hh[(ldh*2)+1];
        scalarprods[3] = hh[(ldh*3)+3];
        scalarprods[4] = hh[(ldh*3)+2];
        scalarprods[5] = hh[(ldh*3)+1];
        scalarprods[6] = hh[(ldh*4)+4];
        scalarprods[7] = hh[(ldh*4)+3];
        scalarprods[8] = hh[(ldh*4)+2];
        scalarprods[9] = hh[(ldh*4)+1];
        scalarprods[10] = hh[(ldh*5)+5];
        scalarprods[11] = hh[(ldh*5)+4];
        scalarprods[12] = hh[(ldh*5)+3];
        scalarprods[13] = hh[(ldh*5)+2];
        scalarprods[14] = hh[(ldh*5)+1];
241

Andreas Marek's avatar
Andreas Marek committed
242 243 244 245 246 247 248
        // calculate scalar product of first and fourth householder Vector
        // loop counter = 2
        scalarprods[0] += hh[1] * hh[(2+ldh)];
        scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
249

Andreas Marek's avatar
Andreas Marek committed
250 251 252 253 254 255
        // loop counter = 3
        scalarprods[0] += hh[2] * hh[(3+ldh)];
        scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
256

Andreas Marek's avatar
Andreas Marek committed
257 258 259 260
        scalarprods[1] += hh[1] * hh[3+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
261

Andreas Marek's avatar
Andreas Marek committed
262 263 264 265 266 267
        // loop counter = 4
        scalarprods[0] += hh[3] * hh[(4+ldh)];
        scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
268

Andreas Marek's avatar
Andreas Marek committed
269 270 271 272
        scalarprods[1] += hh[2] * hh[4+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
273

Andreas Marek's avatar
Andreas Marek committed
274 275 276
        scalarprods[3] += hh[1] * hh[4+(ldh*3)];
        scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
        scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
277

Andreas Marek's avatar
Andreas Marek committed
278 279 280 281 282 283
        // loop counter = 5
        scalarprods[0] += hh[4] * hh[(5+ldh)];
        scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
284

Andreas Marek's avatar
Andreas Marek committed
285 286 287 288
        scalarprods[1] += hh[3] * hh[5+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
289

Andreas Marek's avatar
Andreas Marek committed
290 291 292
        scalarprods[3] += hh[2] * hh[5+(ldh*3)];
        scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
        scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
293

Andreas Marek's avatar
Andreas Marek committed
294 295
        scalarprods[6] += hh[1] * hh[5+(ldh*4)];
        scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
296

Andreas Marek's avatar
Andreas Marek committed
297 298 299 300 301 302 303 304
        #pragma ivdep
        for (i = 6; i < nb; i++)
        {
                scalarprods[0] += hh[i-1] * hh[(i+ldh)];
                scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
                scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
                scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
                scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
305

Andreas Marek's avatar
Andreas Marek committed
306 307 308 309
                scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
                scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
                scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
                scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
310

Andreas Marek's avatar
Andreas Marek committed
311 312 313
                scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
                scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
                scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
314

Andreas Marek's avatar
Andreas Marek committed
315 316
                scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
                scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
317

Andreas Marek's avatar
Andreas Marek committed
318 319
                scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
        }
320

Andreas Marek's avatar
Andreas Marek committed
321
        // Production level kernel calls with padding
322
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
323 324
        for (i = 0; i < nq-2; i+=4)
        {
325
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
326
                hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
327 328
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
329
                hh_trafo_kernel_4_SPARC64_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
330 331
#endif

Andreas Marek's avatar
Andreas Marek committed
332 333
                worked_on += 4;
        }
334 335
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
336 337
        for (i = 0; i < nq-4; i+=8)
        {
338
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
339
                hh_trafo_kernel_8_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
340 341
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
342
                hh_trafo_kernel_8_SPARC64_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
343 344
#endif

Andreas Marek's avatar
Andreas Marek committed
345 346
                worked_on += 8;
        }
347
#endif
Andreas Marek's avatar
Andreas Marek committed
348 349 350 351
        if (nq == i)
        {
                return;
        }
352
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
353 354
        if (nq -i == 2)
        {
355
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
356
                hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
357 358
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
359
                hh_trafo_kernel_2_SPARC64_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
360 361
#endif

Andreas Marek's avatar
Andreas Marek committed
362 363
                worked_on += 2;
        }
364 365
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
366 367
        if (nq -i == 4)
        {
368
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
369
                hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
370 371
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
372
                hh_trafo_kernel_4_SPARC64_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
373
#endif
Andreas Marek's avatar
Andreas Marek committed
374 375
                worked_on += 4;
        }
376
#endif
377
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
378 379
        if (worked_on != nq)
        {
380
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
381
                printf("Error in real SSE BLOCK6 kernel \n");
382 383
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
384
                printf("Error in real SPARC64 BLOCK6 kernel \n");
385 386
#endif

Andreas Marek's avatar
Andreas Marek committed
387 388
                abort();
        }
389
#endif
390 391 392 393 394 395 396 397 398 399 400 401 402
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 4 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
403
#ifdef HAVE_SSE_INTRINSICS
404 405 406 407 408 409
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
410 411 412 413 414 415 416 417 418 419
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
#endif

420
{
Andreas Marek's avatar
Andreas Marek committed
421 422 423 424 425
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [4 x nb+3] * hh
        // hh contains four householder vectors
        /////////////////////////////////////////////////////
        int i;
426

Andreas Marek's avatar
Andreas Marek committed
427 428 429 430 431 432
        __SSE_DATATYPE a1_1 = _SSE_LOAD(&q[ldq*5]);
        __SSE_DATATYPE a2_1 = _SSE_LOAD(&q[ldq*4]);
        __SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq*3]);
        __SSE_DATATYPE a4_1 = _SSE_LOAD(&q[ldq*2]);
        __SSE_DATATYPE a5_1 = _SSE_LOAD(&q[ldq]);
        __SSE_DATATYPE a6_1 = _SSE_LOAD(&q[0]);
433

434
#ifdef HAVE_SSE_INTRINSICS
435
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
436 437 438 439 440
        __SSE_DATATYPE h_6_5 = _mm_set1_pd(hh[(ldh*5)+1]);
        __SSE_DATATYPE h_6_4 = _mm_set1_pd(hh[(ldh*5)+2]);
        __SSE_DATATYPE h_6_3 = _mm_set1_pd(hh[(ldh*5)+3]);
        __SSE_DATATYPE h_6_2 = _mm_set1_pd(hh[(ldh*5)+4]);
        __SSE_DATATYPE h_6_1 = _mm_set1_pd(hh[(ldh*5)+5]);
441 442
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
443 444 445 446 447
        __SSE_DATATYPE h_6_5 =         _mm_set1_ps(hh[(ldh*5)+1]) ;
        __SSE_DATATYPE h_6_4 =         _mm_set1_ps(hh[(ldh*5)+2]) ;
        __SSE_DATATYPE h_6_3 =         _mm_set1_ps(hh[(ldh*5)+3]) ;
        __SSE_DATATYPE h_6_2 =         _mm_set1_ps(hh[(ldh*5)+4]) ;
        __SSE_DATATYPE h_6_1 =         _mm_set1_ps(hh[(ldh*5)+5]) ;
448
#endif
449 450 451 452
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
453 454 455 456 457
        __SSE_DATATYPE h_6_5 = _mm_set_pd(hh[(ldh*5)+1], hh[(ldh*5)+1]);
        __SSE_DATATYPE h_6_4 = _mm_set_pd(hh[(ldh*5)+2], hh[(ldh*5)+2]);
        __SSE_DATATYPE h_6_3 = _mm_set_pd(hh[(ldh*5)+3], hh[(ldh*5)+3]);
        __SSE_DATATYPE h_6_2 = _mm_set_pd(hh[(ldh*5)+4], hh[(ldh*5)+4]);
        __SSE_DATATYPE h_6_1 = _mm_set_pd(hh[(ldh*5)+5], hh[(ldh*5)+5]);
458 459
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
460 461 462 463 464
        __SSE_DATATYPE h_6_5 =         _mm_set_ps(hh[(ldh*5)+1], hh[(ldh*5)+1]) ;
        __SSE_DATATYPE h_6_4 =         _mm_set_ps(hh[(ldh*5)+2], hh[(ldh*5)+2]) ;
        __SSE_DATATYPE h_6_3 =         _mm_set_ps(hh[(ldh*5)+3], hh[(ldh*5)+3]) ;
        __SSE_DATATYPE h_6_2 =         _mm_set_ps(hh[(ldh*5)+4], hh[(ldh*5)+4]) ;
        __SSE_DATATYPE h_6_1 =         _mm_set_ps(hh[(ldh*5)+5], hh[(ldh*5)+5]) ;
465 466 467
#endif
#endif

468 469


Andreas Marek's avatar
Andreas Marek committed
470 471 472 473 474
        register __SSE_DATATYPE t1 = _SSE_ADD(a6_1, _SSE_MUL(a5_1, h_6_5));
        t1 = _SSE_ADD(t1, _SSE_MUL(a4_1, h_6_4));
        t1 = _SSE_ADD(t1, _SSE_MUL(a3_1, h_6_3));
        t1 = _SSE_ADD(t1, _SSE_MUL(a2_1, h_6_2));
        t1 = _SSE_ADD(t1, _SSE_MUL(a1_1, h_6_1));
475

476
#ifdef HAVE_SSE_INTRINSICS
477
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
478 479 480 481
        __SSE_DATATYPE h_5_4 = _mm_set1_pd(hh[(ldh*4)+1]);
        __SSE_DATATYPE h_5_3 = _mm_set1_pd(hh[(ldh*4)+2]);
        __SSE_DATATYPE h_5_2 = _mm_set1_pd(hh[(ldh*4)+3]);
        __SSE_DATATYPE h_5_1 = _mm_set1_pd(hh[(ldh*4)+4]);
482 483
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
484 485 486 487
        __SSE_DATATYPE h_5_4 =         _mm_set1_ps(hh[(ldh*4)+1]) ;
        __SSE_DATATYPE h_5_3 =         _mm_set1_ps(hh[(ldh*4)+2]) ;
        __SSE_DATATYPE h_5_2 =         _mm_set1_ps(hh[(ldh*4)+3]) ;
        __SSE_DATATYPE h_5_1 =         _mm_set1_ps(hh[(ldh*4)+4]) ;
488
#endif
489 490 491 492
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
493 494 495 496
        __SSE_DATATYPE h_5_4 = _mm_set_pd(hh[(ldh*4)+1], hh[(ldh*4)+1]);
        __SSE_DATATYPE h_5_3 = _mm_set_pd(hh[(ldh*4)+2], hh[(ldh*4)+2]);
        __SSE_DATATYPE h_5_2 = _mm_set_pd(hh[(ldh*4)+3], hh[(ldh*4)+3]);
        __SSE_DATATYPE h_5_1 = _mm_set_pd(hh[(ldh*4)+4], hh[(ldh*4)+4]);
497 498
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
499 500 501 502
        __SSE_DATATYPE h_5_4 =         _mm_set_ps(hh[(ldh*4)+1], hh[(ldh*4)+1]) ;
        __SSE_DATATYPE h_5_3 =         _mm_set_ps(hh[(ldh*4)+2], hh[(ldh*4)+2]) ;
        __SSE_DATATYPE h_5_2 =         _mm_set_ps(hh[(ldh*4)+3], hh[(ldh*4)+3]) ;
        __SSE_DATATYPE h_5_1 =         _mm_set_ps(hh[(ldh*4)+4], hh[(ldh*4)+4]) ;
503 504 505 506
#endif
#endif


507

Andreas Marek's avatar
Andreas Marek committed
508 509 510 511
        register __SSE_DATATYPE v1 = _SSE_ADD(a5_1, _SSE_MUL(a4_1, h_5_4));
        v1 = _SSE_ADD(v1, _SSE_MUL(a3_1, h_5_3));
        v1 = _SSE_ADD(v1, _SSE_MUL(a2_1, h_5_2));
        v1 = _SSE_ADD(v1, _SSE_MUL(a1_1, h_5_1));
512

513
#ifdef HAVE_SSE_INTRINSICS
514
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
515 516 517
        __SSE_DATATYPE h_4_3 = _mm_set1_pd(hh[(ldh*3)+1]);
        __SSE_DATATYPE h_4_2 = _mm_set1_pd(hh[(ldh*3)+2]);
        __SSE_DATATYPE h_4_1 = _mm_set1_pd(hh[(ldh*3)+3]);
518 519
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
520 521 522
        __SSE_DATATYPE h_4_3 =         _mm_set1_ps(hh[(ldh*3)+1]) ;
        __SSE_DATATYPE h_4_2 =         _mm_set1_ps(hh[(ldh*3)+2]) ;
        __SSE_DATATYPE h_4_1 =         _mm_set1_ps(hh[(ldh*3)+3]) ;
523
#endif
524 525 526 527
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
528 529 530
        __SSE_DATATYPE h_4_3 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]);
        __SSE_DATATYPE h_4_2 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]);
        __SSE_DATATYPE h_4_1 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]);
531 532
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
533 534 535
        __SSE_DATATYPE h_4_3 =         _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]);
        __SSE_DATATYPE h_4_2 =         _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]);
        __SSE_DATATYPE h_4_1 =         _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]);
536 537 538
#endif
#endif

539

Andreas Marek's avatar
Andreas Marek committed
540 541 542
        register __SSE_DATATYPE w1 = _SSE_ADD(a4_1, _SSE_MUL(a3_1, h_4_3));
        w1 = _SSE_ADD(w1, _SSE_MUL(a2_1, h_4_2));
        w1 = _SSE_ADD(w1, _SSE_MUL(a1_1, h_4_1));
543

544
#ifdef HAVE_SSE_INTRINSICS
545
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
546 547 548
        __SSE_DATATYPE h_2_1 = _mm_set1_pd(hh[ldh+1]);
        __SSE_DATATYPE h_3_2 = _mm_set1_pd(hh[(ldh*2)+1]);
        __SSE_DATATYPE h_3_1 = _mm_set1_pd(hh[(ldh*2)+2]);
549 550
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
551 552 553
        __SSE_DATATYPE h_2_1 =         _mm_set1_ps(hh[ldh+1]) ;
        __SSE_DATATYPE h_3_2 =         _mm_set1_ps(hh[(ldh*2)+1]) ;
        __SSE_DATATYPE h_3_1 =         _mm_set1_ps(hh[(ldh*2)+2]) ;
554 555 556 557 558
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
559 560 561
        __SSE_DATATYPE h_2_1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
        __SSE_DATATYPE h_3_2 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]);
        __SSE_DATATYPE h_3_1 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]);
562 563
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
564 565 566
        __SSE_DATATYPE h_2_1 =         _mm_set_ps(hh[ldh+1], hh[ldh+1]) ;
        __SSE_DATATYPE h_3_2 =         _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]) ;
        __SSE_DATATYPE h_3_1 =         _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]) ;
567
#endif
568 569
#endif

Andreas Marek's avatar
Andreas Marek committed
570 571 572
        register __SSE_DATATYPE z1 = _SSE_ADD(a3_1, _SSE_MUL(a2_1, h_3_2));
        z1 = _SSE_ADD(z1, _SSE_MUL(a1_1, h_3_1));
        register __SSE_DATATYPE y1 = _SSE_ADD(a2_1, _SSE_MUL(a1_1, h_2_1));
573

Andreas Marek's avatar
Andreas Marek committed
574
        register __SSE_DATATYPE x1 = a1_1;
575

Andreas Marek's avatar
Andreas Marek committed
576 577 578 579 580 581
        __SSE_DATATYPE a1_2 = _SSE_LOAD(&q[(ldq*5)+offset]);
        __SSE_DATATYPE a2_2 = _SSE_LOAD(&q[(ldq*4)+offset]);
        __SSE_DATATYPE a3_2 = _SSE_LOAD(&q[(ldq*3)+offset]);
        __SSE_DATATYPE a4_2 = _SSE_LOAD(&q[(ldq*2)+offset]);
        __SSE_DATATYPE a5_2 = _SSE_LOAD(&q[(ldq)+offset]);
        __SSE_DATATYPE a6_2 = _SSE_LOAD(&q[offset]);
582

Andreas Marek's avatar
Andreas Marek committed
583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
        register __SSE_DATATYPE t2 = _SSE_ADD(a6_2, _SSE_MUL(a5_2, h_6_5));
        t2 = _SSE_ADD(t2, _SSE_MUL(a4_2, h_6_4));
        t2 = _SSE_ADD(t2, _SSE_MUL(a3_2, h_6_3));
        t2 = _SSE_ADD(t2, _SSE_MUL(a2_2, h_6_2));
        t2 = _SSE_ADD(t2, _SSE_MUL(a1_2, h_6_1));
        register __SSE_DATATYPE v2 = _SSE_ADD(a5_2, _SSE_MUL(a4_2, h_5_4));
        v2 = _SSE_ADD(v2, _SSE_MUL(a3_2, h_5_3));
        v2 = _SSE_ADD(v2, _SSE_MUL(a2_2, h_5_2));
        v2 = _SSE_ADD(v2, _SSE_MUL(a1_2, h_5_1));
        register __SSE_DATATYPE w2 = _SSE_ADD(a4_2, _SSE_MUL(a3_2, h_4_3));
        w2 = _SSE_ADD(w2, _SSE_MUL(a2_2, h_4_2));
        w2 = _SSE_ADD(w2, _SSE_MUL(a1_2, h_4_1));
        register __SSE_DATATYPE z2 = _SSE_ADD(a3_2, _SSE_MUL(a2_2, h_3_2));
        z2 = _SSE_ADD(z2, _SSE_MUL(a1_2, h_3_1));
        register __SSE_DATATYPE y2 = _SSE_ADD(a2_2, _SSE_MUL(a1_2, h_2_1));
598

Andreas Marek's avatar
Andreas Marek committed
599
        register __SSE_DATATYPE x2 = a1_2;
600

Andreas Marek's avatar
Andreas Marek committed
601 602
        __SSE_DATATYPE q1;
        __SSE_DATATYPE q2;
603

Andreas Marek's avatar
Andreas Marek committed
604 605 606 607 608 609
        __SSE_DATATYPE h1;
        __SSE_DATATYPE h2;
        __SSE_DATATYPE h3;
        __SSE_DATATYPE h4;
        __SSE_DATATYPE h5;
        __SSE_DATATYPE h6;
610

Andreas Marek's avatar
Andreas Marek committed
611 612
        for(i = 6; i < nb; i++)
        {
613
#ifdef HAVE_SSE_INTRINSICS
614
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
615
                h1 = _mm_set1_pd(hh[i-5]);
616 617
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
618
                h1 = _mm_set1_ps(hh[i-5]);
619 620 621 622
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
623
                h1 = _mm_set_pd(hh[i-5], hh[i-5]);
624 625
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
626
                h1 = _mm_set_ps(hh[i-5], hh[i-5]);
627
#endif
628
#endif
Andreas Marek's avatar
Andreas Marek committed
629 630 631
        
                q1 = _SSE_LOAD(&q[i*ldq]);
                q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
632

Andreas Marek's avatar
Andreas Marek committed
633 634
                x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
                x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
635

636
#ifdef HAVE_SSE_INTRINSICS
637
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
638
                h2 = _mm_set1_pd(hh[ldh+i-4]);
639 640
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
641
                h2 = _mm_set1_ps(hh[ldh+i-4]);
642 643 644 645 646
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
647
                h2 = _mm_set_pd(hh[ldh+i-4], hh[ldh+i-4]);
648 649
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
650
                h2 = _mm_set_ps(hh[ldh+i-4], hh[ldh+i-4]);
651
#endif
652
#endif
653

Andreas Marek's avatar
Andreas Marek committed
654 655
                y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
                y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
656

657
#ifdef HAVE_SSE_INTRINSICS
658
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
659
                h3 = _mm_set1_pd(hh[(ldh*2)+i-3]);
660 661
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
662
                h3 = _mm_set1_ps(hh[(ldh*2)+i-3]);
663 664 665 666
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
667
                h3 = _mm_set_pd(hh[(ldh*2)+i-3], hh[(ldh*2)+i-3]);
668 669
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
670
                h3 = _mm_set_ps(hh[(ldh*2)+i-3], hh[(ldh*2)+i-3]);
671
#endif
672
#endif
673

Andreas Marek's avatar
Andreas Marek committed
674 675
                z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
                z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
676
#ifdef HAVE_SSE_INTRINSICS
677
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
678
                h4 = _mm_set1_pd(hh[(ldh*3)+i-2]);
679 680
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
681
                h4 = _mm_set1_ps(hh[(ldh*3)+i-2]);
682 683 684 685
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
686
                h4 = _mm_set_pd(hh[(ldh*3)+i-2], hh[(ldh*3)+i-2]);
687 688
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
689
                h4 = _mm_set_ps(hh[(ldh*3)+i-2], hh[(ldh*3)+i-2]);
690
#endif
691 692
#endif

Andreas Marek's avatar
Andreas Marek committed
693 694
                w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
                w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
695

696
#ifdef HAVE_SSE_INTRINSICS
697
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
698
                h5 = _mm_set1_pd(hh[(ldh*4)+i-1]);
699 700
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
701
                h5 = _mm_set1_ps(hh[(ldh*4)+i-1]);
702 703 704 705
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
706
                h5 = _mm_set_pd(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
707 708
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
709
                h5 = _mm_set_ps(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
710
#endif
711 712
#endif

Andreas Marek's avatar
Andreas Marek committed
713 714
                v1 = _SSE_ADD(v1, _SSE_MUL(q1,h5));
                v2 = _SSE_ADD(v2, _SSE_MUL(q2,h5));
715

716
#ifdef HAVE_SSE_INTRINSICS
717
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
718
                h6 = _mm_set1_pd(hh[(ldh*5)+i]);
719 720
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
721
                h6 = _mm_set1_ps(hh[(ldh*5)+i]);
722 723 724 725 726
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
727
                h6 = _mm_set_pd(hh[(ldh*5)+i], hh[(ldh*5)+i]);
728 729
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
730
                h6 = _mm_set_ps(hh[(ldh*5)+i], hh[(ldh*5)+i]);
731
#endif
732
#endif
733

Andreas Marek's avatar
Andreas Marek committed
734 735 736
                t1 = _SSE_ADD(t1, _SSE_MUL(q1,h6));
                t2 = _SSE_ADD(t2, _SSE_MUL(q2,h6));
        }
737

738
#ifdef HAVE_SSE_INTRINSICS
739
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
740
        h1 = _mm_set1_pd(hh[nb-5]);
741 742
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
743
        h1 = _mm_set1_ps(hh[nb-5] );
744
#endif
745 746 747 748
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
749
        h1 = _mm_set_pd(hh[nb-5], hh[nb-5]);
750 751
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
752
        h1 = _mm_set_ps(hh[nb-5], hh[nb-5]);
753 754 755
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
756 757
        q1 = _SSE_LOAD(&q[nb*ldq]);
        q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
758

Andreas Marek's avatar
Andreas Marek committed
759 760
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
761

762
#ifdef HAVE_SSE_INTRINSICS
763
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
764
        h2 = _mm_set1_pd(hh[ldh+nb-4]);
765 766
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
767
        h2 = _mm_set1_ps(hh[ldh+nb-4]);
768 769 770 771 772
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
773
        h2 = _mm_set_pd(hh[ldh+nb-4], hh[ldh+nb-4]);
774 775
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
776
        h2 = _mm_set_ps(hh[ldh+nb-4], hh[ldh+nb-4]);
777
#endif
778
#endif
779 780


Andreas Marek's avatar
Andreas Marek committed
781 782
        y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
        y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
783

784
#ifdef HAVE_SSE_INTRINSICS
785
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
786
        h3 = _mm_set1_pd(hh[(ldh*2)+nb-3]);
787 788
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
789
        h3 = _mm_set1_ps(hh[(ldh*2)+nb-3]);
790 791 792 793 794
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
795
        h3 = _mm_set_pd(hh[(ldh*2)+nb-3], hh[(ldh*2)+nb-3];
796 797
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
798
        h3 = _mm_set_ps(hh[(ldh*2)+nb-3], hh[(ldh*2)+nb-3];
799
#endif
800
#endif
801 802


Andreas Marek's avatar
Andreas Marek committed
803 804
        z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
        z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
805

806
#ifdef HAVE_SSE_INTRINSICS
807
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
808
        h4 = _mm_set1_pd(hh[(ldh*3)+nb-2]);
809 810
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
811
        h4 = _mm_set1_ps(hh[(ldh*3)+nb-2]);
812 813 814 815 816
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
817
        h4 = _mm_set_pd(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
818 819
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
820
        h4 = _mm_set_ps(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
821
#endif
822
#endif
823

Andreas Marek's avatar
Andreas Marek committed
824 825
        w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
        w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
826

827
#ifdef HAVE_SSE_INTRINSICS
828
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
829
        h5 = _mm_set1_pd(hh[(ldh*4)+nb-1]);
830 831
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
832
        h5 = _mm_set1_ps(hh[(ldh*4)+nb-1]);
833 834 835 836 837
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
838
        h5 = _mm_set_pd(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
839 840
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
841
        h5 = _mm_set_ps(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
842
#endif
843 844 845
#endif


846

Andreas Marek's avatar
Andreas Marek committed
847 848
        v1 = _SSE_ADD(v1, _SSE_MUL(q1,h5));
        v2 = _SSE_ADD(v2, _SSE_MUL(q2,h5));
849
#ifdef HAVE_SSE_INTRINSICS
850
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
851
        h1 = _mm_set1_pd(hh[nb-4]);
852 853
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
854
        h1 = _mm_set1_ps(hh[nb-4]);
855 856 857 858
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
859
        h1 = _mm_set_pd(hh[nb-4]), hh[nb-4];
860 861
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
862
        h1 = _mm_set_ps(hh[nb-4], hh[nb-4]);
863
#endif
864 865
#endif

Andreas Marek's avatar
Andreas Marek committed
866 867
        q1 = _SSE_LOAD(&q[(nb+1)*ldq]);
        q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]);
868

Andreas Marek's avatar
Andreas Marek committed
869 870
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
871
#ifdef HAVE_SSE_INTRINSICS
872
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
873
        h2 = _mm_set1_pd(hh[ldh+nb-3]);
874 875
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
876
        h2 = _mm_set1_ps(hh[ldh+nb-3]);
877 878 879 880
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
881
        h2 = _mm_set_pd(hh[ldh+nb-3], hh[ldh+nb-3]);
882 883
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
884
        h2 = _mm_set_ps(hh[ldh+nb-3], hh[ldh+nb-3]);
885
#endif
886 887
#endif

Andreas Marek's avatar
Andreas Marek committed
888 889
        y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
        y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
890

891
#ifdef HAVE_SSE_INTRINSICS
892
#ifdef DOUBLE_PRECISION
Andreas Marek's avatar
Andreas Marek committed
893
        h3 = _mm_set1_pd(hh[(ldh*2)+nb-2]);
894 895
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
896
        h3 = _mm_set1_ps(hh[(ldh*2)+nb-2]);
897 898 899 900
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION
Andreas Marek's avatar
Andreas Marek committed
901
        h3 = _mm_set_pd(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
902 903
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
904
        h3 = _mm_set_ps(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
905
#endif
906 907
#endif

Andreas Marek's avatar
Andreas Marek committed
908 909
        z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
        z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
910

911
#ifdef HAVE_SSE_INTRINSICS
912
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
913
        h4 = _mm_set1_pd(hh[(ldh*3)+nb-1]);
914 915
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
916
        h4 = _mm_set1_ps(hh[(ldh*3)+nb-1]);
917 918 919 920
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
921