real_vsx_2hv_template.c 69.4 KB
Newer Older
1 2 3 4 5 6
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
Andreas Marek's avatar
Andreas Marek committed
7
//        Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
Andreas Marek's avatar
Andreas Marek committed
9
//        Informatik,
10
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
//        Schwerpunkt Wissenschaftliches Rechnen ,
12 13
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
Andreas Marek's avatar
Andreas Marek committed
14 15
//        Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//        and
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
Andreas Marek's avatar
Andreas Marek committed
36
//    along with ELPA.        If not, see <http://www.gnu.org/licenses/>
37 38 39 40 41 42 43 44 45 46 47
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
//

48
#ifdef HAVE_VSX_SSE
49 50 51 52 53 54 55 56 57
#include <altivec.h>
#endif
#include <stdio.h>
#include <stdlib.h>


#define __forceinline __attribute__((always_inline)) static
#ifdef DOUBLE_PRECISION_REAL
#define __SSE_DATATYPE __vector double
58
#define _SSE_LOAD (__vector double) vec_ld
59 60 61 62 63 64 65 66
#define _SSE_ADD vec_add
#define _SSE_MUL vec_mul
#define _SSE_STORE vec_st
#define offset 2
#endif

#ifdef SINGLE_PRECISION_REAL
#define __SSE_DATATYPE __vector float
67
#define _SSE_LOAD  (__vector float) vec_ld
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
#define _SSE_ADD vec_add
#define _SSE_MUL vec_mul
#define _SSE_STORE vec_st
#define offset 4
#endif


#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

//Forward declaration
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_4_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_6_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_10_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_12_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s);
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_8_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_12_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_16_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_20_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
__forceinline void hh_trafo_kernel_24_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s);
#endif


#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_VSX_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_VSX_2hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f>   subroutine double_hh_trafo_real_vsx_2hv_double(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
109 110 111 112 113
!f>                                bind(C, name="double_hh_trafo_real_vsx_2hv_double")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int) :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value  :: q
!f>        real(kind=c_double) :: hh(pnb,6)
114 115 116 117 118 119 120 121 122
!f>   end subroutine
!f> end interface
!f>#endif
*/

/*
!f>#ifdef HAVE_VSX_SSE
!f> interface
!f>   subroutine double_hh_trafo_real_vsx_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
123 124 125 126 127
!f>                                bind(C, name="double_hh_trafo_real_vsx_2hv_single")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_float)        :: hh(pnb,6)
128 129 130 131 132 133 134 135 136 137 138 139
!f>   end subroutine
!f> end interface
!f>#endif
*/

#ifdef DOUBLE_PRECISION_REAL
void double_hh_trafo_real_vsx_2hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void double_hh_trafo_real_vsx_2hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
140 141 142 143 144 145
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on;
146

Andreas Marek's avatar
Andreas Marek committed
147
        worked_on = 0;
148

Andreas Marek's avatar
Andreas Marek committed
149 150
        // calculating scalar product to compute
        // 2 householder vectors simultaneously
151
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
152
        double s = hh[(ldh)+1]*1.0;
153 154
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
155
        float s = hh[(ldh)+1]*1.0;
156 157 158
#endif

#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
159
        #pragma ivdep
160
#endif
Andreas Marek's avatar
Andreas Marek committed
161 162 163 164
        for (i = 2; i < nb; i++)
        {
                s += hh[i-1] * hh[(i+ldh)];
        }
165

Andreas Marek's avatar
Andreas Marek committed
166
        // Production level kernel calls with padding
167
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
168 169 170 171 172
        for (i = 0; i < nq-10; i+=12)
        {
                hh_trafo_kernel_12_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 12;
        }
173 174
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
175 176 177 178 179
        for (i = 0; i < nq-20; i+=24)
        {
                hh_trafo_kernel_24_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 24;
        }
180 181
#endif

Andreas Marek's avatar
Andreas Marek committed
182 183 184 185
        if (nq == i)
        {
                return;
        }
186 187

#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
188 189 190 191 192
        if (nq-i == 10)
        {
                hh_trafo_kernel_10_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 10;
        }
193 194 195
#endif

#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
196 197 198 199 200
        if (nq-i == 20)
        {
                hh_trafo_kernel_20_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 20;
        }
201 202 203
#endif

#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
204 205 206 207 208
        if (nq-i == 8)
        {
                hh_trafo_kernel_8_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 8;
        }
209 210 211
#endif

#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
212 213 214 215 216
        if (nq-i == 16)
        {
                hh_trafo_kernel_16_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 16;
        }
217 218 219 220
#endif


#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
221 222 223 224 225
        if (nq-i == 6)
        {
                hh_trafo_kernel_6_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 6;
        }
226 227 228
#endif

#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
229 230 231 232 233
        if (nq-i == 12)
        {
                hh_trafo_kernel_12_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 12;
        }
234 235 236
#endif

#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
237 238 239 240 241
        if (nq-i == 4)
        {
                hh_trafo_kernel_4_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 4;
        }
242 243 244
#endif

#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
245 246 247 248 249
        if (nq-i == 8)
        {
                hh_trafo_kernel_8_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 8;
        }
250 251 252
#endif

#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
253 254 255 256 257
        if (nq-i == 2)
        {
                hh_trafo_kernel_2_VSX_2hv_double(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 2;
        }
258 259 260
#endif

#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
261 262 263 264 265
        if (nq-i == 4)
        {
                hh_trafo_kernel_4_VSX_2hv_single(&q[i], hh, nb, ldq, ldh, s);
                worked_on += 4;
        }
266 267
#endif
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
268 269 270 271 272
        if (worked_on != nq)
        {
                printf("Error in real VSX BLOCK2 kernel %d %d\n", worked_on, nq);
                abort();
        }
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
#endif
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 12 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 24 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 2 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_12_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_24_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
294 295 296 297 298
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [12 x nb+1] * hh
        // hh contains two householder vectors, with offset 1
        /////////////////////////////////////////////////////
        int i;
299
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
300
        double mone = -1.0;
301 302
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
303
        float mone = -1.0;
304 305
#endif

306
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
307
        // Needed bit mask for floating point sign flip
308
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
309
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
310 311 312 313 314 315
#endif
#ifdef SINGLE_PRECISION_REAL
        __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
316 317 318 319 320 321
        __SSE_DATATYPE x1 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq]);
        __SSE_DATATYPE x2 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+offset]);
        __SSE_DATATYPE x3 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+2*offset]);
        __SSE_DATATYPE x4 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+3*offset]);
        __SSE_DATATYPE x5 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+4*offset]);
        __SSE_DATATYPE x6 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+5*offset]);
322 323 324

#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
325
        __SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
326 327
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
328
        __SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
329 330 331 332 333 334 335 336 337 338 339
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
        __SSE_DATATYPE h1 = vec_splats(hh[ldh+1]);
#endif
#ifdef SINGLE_PRECISION_REAL
        __SSE_DATATYPE h1 = vec_splats(hh[ldh+1]);
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
340
        __SSE_DATATYPE h2;
341

Andreas Marek's avatar
Andreas Marek committed
342 343 344 345 346 347 348 349 350 351 352 353 354 355
        __SSE_DATATYPE q1 = _SSE_LOAD(0, (unsigned long int *)  &q[0]);
        __SSE_DATATYPE y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
        __SSE_DATATYPE q2 = _SSE_LOAD(0, (unsigned long int *)  &q[offset]);
        __SSE_DATATYPE y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
        __SSE_DATATYPE q3 = _SSE_LOAD(0, (unsigned long int *)  &q[2*offset]);
        __SSE_DATATYPE y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
        __SSE_DATATYPE q4 = _SSE_LOAD(0, (unsigned long int *)  &q[3*offset]);
        __SSE_DATATYPE y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
        __SSE_DATATYPE q5 = _SSE_LOAD(0, (unsigned long int *)  &q[4*offset]);
        __SSE_DATATYPE y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
        __SSE_DATATYPE q6 = _SSE_LOAD(0, (unsigned long int *)  &q[5*offset]);
        __SSE_DATATYPE y6 = _SSE_ADD(q6, _SSE_MUL(x6, h1));
        for(i = 2; i < nb; i++)
        {
356 357
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
358 359
                h1 = _mm_set1_pd(hh[i-1]);
                h2 = _mm_set1_pd(hh[ldh+i]);
360 361
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
362 363
                h1 = _mm_set1_ps(hh[i-1]);
                h2 = _mm_set1_ps(hh[ldh+i]);
364 365 366 367
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
#endif
#endif

                q1 = _SSE_LOAD(0, (unsigned long int *)  &q[i*ldq]);
                x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
                y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
                q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+offset]);
                x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
                y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
                q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+2*offset]);
                x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
                y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
                q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+3*offset]);
                x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
                y4 = _SSE_ADD(y4, _SSE_MUL(q4,h2));
                q5 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+4*offset]);
                x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
                y5 = _SSE_ADD(y5, _SSE_MUL(q5,h2));
                q6 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+5*offset]);
                x6 = _SSE_ADD(x6, _SSE_MUL(q6,h1));
                y6 = _SSE_ADD(y6, _SSE_MUL(q6,h2));
        }
396 397
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
398
        h1 = _mm_set1_pd(hh[nb-1]);
399 400
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
401
        h1 = _mm_set1_ps(hh[nb-1]);
402 403 404 405
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
406
        h1 = vec_splats(hh[nb-1]);
407 408
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
409
        h1 = vec_splats(hh[nb-1]);
410 411 412
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
        q1 = _SSE_LOAD(0, (unsigned long int *)  &q[nb*ldq]);
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+offset]);
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+2*offset]);
        x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+3*offset]);
        x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
        q5 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+4*offset]);
        x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
        q6 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+5*offset]);
        x6 = _SSE_ADD(x6, _SSE_MUL(q6,h1));
        /////////////////////////////////////////////////////
        // Rank-2 update of Q [12 x nb+1]
        /////////////////////////////////////////////////////
428 429
#ifdef  HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
430 431 432
        __SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
        __SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
        __SSE_DATATYPE vs = _mm_set1_pd(s);
433 434
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
435 436 437
        __SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
        __SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
        __SSE_DATATYPE vs = _mm_set1_ps(s);
438 439 440 441
#endif
#endif
#ifdef  HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
442 443 444
        __SSE_DATATYPE tau1 = vec_splats(hh[0]);
        __SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
        __SSE_DATATYPE vs = vec_splats(s);
445 446
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
447 448 449
        __SSE_DATATYPE tau1 = vec_splats(hh[0]);
        __SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
        __SSE_DATATYPE vs = vec_splats(s);
450 451 452 453
#endif
#endif

#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
454
        h1 = _SSE_XOR(tau1, sign);
455 456
#endif
#ifdef HAVE_VSX_SSE
Andreas Marek's avatar
Andreas Marek committed
457 458 459 460 461 462 463 464 465
        //h1 = vec_neg(tau1);
        h1 = vec_mul(vec_splats(mone), tau1);
#endif
        x1 = _SSE_MUL(x1, h1);
        x2 = _SSE_MUL(x2, h1);
        x3 = _SSE_MUL(x3, h1);
        x4 = _SSE_MUL(x4, h1);
        x5 = _SSE_MUL(x5, h1);
        x6 = _SSE_MUL(x6, h1);
466
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
467
        h1 = _SSE_XOR(tau2, sign);
468
#endif
469
#ifdef HAVE_VSX_SSE
Andreas Marek's avatar
Andreas Marek committed
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
        //h1 = vec_neg(tau2);
        h1 = vec_mul(vec_splats(mone), tau2);
#endif
        h2 = _SSE_MUL(h1, vs);

        y1 = _SSE_ADD(_SSE_MUL(y1,h1), _SSE_MUL(x1,h2));
        y2 = _SSE_ADD(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2));
        y3 = _SSE_ADD(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));
        y4 = _SSE_ADD(_SSE_MUL(y4,h1), _SSE_MUL(x4,h2));
        y5 = _SSE_ADD(_SSE_MUL(y5,h1), _SSE_MUL(x5,h2));
        y6 = _SSE_ADD(_SSE_MUL(y6,h1), _SSE_MUL(x6,h2));
        q1 = _SSE_LOAD(0, (unsigned long int *) &q[0]);
        q1 = _SSE_ADD(q1, y1);
        _SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[0]);
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[offset]);
        q2 = _SSE_ADD(q2, y2);
        _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[offset]);
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[2*offset]);
        q3 = _SSE_ADD(q3, y3);
        _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[2*offset]);
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[3*offset]);
        q4 = _SSE_ADD(q4, y4);
        _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[3*offset]);
        q5 = _SSE_LOAD(0, (unsigned long int *)  &q[4*offset]);
        q5 = _SSE_ADD(q5, y5);
        _SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *)  &q[4*offset]);
        q6 = _SSE_LOAD(0, (unsigned long int *)  &q[5*offset]);
        q6 = _SSE_ADD(q6, y6);
        _SSE_STORE((__vector unsigned int) q6, 0, (unsigned int *)  &q[5*offset]);
499 500 501

#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
502
        h2 = _mm_set1_pd(hh[ldh+1]);
503 504
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
505
        h2 = _mm_set1_ps(hh[ldh+1]);
506 507 508 509 510
#endif
#endif

#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
511
        h2 = vec_splats(hh[ldh+1]);
512 513
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
514
        h2 = vec_splats(hh[ldh+1]);
515 516 517
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
        q1 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq]);
        q1 = _SSE_ADD(q1, _SSE_ADD(x1, _SSE_MUL(y1, h2)));
        _SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *)  &q[ldq]);
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+offset]);
        q2 = _SSE_ADD(q2, _SSE_ADD(x2, _SSE_MUL(y2, h2)));
        _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[ldq+offset]);
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+2*offset]);
        q3 = _SSE_ADD(q3, _SSE_ADD(x3, _SSE_MUL(y3, h2)));
        _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[ldq+2*offset]);
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+3*offset]);
        q4 = _SSE_ADD(q4, _SSE_ADD(x4, _SSE_MUL(y4, h2)));
        _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[ldq+3*offset]);
        q5 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+4*offset]);
        q5 = _SSE_ADD(q5, _SSE_ADD(x5, _SSE_MUL(y5, h2)));
        _SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *)  &q[ldq+4*offset]);
        q6 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+5*offset]);
        q6 = _SSE_ADD(q6, _SSE_ADD(x6, _SSE_MUL(y6, h2)));
        _SSE_STORE((__vector unsigned int) q6, 0, (unsigned int *)  &q[ldq+5*offset]);
536

Andreas Marek's avatar
Andreas Marek committed
537 538
        for (i = 2; i < nb; i++)
        {
539 540
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
541 542
                h1 = _mm_set1_pd(hh[i-1]);
                h2 = _mm_set1_pd(hh[ldh+i]);
543 544
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
545 546
                h1 = _mm_set1_ps(hh[i-1]);
                h2 = _mm_set1_ps(hh[ldh+i]);
547 548 549 550
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
#endif
#ifdef SINGLE_PRECISION_REAL
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
#endif
#endif

                q1 = _SSE_LOAD(0, (unsigned long int *)  &q[i*ldq]);
                q1 = _SSE_ADD(q1, _SSE_ADD(_SSE_MUL(x1,h1), _SSE_MUL(y1, h2)));
                _SSE_STORE((__vector unsigned int) q1, 0,  (unsigned int *)  &q[i*ldq]);
                q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+offset]);
                q2 = _SSE_ADD(q2, _SSE_ADD(_SSE_MUL(x2,h1), _SSE_MUL(y2, h2)));
                _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[(i*ldq)+offset]);
                q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+2*offset]);
                q3 = _SSE_ADD(q3, _SSE_ADD(_SSE_MUL(x3,h1), _SSE_MUL(y3, h2)));
                _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[(i*ldq)+2*offset]);
                q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+3*offset]);
                q4 = _SSE_ADD(q4, _SSE_ADD(_SSE_MUL(x4,h1), _SSE_MUL(y4, h2)));
                _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[(i*ldq)+3*offset]);
                q5 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+4*offset]);
                q5 = _SSE_ADD(q5, _SSE_ADD(_SSE_MUL(x5,h1), _SSE_MUL(y5, h2)));
                _SSE_STORE((__vector unsigned int) q5, 0, (unsigned  int *)  &q[(i*ldq)+4*offset]);
                q6 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+5*offset]);
                q6 = _SSE_ADD(q6, _SSE_ADD(_SSE_MUL(x6,h1), _SSE_MUL(y6, h2)));
                _SSE_STORE((__vector unsigned int) q6, 0, (unsigned int *)  &q[(i*ldq)+5*offset]);
        }
579 580
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
581
        h1 = _mm_set1_pd(hh[nb-1]);
582 583
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
584
        h1 = _mm_set1_ps(hh[nb-1]);
585 586 587 588
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
589
        h1 = vec_splats(hh[nb-1]);
590 591
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
592
        h1 = vec_splats(hh[nb-1]);
593 594 595 596
#endif
#endif


Andreas Marek's avatar
Andreas Marek committed
597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
        q1 = _SSE_LOAD(0, (unsigned long int *)  &q[nb*ldq]);
        q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
        _SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *)  &q[nb*ldq]);
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+offset]);
        q2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
        _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[(nb*ldq)+offset]);
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+2*offset]);
        q3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
        _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[(nb*ldq)+2*offset]);
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+3*offset]);
        q4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
        _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[(nb*ldq)+3*offset]);
        q5 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+4*offset]);
        q5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
        _SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *)  &q[(nb*ldq)+4*offset]);
        q6 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+5*offset]);
        q6 = _SSE_ADD(q6, _SSE_MUL(x6, h1));
        _SSE_STORE((__vector unsigned int) q6, 0, (unsigned int *)  &q[(nb*ldq)+5*offset]);
615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636
}



/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 10 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 20 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 2 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_10_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_20_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
637 638 639 640 641
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [12 x nb+1] * hh
        // hh contains two householder vectors, with offset 1
        /////////////////////////////////////////////////////
        int i;
642
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
643
        double mone = -1.0;
644 645
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
646
        float mone = -1.0;
647 648
#endif

649
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
650
        // Needed bit mask for floating point sign flip
651
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
652
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
653 654 655 656 657 658
#endif
#ifdef SINGLE_PRECISION_REAL
        __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
659 660 661 662 663
        __SSE_DATATYPE x1 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq]);
        __SSE_DATATYPE x2 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+offset]);
        __SSE_DATATYPE x3 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+2*offset]);
        __SSE_DATATYPE x4 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+3*offset]);
        __SSE_DATATYPE x5 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+4*offset]);
664 665 666

#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
667
        __SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
668 669
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
670
        __SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
671 672 673 674
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
675
        __SSE_DATATYPE h1 = vec_splats(hh[ldh+1]);
676 677
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
678
        __SSE_DATATYPE h1 = vec_splats(hh[ldh+1]);
679 680 681
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
682
        __SSE_DATATYPE h2;
683

Andreas Marek's avatar
Andreas Marek committed
684 685 686 687 688 689 690 691 692 693 694 695
        __SSE_DATATYPE q1 = _SSE_LOAD(0, (unsigned long int *) &q[0]);
        __SSE_DATATYPE y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
        __SSE_DATATYPE q2 = _SSE_LOAD(0, (unsigned long int *)  &q[offset]);
        __SSE_DATATYPE y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
        __SSE_DATATYPE q3 = _SSE_LOAD(0, (unsigned long int *)  &q[2*offset]);
        __SSE_DATATYPE y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
        __SSE_DATATYPE q4 = _SSE_LOAD(0, (unsigned long int *)  &q[3*offset]);
        __SSE_DATATYPE y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
        __SSE_DATATYPE q5 = _SSE_LOAD(0, (unsigned long int *)  &q[4*offset]);
        __SSE_DATATYPE y5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
        for(i = 2; i < nb; i++)
        {
696 697
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
698 699
                h1 = _mm_set1_pd(hh[i-1]);
                h2 = _mm_set1_pd(hh[ldh+i]);
700 701
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
702 703
                h1 = _mm_set1_ps(hh[i-1]);
                h2 = _mm_set1_ps(hh[ldh+i]);
704 705 706 707
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
708 709
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
710 711
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
712 713
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
714 715 716 717
#endif
#endif


Andreas Marek's avatar
Andreas Marek committed
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
                q1 = _SSE_LOAD(0, (unsigned long int *)  &q[i*ldq]);
                x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
                y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
                q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+offset]);
                x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
                y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
                q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+2*offset]);
                x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
                y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
                q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+3*offset]);
                x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
                y4 = _SSE_ADD(y4, _SSE_MUL(q4,h2));
                q5 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+4*offset]);
                x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
                y5 = _SSE_ADD(y5, _SSE_MUL(q5,h2));
        }
734 735 736

#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
737
        h1 = _mm_set1_pd(hh[nb-1]);
738 739
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
740
        h1 = _mm_set1_ps(hh[nb-1]);
741 742 743 744
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
745
        h1 = vec_splats(hh[nb-1]);
746 747
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
748
        h1 = vec_splats(hh[nb-1]);
749 750 751
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
752 753 754 755 756 757 758 759 760 761 762 763 764
        q1 = _SSE_LOAD(0, (unsigned long int *)  &q[nb*ldq]);
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+offset]);
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+2*offset]);
        x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+3*offset]);
        x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
        q5 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+4*offset]);
        x5 = _SSE_ADD(x5, _SSE_MUL(q5,h1));
        /////////////////////////////////////////////////////
        // Rank-2 update of Q [12 x nb+1]
        /////////////////////////////////////////////////////
765 766
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
767 768 769
        __SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
        __SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
        __SSE_DATATYPE vs = _mm_set1_pd(s);
770 771
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
772 773 774
        __SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
        __SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
        __SSE_DATATYPE vs = _mm_set1_ps(s);
775 776 777 778 779

#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
780 781 782
        __SSE_DATATYPE tau1 = vec_splats(hh[0]);
        __SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
        __SSE_DATATYPE vs = vec_splats(s);
783 784
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
785 786 787
        __SSE_DATATYPE tau1 = vec_splats(hh[0]);
        __SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
        __SSE_DATATYPE vs = vec_splats(s);
788 789 790 791 792

#endif
#endif

#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
793
        h1 = _SSE_XOR(tau1, sign);
794 795
#endif
#ifdef HAVE_VSX_SSE
Andreas Marek's avatar
Andreas Marek committed
796 797 798 799 800 801 802 803
        h1 = vec_mul(vec_splats(mone), tau1);
        // h1 = vec_neg(tau1);
#endif
        x1 = _SSE_MUL(x1, h1);
        x2 = _SSE_MUL(x2, h1);
        x3 = _SSE_MUL(x3, h1);
        x4 = _SSE_MUL(x4, h1);
        x5 = _SSE_MUL(x5, h1);
804
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
805
        h1 = _SSE_XOR(tau2, sign);
806 807
#endif
#ifdef HAVE_VSX_SSE
Andreas Marek's avatar
Andreas Marek committed
808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
        // h1 = vec_neg(tau2);
        h1 = vec_mul(vec_splats(mone), tau2);
#endif
        h2 = _SSE_MUL(h1, vs);

        y1 = _SSE_ADD(_SSE_MUL(y1,h1), _SSE_MUL(x1,h2));
        y2 = _SSE_ADD(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2));
        y3 = _SSE_ADD(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));
        y4 = _SSE_ADD(_SSE_MUL(y4,h1), _SSE_MUL(x4,h2));
        y5 = _SSE_ADD(_SSE_MUL(y5,h1), _SSE_MUL(x5,h2));
        q1 = _SSE_LOAD(0, (unsigned int *) &q[0]);
        q1 = _SSE_ADD(q1, y1);
        _SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[0]);
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[offset]);
        q2 = _SSE_ADD(q2, y2);
        _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[offset]);
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[2*offset]);
        q3 = _SSE_ADD(q3, y3);
        _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[2*offset]);
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[3*offset]);
        q4 = _SSE_ADD(q4, y4);
        _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[3*offset]);
        q5 = _SSE_LOAD(0, (unsigned long int *)  &q[4*offset]);
        q5 = _SSE_ADD(q5, y5);
        _SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *)  &q[4*offset]);
833 834 835

#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
836
        h2 = _mm_set1_pd(hh[ldh+1]);
837 838
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
839
        h2 = _mm_set1_ps(hh[ldh+1]);
840 841 842 843
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
844
        h2 = vec_splats(hh[ldh+1]);
845 846
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
847
        h2 = vec_splats(hh[ldh+1]);
848 849 850
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
        q1 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq]);
        q1 = _SSE_ADD(q1, _SSE_ADD(x1, _SSE_MUL(y1, h2)));
        _SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *)  &q[ldq]);
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+offset]);
        q2 = _SSE_ADD(q2, _SSE_ADD(x2, _SSE_MUL(y2, h2)));
        _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[ldq+offset]);
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+2*offset]);
        q3 = _SSE_ADD(q3, _SSE_ADD(x3, _SSE_MUL(y3, h2)));
        _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[ldq+2*offset]);
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+3*offset]);
        q4 = _SSE_ADD(q4, _SSE_ADD(x4, _SSE_MUL(y4, h2)));
        _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[ldq+3*offset]);
        q5 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+4*offset]);
        q5 = _SSE_ADD(q5, _SSE_ADD(x5, _SSE_MUL(y5, h2)));
        _SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *)  &q[ldq+4*offset]);
866

Andreas Marek's avatar
Andreas Marek committed
867 868
        for (i = 2; i < nb; i++)
        {
869 870
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
871 872
                h1 = _mm_set1_pd(hh[i-1]);
                h2 = _mm_set1_pd(hh[ldh+i]);
873 874
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
875 876
                h1 = _mm_set1_ps(hh[i-1]);
                h2 = _mm_set1_ps(hh[ldh+i]);
877 878 879 880
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
881 882
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
883 884
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
885 886
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
887 888 889
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
                q1 = _SSE_LOAD(0, (unsigned long int *)  &q[i*ldq]);
                q1 = _SSE_ADD(q1, _SSE_ADD(_SSE_MUL(x1,h1), _SSE_MUL(y1, h2)));
                _SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *)  &q[i*ldq]);
                q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+offset]);
                q2 = _SSE_ADD(q2, _SSE_ADD(_SSE_MUL(x2,h1), _SSE_MUL(y2, h2)));
                _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[(i*ldq)+offset]);
                q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+2*offset]);
                q3 = _SSE_ADD(q3, _SSE_ADD(_SSE_MUL(x3,h1), _SSE_MUL(y3, h2)));
                _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[(i*ldq)+2*offset]);
                q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+3*offset]);
                q4 = _SSE_ADD(q4, _SSE_ADD(_SSE_MUL(x4,h1), _SSE_MUL(y4, h2)));
                _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[(i*ldq)+3*offset]);
                q5 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+4*offset]);
                q5 = _SSE_ADD(q5, _SSE_ADD(_SSE_MUL(x5,h1), _SSE_MUL(y5, h2)));
                _SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *)  &q[(i*ldq)+4*offset]);
        }
906 907
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
908
        h1 = _mm_set1_pd(hh[nb-1]);
909 910
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
911
        h1 = _mm_set1_ps(hh[nb-1]);
912 913 914 915
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
916
        h1 = vec_splats(hh[nb-1]);
917 918
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
919
        h1 = vec_splats(hh[nb-1]);
920 921 922
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937
        q1 = _SSE_LOAD(0, (unsigned long int *)  &q[nb*ldq]);
        q1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
        _SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *)  &q[nb*ldq]);
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+offset]);
        q2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
        _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[(nb*ldq)+offset]);
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+2*offset]);
        q3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
        _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[(nb*ldq)+2*offset]);
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+3*offset]);
        q4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
        _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[(nb*ldq)+3*offset]);
        q5 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+4*offset]);
        q5 = _SSE_ADD(q5, _SSE_MUL(x5, h1));
        _SSE_STORE((__vector unsigned int) q5, 0, (unsigned int *)  &q[(nb*ldq)+4*offset]);
938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 16 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 2 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_8_VSX_2hv_double(double* q, double* hh, int nb, int ldq, int ldh, double s)
#endif
#ifdef SINGLE_PRECISION_REAL
 __forceinline void hh_trafo_kernel_16_VSX_2hv_single(float* q, float* hh, int nb, int ldq, int ldh, float s)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
958 959 960 961 962
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [12 x nb+1] * hh
        // hh contains two householder vectors, with offset 1
        /////////////////////////////////////////////////////
        int i;
963

964
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
965
        double mone = -1.0;
966 967
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
968
        float mone = -1.0;
969 970
#endif

971
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
972
        // Needed bit mask for floating point sign flip
973
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
974
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set1_epi64x(0x8000000000000000LL);
975 976 977 978 979
#endif
#ifdef SINGLE_PRECISION_REAL
        __SSE_DATATYPE sign = _mm_castsi128_ps(_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
#endif
#endif
Andreas Marek's avatar
Andreas Marek committed
980 981 982 983
        __SSE_DATATYPE x1 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq]);
        __SSE_DATATYPE x2 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+offset]);
        __SSE_DATATYPE x3 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+2*offset]);
        __SSE_DATATYPE x4 = _SSE_LOAD(0, (unsigned long int *)  &q[ldq+3*offset]);
984 985 986

#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
987
        __SSE_DATATYPE h1 = _mm_set1_pd(hh[ldh+1]);
988 989
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
990
        __SSE_DATATYPE h1 = _mm_set1_ps(hh[ldh+1]);
991 992 993 994
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
995
        __SSE_DATATYPE h1 = vec_splats(hh[ldh+1]);
996 997
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
998
        __SSE_DATATYPE h1 = vec_splats(hh[ldh+1]);
999 1000 1001
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
1002
        __SSE_DATATYPE h2;
1003

Andreas Marek's avatar
Andreas Marek committed
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
        __SSE_DATATYPE q1 = _SSE_LOAD(0, (unsigned long int *)  &q[0]);
        __SSE_DATATYPE y1 = _SSE_ADD(q1, _SSE_MUL(x1, h1));
        __SSE_DATATYPE q2 = _SSE_LOAD(0, (unsigned long int *)  &q[offset]);
        __SSE_DATATYPE y2 = _SSE_ADD(q2, _SSE_MUL(x2, h1));
        __SSE_DATATYPE q3 = _SSE_LOAD(0, (unsigned long int *)  &q[2*offset]);
        __SSE_DATATYPE y3 = _SSE_ADD(q3, _SSE_MUL(x3, h1));
        __SSE_DATATYPE q4 = _SSE_LOAD(0, (unsigned long int *)  &q[3*offset]);
        __SSE_DATATYPE y4 = _SSE_ADD(q4, _SSE_MUL(x4, h1));
        for(i = 2; i < nb; i++)
        {
1014 1015
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1016 1017
                h1 = _mm_set1_pd(hh[i-1]);
                h2 = _mm_set1_pd(hh[ldh+i]);
1018 1019
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1020 1021
                h1 = _mm_set1_ps(hh[i-1]);
                h2 = _mm_set1_ps(hh[ldh+i]);
1022 1023 1024 1025
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1026 1027
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
1028 1029
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1030 1031
                h1 = vec_splats(hh[i-1]);
                h2 = vec_splats(hh[ldh+i]);
1032 1033 1034 1035
#endif
#endif


Andreas Marek's avatar
Andreas Marek committed
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
                q1 = _SSE_LOAD(0, (unsigned long int *)  &q[i*ldq]);
                x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
                y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
                q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+offset]);
                x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
                y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
                q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+2*offset]);
                x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
                y3 = _SSE_ADD(y3, _SSE_MUL(q3,h2));
                q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(i*ldq)+3*offset]);
                x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
                y4 = _SSE_ADD(y4, _SSE_MUL(q4,h2));
        }
1049 1050
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1051
        h1 = _mm_set1_pd(hh[nb-1]);
1052 1053
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1054
        h1 = _mm_set1_ps(hh[nb-1]);
1055 1056 1057 1058
#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1059
        h1 = vec_splats(hh[nb-1]);
1060 1061
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1062
        h1 = vec_splats(hh[nb-1]);
1063 1064
#endif
#endif
Andreas Marek's avatar
Andreas Marek committed
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075
        q1 = _SSE_LOAD(0, (unsigned long int *)  &q[nb*ldq]);
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+offset]);
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+2*offset]);
        x3 = _SSE_ADD(x3, _SSE_MUL(q3,h1));
        q4 = _SSE_LOAD(0, (unsigned long int *)  &q[(nb*ldq)+3*offset]);
        x4 = _SSE_ADD(x4, _SSE_MUL(q4,h1));
        /////////////////////////////////////////////////////
        // Rank-2 update of Q [12 x nb+1]
        /////////////////////////////////////////////////////
1076 1077
#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1078 1079 1080
        __SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
        __SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
        __SSE_DATATYPE vs = _mm_set1_pd(s);
1081 1082
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1083 1084 1085
        __SSE_DATATYPE tau1 = _mm_set1_ps(hh[0]);
        __SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh]);
        __SSE_DATATYPE vs = _mm_set1_ps(s);
1086 1087 1088 1089 1090

#endif
#endif
#ifdef HAVE_VSX_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1091 1092 1093
        __SSE_DATATYPE tau1 = vec_splats(hh[0]);
        __SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
        __SSE_DATATYPE vs = vec_splats(s);
1094 1095
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1096 1097 1098
        __SSE_DATATYPE tau1 = vec_splats(hh[0]);
        __SSE_DATATYPE tau2 = vec_splats(hh[ldh]);
        __SSE_DATATYPE vs = vec_splats(s);
1099 1100 1101 1102 1103

#endif
#endif

#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
1104
        h1 = _SSE_XOR(tau1, sign);
1105 1106
#endif
#ifdef HAVE_VSX_SSE
Andreas Marek's avatar
Andreas Marek committed
1107 1108
        // h1 = vec_neg(tau1);
        h1 = vec_mul(vec_splats(mone), tau1);
1109
#endif
Andreas Marek's avatar
Andreas Marek committed
1110 1111 1112 1113
        x1 = _SSE_MUL(x1, h1);
        x2 = _SSE_MUL(x2, h1);
        x3 = _SSE_MUL(x3, h1);
        x4 = _SSE_MUL(x4, h1);
1114
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
1115
        h1 = _SSE_XOR(tau2, sign);
1116 1117
#endif
#ifdef HAVE_VSX_SSE
Andreas Marek's avatar
Andreas Marek committed
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138
        //h1 = vec_neg(tau2);
        h1 = vec_mul(vec_splats(mone), tau2);
#endif
        h2 = _SSE_MUL(h1, vs);

        y1 = _SSE_ADD(_SSE_MUL(y1,h1), _SSE_MUL(x1,h2));
        y2 = _SSE_ADD(_SSE_MUL(y2,h1), _SSE_MUL(x2,h2));
        y3 = _SSE_ADD(_SSE_MUL(y3,h1), _SSE_MUL(x3,h2));
        y4 = _SSE_ADD(_SSE_MUL(y4,h1), _SSE_MUL(x4,h2));
        q1 = _SSE_LOAD(0, (unsigned long int *) &q[0]);
        q1 = _SSE_ADD(q1, y1);
        _SSE_STORE((__vector unsigned int) q1, 0, (unsigned int *) &q[0]);
        q2 = _SSE_LOAD(0, (unsigned long int *)  &q[offset]);
        q2 = _SSE_ADD(q2, y2);
        _SSE_STORE((__vector unsigned int) q2, 0, (unsigned int *)  &q[offset]);
        q3 = _SSE_LOAD(0, (unsigned long int *)  &q[2*offset]);
        q3 = _SSE_ADD(q3, y3);
        _SSE_STORE((__vector unsigned int) q3, 0, (unsigned int *)  &q[2*offset]);
        q4 = _SSE_LOAD(0,  (unsigned long int *) &q[3*offset]);
        q4 = _SSE_ADD(q4, y4);
        _SSE_STORE((__vector unsigned int) q4, 0, (unsigned int *)  &q[3*offset]);
1139 1140 1141

#ifdef HAVE_SSE_INTRINSICS
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
1142
        h2 = _mm_set1_pd(hh[ldh+1]);