real_sse_6hv_template.c 106 KB
Newer Older
1
2
3
4
5
6
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
Andreas Marek's avatar
Andreas Marek committed
7
//        Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
Andreas Marek's avatar
Andreas Marek committed
9
//        Informatik,
10
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
//        Schwerpunkt Wissenschaftliches Rechnen ,
12
13
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
Andreas Marek's avatar
Andreas Marek committed
14
15
//        Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//        and
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
Andreas Marek's avatar
Andreas Marek committed
36
//    along with ELPA.        If not, see <http://www.gnu.org/licenses/>
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de), based on Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

64
#ifdef HAVE_SSE_INTRINSICS
65
#include <x86intrin.h>
66
67
68
69
70
#endif
#ifdef HAVE_SPARC64_SSE
#include <fjmfunc.h>
#include <emmintrin.h>
#endif
71
72
#include <stdio.h>
#include <stdlib.h>
73

74

75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#define __forceinline __attribute__((always_inline)) static

#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_SUB _mm_sub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_STORE _mm_store_pd
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_SUB _mm_sub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_STORE _mm_store_ps
#endif

#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

100
#ifdef HAVE_SSE_INTRINSICS
101
102
103
104
105
106
107
108
109
110
111
112
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_sse_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sparc64_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
void hexa_hh_trafo_real_sparc64_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif
#endif


131
132
133
134
135
136

#ifdef DOUBLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
137
138
139
140
141
!f>                                bind(C, name="hexa_hh_trafo_real_sse_6hv_double")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_double)        :: hh(pnb,6)
142
143
144
145
!f>   end subroutine
!f> end interface
!f>#endif
*/
146
147
148
149
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f>   subroutine hexa_hh_trafo_real_sparc64_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
150
151
152
153
154
!f>                                bind(C, name="hexa_hh_trafo_real_sparc64_6hv_double")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_double)        :: hh(pnb,6)
155
156
157
158
!f>   end subroutine
!f> end interface
!f>#endif
*/
159
#endif
160

161
162
163
164
165
#ifdef SINGLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
166
167
168
169
170
!f>                                bind(C, name="hexa_hh_trafo_real_sse_6hv_single")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_float)        :: hh(pnb,6)
171
172
173
174
!f>   end subroutine
!f> end interface
!f>#endif
*/
175
176
177
178
/*
!f>#ifdef HAVE_SPARC64_SSE
!f> interface
!f>   subroutine hexa_hh_trafo_real_sparc64_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
Andreas Marek's avatar
Andreas Marek committed
179
180
181
182
183
!f>                                bind(C, name="hexa_hh_trafo_real_sparc64_6hv_single")
!f>        use, intrinsic :: iso_c_binding
!f>        integer(kind=c_int)        :: pnb, pnq, pldq, pldh
!f>        type(c_ptr), value        :: q
!f>        real(kind=c_float)        :: hh(pnb,6)
184
185
186
187
188
!f>   end subroutine
!f> end interface
!f>#endif
*/

189
190
#endif

191
#ifdef HAVE_SSE_INTRINSICS
192
193
194
195
196
197
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
198
199
200
201
202
203
204
205
206
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sparc64_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sparc64_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#endif
207
{
Andreas Marek's avatar
Andreas Marek committed
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        int ldh = *pldh;
        int worked_on ;

        worked_on = 0;

        // calculating scalar products to compute
        // 6 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
        double scalarprods[15];
#endif
#ifdef SINGLE_PRECISION_REAL
        float scalarprods[15];
#endif

        scalarprods[0] = hh[(ldh+1)];
        scalarprods[1] = hh[(ldh*2)+2];
        scalarprods[2] = hh[(ldh*2)+1];
        scalarprods[3] = hh[(ldh*3)+3];
        scalarprods[4] = hh[(ldh*3)+2];
        scalarprods[5] = hh[(ldh*3)+1];
        scalarprods[6] = hh[(ldh*4)+4];
        scalarprods[7] = hh[(ldh*4)+3];
        scalarprods[8] = hh[(ldh*4)+2];
        scalarprods[9] = hh[(ldh*4)+1];
        scalarprods[10] = hh[(ldh*5)+5];
        scalarprods[11] = hh[(ldh*5)+4];
        scalarprods[12] = hh[(ldh*5)+3];
        scalarprods[13] = hh[(ldh*5)+2];
        scalarprods[14] = hh[(ldh*5)+1];
241

Andreas Marek's avatar
Andreas Marek committed
242
243
244
245
246
247
248
        // calculate scalar product of first and fourth householder Vector
        // loop counter = 2
        scalarprods[0] += hh[1] * hh[(2+ldh)];
        scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];
249

Andreas Marek's avatar
Andreas Marek committed
250
251
252
253
254
255
        // loop counter = 3
        scalarprods[0] += hh[2] * hh[(3+ldh)];
        scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];
256

Andreas Marek's avatar
Andreas Marek committed
257
258
259
260
        scalarprods[1] += hh[1] * hh[3+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];
261

Andreas Marek's avatar
Andreas Marek committed
262
263
264
265
266
267
        // loop counter = 4
        scalarprods[0] += hh[3] * hh[(4+ldh)];
        scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];
268

Andreas Marek's avatar
Andreas Marek committed
269
270
271
272
        scalarprods[1] += hh[2] * hh[4+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];
273

Andreas Marek's avatar
Andreas Marek committed
274
275
276
        scalarprods[3] += hh[1] * hh[4+(ldh*3)];
        scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
        scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];
277

Andreas Marek's avatar
Andreas Marek committed
278
279
280
281
282
283
        // loop counter = 5
        scalarprods[0] += hh[4] * hh[(5+ldh)];
        scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
        scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
        scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
        scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];
284

Andreas Marek's avatar
Andreas Marek committed
285
286
287
288
        scalarprods[1] += hh[3] * hh[5+(ldh*2)];
        scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
        scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
        scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];
289

Andreas Marek's avatar
Andreas Marek committed
290
291
292
        scalarprods[3] += hh[2] * hh[5+(ldh*3)];
        scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
        scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];
293

Andreas Marek's avatar
Andreas Marek committed
294
295
        scalarprods[6] += hh[1] * hh[5+(ldh*4)];
        scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];
296

Andreas Marek's avatar
Andreas Marek committed
297
298
299
300
301
302
303
304
        #pragma ivdep
        for (i = 6; i < nb; i++)
        {
                scalarprods[0] += hh[i-1] * hh[(i+ldh)];
                scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
                scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
                scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
                scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];
305

Andreas Marek's avatar
Andreas Marek committed
306
307
308
309
                scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
                scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
                scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
                scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];
310

Andreas Marek's avatar
Andreas Marek committed
311
312
313
                scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
                scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
                scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];
314

Andreas Marek's avatar
Andreas Marek committed
315
316
                scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
                scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];
317

Andreas Marek's avatar
Andreas Marek committed
318
319
                scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
        }
320

Andreas Marek's avatar
Andreas Marek committed
321
        // Production level kernel calls with padding
322
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
323
324
        for (i = 0; i < nq-2; i+=4)
        {
325
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
326
                hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
327
328
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
329
                hh_trafo_kernel_4_SPARC64_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
330
331
#endif

Andreas Marek's avatar
Andreas Marek committed
332
333
                worked_on += 4;
        }
334
335
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
336
337
        for (i = 0; i < nq-4; i+=8)
        {
338
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
339
                hh_trafo_kernel_8_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
340
341
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
342
                hh_trafo_kernel_8_SPARC64_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
343
344
#endif

Andreas Marek's avatar
Andreas Marek committed
345
346
                worked_on += 8;
        }
347
#endif
Andreas Marek's avatar
Andreas Marek committed
348
349
350
351
        if (nq == i)
        {
                return;
        }
352
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
353
354
        if (nq -i == 2)
        {
355
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
356
                hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
357
358
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
359
                hh_trafo_kernel_2_SPARC64_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
360
361
#endif

Andreas Marek's avatar
Andreas Marek committed
362
363
                worked_on += 2;
        }
364
365
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
366
367
        if (nq -i == 4)
        {
368
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
369
                hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
370
371
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
372
                hh_trafo_kernel_4_SPARC64_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
373
#endif
Andreas Marek's avatar
Andreas Marek committed
374
375
                worked_on += 4;
        }
376
#endif
377
#ifdef WITH_DEBUG
Andreas Marek's avatar
Andreas Marek committed
378
379
        if (worked_on != nq)
        {
380
#ifdef HAVE_SSE_INTRINSICS
Andreas Marek's avatar
Andreas Marek committed
381
                printf("Error in real SSE BLOCK6 kernel \n");
382
383
#endif
#ifdef HAVE_SPARC64_SSE
Andreas Marek's avatar
Andreas Marek committed
384
                printf("Error in real SPARC64 BLOCK6 kernel \n");
385
386
#endif

Andreas Marek's avatar
Andreas Marek committed
387
388
                abort();
        }
389
#endif
390
391
392
393
394
395
396
397
398
399
400
401
402
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 4 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
403
#ifdef HAVE_SSE_INTRINSICS
404
405
406
407
408
409
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
410
411
412
413
414
415
416
417
418
419
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SPARC64_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SPARC64_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
#endif

420
{
Andreas Marek's avatar
Andreas Marek committed
421
422
423
424
425
        /////////////////////////////////////////////////////
        // Matrix Vector Multiplication, Q [4 x nb+3] * hh
        // hh contains four householder vectors
        /////////////////////////////////////////////////////
        int i;
426

Andreas Marek's avatar
Andreas Marek committed
427
428
429
430
431
432
        __SSE_DATATYPE a1_1 = _SSE_LOAD(&q[ldq*5]);
        __SSE_DATATYPE a2_1 = _SSE_LOAD(&q[ldq*4]);
        __SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq*3]);
        __SSE_DATATYPE a4_1 = _SSE_LOAD(&q[ldq*2]);
        __SSE_DATATYPE a5_1 = _SSE_LOAD(&q[ldq]);
        __SSE_DATATYPE a6_1 = _SSE_LOAD(&q[0]);
433

434
#ifdef HAVE_SSE_INTRINSICS
435
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
436
437
438
439
440
        __SSE_DATATYPE h_6_5 = _mm_set1_pd(hh[(ldh*5)+1]);
        __SSE_DATATYPE h_6_4 = _mm_set1_pd(hh[(ldh*5)+2]);
        __SSE_DATATYPE h_6_3 = _mm_set1_pd(hh[(ldh*5)+3]);
        __SSE_DATATYPE h_6_2 = _mm_set1_pd(hh[(ldh*5)+4]);
        __SSE_DATATYPE h_6_1 = _mm_set1_pd(hh[(ldh*5)+5]);
441
442
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
443
444
445
446
447
        __SSE_DATATYPE h_6_5 =         _mm_set1_ps(hh[(ldh*5)+1]) ;
        __SSE_DATATYPE h_6_4 =         _mm_set1_ps(hh[(ldh*5)+2]) ;
        __SSE_DATATYPE h_6_3 =         _mm_set1_ps(hh[(ldh*5)+3]) ;
        __SSE_DATATYPE h_6_2 =         _mm_set1_ps(hh[(ldh*5)+4]) ;
        __SSE_DATATYPE h_6_1 =         _mm_set1_ps(hh[(ldh*5)+5]) ;
448
#endif
449
450
451
452
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
453
454
455
456
457
        __SSE_DATATYPE h_6_5 = _mm_set_pd(hh[(ldh*5)+1], hh[(ldh*5)+1]);
        __SSE_DATATYPE h_6_4 = _mm_set_pd(hh[(ldh*5)+2], hh[(ldh*5)+2]);
        __SSE_DATATYPE h_6_3 = _mm_set_pd(hh[(ldh*5)+3], hh[(ldh*5)+3]);
        __SSE_DATATYPE h_6_2 = _mm_set_pd(hh[(ldh*5)+4], hh[(ldh*5)+4]);
        __SSE_DATATYPE h_6_1 = _mm_set_pd(hh[(ldh*5)+5], hh[(ldh*5)+5]);
458
459
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
460
461
462
463
464
        __SSE_DATATYPE h_6_5 =         _mm_set_ps(hh[(ldh*5)+1], hh[(ldh*5)+1]) ;
        __SSE_DATATYPE h_6_4 =         _mm_set_ps(hh[(ldh*5)+2], hh[(ldh*5)+2]) ;
        __SSE_DATATYPE h_6_3 =         _mm_set_ps(hh[(ldh*5)+3], hh[(ldh*5)+3]) ;
        __SSE_DATATYPE h_6_2 =         _mm_set_ps(hh[(ldh*5)+4], hh[(ldh*5)+4]) ;
        __SSE_DATATYPE h_6_1 =         _mm_set_ps(hh[(ldh*5)+5], hh[(ldh*5)+5]) ;
465
466
467
#endif
#endif

468
469


Andreas Marek's avatar
Andreas Marek committed
470
471
472
473
474
        register __SSE_DATATYPE t1 = _SSE_ADD(a6_1, _SSE_MUL(a5_1, h_6_5));
        t1 = _SSE_ADD(t1, _SSE_MUL(a4_1, h_6_4));
        t1 = _SSE_ADD(t1, _SSE_MUL(a3_1, h_6_3));
        t1 = _SSE_ADD(t1, _SSE_MUL(a2_1, h_6_2));
        t1 = _SSE_ADD(t1, _SSE_MUL(a1_1, h_6_1));
475

476
#ifdef HAVE_SSE_INTRINSICS
477
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
478
479
480
481
        __SSE_DATATYPE h_5_4 = _mm_set1_pd(hh[(ldh*4)+1]);
        __SSE_DATATYPE h_5_3 = _mm_set1_pd(hh[(ldh*4)+2]);
        __SSE_DATATYPE h_5_2 = _mm_set1_pd(hh[(ldh*4)+3]);
        __SSE_DATATYPE h_5_1 = _mm_set1_pd(hh[(ldh*4)+4]);
482
483
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
484
485
486
487
        __SSE_DATATYPE h_5_4 =         _mm_set1_ps(hh[(ldh*4)+1]) ;
        __SSE_DATATYPE h_5_3 =         _mm_set1_ps(hh[(ldh*4)+2]) ;
        __SSE_DATATYPE h_5_2 =         _mm_set1_ps(hh[(ldh*4)+3]) ;
        __SSE_DATATYPE h_5_1 =         _mm_set1_ps(hh[(ldh*4)+4]) ;
488
#endif
489
490
491
492
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
493
494
495
496
        __SSE_DATATYPE h_5_4 = _mm_set_pd(hh[(ldh*4)+1], hh[(ldh*4)+1]);
        __SSE_DATATYPE h_5_3 = _mm_set_pd(hh[(ldh*4)+2], hh[(ldh*4)+2]);
        __SSE_DATATYPE h_5_2 = _mm_set_pd(hh[(ldh*4)+3], hh[(ldh*4)+3]);
        __SSE_DATATYPE h_5_1 = _mm_set_pd(hh[(ldh*4)+4], hh[(ldh*4)+4]);
497
498
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
499
500
501
502
        __SSE_DATATYPE h_5_4 =         _mm_set_ps(hh[(ldh*4)+1], hh[(ldh*4)+1]) ;
        __SSE_DATATYPE h_5_3 =         _mm_set_ps(hh[(ldh*4)+2], hh[(ldh*4)+2]) ;
        __SSE_DATATYPE h_5_2 =         _mm_set_ps(hh[(ldh*4)+3], hh[(ldh*4)+3]) ;
        __SSE_DATATYPE h_5_1 =         _mm_set_ps(hh[(ldh*4)+4], hh[(ldh*4)+4]) ;
503
504
505
506
#endif
#endif


507

Andreas Marek's avatar
Andreas Marek committed
508
509
510
511
        register __SSE_DATATYPE v1 = _SSE_ADD(a5_1, _SSE_MUL(a4_1, h_5_4));
        v1 = _SSE_ADD(v1, _SSE_MUL(a3_1, h_5_3));
        v1 = _SSE_ADD(v1, _SSE_MUL(a2_1, h_5_2));
        v1 = _SSE_ADD(v1, _SSE_MUL(a1_1, h_5_1));
512

513
#ifdef HAVE_SSE_INTRINSICS
514
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
515
516
517
        __SSE_DATATYPE h_4_3 = _mm_set1_pd(hh[(ldh*3)+1]);
        __SSE_DATATYPE h_4_2 = _mm_set1_pd(hh[(ldh*3)+2]);
        __SSE_DATATYPE h_4_1 = _mm_set1_pd(hh[(ldh*3)+3]);
518
519
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
520
521
522
        __SSE_DATATYPE h_4_3 =         _mm_set1_ps(hh[(ldh*3)+1]) ;
        __SSE_DATATYPE h_4_2 =         _mm_set1_ps(hh[(ldh*3)+2]) ;
        __SSE_DATATYPE h_4_1 =         _mm_set1_ps(hh[(ldh*3)+3]) ;
523
#endif
524
525
526
527
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
528
529
530
        __SSE_DATATYPE h_4_3 = _mm_set_pd(hh[(ldh*3)+1], hh[(ldh*3)+1]);
        __SSE_DATATYPE h_4_2 = _mm_set_pd(hh[(ldh*3)+2], hh[(ldh*3)+2]);
        __SSE_DATATYPE h_4_1 = _mm_set_pd(hh[(ldh*3)+3], hh[(ldh*3)+3]);
531
532
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
533
534
535
        __SSE_DATATYPE h_4_3 =         _mm_set_ps(hh[(ldh*3)+1], hh[(ldh*3)+1]);
        __SSE_DATATYPE h_4_2 =         _mm_set_ps(hh[(ldh*3)+2], hh[(ldh*3)+2]);
        __SSE_DATATYPE h_4_1 =         _mm_set_ps(hh[(ldh*3)+3], hh[(ldh*3)+3]);
536
537
538
#endif
#endif

539

Andreas Marek's avatar
Andreas Marek committed
540
541
542
        register __SSE_DATATYPE w1 = _SSE_ADD(a4_1, _SSE_MUL(a3_1, h_4_3));
        w1 = _SSE_ADD(w1, _SSE_MUL(a2_1, h_4_2));
        w1 = _SSE_ADD(w1, _SSE_MUL(a1_1, h_4_1));
543

544
#ifdef HAVE_SSE_INTRINSICS
545
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
546
547
548
        __SSE_DATATYPE h_2_1 = _mm_set1_pd(hh[ldh+1]);
        __SSE_DATATYPE h_3_2 = _mm_set1_pd(hh[(ldh*2)+1]);
        __SSE_DATATYPE h_3_1 = _mm_set1_pd(hh[(ldh*2)+2]);
549
550
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
551
552
553
        __SSE_DATATYPE h_2_1 =         _mm_set1_ps(hh[ldh+1]) ;
        __SSE_DATATYPE h_3_2 =         _mm_set1_ps(hh[(ldh*2)+1]) ;
        __SSE_DATATYPE h_3_1 =         _mm_set1_ps(hh[(ldh*2)+2]) ;
554
555
556
557
558
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
559
560
561
        __SSE_DATATYPE h_2_1 = _mm_set_pd(hh[ldh+1], hh[ldh+1]);
        __SSE_DATATYPE h_3_2 = _mm_set_pd(hh[(ldh*2)+1], hh[(ldh*2)+1]);
        __SSE_DATATYPE h_3_1 = _mm_set_pd(hh[(ldh*2)+2], hh[(ldh*2)+2]);
562
563
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
564
565
566
        __SSE_DATATYPE h_2_1 =         _mm_set_ps(hh[ldh+1], hh[ldh+1]) ;
        __SSE_DATATYPE h_3_2 =         _mm_set_ps(hh[(ldh*2)+1], hh[(ldh*2)+1]) ;
        __SSE_DATATYPE h_3_1 =         _mm_set_ps(hh[(ldh*2)+2], hh[(ldh*2)+2]) ;
567
#endif
568
569
#endif

Andreas Marek's avatar
Andreas Marek committed
570
571
572
        register __SSE_DATATYPE z1 = _SSE_ADD(a3_1, _SSE_MUL(a2_1, h_3_2));
        z1 = _SSE_ADD(z1, _SSE_MUL(a1_1, h_3_1));
        register __SSE_DATATYPE y1 = _SSE_ADD(a2_1, _SSE_MUL(a1_1, h_2_1));
573

Andreas Marek's avatar
Andreas Marek committed
574
        register __SSE_DATATYPE x1 = a1_1;
575

Andreas Marek's avatar
Andreas Marek committed
576
577
578
579
580
581
        __SSE_DATATYPE a1_2 = _SSE_LOAD(&q[(ldq*5)+offset]);
        __SSE_DATATYPE a2_2 = _SSE_LOAD(&q[(ldq*4)+offset]);
        __SSE_DATATYPE a3_2 = _SSE_LOAD(&q[(ldq*3)+offset]);
        __SSE_DATATYPE a4_2 = _SSE_LOAD(&q[(ldq*2)+offset]);
        __SSE_DATATYPE a5_2 = _SSE_LOAD(&q[(ldq)+offset]);
        __SSE_DATATYPE a6_2 = _SSE_LOAD(&q[offset]);
582

Andreas Marek's avatar
Andreas Marek committed
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
        register __SSE_DATATYPE t2 = _SSE_ADD(a6_2, _SSE_MUL(a5_2, h_6_5));
        t2 = _SSE_ADD(t2, _SSE_MUL(a4_2, h_6_4));
        t2 = _SSE_ADD(t2, _SSE_MUL(a3_2, h_6_3));
        t2 = _SSE_ADD(t2, _SSE_MUL(a2_2, h_6_2));
        t2 = _SSE_ADD(t2, _SSE_MUL(a1_2, h_6_1));
        register __SSE_DATATYPE v2 = _SSE_ADD(a5_2, _SSE_MUL(a4_2, h_5_4));
        v2 = _SSE_ADD(v2, _SSE_MUL(a3_2, h_5_3));
        v2 = _SSE_ADD(v2, _SSE_MUL(a2_2, h_5_2));
        v2 = _SSE_ADD(v2, _SSE_MUL(a1_2, h_5_1));
        register __SSE_DATATYPE w2 = _SSE_ADD(a4_2, _SSE_MUL(a3_2, h_4_3));
        w2 = _SSE_ADD(w2, _SSE_MUL(a2_2, h_4_2));
        w2 = _SSE_ADD(w2, _SSE_MUL(a1_2, h_4_1));
        register __SSE_DATATYPE z2 = _SSE_ADD(a3_2, _SSE_MUL(a2_2, h_3_2));
        z2 = _SSE_ADD(z2, _SSE_MUL(a1_2, h_3_1));
        register __SSE_DATATYPE y2 = _SSE_ADD(a2_2, _SSE_MUL(a1_2, h_2_1));
598

Andreas Marek's avatar
Andreas Marek committed
599
        register __SSE_DATATYPE x2 = a1_2;
600

Andreas Marek's avatar
Andreas Marek committed
601
602
        __SSE_DATATYPE q1;
        __SSE_DATATYPE q2;
603

Andreas Marek's avatar
Andreas Marek committed
604
605
606
607
608
609
        __SSE_DATATYPE h1;
        __SSE_DATATYPE h2;
        __SSE_DATATYPE h3;
        __SSE_DATATYPE h4;
        __SSE_DATATYPE h5;
        __SSE_DATATYPE h6;
610

Andreas Marek's avatar
Andreas Marek committed
611
612
        for(i = 6; i < nb; i++)
        {
613
#ifdef HAVE_SSE_INTRINSICS
614
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
615
                h1 = _mm_set1_pd(hh[i-5]);
616
617
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
618
                h1 = _mm_set1_ps(hh[i-5]);
619
620
621
622
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
623
                h1 = _mm_set_pd(hh[i-5], hh[i-5]);
624
625
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
626
                h1 = _mm_set_ps(hh[i-5], hh[i-5]);
627
#endif
628
#endif
Andreas Marek's avatar
Andreas Marek committed
629
630
631
        
                q1 = _SSE_LOAD(&q[i*ldq]);
                q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
632

Andreas Marek's avatar
Andreas Marek committed
633
634
                x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
                x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
635

636
#ifdef HAVE_SSE_INTRINSICS
637
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
638
                h2 = _mm_set1_pd(hh[ldh+i-4]);
639
640
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
641
                h2 = _mm_set1_ps(hh[ldh+i-4]);
642
643
644
645
646
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
647
                h2 = _mm_set_pd(hh[ldh+i-4], hh[ldh+i-4]);
648
649
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
650
                h2 = _mm_set_ps(hh[ldh+i-4], hh[ldh+i-4]);
651
#endif
652
#endif
653

Andreas Marek's avatar
Andreas Marek committed
654
655
                y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
                y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
656

657
#ifdef HAVE_SSE_INTRINSICS
658
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
659
                h3 = _mm_set1_pd(hh[(ldh*2)+i-3]);
660
661
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
662
                h3 = _mm_set1_ps(hh[(ldh*2)+i-3]);
663
664
665
666
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
667
                h3 = _mm_set_pd(hh[(ldh*2)+i-3], hh[(ldh*2)+i-3]);
668
669
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
670
                h3 = _mm_set_ps(hh[(ldh*2)+i-3], hh[(ldh*2)+i-3]);
671
#endif
672
#endif
673

Andreas Marek's avatar
Andreas Marek committed
674
675
                z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
                z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
676
#ifdef HAVE_SSE_INTRINSICS
677
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
678
                h4 = _mm_set1_pd(hh[(ldh*3)+i-2]);
679
680
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
681
                h4 = _mm_set1_ps(hh[(ldh*3)+i-2]);
682
683
684
685
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
686
                h4 = _mm_set_pd(hh[(ldh*3)+i-2], hh[(ldh*3)+i-2]);
687
688
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
689
                h4 = _mm_set_ps(hh[(ldh*3)+i-2], hh[(ldh*3)+i-2]);
690
#endif
691
692
#endif

Andreas Marek's avatar
Andreas Marek committed
693
694
                w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
                w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
695

696
#ifdef HAVE_SSE_INTRINSICS
697
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
698
                h5 = _mm_set1_pd(hh[(ldh*4)+i-1]);
699
700
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
701
                h5 = _mm_set1_ps(hh[(ldh*4)+i-1]);
702
703
704
705
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
706
                h5 = _mm_set_pd(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
707
708
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
709
                h5 = _mm_set_ps(hh[(ldh*4)+i-1], hh[(ldh*4)+i-1]);
710
#endif
711
712
#endif

Andreas Marek's avatar
Andreas Marek committed
713
714
                v1 = _SSE_ADD(v1, _SSE_MUL(q1,h5));
                v2 = _SSE_ADD(v2, _SSE_MUL(q2,h5));
715

716
#ifdef HAVE_SSE_INTRINSICS
717
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
718
                h6 = _mm_set1_pd(hh[(ldh*5)+i]);
719
720
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
721
                h6 = _mm_set1_ps(hh[(ldh*5)+i]);
722
723
724
725
726
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
727
                h6 = _mm_set_pd(hh[(ldh*5)+i], hh[(ldh*5)+i]);
728
729
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
730
                h6 = _mm_set_ps(hh[(ldh*5)+i], hh[(ldh*5)+i]);
731
#endif
732
#endif
733

Andreas Marek's avatar
Andreas Marek committed
734
735
736
                t1 = _SSE_ADD(t1, _SSE_MUL(q1,h6));
                t2 = _SSE_ADD(t2, _SSE_MUL(q2,h6));
        }
737

738
#ifdef HAVE_SSE_INTRINSICS
739
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
740
        h1 = _mm_set1_pd(hh[nb-5]);
741
742
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
743
        h1 = _mm_set1_ps(hh[nb-5] );
744
#endif
745
746
747
748
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
749
        h1 = _mm_set_pd(hh[nb-5], hh[nb-5]);
750
751
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
752
        h1 = _mm_set_ps(hh[nb-5], hh[nb-5]);
753
754
755
#endif
#endif

Andreas Marek's avatar
Andreas Marek committed
756
757
        q1 = _SSE_LOAD(&q[nb*ldq]);
        q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);
758

Andreas Marek's avatar
Andreas Marek committed
759
760
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
761

762
#ifdef HAVE_SSE_INTRINSICS
763
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
764
        h2 = _mm_set1_pd(hh[ldh+nb-4]);
765
766
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
767
        h2 = _mm_set1_ps(hh[ldh+nb-4]);
768
769
770
771
772
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
773
        h2 = _mm_set_pd(hh[ldh+nb-4], hh[ldh+nb-4]);
774
775
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
776
        h2 = _mm_set_ps(hh[ldh+nb-4], hh[ldh+nb-4]);
777
#endif
778
#endif
779
780


Andreas Marek's avatar
Andreas Marek committed
781
782
        y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
        y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
783

784
#ifdef HAVE_SSE_INTRINSICS
785
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
786
        h3 = _mm_set1_pd(hh[(ldh*2)+nb-3]);
787
788
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
789
        h3 = _mm_set1_ps(hh[(ldh*2)+nb-3]);
790
791
792
793
794
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
795
        h3 = _mm_set_pd(hh[(ldh*2)+nb-3], hh[(ldh*2)+nb-3]);
796
797
#endif
#ifdef SINGLE_PRECISION_REAL
798
        h3 = _mm_set_ps(hh[(ldh*2)+nb-3], hh[(ldh*2)+nb-3]);
799
#endif
800
#endif
801
802


Andreas Marek's avatar
Andreas Marek committed
803
804
        z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
        z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
805

806
#ifdef HAVE_SSE_INTRINSICS
807
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
808
        h4 = _mm_set1_pd(hh[(ldh*3)+nb-2]);
809
810
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
811
        h4 = _mm_set1_ps(hh[(ldh*3)+nb-2]);
812
813
814
815
816
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
817
        h4 = _mm_set_pd(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
818
819
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
820
        h4 = _mm_set_ps(hh[(ldh*3)+nb-2], hh[(ldh*3)+nb-2]);
821
#endif
822
#endif
823

Andreas Marek's avatar
Andreas Marek committed
824
825
        w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
        w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
826

827
#ifdef HAVE_SSE_INTRINSICS
828
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
829
        h5 = _mm_set1_pd(hh[(ldh*4)+nb-1]);
830
831
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
832
        h5 = _mm_set1_ps(hh[(ldh*4)+nb-1]);
833
834
835
836
837
#endif
#endif

#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
838
        h5 = _mm_set_pd(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
839
840
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
841
        h5 = _mm_set_ps(hh[(ldh*4)+nb-1], hh[(ldh*4)+nb-1]);
842
#endif
843
844
845
#endif


846

Andreas Marek's avatar
Andreas Marek committed
847
848
        v1 = _SSE_ADD(v1, _SSE_MUL(q1,h5));
        v2 = _SSE_ADD(v2, _SSE_MUL(q2,h5));
849
#ifdef HAVE_SSE_INTRINSICS
850
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
851
        h1 = _mm_set1_pd(hh[nb-4]);
852
853
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
854
        h1 = _mm_set1_ps(hh[nb-4]);
855
856
857
858
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
859
        h1 = _mm_set_pd(hh[nb-4], hh[nb-4]);
860
861
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
862
        h1 = _mm_set_ps(hh[nb-4], hh[nb-4]);
863
#endif
864
865
#endif

Andreas Marek's avatar
Andreas Marek committed
866
867
        q1 = _SSE_LOAD(&q[(nb+1)*ldq]);
        q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]);
868

Andreas Marek's avatar
Andreas Marek committed
869
870
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
871
#ifdef HAVE_SSE_INTRINSICS
872
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
873
        h2 = _mm_set1_pd(hh[ldh+nb-3]);
874
875
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
876
        h2 = _mm_set1_ps(hh[ldh+nb-3]);
877
878
879
880
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
881
        h2 = _mm_set_pd(hh[ldh+nb-3], hh[ldh+nb-3]);
882
883
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
884
        h2 = _mm_set_ps(hh[ldh+nb-3], hh[ldh+nb-3]);
885
#endif
886
887
#endif

Andreas Marek's avatar
Andreas Marek committed
888
889
        y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
        y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));
890

891
#ifdef HAVE_SSE_INTRINSICS
892
#ifdef DOUBLE_PRECISION
Andreas Marek's avatar
Andreas Marek committed
893
        h3 = _mm_set1_pd(hh[(ldh*2)+nb-2]);
894
895
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
896
        h3 = _mm_set1_ps(hh[(ldh*2)+nb-2]);
897
898
899
900
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION
Andreas Marek's avatar
Andreas Marek committed
901
        h3 = _mm_set_pd(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
902
903
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
904
        h3 = _mm_set_ps(hh[(ldh*2)+nb-2], hh[(ldh*2)+nb-2]);
905
#endif
906
907
#endif

Andreas Marek's avatar
Andreas Marek committed
908
909
        z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
        z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));
910

911
#ifdef HAVE_SSE_INTRINSICS
912
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
913
        h4 = _mm_set1_pd(hh[(ldh*3)+nb-1]);
914
915
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
916
        h4 = _mm_set1_ps(hh[(ldh*3)+nb-1]);
917
918
919
920
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
921
        h4 = _mm_set_pd(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
922
923
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
924
        h4 = _mm_set_ps(hh[(ldh*3)+nb-1], hh[(ldh*3)+nb-1]);
925
#endif
926
927
#endif

928

Andreas Marek's avatar
Andreas Marek committed
929
930
        w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
        w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));
931

932
#ifdef HAVE_SSE_INTRINSICS
933
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
934
        h1 = _mm_set1_pd(hh[nb-3]);
935
936
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
937
        h1 = _mm_set1_ps(hh[nb-3]);
938
939
940
941
#endif
#endif
#ifdef HAVE_SPARC64_SSE
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
942
        h1 = _mm_set_pd(hh[nb-3], hh[nb-3]);
943
944
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
945
        h1 = _mm_set_ps(hh[nb-3], hh[nb-3]);
946
#endif
947
948
#endif

949

Andreas Marek's avatar
Andreas Marek committed
950
951
        q1 = _SSE_LOAD(&q[(nb+2)*ldq]);
        q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]);
952

Andreas Marek's avatar
Andreas Marek committed
953
954
        x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
        x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));
955

956
#ifdef HAVE_SSE_INTRINSICS
957
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
958
        h2 = _mm_set1_pd(hh[ldh+nb-2]);
959
960
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
961
        h2 = _mm_set1_ps(hh[ldh+nb-2]);
962
#endif
963
964
#endif

965
#ifdef HAVE_SPARC64_SSE
966
#ifdef DOUBLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
967
        h2 = _mm_set_pd(hh[ldh+nb-2], hh[ldh+nb-2]);
968
969
#endif
#ifdef SINGLE_PRECISION_REAL
Andreas Marek's avatar
Andreas Marek committed
970
        h2 = _mm_set_ps(hh[ldh+nb-2], hh[ldh+nb-2]);
Andreas Marek's avatar