real_sse_6hv_template.c 53 KB
Newer Older
1
2
3
4
5
6
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
7
//	Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
9
//	Informatik,
10
//    - Technische Universität München, Lehrstuhl für Informatik mit
11
//	Schwerpunkt Wissenschaftliches Rechnen ,
12
13
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
//	Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//	and
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
36
//    along with ELPA.	If not, see <http://www.gnu.org/licenses/>
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de), based on Alexander Heinecke (alexander.heinecke@mytum.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

#include <x86intrin.h>
65
66
#include <stdio.h>
#include <stdlib.h>
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

#define __forceinline __attribute__((always_inline)) static

#ifdef DOUBLE_PRECISION_REAL
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_SUB _mm_sub_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_STORE _mm_store_pd
#endif
#ifdef SINGLE_PRECISION_REAL
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_SUB _mm_sub_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_STORE _mm_store_ps
#endif

#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

#ifdef DOUBLE_PRECISION_REAL
//Forward declaration
static void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef SINGLE_PRECISION_REAL
static void hh_trafo_kernel_4_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);
static void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods);

void hexa_hh_trafo_real_sse_6hv_single_(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh);
#endif

#ifdef DOUBLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
112
113
114
115
116
!f>				bind(C, name="hexa_hh_trafo_real_sse_6hv_double")
!f>	use, intrinsic :: iso_c_binding
!f>	integer(kind=c_int)	:: pnb, pnq, pldq, pldh
!f>	type(c_ptr), value	:: q
!f>	real(kind=c_double)	:: hh(pnb,6)
117
118
119
120
121
122
123
124
125
126
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif
#ifdef SINGLE_PRECISION_REAL
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine hexa_hh_trafo_real_sse_6hv_single(q, hh, pnb, pnq, pldq, pldh) &
127
128
129
130
131
!f>				bind(C, name="hexa_hh_trafo_real_sse_6hv_single")
!f>	use, intrinsic :: iso_c_binding
!f>	integer(kind=c_int)	:: pnb, pnq, pldq, pldh
!f>	type(c_ptr), value	:: q
!f>	real(kind=c_float)	:: hh(pnb,6)
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef DOUBLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
#ifdef SINGLE_PRECISION_REAL
void hexa_hh_trafo_real_sse_6hv_single(float* q, float* hh, int* pnb, int* pnq, int* pldq, int* pldh)
#endif
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;
150
151
152
	int worked_on ;

	worked_on = 0;
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262

	// calculating scalar products to compute
	// 6 householder vectors simultaneously
#ifdef DOUBLE_PRECISION_REAL
	double scalarprods[15];
#endif
#ifdef SINGLE_PRECISION_REAL
	float scalarprods[15];
#endif

	scalarprods[0] = hh[(ldh+1)];
	scalarprods[1] = hh[(ldh*2)+2];
	scalarprods[2] = hh[(ldh*2)+1];
	scalarprods[3] = hh[(ldh*3)+3];
	scalarprods[4] = hh[(ldh*3)+2];
	scalarprods[5] = hh[(ldh*3)+1];
	scalarprods[6] = hh[(ldh*4)+4];
	scalarprods[7] = hh[(ldh*4)+3];
	scalarprods[8] = hh[(ldh*4)+2];
	scalarprods[9] = hh[(ldh*4)+1];
	scalarprods[10] = hh[(ldh*5)+5];
	scalarprods[11] = hh[(ldh*5)+4];
	scalarprods[12] = hh[(ldh*5)+3];
	scalarprods[13] = hh[(ldh*5)+2];
	scalarprods[14] = hh[(ldh*5)+1];

	// calculate scalar product of first and fourth householder Vector
	// loop counter = 2
	scalarprods[0] += hh[1] * hh[(2+ldh)];
	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];

	// loop counter = 3
	scalarprods[0] += hh[2] * hh[(3+ldh)];
	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];

	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];

	// loop counter = 4
	scalarprods[0] += hh[3] * hh[(4+ldh)];
	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];

	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];

	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];

	// loop counter = 5
	scalarprods[0] += hh[4] * hh[(5+ldh)];
	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];

	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];

	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];

	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];

	#pragma ivdep
	for (i = 6; i < nb; i++)
	{
		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];

		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];

		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];

		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];

		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
	}

	// Production level kernel calls with padding
#ifdef DOUBLE_PRECISION_REAL
	for (i = 0; i < nq-2; i+=4)
	{
		hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
263
		worked_on += 4;
264
265
266
267
268
	}
#endif
#ifdef SINGLE_PRECISION_REAL
	for (i = 0; i < nq-4; i+=8)
	{
269
		hh_trafo_kernel_8_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
270
		worked_on += 8;
271
272
273
274
275
276
277
	}
#endif
	if (nq == i)
	{
		return;
	}
#ifdef DOUBLE_PRECISION_REAL
278
279
	if (nq -i == 2)
	{
280
		hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
281
282
		worked_on += 2;
	}
283
284
#endif
#ifdef SINGLE_PRECISION_REAL
285
286
	if (nq -i == 4)
	{
287
		hh_trafo_kernel_4_SSE_6hv_single(&q[i], hh, nb, ldq, ldh, scalarprods);
288
289
		worked_on += 4;
	}
290
#endif
291
292
293
294
	if (worked_on != nq)
	{
		printf("Error in real SSE BLOCK6 kernel \n");
		abort();
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
	}
}

/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 4 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 8 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
#endif
#ifdef SINGLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_8_SSE_6hv_single(float* q, float* hh, int nb, int ldq, int ldh, float* scalarprods)
#endif
{
	/////////////////////////////////////////////////////
	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
	// hh contains four householder vectors
	/////////////////////////////////////////////////////
	int i;

	__SSE_DATATYPE a1_1 = _SSE_LOAD(&q[ldq*5]);
	__SSE_DATATYPE a2_1 = _SSE_LOAD(&q[ldq*4]);
	__SSE_DATATYPE a3_1 = _SSE_LOAD(&q[ldq*3]);
	__SSE_DATATYPE a4_1 = _SSE_LOAD(&q[ldq*2]);
	__SSE_DATATYPE a5_1 = _SSE_LOAD(&q[ldq]);
	__SSE_DATATYPE a6_1 = _SSE_LOAD(&q[0]);

#ifdef DOUBLE_PRECISION_REAL
330
331
332
333
334
	__SSE_DATATYPE h_6_5 = _mm_set1_pd(hh[(ldh*5)+1]);
	__SSE_DATATYPE h_6_4 = _mm_set1_pd(hh[(ldh*5)+2]);
	__SSE_DATATYPE h_6_3 = _mm_set1_pd(hh[(ldh*5)+3]);
	__SSE_DATATYPE h_6_2 = _mm_set1_pd(hh[(ldh*5)+4]);
	__SSE_DATATYPE h_6_1 = _mm_set1_pd(hh[(ldh*5)+5]);
335
336
#endif
#ifdef SINGLE_PRECISION_REAL
337
338
339
340
341
	__SSE_DATATYPE h_6_5 =	 _mm_set1_ps(hh[(ldh*5)+1]) ;
	__SSE_DATATYPE h_6_4 =	 _mm_set1_ps(hh[(ldh*5)+2]) ;
	__SSE_DATATYPE h_6_3 =	 _mm_set1_ps(hh[(ldh*5)+3]) ;
	__SSE_DATATYPE h_6_2 =	 _mm_set1_ps(hh[(ldh*5)+4]) ;
	__SSE_DATATYPE h_6_1 =	 _mm_set1_ps(hh[(ldh*5)+5]) ;
342
343
344
345
346
347
348
349
350
351
#endif


	register __SSE_DATATYPE t1 = _SSE_ADD(a6_1, _SSE_MUL(a5_1, h_6_5));
	t1 = _SSE_ADD(t1, _SSE_MUL(a4_1, h_6_4));
	t1 = _SSE_ADD(t1, _SSE_MUL(a3_1, h_6_3));
	t1 = _SSE_ADD(t1, _SSE_MUL(a2_1, h_6_2));
	t1 = _SSE_ADD(t1, _SSE_MUL(a1_1, h_6_1));

#ifdef DOUBLE_PRECISION_REAL
352
353
354
355
	__SSE_DATATYPE h_5_4 = _mm_set1_pd(hh[(ldh*4)+1]);
	__SSE_DATATYPE h_5_3 = _mm_set1_pd(hh[(ldh*4)+2]);
	__SSE_DATATYPE h_5_2 = _mm_set1_pd(hh[(ldh*4)+3]);
	__SSE_DATATYPE h_5_1 = _mm_set1_pd(hh[(ldh*4)+4]);
356
357
#endif
#ifdef SINGLE_PRECISION_REAL
358
359
360
361
	__SSE_DATATYPE h_5_4 =	 _mm_set1_ps(hh[(ldh*4)+1]) ;
	__SSE_DATATYPE h_5_3 =	 _mm_set1_ps(hh[(ldh*4)+2]) ;
	__SSE_DATATYPE h_5_2 =	 _mm_set1_ps(hh[(ldh*4)+3]) ;
	__SSE_DATATYPE h_5_1 =	 _mm_set1_ps(hh[(ldh*4)+4]) ;
362
363
364
365
366
367
368
369
#endif

	register __SSE_DATATYPE v1 = _SSE_ADD(a5_1, _SSE_MUL(a4_1, h_5_4));
	v1 = _SSE_ADD(v1, _SSE_MUL(a3_1, h_5_3));
	v1 = _SSE_ADD(v1, _SSE_MUL(a2_1, h_5_2));
	v1 = _SSE_ADD(v1, _SSE_MUL(a1_1, h_5_1));

#ifdef DOUBLE_PRECISION_REAL
370
371
372
	__SSE_DATATYPE h_4_3 = _mm_set1_pd(hh[(ldh*3)+1]);
	__SSE_DATATYPE h_4_2 = _mm_set1_pd(hh[(ldh*3)+2]);
	__SSE_DATATYPE h_4_1 = _mm_set1_pd(hh[(ldh*3)+3]);
373
374
#endif
#ifdef SINGLE_PRECISION_REAL
375
376
377
	__SSE_DATATYPE h_4_3 =	 _mm_set1_ps(hh[(ldh*3)+1]) ;
	__SSE_DATATYPE h_4_2 =	 _mm_set1_ps(hh[(ldh*3)+2]) ;
	__SSE_DATATYPE h_4_1 =	 _mm_set1_ps(hh[(ldh*3)+3]) ;
378
379
380
381
382
383
384
#endif

	register __SSE_DATATYPE w1 = _SSE_ADD(a4_1, _SSE_MUL(a3_1, h_4_3));
	w1 = _SSE_ADD(w1, _SSE_MUL(a2_1, h_4_2));
	w1 = _SSE_ADD(w1, _SSE_MUL(a1_1, h_4_1));

#ifdef DOUBLE_PRECISION_REAL
385
386
387
	__SSE_DATATYPE h_2_1 = _mm_set1_pd(hh[ldh+1]);
	__SSE_DATATYPE h_3_2 = _mm_set1_pd(hh[(ldh*2)+1]);
	__SSE_DATATYPE h_3_1 = _mm_set1_pd(hh[(ldh*2)+2]);
388
389
#endif
#ifdef SINGLE_PRECISION_REAL
390
391
392
	__SSE_DATATYPE h_2_1 =	 _mm_set1_ps(hh[ldh+1]) ;
	__SSE_DATATYPE h_3_2 =	 _mm_set1_ps(hh[(ldh*2)+1]) ;
	__SSE_DATATYPE h_3_1 =	 _mm_set1_ps(hh[(ldh*2)+2]) ;
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
#endif

	register __SSE_DATATYPE z1 = _SSE_ADD(a3_1, _SSE_MUL(a2_1, h_3_2));
	z1 = _SSE_ADD(z1, _SSE_MUL(a1_1, h_3_1));
	register __SSE_DATATYPE y1 = _SSE_ADD(a2_1, _SSE_MUL(a1_1, h_2_1));

	register __SSE_DATATYPE x1 = a1_1;

	__SSE_DATATYPE a1_2 = _SSE_LOAD(&q[(ldq*5)+offset]);
	__SSE_DATATYPE a2_2 = _SSE_LOAD(&q[(ldq*4)+offset]);
	__SSE_DATATYPE a3_2 = _SSE_LOAD(&q[(ldq*3)+offset]);
	__SSE_DATATYPE a4_2 = _SSE_LOAD(&q[(ldq*2)+offset]);
	__SSE_DATATYPE a5_2 = _SSE_LOAD(&q[(ldq)+offset]);
	__SSE_DATATYPE a6_2 = _SSE_LOAD(&q[offset]);

	register __SSE_DATATYPE t2 = _SSE_ADD(a6_2, _SSE_MUL(a5_2, h_6_5));
	t2 = _SSE_ADD(t2, _SSE_MUL(a4_2, h_6_4));
	t2 = _SSE_ADD(t2, _SSE_MUL(a3_2, h_6_3));
	t2 = _SSE_ADD(t2, _SSE_MUL(a2_2, h_6_2));
	t2 = _SSE_ADD(t2, _SSE_MUL(a1_2, h_6_1));
	register __SSE_DATATYPE v2 = _SSE_ADD(a5_2, _SSE_MUL(a4_2, h_5_4));
	v2 = _SSE_ADD(v2, _SSE_MUL(a3_2, h_5_3));
	v2 = _SSE_ADD(v2, _SSE_MUL(a2_2, h_5_2));
	v2 = _SSE_ADD(v2, _SSE_MUL(a1_2, h_5_1));
	register __SSE_DATATYPE w2 = _SSE_ADD(a4_2, _SSE_MUL(a3_2, h_4_3));
	w2 = _SSE_ADD(w2, _SSE_MUL(a2_2, h_4_2));
	w2 = _SSE_ADD(w2, _SSE_MUL(a1_2, h_4_1));
	register __SSE_DATATYPE z2 = _SSE_ADD(a3_2, _SSE_MUL(a2_2, h_3_2));
	z2 = _SSE_ADD(z2, _SSE_MUL(a1_2, h_3_1));
	register __SSE_DATATYPE y2 = _SSE_ADD(a2_2, _SSE_MUL(a1_2, h_2_1));

	register __SSE_DATATYPE x2 = a1_2;

	__SSE_DATATYPE q1;
	__SSE_DATATYPE q2;

	__SSE_DATATYPE h1;
	__SSE_DATATYPE h2;
	__SSE_DATATYPE h3;
	__SSE_DATATYPE h4;
	__SSE_DATATYPE h5;
	__SSE_DATATYPE h6;

	for(i = 6; i < nb; i++)
	{
#ifdef DOUBLE_PRECISION_REAL
439
		h1 = _mm_set1_pd(hh[i-5]);
440
441
#endif
#ifdef SINGLE_PRECISION_REAL
442
		h1 = _mm_set1_ps(hh[i-5] );
443
444
445
446
447
448
449
450
#endif
		q1 = _SSE_LOAD(&q[i*ldq]);
		q2 = _SSE_LOAD(&q[(i*ldq)+offset]);

		x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
		x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));

#ifdef DOUBLE_PRECISION_REAL
451
		h2 = _mm_set1_pd(hh[ldh+i-4]);
452
453
#endif
#ifdef SINGLE_PRECISION_REAL
454
		h2 = _mm_set1_ps(hh[ldh+i-4] );
455
456
457
458
459
#endif
		y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
		y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));

#ifdef DOUBLE_PRECISION_REAL
460
		h3 = _mm_set1_pd(hh[(ldh*2)+i-3]);
461
462
#endif
#ifdef SINGLE_PRECISION_REAL
463
		h3 = _mm_set1_ps(hh[(ldh*2)+i-3] );
464
465
466
467
468
#endif
		z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
		z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));

#ifdef DOUBLE_PRECISION_REAL
469
		h4 = _mm_set1_pd(hh[(ldh*3)+i-2]);
470
471
#endif
#ifdef SINGLE_PRECISION_REAL
472
		h4 = _mm_set1_ps(hh[(ldh*3)+i-2] );
473
474
475
476
477
#endif
		w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
		w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));

#ifdef DOUBLE_PRECISION_REAL
478
		h5 = _mm_set1_pd(hh[(ldh*4)+i-1]);
479
480
#endif
#ifdef SINGLE_PRECISION_REAL
481
		h5 = _mm_set1_ps(hh[(ldh*4)+i-1] );
482
483
484
485
486
#endif
		v1 = _SSE_ADD(v1, _SSE_MUL(q1,h5));
		v2 = _SSE_ADD(v2, _SSE_MUL(q2,h5));

#ifdef DOUBLE_PRECISION_REAL
487
		h6 = _mm_set1_pd(hh[(ldh*5)+i]);
488
489
#endif
#ifdef SINGLE_PRECISION_REAL
490
		h6 = _mm_set1_ps(hh[(ldh*5)+i] );
491
492
493
494
495
496
#endif
		t1 = _SSE_ADD(t1, _SSE_MUL(q1,h6));
		t2 = _SSE_ADD(t2, _SSE_MUL(q2,h6));
	}

#ifdef DOUBLE_PRECISION_REAL
497
	h1 = _mm_set1_pd(hh[nb-5]);
498
499
#endif
#ifdef SINGLE_PRECISION_REAL
500
	h1 = _mm_set1_ps(hh[nb-5] );
501
502
503
504
505
506
507
508
#endif
	q1 = _SSE_LOAD(&q[nb*ldq]);
	q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);

	x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
	x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));

#ifdef DOUBLE_PRECISION_REAL
509
	h2 = _mm_set1_pd(hh[ldh+nb-4]);
510
511
#endif
#ifdef SINGLE_PRECISION_REAL
512
	h2 = _mm_set1_ps(hh[ldh+nb-4] );
513
514
515
516
517
#endif
	y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
	y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));

#ifdef DOUBLE_PRECISION_REAL
518
	h3 = _mm_set1_pd(hh[(ldh*2)+nb-3]);
519
520
#endif
#ifdef SINGLE_PRECISION_REAL
521
	h3 = _mm_set1_ps(hh[(ldh*2)+nb-3] );
522
523
524
525
526
#endif
	z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
	z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));

#ifdef DOUBLE_PRECISION_REAL
527
	h4 = _mm_set1_pd(hh[(ldh*3)+nb-2]);
528
529
#endif
#ifdef SINGLE_PRECISION_REAL
530
	h4 = _mm_set1_ps(hh[(ldh*3)+nb-2] );
531
532
533
534
535
#endif
	w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
	w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));

#ifdef DOUBLE_PRECISION_REAL
536
	h5 = _mm_set1_pd(hh[(ldh*4)+nb-1]);
537
538
#endif
#ifdef SINGLE_PRECISION_REAL
539
	h5 = _mm_set1_ps(hh[(ldh*4)+nb-1] );
540
541
542
543
544
545
#endif

	v1 = _SSE_ADD(v1, _SSE_MUL(q1,h5));
	v2 = _SSE_ADD(v2, _SSE_MUL(q2,h5));

#ifdef DOUBLE_PRECISION_REAL
546
	h1 = _mm_set1_pd(hh[nb-4]);
547
548
#endif
#ifdef SINGLE_PRECISION_REAL
549
	h1 = _mm_set1_ps(hh[nb-4] );
550
551
552
553
554
555
556
557
#endif
	q1 = _SSE_LOAD(&q[(nb+1)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]);

	x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
	x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));

#ifdef DOUBLE_PRECISION_REAL
558
	h2 = _mm_set1_pd(hh[ldh+nb-3]);
559
560
#endif
#ifdef SINGLE_PRECISION_REAL
561
	h2 = _mm_set1_ps(hh[ldh+nb-3] );
562
563
564
565
566
567
#endif

	y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
	y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));

#ifdef DOUBLE_PRECISION
568
	h3 = _mm_set1_pd(hh[(ldh*2)+nb-2]);
569
570
#endif
#ifdef SINGLE_PRECISION_REAL
571
	h3 = _mm_set1_ps(hh[(ldh*2)+nb-2] );
572
573
574
575
576
577
#endif

	z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
	z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));

#ifdef DOUBLE_PRECISION_REAL
578
	h4 = _mm_set1_pd(hh[(ldh*3)+nb-1]);
579
580
#endif
#ifdef SINGLE_PRECISION_REAL
581
	h4 = _mm_set1_ps(hh[(ldh*3)+nb-1] );
582
583
584
585
586
587
#endif

	w1 = _SSE_ADD(w1, _SSE_MUL(q1,h4));
	w2 = _SSE_ADD(w2, _SSE_MUL(q2,h4));

#ifdef DOUBLE_PRECISION_REAL
588
	h1 = _mm_set1_pd(hh[nb-3]);
589
590
#endif
#ifdef SINGLE_PRECISION_REAL
591
	h1 = _mm_set1_ps(hh[nb-3] );
592
593
594
595
596
597
598
599
600
#endif

	q1 = _SSE_LOAD(&q[(nb+2)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]);

	x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
	x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));

#ifdef DOUBLE_PRECISION_REAL
601
	h2 = _mm_set1_pd(hh[ldh+nb-2]);
602
603
#endif
#ifdef SINGLE_PRECISION_REAL
604
	h2 = _mm_set1_ps(hh[ldh+nb-2] );
605
606
607
608
609
610
#endif

	y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
	y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));

#ifdef DOUBLE_PRECISION_REAL
611
	h3 = _mm_set1_pd(hh[(ldh*2)+nb-1]);
612
613
#endif
#ifdef SINGLE_PRECISION_REAL
614
	h3 = _mm_set1_ps(hh[(ldh*2)+nb-1] );
615
616
617
618
619
620
#endif

	z1 = _SSE_ADD(z1, _SSE_MUL(q1,h3));
	z2 = _SSE_ADD(z2, _SSE_MUL(q2,h3));

#ifdef DOUBLE_PRECISION_REAL
621
	h1 = _mm_set1_pd(hh[nb-2]);
622
623
#endif
#ifdef SINGLE_PRECISION_REAL
624
	h1 = _mm_set1_ps(hh[nb-2] );
625
626
627
628
629
630
631
632
633
#endif

	q1 = _SSE_LOAD(&q[(nb+3)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+3)*ldq)+offset]);

	x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
	x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));

#ifdef DOUBLE_PRECISION_REAL
634
	h2 = _mm_set1_pd(hh[ldh+nb-1]);
635
636
#endif
#ifdef SINGLE_PRECISION_REAL
637
	h2 = _mm_set1_ps(hh[ldh+nb-1] );
638
639
640
641
642
643
644
#endif


	y1 = _SSE_ADD(y1, _SSE_MUL(q1,h2));
	y2 = _SSE_ADD(y2, _SSE_MUL(q2,h2));

#ifdef DOUBLE_PRECISION_REAL
645
	h1 = _mm_set1_pd(hh[nb-1]);
646
647
#endif
#ifdef SINGLE_PRECISION_REAL
648
	h1 = _mm_set1_ps(hh[nb-1] );
649
650
651
652
653
654
655
656
657
658
659
660
661
#endif

	q1 = _SSE_LOAD(&q[(nb+4)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+4)*ldq)+offset]);

	x1 = _SSE_ADD(x1, _SSE_MUL(q1,h1));
	x2 = _SSE_ADD(x2, _SSE_MUL(q2,h1));

	/////////////////////////////////////////////////////
	// Apply tau, correct wrong calculation using pre-calculated scalar products
	/////////////////////////////////////////////////////

#ifdef DOUBLE_PRECISION_REAL
662
	__SSE_DATATYPE tau1 = _mm_set1_pd(hh[0]);
663
664
665
	x1 = _SSE_MUL(x1, tau1);
	x2 = _SSE_MUL(x2, tau1);

666
667
	__SSE_DATATYPE tau2 = _mm_set1_pd(hh[ldh]);
	__SSE_DATATYPE vs_1_2 = _mm_set1_pd(scalarprods[0]);
668
669
670
	h2 = _SSE_MUL(tau2, vs_1_2);
#endif
#ifdef SINGLE_PRECISION_REAL
671
	__SSE_DATATYPE tau1 = _mm_set1_ps(hh[0] );
672
673
674
	x1 = _SSE_MUL(x1, tau1);
	x2 = _SSE_MUL(x2, tau1);

675
676
	__SSE_DATATYPE tau2 = _mm_set1_ps(hh[ldh] );
	__SSE_DATATYPE vs_1_2 = _mm_set1_ps(scalarprods[0] );
677
678
679
680
681
682
683
	h2 = _SSE_MUL(tau2, vs_1_2);
#endif

	y1 = _SSE_SUB(_SSE_MUL(y1,tau2), _SSE_MUL(x1,h2));
	y2 = _SSE_SUB(_SSE_MUL(y2,tau2), _SSE_MUL(x2,h2));

#ifdef DOUBLE_PRECISION_REAL
684
685
686
	__SSE_DATATYPE tau3 = _mm_set1_pd(hh[ldh*2]);
	__SSE_DATATYPE vs_1_3 = _mm_set1_pd(scalarprods[1]);
	__SSE_DATATYPE vs_2_3 = _mm_set1_pd(scalarprods[2]);
687
688
689
690
	h2 = _SSE_MUL(tau3, vs_1_3);
	h3 = _SSE_MUL(tau3, vs_2_3);
#endif
#ifdef SINGLE_PRECISION_REAL
691
692
693
	__SSE_DATATYPE tau3 = _mm_set1_ps(hh[ldh*2] );
	__SSE_DATATYPE vs_1_3 = _mm_set1_ps(scalarprods[1] );
	__SSE_DATATYPE vs_2_3 = _mm_set1_ps(scalarprods[2] );
694
695
696
697
698
699
700
701
	h2 = _SSE_MUL(tau3, vs_1_3);
	h3 = _SSE_MUL(tau3, vs_2_3);
#endif

	z1 = _SSE_SUB(_SSE_MUL(z1,tau3), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2)));
	z2 = _SSE_SUB(_SSE_MUL(z2,tau3), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2)));

#ifdef DOUBLE_PRECISION_REAL
702
703
704
	__SSE_DATATYPE tau4 = _mm_set1_pd(hh[ldh*3]);
	__SSE_DATATYPE vs_1_4 = _mm_set1_pd(scalarprods[3]);
	__SSE_DATATYPE vs_2_4 = _mm_set1_pd(scalarprods[4]);
705
706
	h2 = _SSE_MUL(tau4, vs_1_4);
	h3 = _SSE_MUL(tau4, vs_2_4);
707
	__SSE_DATATYPE vs_3_4 = _mm_set1_pd(scalarprods[5]);
708
709
710
	h4 = _SSE_MUL(tau4, vs_3_4);
#endif
#ifdef SINGLE_PRECISION_REAL
711
712
713
       __SSE_DATATYPE tau4 = _mm_set1_ps(hh[ldh*3] );
	__SSE_DATATYPE vs_1_4 = _mm_set1_ps(scalarprods[3] );
	__SSE_DATATYPE vs_2_4 = _mm_set1_ps(scalarprods[4] );
714
715
716
	h2 = _SSE_MUL(tau4, vs_1_4);
	h3 = _SSE_MUL(tau4, vs_2_4);

717
	__SSE_DATATYPE vs_3_4 = _mm_set1_ps(scalarprods[5] );
718
719
720
721
722
723
724
	h4 = _SSE_MUL(tau4, vs_3_4);
#endif

	w1 = _SSE_SUB(_SSE_MUL(w1,tau4), _SSE_ADD(_SSE_MUL(z1,h4), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2))));
	w2 = _SSE_SUB(_SSE_MUL(w2,tau4), _SSE_ADD(_SSE_MUL(z2,h4), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2))));

#ifdef DOUBLE_PRECISION_REAL
725
726
727
	__SSE_DATATYPE tau5 = _mm_set1_pd(hh[ldh*4]);
	__SSE_DATATYPE vs_1_5 = _mm_set1_pd(scalarprods[6]);
	__SSE_DATATYPE vs_2_5 = _mm_set1_pd(scalarprods[7]);
728
729
	h2 = _SSE_MUL(tau5, vs_1_5);
	h3 = _SSE_MUL(tau5, vs_2_5);
730
731
	__SSE_DATATYPE vs_3_5 = _mm_set1_pd(scalarprods[8]);
	__SSE_DATATYPE vs_4_5 = _mm_set1_pd(scalarprods[9]);
732
733
734
735
	h4 = _SSE_MUL(tau5, vs_3_5);
	h5 = _SSE_MUL(tau5, vs_4_5);
#endif
#ifdef SINGLE_PRECISION_REAL
736
737
738
	__SSE_DATATYPE tau5 = _mm_set1_ps(hh[ldh*4] );
	__SSE_DATATYPE vs_1_5 = _mm_set1_ps(scalarprods[6] );
	__SSE_DATATYPE vs_2_5 = _mm_set1_ps(scalarprods[7] );
739
740
741
	h2 = _SSE_MUL(tau5, vs_1_5);
	h3 = _SSE_MUL(tau5, vs_2_5);

742
743
	__SSE_DATATYPE vs_3_5 = _mm_set1_ps(scalarprods[8] );
	__SSE_DATATYPE vs_4_5 = _mm_set1_ps(scalarprods[9] );
744
745
746
747
748
749
750
751
752

	h4 = _SSE_MUL(tau5, vs_3_5);
	h5 = _SSE_MUL(tau5, vs_4_5);
#endif

	v1 = _SSE_SUB(_SSE_MUL(v1,tau5), _SSE_ADD(_SSE_ADD(_SSE_MUL(w1,h5), _SSE_MUL(z1,h4)), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2))));
	v2 = _SSE_SUB(_SSE_MUL(v2,tau5), _SSE_ADD(_SSE_ADD(_SSE_MUL(w2,h5), _SSE_MUL(z2,h4)), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2))));

#ifdef DOUBLE_PRECISION_REAL
753
754
755
	__SSE_DATATYPE tau6 = _mm_set1_pd(hh[ldh*5]);
	__SSE_DATATYPE vs_1_6 = _mm_set1_pd(scalarprods[10]);
	__SSE_DATATYPE vs_2_6 = _mm_set1_pd(scalarprods[11]);
756
757
	h2 = _SSE_MUL(tau6, vs_1_6);
	h3 = _SSE_MUL(tau6, vs_2_6);
758
759
760
	__SSE_DATATYPE vs_3_6 = _mm_set1_pd(scalarprods[12]);
	__SSE_DATATYPE vs_4_6 = _mm_set1_pd(scalarprods[13]);
	__SSE_DATATYPE vs_5_6 = _mm_set1_pd(scalarprods[14]);
761
762
763
764
765
	h4 = _SSE_MUL(tau6, vs_3_6);
	h5 = _SSE_MUL(tau6, vs_4_6);
	h6 = _SSE_MUL(tau6, vs_5_6);
#endif
#ifdef SINGLE_PRECISION_REAL
766
767
768
	__SSE_DATATYPE tau6 = _mm_set1_ps(hh[ldh*5] );
	__SSE_DATATYPE vs_1_6 = _mm_set1_ps(scalarprods[10] );
	__SSE_DATATYPE vs_2_6 = _mm_set1_ps(scalarprods[11] );
769
770
771
772

	h2 = _SSE_MUL(tau6, vs_1_6);
	h3 = _SSE_MUL(tau6, vs_2_6);

773
774
775
	__SSE_DATATYPE vs_3_6 = _mm_set1_ps(scalarprods[12] );
	__SSE_DATATYPE vs_4_6 = _mm_set1_ps(scalarprods[13] );
	__SSE_DATATYPE vs_5_6 = _mm_set1_ps(scalarprods[14] );
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797

	h4 = _SSE_MUL(tau6, vs_3_6);
	h5 = _SSE_MUL(tau6, vs_4_6);
	h6 = _SSE_MUL(tau6, vs_5_6);
#endif

	t1 = _SSE_SUB(_SSE_MUL(t1,tau6), _SSE_ADD( _SSE_MUL(v1,h6), _SSE_ADD(_SSE_ADD(_SSE_MUL(w1,h5), _SSE_MUL(z1,h4)), _SSE_ADD(_SSE_MUL(y1,h3), _SSE_MUL(x1,h2)))));
	t2 = _SSE_SUB(_SSE_MUL(t2,tau6), _SSE_ADD( _SSE_MUL(v2,h6), _SSE_ADD(_SSE_ADD(_SSE_MUL(w2,h5), _SSE_MUL(z2,h4)), _SSE_ADD(_SSE_MUL(y2,h3), _SSE_MUL(x2,h2)))));

	/////////////////////////////////////////////////////
	// Rank-1 update of Q [4 x nb+3]
	/////////////////////////////////////////////////////

	q1 = _SSE_LOAD(&q[0]);
	q2 = _SSE_LOAD(&q[offset]);
	q1 = _SSE_SUB(q1, t1);
	q2 = _SSE_SUB(q2, t2);
	_SSE_STORE(&q[0],q1);
	_SSE_STORE(&q[offset],q2);


#ifdef DOUBLE_PRECISION_REAL
798
	h6 = _mm_set1_pd(hh[(ldh*5)+1]);
799
800
#endif
#ifdef SINGLE_PRECISION_REAL
801
	h6 = _mm_set1_ps(hh[(ldh*5)+1] );
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
#endif


	q1 = _SSE_LOAD(&q[ldq]);
	q2 = _SSE_LOAD(&q[(ldq+offset)]);
	q1 = _SSE_SUB(q1, v1);
	q2 = _SSE_SUB(q2, v2);

	q1 = _SSE_SUB(q1, _SSE_MUL(t1, h6));
	q2 = _SSE_SUB(q2, _SSE_MUL(t2, h6));

	_SSE_STORE(&q[ldq],q1);
	_SSE_STORE(&q[(ldq+offset)],q2);

#ifdef DOUBLE_PRECISION_REAL
817
	h5 = _mm_set1_pd(hh[(ldh*4)+1]);
818
819
#endif
#ifdef SINGLE_PRECISION_REAL
820
	h5 = _mm_set1_ps(hh[(ldh*4)+1] );
821
822
823
824
825
826
827
828
829
#endif
	q1 = _SSE_LOAD(&q[ldq*2]);
	q2 = _SSE_LOAD(&q[(ldq*2)+offset]);
	q1 = _SSE_SUB(q1, w1);
	q2 = _SSE_SUB(q2, w2);
	q1 = _SSE_SUB(q1, _SSE_MUL(v1, h5));
	q2 = _SSE_SUB(q2, _SSE_MUL(v2, h5));

#ifdef DOUBLE_PRECISION_REAL
830
	h6 = _mm_set1_pd(hh[(ldh*5)+2]);
831
832
#endif
#ifdef SINGLE_PRECISION_REAL
833
	h6 = _mm_set1_ps(hh[(ldh*5)+2] );
834
835
836
837
838
839
840
841
842
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(t1, h6));
	q2 = _SSE_SUB(q2, _SSE_MUL(t2, h6));

	_SSE_STORE(&q[ldq*2],q1);
	_SSE_STORE(&q[(ldq*2)+offset],q2);

#ifdef DOUBLE_PRECISION_REAL
843
	h4 = _mm_set1_pd(hh[(ldh*3)+1]);
844
845
#endif
#ifdef SINGLE_PRECISION_REAL
846
	h4 = _mm_set1_ps(hh[(ldh*3)+1] );
847
848
849
850
851
852
853
854
855
856
857
#endif

	q1 = _SSE_LOAD(&q[ldq*3]);
	q2 = _SSE_LOAD(&q[(ldq*3)+offset]);
	q1 = _SSE_SUB(q1, z1);
	q2 = _SSE_SUB(q2, z2);

	q1 = _SSE_SUB(q1, _SSE_MUL(w1, h4));
	q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4));

#ifdef DOUBLE_PRECISION_REAL
858
	h5 = _mm_set1_pd(hh[(ldh*4)+2]);
859
860
#endif
#ifdef SINGLE_PRECISION_REAL
861
	h5 = _mm_set1_ps(hh[(ldh*4)+2] );
862
863
864
865
866
867
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(v1, h5));
	q2 = _SSE_SUB(q2, _SSE_MUL(v2, h5));

#ifdef DOUBLE_PRECISION_REAL
868
	h6 = _mm_set1_pd(hh[(ldh*5)+3]);
869
870
#endif
#ifdef SINGLE_PRECISION_REAL
871
	h6 = _mm_set1_ps(hh[(ldh*5)+3] );
872
873
874
875
876
877
878
879
880
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(t1, h6));
	q2 = _SSE_SUB(q2, _SSE_MUL(t2, h6));

	_SSE_STORE(&q[ldq*3],q1);
	_SSE_STORE(&q[(ldq*3)+offset],q2);

#ifdef DOUBLE_PRECISION_REAL
881
	h3 = _mm_set1_pd(hh[(ldh*2)+1]);
882
883
#endif
#ifdef SINGLE_PRECISION_REAL
884
	h3 = _mm_set1_ps(hh[(ldh*2)+1] );
885
886
887
888
889
890
891
892
893
894
895
#endif

	q1 = _SSE_LOAD(&q[ldq*4]);
	q2 = _SSE_LOAD(&q[(ldq*4)+offset]);
	q1 = _SSE_SUB(q1, y1);
	q2 = _SSE_SUB(q2, y2);

	q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
	q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));

#ifdef DOUBLE_PRECISION_REAL
896
	h4 = _mm_set1_pd(hh[(ldh*3)+2]);
897
898
#endif
#ifdef SINGLE_PRECISION_REAL
899
	h4 = _mm_set1_ps(hh[(ldh*3)+2] );
900
901
902
903
904
905
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(w1, h4));
	q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4));

#ifdef DOUBLE_PRECISION_REAL
906
	h5 = _mm_set1_pd(hh[(ldh*4)+3]);
907
908
#endif
#ifdef SINGLE_PRECISION_REAL
909
	h5 = _mm_set1_ps(hh[(ldh*4)+3] );
910
911
912
913
914
915
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(v1, h5));
	q2 = _SSE_SUB(q2, _SSE_MUL(v2, h5));

#ifdef DOUBLE_PRECISION_REAL
916
	h6 = _mm_set1_pd(hh[(ldh*5)+4]);
917
918
#endif
#ifdef SINGLE_PRECISION_REAL
919
	h6 = _mm_set1_ps(hh[(ldh*5)+4] );
920
921
922
923
924
925
926
927
928
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(t1, h6));
	q2 = _SSE_SUB(q2, _SSE_MUL(t2, h6));

	_SSE_STORE(&q[ldq*4],q1);
	_SSE_STORE(&q[(ldq*4)+offset],q2);

#ifdef DOUBLE_PRECISION_REAL
929
	h2 = _mm_set1_pd(hh[(ldh)+1]);
930
931
#endif
#ifdef SINGLE_PRECISION_REAL
932
	h2 = _mm_set1_ps(hh[(ldh)+1] );
933
934
935
936
937
938
939
940
941
942
943
#endif

	q1 = _SSE_LOAD(&q[ldq*5]);
	q2 = _SSE_LOAD(&q[(ldq*5)+offset]);
	q1 = _SSE_SUB(q1, x1);
	q2 = _SSE_SUB(q2, x2);

	q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2));
	q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2));

#ifdef DOUBLE_PRECISION_REAL
944
	h3 = _mm_set1_pd(hh[(ldh*2)+2]);
945
946
#endif
#ifdef SINGLE_PRECISION_REAL
947
	h3 = _mm_set1_ps(hh[(ldh*2)+2] );
948
949
950
951
952
953
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
	q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));

#ifdef DOUBLE_PRECISION_REAL
954
	h4 = _mm_set1_pd(hh[(ldh*3)+3]);
955
956
#endif
#ifdef SINGLE_PRECISION_REAL
957
	h4 = _mm_set1_ps(hh[(ldh*3)+3] );
958
959
960
961
962
963
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(w1, h4));
	q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4));

#ifdef DOUBLE_PRECISION_REAL
964
	h5 = _mm_set1_pd(hh[(ldh*4)+4]);
965
966
#endif
#ifdef SINGLE_PRECISION_REAL
967
	h5 = _mm_set1_ps(hh[(ldh*4)+4] );
968
969
970
971
972
973
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(v1, h5));
	q2 = _SSE_SUB(q2, _SSE_MUL(v2, h5));

#ifdef DOUBLE_PRECISION_REAL
974
	h6 = _mm_set1_pd(hh[(ldh*5)+5]);
975
976
#endif
#ifdef SINGLE_PRECISION_REAL
977
	h6 = _mm_set1_ps(hh[(ldh*5)+5] );
978
979
980
981
982
983
984
985
986
987
988
989
990
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(t1, h6));
	q2 = _SSE_SUB(q2, _SSE_MUL(t2, h6));

	_SSE_STORE(&q[ldq*5],q1);
	_SSE_STORE(&q[(ldq*5)+offset],q2);

	for (i = 6; i < nb; i++)
	{
		q1 = _SSE_LOAD(&q[i*ldq]);
		q2 = _SSE_LOAD(&q[(i*ldq)+offset]);
#ifdef DOUBLE_PRECISION_REAL
991
		h1 = _mm_set1_pd(hh[i-5]);
992
993
#endif
#ifdef SINGLE_PRECISION_REAL
994
		h1 = _mm_set1_ps(hh[i-5] );
995
996
997
998
999
1000
#endif

		q1 = _SSE_SUB(q1, _SSE_MUL(x1, h1));
		q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));

#ifdef DOUBLE_PRECISION_REAL
1001
		h2 = _mm_set1_pd(hh[ldh+i-4]);
1002
1003
#endif
#ifdef SINGLE_PRECISION_REAL
1004
		h2 = _mm_set1_ps(hh[ldh+i-4] );
1005
1006
1007
1008
1009
1010
#endif

		q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2));
		q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2));

#ifdef DOUBLE_PRECISION_REAL
1011
		h3 = _mm_set1_pd(hh[(ldh*2)+i-3]);
1012
1013
#endif
#ifdef SINGLE_PRECISION_REAL
1014
		h3 = _mm_set1_ps(hh[(ldh*2)+i-3] );
1015
1016
1017
1018
1019
1020
#endif

		q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
		q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));

#ifdef DOUBLE_PRECISION_REAL
1021
		h4 = _mm_set1_pd(hh[(ldh*3)+i-2]);
1022
1023
#endif
#ifdef SINGLE_PRECISION_REAL
1024
		h4 = _mm_set1_ps(hh[(ldh*3)+i-2] );
1025
1026
1027
1028
1029
1030
#endif

		q1 = _SSE_SUB(q1, _SSE_MUL(w1, h4));
		q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4));

#ifdef DOUBLE_PRECISION_REAL
1031
		h5 = _mm_set1_pd(hh[(ldh*4)+i-1]);
1032
1033
#endif
#ifdef SINGLE_PRECISION_REAL
1034
		h5 = _mm_set1_ps(hh[(ldh*4)+i-1] );
1035
1036
1037
1038
1039
1040
#endif

		q1 = _SSE_SUB(q1, _SSE_MUL(v1, h5));
		q2 = _SSE_SUB(q2, _SSE_MUL(v2, h5));

#ifdef DOUBLE_PRECISION_REAL
1041
		h6 = _mm_set1_pd(hh[(ldh*5)+i]);
1042
1043
#endif
#ifdef SINGLE_PRECISION_REAL
1044
		h6 = _mm_set1_ps(hh[(ldh*5)+i] );
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
#endif

		q1 = _SSE_SUB(q1, _SSE_MUL(t1, h6));
		q2 = _SSE_SUB(q2, _SSE_MUL(t2, h6));

		_SSE_STORE(&q[i*ldq],q1);
		_SSE_STORE(&q[(i*ldq)+offset],q2);
	}

#ifdef DOUBLE_PRECISION_REAL
1055
	h1 = _mm_set1_pd(hh[nb-5]);
1056
1057
#endif
#ifdef SINGLE_PRECISION_REAL
1058
	h1 = _mm_set1_ps(hh[nb-5] );
1059
1060
1061
1062
1063
1064
1065
1066
1067
#endif

	q1 = _SSE_LOAD(&q[nb*ldq]);
	q2 = _SSE_LOAD(&q[(nb*ldq)+offset]);

	q1 = _SSE_SUB(q1, _SSE_MUL(x1, h1));
	q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));

#ifdef DOUBLE_PRECISION_REAL
1068
	h2 = _mm_set1_pd(hh[ldh+nb-4]);
1069
1070
#endif
#ifdef SINGLE_PRECISION_REAL
1071
	h2 = _mm_set1_ps(hh[ldh+nb-4] );
1072
1073
1074
1075
1076
1077
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2));
	q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2));

#ifdef DOUBLE_PRECISION_REAL
1078
	h3 = _mm_set1_pd(hh[(ldh*2)+nb-3]);
1079
1080
#endif
#ifdef SINGLE_PRECISION_REAL
1081
	h3 = _mm_set1_ps(hh[(ldh*2)+nb-3] );
1082
1083
1084
1085
1086
1087
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
	q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));

#ifdef DOUBLE_PRECISION_REAL
1088
	h4 = _mm_set1_pd(hh[(ldh*3)+nb-2]);
1089
1090
#endif
#ifdef SINGLE_PRECISION_REAL
1091
	h4 = _mm_set1_ps(hh[(ldh*3)+nb-2] );
1092
1093
1094
1095
1096
1097
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(w1, h4));
	q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4));

#ifdef DOUBLE_PRECISION_REAL
1098
	h5 = _mm_set1_pd(hh[(ldh*4)+nb-1]);
1099
1100
#endif
#ifdef SINGLE_PRECISION_REAL
1101
	h5 = _mm_set1_ps(hh[(ldh*4)+nb-1] );
1102
1103
1104
1105
1106
1107
1108
1109
1110
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(v1, h5));
	q2 = _SSE_SUB(q2, _SSE_MUL(v2, h5));

	_SSE_STORE(&q[nb*ldq],q1);
	_SSE_STORE(&q[(nb*ldq)+offset],q2);

#ifdef DOUBLE_PRECISION_REAL
1111
	h1 = _mm_set1_pd(hh[nb-4]);
1112
1113
#endif
#ifdef SINGLE_PRECISION_REAL
1114
	h1 = _mm_set1_ps(hh[nb-4] );
1115
1116
1117
1118
1119
1120
1121
1122
1123
#endif

	q1 = _SSE_LOAD(&q[(nb+1)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+1)*ldq)+offset]);

	q1 = _SSE_SUB(q1, _SSE_MUL(x1, h1));
	q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));

#ifdef DOUBLE_PRECISION_REAL
1124
	h2 = _mm_set1_pd(hh[ldh+nb-3]);
1125
1126
#endif
#ifdef SINGLE_PRECISION_REAL
1127
	h2 = _mm_set1_ps(hh[ldh+nb-3] );
1128
1129
1130
1131
1132
1133
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2));
	q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2));

#ifdef DOUBLE_PRECISION_REAL
1134
	h3 = _mm_set1_pd(hh[(ldh*2)+nb-2]);
1135
1136
#endif
#ifdef SINGLE_PRECISION_REAL
1137
	h3 = _mm_set1_ps(hh[(ldh*2)+nb-2] );
1138
1139
1140
1141
1142
1143
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
	q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));

#ifdef DOUBLE_PRECISION_REAL
1144
	h4 = _mm_set1_pd(hh[(ldh*3)+nb-1]);
1145
1146
#endif
#ifdef SINGLE_PRECISION_REAL
1147
	h4 = _mm_set1_ps(hh[(ldh*3)+nb-1] );
1148
1149
1150
1151
1152
1153
1154
1155
1156
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(w1, h4));
	q2 = _SSE_SUB(q2, _SSE_MUL(w2, h4));

	_SSE_STORE(&q[(nb+1)*ldq],q1);
	_SSE_STORE(&q[((nb+1)*ldq)+offset],q2);

#ifdef DOUBLE_PRECISION_REAL
1157
	h1 = _mm_set1_pd(hh[nb-3]);
1158
1159
#endif
#ifdef SINGLE_PRECISION_REAL
1160
	h1 = _mm_set1_ps(hh[nb-3] );
1161
1162
1163
1164
1165
1166
1167
1168
1169
#endif

	q1 = _SSE_LOAD(&q[(nb+2)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+2)*ldq)+offset]);

	q1 = _SSE_SUB(q1, _SSE_MUL(x1, h1));
	q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));

#ifdef DOUBLE_PRECISION_REAL
1170
	h2 = _mm_set1_pd(hh[ldh+nb-2]);
1171
1172
#endif
#ifdef SINGLE_PRECISION_REAL
1173
	h2 = _mm_set1_ps(hh[ldh+nb-2] );
1174
1175
1176
1177
1178
1179
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2));
	q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2));

#ifdef DOUBLE_PRECISION_REAL
1180
	h3 = _mm_set1_pd(hh[(ldh*2)+nb-1]);
1181
1182
#endif
#ifdef SINGLE_PRECISION_REAL
1183
	h3 = _mm_set1_ps(hh[(ldh*2)+nb-1] );
1184
1185
1186
1187
1188
1189
1190
1191
1192
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(z1, h3));
	q2 = _SSE_SUB(q2, _SSE_MUL(z2, h3));

	_SSE_STORE(&q[(nb+2)*ldq],q1);
	_SSE_STORE(&q[((nb+2)*ldq)+offset],q2);

#ifdef DOUBLE_PRECISION_REAL
1193
	h1 = _mm_set1_pd(hh[nb-2]);
1194
1195
#endif
#ifdef SINGLE_PRECISION_REAL
1196
	h1 = _mm_set1_ps(hh[nb-2] );
1197
1198
1199
1200
1201
1202
1203
1204
1205
#endif

	q1 = _SSE_LOAD(&q[(nb+3)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+3)*ldq)+offset]);

	q1 = _SSE_SUB(q1, _SSE_MUL(x1, h1));
	q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));

#ifdef DOUBLE_PRECISION_REAL
1206
	h2 = _mm_set1_pd(hh[ldh+nb-1]);
1207
1208
#endif
#ifdef SINGLE_PRECISION_REAL
1209
	h2 = _mm_set1_ps(hh[ldh+nb-1] );
1210
1211
1212
1213
1214
1215
1216
1217
1218
#endif

	q1 = _SSE_SUB(q1, _SSE_MUL(y1, h2));
	q2 = _SSE_SUB(q2, _SSE_MUL(y2, h2));

	_SSE_STORE(&q[(nb+3)*ldq],q1);
	_SSE_STORE(&q[((nb+3)*ldq)+offset],q2);

#ifdef DOUBLE_PRECISION_REAL
1219
	h1 = _mm_set1_pd(hh[nb-1]);
1220
1221
#endif
#ifdef SINGLE_PRECISION_REAL
1222
	h1 = _mm_set1_ps(hh[nb-1] );
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
#endif

	q1 = _SSE_LOAD(&q[(nb+4)*ldq]);
	q2 = _SSE_LOAD(&q[((nb+4)*ldq)+offset]);

	q1 = _SSE_SUB(q1, _SSE_MUL(x1, h1));
	q2 = _SSE_SUB(q2, _SSE_MUL(x2, h1));

	_SSE_STORE(&q[(nb+4)*ldq],q1);
	_SSE_STORE(&q[((nb+4)*ldq)+offset],q2);
}
/**
 * Unrolled kernel that computes
#ifdef DOUBLE_PRECISION_REAL
 * 2 rows of Q simultaneously, a
#endif
#ifdef SINGLE_PRECISION_REAL
 * 4 rows of Q simultaneously, a
#endif
 * matrix Vector product with two householder
 * vectors + a rank 1 update is performed
 */
#ifdef DOUBLE_PRECISION_REAL
__forceinline void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh