elpa2_kernels_real_sse_6hv_double_precision.c 39.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

#include <x86intrin.h>

#define __forceinline __attribute__((always_inline)) static

69
#ifdef HAVE_SSE_INTRINSICS
70
71
72
73
#undef __AVX__
#endif

//Forward declaration
74
75
static void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
static void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods);
76
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);
77

78
/*
79
!f>#ifdef HAVE_SSE_INTRINSICS
80
!f> interface
81
82
!f>   subroutine hexa_hh_trafo_real_sse_6hv_double(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="hexa_hh_trafo_real_sse_6hv_double")
83
84
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
85
!f>     type(c_ptr), value      :: q
86
87
88
89
90
!f>     real(kind=c_double)     :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/
91

92
void hexa_hh_trafo_real_sse_6hv_double(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

	// calculating scalar products to compute
	// 6 householder vectors simultaneously
	double scalarprods[15];

//	scalarprods[0] = s_1_2;
//	scalarprods[1] = s_1_3;
//	scalarprods[2] = s_2_3;
//	scalarprods[3] = s_1_4;
//	scalarprods[4] = s_2_4;
//	scalarprods[5] = s_3_4;
//	scalarprods[6] = s_1_5;
//	scalarprods[7] = s_2_5;
//	scalarprods[8] = s_3_5;
//	scalarprods[9] = s_4_5;
//	scalarprods[10] = s_1_6;
//	scalarprods[11] = s_2_6;
//	scalarprods[12] = s_3_6;
//	scalarprods[13] = s_4_6;
//	scalarprods[14] = s_5_6;

	scalarprods[0] = hh[(ldh+1)];
	scalarprods[1] = hh[(ldh*2)+2];
	scalarprods[2] = hh[(ldh*2)+1];
	scalarprods[3] = hh[(ldh*3)+3];
	scalarprods[4] = hh[(ldh*3)+2];
	scalarprods[5] = hh[(ldh*3)+1];
	scalarprods[6] = hh[(ldh*4)+4];
	scalarprods[7] = hh[(ldh*4)+3];
	scalarprods[8] = hh[(ldh*4)+2];
	scalarprods[9] = hh[(ldh*4)+1];
	scalarprods[10] = hh[(ldh*5)+5];
	scalarprods[11] = hh[(ldh*5)+4];
	scalarprods[12] = hh[(ldh*5)+3];
	scalarprods[13] = hh[(ldh*5)+2];
	scalarprods[14] = hh[(ldh*5)+1];

	// calculate scalar product of first and fourth householder vector
	// loop counter = 2
	scalarprods[0] += hh[1] * hh[(2+ldh)];
	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];

	// loop counter = 3
	scalarprods[0] += hh[2] * hh[(3+ldh)];
	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];

	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];

	// loop counter = 4
	scalarprods[0] += hh[3] * hh[(4+ldh)];
	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];

	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];

	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];

	// loop counter = 5
	scalarprods[0] += hh[4] * hh[(5+ldh)];
	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];

	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];

	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];

	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];

	#pragma ivdep
	for (i = 6; i < nb; i++)
	{
		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];

		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];

		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];

		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];

		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
	}

//	printf("s_1_2: %f\n", scalarprods[0]);
//	printf("s_1_3: %f\n", scalarprods[1]);
//	printf("s_2_3: %f\n", scalarprods[2]);
//	printf("s_1_4: %f\n", scalarprods[3]);
//	printf("s_2_4: %f\n", scalarprods[4]);
//	printf("s_3_4: %f\n", scalarprods[5]);
//	printf("s_1_5: %f\n", scalarprods[6]);
//	printf("s_2_5: %f\n", scalarprods[7]);
//	printf("s_3_5: %f\n", scalarprods[8]);
//	printf("s_4_5: %f\n", scalarprods[9]);
//	printf("s_1_6: %f\n", scalarprods[10]);
//	printf("s_2_6: %f\n", scalarprods[11]);
//	printf("s_3_6: %f\n", scalarprods[12]);
//	printf("s_4_6: %f\n", scalarprods[13]);
//	printf("s_5_6: %f\n", scalarprods[14]);

	// Production level kernel calls with padding
	for (i = 0; i < nq-2; i+=4)
	{
234
		hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
235
236
237
238
239
240
241
	}
	if (nq == i)
	{
		return;
	}
	else
	{
242
		hh_trafo_kernel_2_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
	}
}

#if 0
void hexa_hh_trafo_fast_(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

	// calculating scalar products to compute
	// 6 householder vectors simultaneously
	double scalarprods[15];

//	scalarprods[0] = s_1_2;
//	scalarprods[1] = s_1_3;
//	scalarprods[2] = s_2_3;
//	scalarprods[3] = s_1_4;
//	scalarprods[4] = s_2_4;
//	scalarprods[5] = s_3_4;
//	scalarprods[6] = s_1_5;
//	scalarprods[7] = s_2_5;
//	scalarprods[8] = s_3_5;
//	scalarprods[9] = s_4_5;
//	scalarprods[10] = s_1_6;
//	scalarprods[11] = s_2_6;
//	scalarprods[12] = s_3_6;
//	scalarprods[13] = s_4_6;
//	scalarprods[14] = s_5_6;

	scalarprods[0] = hh[(ldh+1)];
	scalarprods[1] = hh[(ldh*2)+2];
	scalarprods[2] = hh[(ldh*2)+1];
	scalarprods[3] = hh[(ldh*3)+3];
	scalarprods[4] = hh[(ldh*3)+2];
	scalarprods[5] = hh[(ldh*3)+1];
	scalarprods[6] = hh[(ldh*4)+4];
	scalarprods[7] = hh[(ldh*4)+3];
	scalarprods[8] = hh[(ldh*4)+2];
	scalarprods[9] = hh[(ldh*4)+1];
	scalarprods[10] = hh[(ldh*5)+5];
	scalarprods[11] = hh[(ldh*5)+4];
	scalarprods[12] = hh[(ldh*5)+3];
	scalarprods[13] = hh[(ldh*5)+2];
	scalarprods[14] = hh[(ldh*5)+1];

	// calculate scalar product of first and fourth householder vector
	// loop counter = 2
	scalarprods[0] += hh[1] * hh[(2+ldh)];
	scalarprods[2] += hh[(ldh)+1] * hh[2+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+1] * hh[2+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+1] * hh[2+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+1] * hh[2+(ldh*5)];

	// loop counter = 3
	scalarprods[0] += hh[2] * hh[(3+ldh)];
	scalarprods[2] += hh[(ldh)+2] * hh[3+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+2] * hh[3+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+2] * hh[3+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+2] * hh[3+(ldh*5)];

	scalarprods[1] += hh[1] * hh[3+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+1] * hh[3+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+1] * hh[3+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+1] * hh[3+(ldh*5)];

	// loop counter = 4
	scalarprods[0] += hh[3] * hh[(4+ldh)];
	scalarprods[2] += hh[(ldh)+3] * hh[4+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+3] * hh[4+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+3] * hh[4+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+3] * hh[4+(ldh*5)];

	scalarprods[1] += hh[2] * hh[4+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+2] * hh[4+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+2] * hh[4+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+2] * hh[4+(ldh*5)];

	scalarprods[3] += hh[1] * hh[4+(ldh*3)];
	scalarprods[7] += hh[(ldh)+1] * hh[4+(ldh*4)];
	scalarprods[12] += hh[(ldh*2)+1] * hh[4+(ldh*5)];

	// loop counter = 5
	scalarprods[0] += hh[4] * hh[(5+ldh)];
	scalarprods[2] += hh[(ldh)+4] * hh[5+(ldh*2)];
	scalarprods[5] += hh[(ldh*2)+4] * hh[5+(ldh*3)];
	scalarprods[9] += hh[(ldh*3)+4] * hh[5+(ldh*4)];
	scalarprods[14] += hh[(ldh*4)+4] * hh[5+(ldh*5)];

	scalarprods[1] += hh[3] * hh[5+(ldh*2)];
	scalarprods[4] += hh[(ldh*1)+3] * hh[5+(ldh*3)];
	scalarprods[8] += hh[(ldh*2)+3] * hh[5+(ldh*4)];
	scalarprods[13] += hh[(ldh*3)+3] * hh[5+(ldh*5)];

	scalarprods[3] += hh[2] * hh[5+(ldh*3)];
	scalarprods[7] += hh[(ldh)+2] * hh[5+(ldh*4)];
	scalarprods[12] += hh[(ldh*2)+2] * hh[5+(ldh*5)];

	scalarprods[6] += hh[1] * hh[5+(ldh*4)];
	scalarprods[11] += hh[(ldh)+1] * hh[5+(ldh*5)];

	#pragma ivdep
	for (i = 6; i < nb; i++)
	{
		scalarprods[0] += hh[i-1] * hh[(i+ldh)];
		scalarprods[2] += hh[(ldh)+i-1] * hh[i+(ldh*2)];
		scalarprods[5] += hh[(ldh*2)+i-1] * hh[i+(ldh*3)];
		scalarprods[9] += hh[(ldh*3)+i-1] * hh[i+(ldh*4)];
		scalarprods[14] += hh[(ldh*4)+i-1] * hh[i+(ldh*5)];

		scalarprods[1] += hh[i-2] * hh[i+(ldh*2)];
		scalarprods[4] += hh[(ldh*1)+i-2] * hh[i+(ldh*3)];
		scalarprods[8] += hh[(ldh*2)+i-2] * hh[i+(ldh*4)];
		scalarprods[13] += hh[(ldh*3)+i-2] * hh[i+(ldh*5)];

		scalarprods[3] += hh[i-3] * hh[i+(ldh*3)];
		scalarprods[7] += hh[(ldh)+i-3] * hh[i+(ldh*4)];
		scalarprods[12] += hh[(ldh*2)+i-3] * hh[i+(ldh*5)];

		scalarprods[6] += hh[i-4] * hh[i+(ldh*4)];
		scalarprods[11] += hh[(ldh)+i-4] * hh[i+(ldh*5)];

		scalarprods[10] += hh[i-5] * hh[i+(ldh*5)];
	}

//	printf("s_1_2: %f\n", scalarprods[0]);
//	printf("s_1_3: %f\n", scalarprods[1]);
//	printf("s_2_3: %f\n", scalarprods[2]);
//	printf("s_1_4: %f\n", scalarprods[3]);
//	printf("s_2_4: %f\n", scalarprods[4]);
//	printf("s_3_4: %f\n", scalarprods[5]);
//	printf("s_1_5: %f\n", scalarprods[6]);
//	printf("s_2_5: %f\n", scalarprods[7]);
//	printf("s_3_5: %f\n", scalarprods[8]);
//	printf("s_4_5: %f\n", scalarprods[9]);
//	printf("s_1_6: %f\n", scalarprods[10]);
//	printf("s_2_6: %f\n", scalarprods[11]);
//	printf("s_3_6: %f\n", scalarprods[12]);
//	printf("s_4_6: %f\n", scalarprods[13]);
//	printf("s_5_6: %f\n", scalarprods[14]);

	// Production level kernel calls with padding
#ifdef __AVX__
	for (i = 0; i < nq; i+=8)
	{
390
		hh_trafo_kernel_8_AVX_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
391
392
393
394
	}
#else
	for (i = 0; i < nq; i+=4)
	{
395
		hh_trafo_kernel_4_SSE_6hv_double(&q[i], hh, nb, ldq, ldh, scalarprods);
396
397
398
399
400
401
402
403
404
405
406
	}
#endif
}
#endif

/**
 * Unrolled kernel that computes
 * 4 rows of Q simultaneously, a
 * matrix vector product with two householder
 * vectors + a rank 1 update is performed
 */
407
__forceinline void hh_trafo_kernel_4_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
{
	/////////////////////////////////////////////////////
	// Matrix Vector Multiplication, Q [4 x nb+3] * hh
	// hh contains four householder vectors
	/////////////////////////////////////////////////////
	int i;

	__m128d a1_1 = _mm_load_pd(&q[ldq*5]);
	__m128d a2_1 = _mm_load_pd(&q[ldq*4]);
	__m128d a3_1 = _mm_load_pd(&q[ldq*3]);
	__m128d a4_1 = _mm_load_pd(&q[ldq*2]);
	__m128d a5_1 = _mm_load_pd(&q[ldq]);
	__m128d a6_1 = _mm_load_pd(&q[0]);

	__m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
	__m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
	__m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
	__m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
	__m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
427

428
429
430
431
432
	register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5));
	t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4));
	t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3));
	t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2));
	t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1));
433

434
435
436
437
	__m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
	__m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
	__m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
	__m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
438

439
440
441
442
	register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4));
	v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3));
	v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2));
	v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1));
443

444
445
446
	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
447

448
449
450
	register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
451

452
453
454
	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
455

456
457
458
	register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
	register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
459

460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
	register __m128d x1 = a1_1;

	__m128d a1_2 = _mm_load_pd(&q[(ldq*5)+2]);
	__m128d a2_2 = _mm_load_pd(&q[(ldq*4)+2]);
	__m128d a3_2 = _mm_load_pd(&q[(ldq*3)+2]);
	__m128d a4_2 = _mm_load_pd(&q[(ldq*2)+2]);
	__m128d a5_2 = _mm_load_pd(&q[(ldq)+2]);
	__m128d a6_2 = _mm_load_pd(&q[2]);

	register __m128d t2 = _mm_add_pd(a6_2, _mm_mul_pd(a5_2, h_6_5));
	t2 = _mm_add_pd(t2, _mm_mul_pd(a4_2, h_6_4));
	t2 = _mm_add_pd(t2, _mm_mul_pd(a3_2, h_6_3));
	t2 = _mm_add_pd(t2, _mm_mul_pd(a2_2, h_6_2));
	t2 = _mm_add_pd(t2, _mm_mul_pd(a1_2, h_6_1));
	register __m128d v2 = _mm_add_pd(a5_2, _mm_mul_pd(a4_2, h_5_4));
	v2 = _mm_add_pd(v2, _mm_mul_pd(a3_2, h_5_3));
	v2 = _mm_add_pd(v2, _mm_mul_pd(a2_2, h_5_2));
	v2 = _mm_add_pd(v2, _mm_mul_pd(a1_2, h_5_1));
	register __m128d w2 = _mm_add_pd(a4_2, _mm_mul_pd(a3_2, h_4_3));
	w2 = _mm_add_pd(w2, _mm_mul_pd(a2_2, h_4_2));
	w2 = _mm_add_pd(w2, _mm_mul_pd(a1_2, h_4_1));
	register __m128d z2 = _mm_add_pd(a3_2, _mm_mul_pd(a2_2, h_3_2));
	z2 = _mm_add_pd(z2, _mm_mul_pd(a1_2, h_3_1));
	register __m128d y2 = _mm_add_pd(a2_2, _mm_mul_pd(a1_2, h_2_1));
484

485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
	register __m128d x2 = a1_2;

	__m128d q1;
	__m128d q2;

	__m128d h1;
	__m128d h2;
	__m128d h3;
	__m128d h4;
	__m128d h5;
	__m128d h6;

	for(i = 6; i < nb; i++)
	{
		h1 = _mm_loaddup_pd(&hh[i-5]);
		q1 = _mm_load_pd(&q[i*ldq]);
		q2 = _mm_load_pd(&q[(i*ldq)+2]);
502

503
504
		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
		x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
505

506
		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
507

508
509
		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
		y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
510

511
		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
512

513
514
		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
		z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
515

516
		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
517

518
519
		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
		w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
520

521
		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
522

523
524
		v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
		v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5));
525

526
		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
527

528
529
530
531
532
533
534
		t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6));
		t2 = _mm_add_pd(t2, _mm_mul_pd(q2,h6));
	}

	h1 = _mm_loaddup_pd(&hh[nb-5]);
	q1 = _mm_load_pd(&q[nb*ldq]);
	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
535

536
537
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
538

539
	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
540

541
542
	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
543

544
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
545

546
547
	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
548

549
	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
550

551
552
	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
	w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));
553

554
	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
555

556
557
558
559
560
561
	v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
	v2 = _mm_add_pd(v2, _mm_mul_pd(q2,h5));

	h1 = _mm_loaddup_pd(&hh[nb-4]);
	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
562

563
564
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
565

566
	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
567

568
569
	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
570

571
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
572

573
574
	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));
575

576
	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
577

578
579
580
581
582
583
	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
	w2 = _mm_add_pd(w2, _mm_mul_pd(q2,h4));

	h1 = _mm_loaddup_pd(&hh[nb-3]);
	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
584

585
586
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
587

588
	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
589

590
591
	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));
592

593
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
594

595
596
597
598
599
600
	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
	z2 = _mm_add_pd(z2, _mm_mul_pd(q2,h3));

	h1 = _mm_loaddup_pd(&hh[nb-2]);
	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
	q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]);
601

602
603
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));
604

605
	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
606

607
608
609
610
611
612
	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
	y2 = _mm_add_pd(y2, _mm_mul_pd(q2,h2));

	h1 = _mm_loaddup_pd(&hh[nb-1]);
	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
	q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]);
613

614
615
616
617
618
619
620
621
622
623
624
625
626
627
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
	x2 = _mm_add_pd(x2, _mm_mul_pd(q2,h1));

	/////////////////////////////////////////////////////
	// Apply tau, correct wrong calculation using pre-calculated scalar products
	/////////////////////////////////////////////////////

	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
	x1 = _mm_mul_pd(x1, tau1);
	x2 = _mm_mul_pd(x2, tau1);

	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
	__m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]);
	h2 = _mm_mul_pd(tau2, vs_1_2);
628

629
630
631
632
633
634
635
636
	y1 = _mm_sub_pd(_mm_mul_pd(y1,tau2), _mm_mul_pd(x1,h2));
	y2 = _mm_sub_pd(_mm_mul_pd(y2,tau2), _mm_mul_pd(x2,h2));

	__m128d tau3 = _mm_loaddup_pd(&hh[ldh*2]);
	__m128d vs_1_3 = _mm_loaddup_pd(&scalarprods[1]);
	__m128d vs_2_3 = _mm_loaddup_pd(&scalarprods[2]);
	h2 = _mm_mul_pd(tau3, vs_1_3);
	h3 = _mm_mul_pd(tau3, vs_2_3);
637

638
639
640
641
642
643
644
645
646
647
	z1 = _mm_sub_pd(_mm_mul_pd(z1,tau3), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)));
	z2 = _mm_sub_pd(_mm_mul_pd(z2,tau3), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)));

	__m128d tau4 = _mm_loaddup_pd(&hh[ldh*3]);
	__m128d vs_1_4 = _mm_loaddup_pd(&scalarprods[3]);
	__m128d vs_2_4 = _mm_loaddup_pd(&scalarprods[4]);
	h2 = _mm_mul_pd(tau4, vs_1_4);
	h3 = _mm_mul_pd(tau4, vs_2_4);
	__m128d vs_3_4 = _mm_loaddup_pd(&scalarprods[5]);
	h4 = _mm_mul_pd(tau4, vs_3_4);
648

649
650
651
652
653
654
655
656
657
658
659
660
	w1 = _mm_sub_pd(_mm_mul_pd(w1,tau4), _mm_add_pd(_mm_mul_pd(z1,h4), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
	w2 = _mm_sub_pd(_mm_mul_pd(w2,tau4), _mm_add_pd(_mm_mul_pd(z2,h4), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));

	__m128d tau5 = _mm_loaddup_pd(&hh[ldh*4]);
	__m128d vs_1_5 = _mm_loaddup_pd(&scalarprods[6]);
	__m128d vs_2_5 = _mm_loaddup_pd(&scalarprods[7]);
	h2 = _mm_mul_pd(tau5, vs_1_5);
	h3 = _mm_mul_pd(tau5, vs_2_5);
	__m128d vs_3_5 = _mm_loaddup_pd(&scalarprods[8]);
	__m128d vs_4_5 = _mm_loaddup_pd(&scalarprods[9]);
	h4 = _mm_mul_pd(tau5, vs_3_5);
	h5 = _mm_mul_pd(tau5, vs_4_5);
661

662
663
664
665
666
667
668
669
670
671
672
673
674
675
	v1 = _mm_sub_pd(_mm_mul_pd(v1,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2))));
	v2 = _mm_sub_pd(_mm_mul_pd(v2,tau5), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2))));

	__m128d tau6 = _mm_loaddup_pd(&hh[ldh*5]);
	__m128d vs_1_6 = _mm_loaddup_pd(&scalarprods[10]);
	__m128d vs_2_6 = _mm_loaddup_pd(&scalarprods[11]);
	h2 = _mm_mul_pd(tau6, vs_1_6);
	h3 = _mm_mul_pd(tau6, vs_2_6);
	__m128d vs_3_6 = _mm_loaddup_pd(&scalarprods[12]);
	__m128d vs_4_6 = _mm_loaddup_pd(&scalarprods[13]);
	__m128d vs_5_6 = _mm_loaddup_pd(&scalarprods[14]);
	h4 = _mm_mul_pd(tau6, vs_3_6);
	h5 = _mm_mul_pd(tau6, vs_4_6);
	h6 = _mm_mul_pd(tau6, vs_5_6);
676

677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
	t1 = _mm_sub_pd(_mm_mul_pd(t1,tau6), _mm_add_pd( _mm_mul_pd(v1,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w1,h5), _mm_mul_pd(z1,h4)), _mm_add_pd(_mm_mul_pd(y1,h3), _mm_mul_pd(x1,h2)))));
	t2 = _mm_sub_pd(_mm_mul_pd(t2,tau6), _mm_add_pd( _mm_mul_pd(v2,h6), _mm_add_pd(_mm_add_pd(_mm_mul_pd(w2,h5), _mm_mul_pd(z2,h4)), _mm_add_pd(_mm_mul_pd(y2,h3), _mm_mul_pd(x2,h2)))));

	/////////////////////////////////////////////////////
	// Rank-1 update of Q [4 x nb+3]
	/////////////////////////////////////////////////////

	q1 = _mm_load_pd(&q[0]);
	q2 = _mm_load_pd(&q[2]);
	q1 = _mm_sub_pd(q1, t1);
	q2 = _mm_sub_pd(q2, t2);
	_mm_store_pd(&q[0],q1);
	_mm_store_pd(&q[2],q2);

	h6 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
	q1 = _mm_load_pd(&q[ldq]);
	q2 = _mm_load_pd(&q[(ldq+2)]);
	q1 = _mm_sub_pd(q1, v1);
	q2 = _mm_sub_pd(q2, v2);
696

697
698
	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
699

700
701
702
703
704
705
706
707
708
709
	_mm_store_pd(&q[ldq],q1);
	_mm_store_pd(&q[(ldq+2)],q2);

	h5 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
	q1 = _mm_load_pd(&q[ldq*2]);
	q2 = _mm_load_pd(&q[(ldq*2)+2]);
	q1 = _mm_sub_pd(q1, w1);
	q2 = _mm_sub_pd(q2, w2);
	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
710

711
	h6 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
712

713
714
	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
715

716
717
718
719
720
721
722
723
	_mm_store_pd(&q[ldq*2],q1);
	_mm_store_pd(&q[(ldq*2)+2],q2);

	h4 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
	q1 = _mm_load_pd(&q[ldq*3]);
	q2 = _mm_load_pd(&q[(ldq*3)+2]);
	q1 = _mm_sub_pd(q1, z1);
	q2 = _mm_sub_pd(q2, z2);
724

725
726
	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
727

728
	h5 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
729

730
731
	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
732

733
	h6 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
734

735
736
	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
737

738
739
740
741
742
743
744
745
	_mm_store_pd(&q[ldq*3],q1);
	_mm_store_pd(&q[(ldq*3)+2],q2);

	h3 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
	q1 = _mm_load_pd(&q[ldq*4]);
	q2 = _mm_load_pd(&q[(ldq*4)+2]);
	q1 = _mm_sub_pd(q1, y1);
	q2 = _mm_sub_pd(q2, y2);
746

747
748
	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
749

750
	h4 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
751

752
753
	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
754

755
	h5 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
756

757
758
	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
759

760
	h6 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
761

762
763
	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
764

765
766
767
768
769
770
771
772
	_mm_store_pd(&q[ldq*4],q1);
	_mm_store_pd(&q[(ldq*4)+2],q2);

	h2 = _mm_loaddup_pd(&hh[(ldh)+1]);
	q1 = _mm_load_pd(&q[ldq*5]);
	q2 = _mm_load_pd(&q[(ldq*5)+2]);
	q1 = _mm_sub_pd(q1, x1);
	q2 = _mm_sub_pd(q2, x2);
773

774
775
	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
776

777
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
778

779
780
	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
781

782
	h4 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
783

784
785
	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
786

787
	h5 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
788

789
790
	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
791

792
	h6 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
793

794
795
	q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
796

797
798
799
800
801
802
803
804
	_mm_store_pd(&q[ldq*5],q1);
	_mm_store_pd(&q[(ldq*5)+2],q2);

	for (i = 6; i < nb; i++)
	{
		q1 = _mm_load_pd(&q[i*ldq]);
		q2 = _mm_load_pd(&q[(i*ldq)+2]);
		h1 = _mm_loaddup_pd(&hh[i-5]);
805

806
807
		q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
		q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
808

809
		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
810

811
812
		q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
		q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
813

814
		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
815

816
817
		q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
		q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
818

819
		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
820

821
822
		q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
		q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
823

824
		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
825

826
827
		q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
		q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
828

829
		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
830

831
832
		q1 = _mm_sub_pd(q1, _mm_mul_pd(t1, h6));
		q2 = _mm_sub_pd(q2, _mm_mul_pd(t2, h6));
833

834
835
836
837
838
839
840
		_mm_store_pd(&q[i*ldq],q1);
		_mm_store_pd(&q[(i*ldq)+2],q2);
	}

	h1 = _mm_loaddup_pd(&hh[nb-5]);
	q1 = _mm_load_pd(&q[nb*ldq]);
	q2 = _mm_load_pd(&q[(nb*ldq)+2]);
841

842
843
	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
844

845
	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
846

847
848
	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
849

850
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
851

852
853
	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
854

855
	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
856

857
858
	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
859

860
	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
861

862
863
	q1 = _mm_sub_pd(q1, _mm_mul_pd(v1, h5));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(v2, h5));
864

865
866
867
868
869
870
	_mm_store_pd(&q[nb*ldq],q1);
	_mm_store_pd(&q[(nb*ldq)+2],q2);

	h1 = _mm_loaddup_pd(&hh[nb-4]);
	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
	q2 = _mm_load_pd(&q[((nb+1)*ldq)+2]);
871

872
873
	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
874

875
	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
876

877
878
	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
879

880
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
881

882
883
	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
884

885
	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
886

887
888
	q1 = _mm_sub_pd(q1, _mm_mul_pd(w1, h4));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(w2, h4));
889

890
891
892
893
894
895
	_mm_store_pd(&q[(nb+1)*ldq],q1);
	_mm_store_pd(&q[((nb+1)*ldq)+2],q2);

	h1 = _mm_loaddup_pd(&hh[nb-3]);
	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
	q2 = _mm_load_pd(&q[((nb+2)*ldq)+2]);
896

897
898
	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
899

900
	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
901

902
903
	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
904

905
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
906

907
908
	q1 = _mm_sub_pd(q1, _mm_mul_pd(z1, h3));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(z2, h3));
909

910
911
912
913
914
915
	_mm_store_pd(&q[(nb+2)*ldq],q1);
	_mm_store_pd(&q[((nb+2)*ldq)+2],q2);

	h1 = _mm_loaddup_pd(&hh[nb-2]);
	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
	q2 = _mm_load_pd(&q[((nb+3)*ldq)+2]);
916

917
918
	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
919

920
	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
921

922
923
	q1 = _mm_sub_pd(q1, _mm_mul_pd(y1, h2));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(y2, h2));
924

925
926
927
928
929
930
	_mm_store_pd(&q[(nb+3)*ldq],q1);
	_mm_store_pd(&q[((nb+3)*ldq)+2],q2);

	h1 = _mm_loaddup_pd(&hh[nb-1]);
	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
	q2 = _mm_load_pd(&q[((nb+4)*ldq)+2]);
931

932
933
	q1 = _mm_sub_pd(q1, _mm_mul_pd(x1, h1));
	q2 = _mm_sub_pd(q2, _mm_mul_pd(x2, h1));
934

935
936
937
938
939
940
941
942
943
944
	_mm_store_pd(&q[(nb+4)*ldq],q1);
	_mm_store_pd(&q[((nb+4)*ldq)+2],q2);
}

/**
 * Unrolled kernel that computes
 * 2 rows of Q simultaneously, a
 * matrix vector product with two householder
 * vectors + a rank 1 update is performed
 */
945
__forceinline void hh_trafo_kernel_2_SSE_6hv_double(double* q, double* hh, int nb, int ldq, int ldh, double* scalarprods)
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
{
	/////////////////////////////////////////////////////
	// Matrix Vector Multiplication, Q [2 x nb+3] * hh
	// hh contains four householder vectors
	/////////////////////////////////////////////////////
	int i;

	__m128d a1_1 = _mm_load_pd(&q[ldq*5]);
	__m128d a2_1 = _mm_load_pd(&q[ldq*4]);
	__m128d a3_1 = _mm_load_pd(&q[ldq*3]);
	__m128d a4_1 = _mm_load_pd(&q[ldq*2]);
	__m128d a5_1 = _mm_load_pd(&q[ldq]);
	__m128d a6_1 = _mm_load_pd(&q[0]);

	__m128d h_6_5 = _mm_loaddup_pd(&hh[(ldh*5)+1]);
	__m128d h_6_4 = _mm_loaddup_pd(&hh[(ldh*5)+2]);
	__m128d h_6_3 = _mm_loaddup_pd(&hh[(ldh*5)+3]);
	__m128d h_6_2 = _mm_loaddup_pd(&hh[(ldh*5)+4]);
	__m128d h_6_1 = _mm_loaddup_pd(&hh[(ldh*5)+5]);
965

966
967
968
969
970
	register __m128d t1 = _mm_add_pd(a6_1, _mm_mul_pd(a5_1, h_6_5));
	t1 = _mm_add_pd(t1, _mm_mul_pd(a4_1, h_6_4));
	t1 = _mm_add_pd(t1, _mm_mul_pd(a3_1, h_6_3));
	t1 = _mm_add_pd(t1, _mm_mul_pd(a2_1, h_6_2));
	t1 = _mm_add_pd(t1, _mm_mul_pd(a1_1, h_6_1));
971

972
973
974
975
	__m128d h_5_4 = _mm_loaddup_pd(&hh[(ldh*4)+1]);
	__m128d h_5_3 = _mm_loaddup_pd(&hh[(ldh*4)+2]);
	__m128d h_5_2 = _mm_loaddup_pd(&hh[(ldh*4)+3]);
	__m128d h_5_1 = _mm_loaddup_pd(&hh[(ldh*4)+4]);
976

977
978
979
980
	register __m128d v1 = _mm_add_pd(a5_1, _mm_mul_pd(a4_1, h_5_4));
	v1 = _mm_add_pd(v1, _mm_mul_pd(a3_1, h_5_3));
	v1 = _mm_add_pd(v1, _mm_mul_pd(a2_1, h_5_2));
	v1 = _mm_add_pd(v1, _mm_mul_pd(a1_1, h_5_1));
981

982
983
984
	__m128d h_4_3 = _mm_loaddup_pd(&hh[(ldh*3)+1]);
	__m128d h_4_2 = _mm_loaddup_pd(&hh[(ldh*3)+2]);
	__m128d h_4_1 = _mm_loaddup_pd(&hh[(ldh*3)+3]);
985

986
987
988
	register __m128d w1 = _mm_add_pd(a4_1, _mm_mul_pd(a3_1, h_4_3));
	w1 = _mm_add_pd(w1, _mm_mul_pd(a2_1, h_4_2));
	w1 = _mm_add_pd(w1, _mm_mul_pd(a1_1, h_4_1));
989

990
991
992
	__m128d h_2_1 = _mm_loaddup_pd(&hh[ldh+1]);
	__m128d h_3_2 = _mm_loaddup_pd(&hh[(ldh*2)+1]);
	__m128d h_3_1 = _mm_loaddup_pd(&hh[(ldh*2)+2]);
993

994
995
996
	register __m128d z1 = _mm_add_pd(a3_1, _mm_mul_pd(a2_1, h_3_2));
	z1 = _mm_add_pd(z1, _mm_mul_pd(a1_1, h_3_1));
	register __m128d y1 = _mm_add_pd(a2_1, _mm_mul_pd(a1_1, h_2_1));
997

998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
	register __m128d x1 = a1_1;

	__m128d q1;

	__m128d h1;
	__m128d h2;
	__m128d h3;
	__m128d h4;
	__m128d h5;
	__m128d h6;

	for(i = 6; i < nb; i++)
	{
		h1 = _mm_loaddup_pd(&hh[i-5]);
		q1 = _mm_load_pd(&q[i*ldq]);
1013

1014
		x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
1015

1016
		h2 = _mm_loaddup_pd(&hh[ldh+i-4]);
1017

1018
		y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
1019

1020
		h3 = _mm_loaddup_pd(&hh[(ldh*2)+i-3]);
1021

1022
		z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
1023

1024
		h4 = _mm_loaddup_pd(&hh[(ldh*3)+i-2]);
1025

1026
		w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
1027

1028
		h5 = _mm_loaddup_pd(&hh[(ldh*4)+i-1]);
1029

1030
		v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
1031

1032
		h6 = _mm_loaddup_pd(&hh[(ldh*5)+i]);
1033

1034
		t1 = _mm_add_pd(t1, _mm_mul_pd(q1,h6));
1035

1036
1037
1038
1039
	}

	h1 = _mm_loaddup_pd(&hh[nb-5]);
	q1 = _mm_load_pd(&q[nb*ldq]);
1040

1041
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
1042

1043
	h2 = _mm_loaddup_pd(&hh[ldh+nb-4]);
1044

1045
	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
1046

1047
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-3]);
1048

1049
	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
1050

1051
	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-2]);
1052

1053
	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));
1054

1055
	h5 = _mm_loaddup_pd(&hh[(ldh*4)+nb-1]);
1056

1057
	v1 = _mm_add_pd(v1, _mm_mul_pd(q1,h5));
1058

1059
1060
1061

	h1 = _mm_loaddup_pd(&hh[nb-4]);
	q1 = _mm_load_pd(&q[(nb+1)*ldq]);
1062

1063
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
1064

1065
	h2 = _mm_loaddup_pd(&hh[ldh+nb-3]);
1066

1067
	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
1068

1069
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-2]);
1070

1071
	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));
1072

1073
	h4 = _mm_loaddup_pd(&hh[(ldh*3)+nb-1]);
1074

1075
1076
1077
1078
	w1 = _mm_add_pd(w1, _mm_mul_pd(q1,h4));

	h1 = _mm_loaddup_pd(&hh[nb-3]);
	q1 = _mm_load_pd(&q[(nb+2)*ldq]);
1079

1080
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
1081

1082
	h2 = _mm_loaddup_pd(&hh[ldh+nb-2]);
1083

1084
	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));
1085

1086
	h3 = _mm_loaddup_pd(&hh[(ldh*2)+nb-1]);
1087

1088
1089
1090
1091
	z1 = _mm_add_pd(z1, _mm_mul_pd(q1,h3));

	h1 = _mm_loaddup_pd(&hh[nb-2]);
	q1 = _mm_load_pd(&q[(nb+3)*ldq]);
1092

1093
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));
1094

1095
	h2 = _mm_loaddup_pd(&hh[ldh+nb-1]);
1096

1097
1098
1099
1100
	y1 = _mm_add_pd(y1, _mm_mul_pd(q1,h2));

	h1 = _mm_loaddup_pd(&hh[nb-1]);
	q1 = _mm_load_pd(&q[(nb+4)*ldq]);
1101

1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
	x1 = _mm_add_pd(x1, _mm_mul_pd(q1,h1));

	/////////////////////////////////////////////////////
	// Apply tau, correct wrong calculation using pre-calculated scalar products
	/////////////////////////////////////////////////////

	__m128d tau1 = _mm_loaddup_pd(&hh[0]);
	x1 = _mm_mul_pd(x1, tau1);

	__m128d tau2 = _mm_loaddup_pd(&hh[ldh]);
	__m128d vs_1_2 = _mm_loaddup_pd(&scalarprods[0]);
	h2 = _mm_mul_pd(tau2, vs_1_2);
1114

1115
1116
1117
1118
1119
1120
1121
	y1 = _mm_sub_pd(_mm_mul_pd