elpa2_kernels_real_avx-avx2_2hv.c 32.1 KB
Newer Older
1
2
//    This file is part of ELPA.
//
Andreas Marek's avatar
Andreas Marek committed
3
//    The ELPA library was originally created by the ELPA consortium,
4
5
//    consisting of the following organizations:
//
6
7
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
12
13
14
15
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
16
17
//    - IBM Deutschland GmbH
//
18
//    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
//    enhancements authored by Intel Corporation which is not part of
20
//    the ELPA consortium.
21
22
//
//    More information can be found here:
23
//    http://elpa.mpcdf.mpg.de/
24
25
//
//    ELPA is free software: you can redistribute it and/or modify
Andreas Marek's avatar
Andreas Marek committed
26
27
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
60
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
61
62
// --------------------------------------------------------------------------------------------------

63
64
#include "config-f90.h"

65
66
67
68
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline)) static

69
70
#ifdef HAVE_AVX2

71
72
73
74
75
76
77
78
79
80
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_macc_pd(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMA_pd(a,b,c) _mm256_fmadd_pd(a,b,c)
#endif

81
82
#endif

83
84
85
86
87
88
//Forward declaration
__forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);
__forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s);

89
/*
90
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
91
92
93
94
95
96
97
98
99
100
101
102
103
104
!f> interface
!f>   subroutine double_hh_trafo_real_avx_avx2_2hv(q, hh, pnb, pnq, pldq, pldh) bind(C, name="double_hh_trafo_real_avx_avx2_2hv")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     real(kind=c_double)     :: q(*)
!f>     real(kind=c_double)     :: hh(pnb,6)
!f>   end subroutine
!f> end interface
!f>#endif
*/

void double_hh_trafo_real_avx_avx2_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh);

void double_hh_trafo_real_avx_avx2_2hv(double* q, double* hh, int* pnb, int* pnq, int* pldq, int* pldh)
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

	// calculating scalar product to compute
	// 2 householder vectors simultaneously
	double s = hh[(ldh)+1]*1.0;

	#pragma ivdep
	for (i = 2; i < nb; i++)
	{
		s += hh[i-1] * hh[(i+ldh)];
	}

	// Production level kernel calls with padding
	for (i = 0; i < nq-20; i+=24)
	{
		hh_trafo_kernel_24_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
	}

	if (nq == i)
	{
		return;
	}

	if (nq-i == 20)
	{
		hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
		hh_trafo_kernel_4_AVX_2hv(&q[i+16], hh, nb, ldq, ldh, s);
	}
	else if (nq-i == 16)
	{
		hh_trafo_kernel_16_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
	else if (nq-i == 12)
	{
		hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
		hh_trafo_kernel_4_AVX_2hv(&q[i+8], hh, nb, ldq, ldh, s);
	}
	else if (nq-i == 8)
	{
		hh_trafo_kernel_8_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
	else
	{
		hh_trafo_kernel_4_AVX_2hv(&q[i], hh, nb, ldq, ldh, s);
	}
}
/**
 * Unrolled kernel that computes
 * 24 rows of Q simultaneously, a
 * matrix vector product with two householder
 * vectors + a rank 2 update is performed
 */
 __forceinline void hh_trafo_kernel_24_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
	/////////////////////////////////////////////////////
	// Matrix Vector Multiplication, Q [24 x nb+1] * hh
	// hh contains two householder vectors, with offset 1
	/////////////////////////////////////////////////////
	int i;
	// Needed bit mask for floating point sign flip
	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);

	__m256d x1 = _mm256_load_pd(&q[ldq]);
	__m256d x2 = _mm256_load_pd(&q[ldq+4]);
	__m256d x3 = _mm256_load_pd(&q[ldq+8]);
	__m256d x4 = _mm256_load_pd(&q[ldq+12]);
	__m256d x5 = _mm256_load_pd(&q[ldq+16]);
	__m256d x6 = _mm256_load_pd(&q[ldq+20]);

	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
	__m256d h2;

182
#ifdef __ELPA_USE_FMA__
183
	__m256d q1 = _mm256_load_pd(q);
184
	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
185
	__m256d q2 = _mm256_load_pd(&q[4]);
186
	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
187
	__m256d q3 = _mm256_load_pd(&q[8]);
188
	__m256d y3 = _mm256_FMA_pd(x3, h1, q3);
189
	__m256d q4 = _mm256_load_pd(&q[12]);
190
	__m256d y4 = _mm256_FMA_pd(x4, h1, q4);
191
	__m256d q5 = _mm256_load_pd(&q[16]);
192
	__m256d y5 = _mm256_FMA_pd(x5, h1, q5);
193
	__m256d q6 = _mm256_load_pd(&q[20]);
194
	__m256d y6 = _mm256_FMA_pd(x6, h1, q6);
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#else
	__m256d q1 = _mm256_load_pd(q);
	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
	__m256d q2 = _mm256_load_pd(&q[4]);
	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
	__m256d q3 = _mm256_load_pd(&q[8]);
	__m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
	__m256d q4 = _mm256_load_pd(&q[12]);
	__m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
	__m256d q5 = _mm256_load_pd(&q[16]);
	__m256d y5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1));
	__m256d q6 = _mm256_load_pd(&q[20]);
	__m256d y6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1));
#endif

	for(i = 2; i < nb; i++)
	{
		h1 = _mm256_broadcast_sd(&hh[i-1]);
		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
214
#ifdef __ELPA_USE_FMA__
215
		q1 = _mm256_load_pd(&q[i*ldq]);
216
217
		x1 = _mm256_FMA_pd(q1, h1, x1);
		y1 = _mm256_FMA_pd(q1, h2, y1);
218
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
219
220
		x2 = _mm256_FMA_pd(q2, h1, x2);
		y2 = _mm256_FMA_pd(q2, h2, y2);
221
		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
222
223
		x3 = _mm256_FMA_pd(q3, h1, x3);
		y3 = _mm256_FMA_pd(q3, h2, y3);
224
		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
225
226
		x4 = _mm256_FMA_pd(q4, h1, x4);
		y4 = _mm256_FMA_pd(q4, h2, y4);
227
		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
228
229
		x5 = _mm256_FMA_pd(q5, h1, x5);
		y5 = _mm256_FMA_pd(q5, h2, y5);
230
		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
231
232
		x6 = _mm256_FMA_pd(q6, h1, x6);
		y6 = _mm256_FMA_pd(q6, h2, y6);
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#else
		q1 = _mm256_load_pd(&q[i*ldq]);
		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
		x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
		y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
		x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
		y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2));
		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
		x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1));
		y5 = _mm256_add_pd(y5, _mm256_mul_pd(q5,h2));
		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
		x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1));
		y6 = _mm256_add_pd(y6, _mm256_mul_pd(q6,h2));
#endif
	}

	h1 = _mm256_broadcast_sd(&hh[nb-1]);
256
#ifdef __ELPA_USE_FMA__
257
	q1 = _mm256_load_pd(&q[nb*ldq]);
258
	x1 = _mm256_FMA_pd(q1, h1, x1);
259
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
260
	x2 = _mm256_FMA_pd(q2, h1, x2);
261
	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
262
	x3 = _mm256_FMA_pd(q3, h1, x3);
263
	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
264
	x4 = _mm256_FMA_pd(q4, h1, x4);
265
	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
266
	x5 = _mm256_FMA_pd(q5, h1, x5);
267
	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
268
	x6 = _mm256_FMA_pd(q6, h1, x6);
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
#else
	q1 = _mm256_load_pd(&q[nb*ldq]);
	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
	x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
	x5 = _mm256_add_pd(x5, _mm256_mul_pd(q5,h1));
	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
	x6 = _mm256_add_pd(x6, _mm256_mul_pd(q6,h1));
#endif

	/////////////////////////////////////////////////////
	// Rank-2 update of Q [24 x nb+1]
	/////////////////////////////////////////////////////

	__m256d tau1 = _mm256_broadcast_sd(hh);
	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
	__m256d vs = _mm256_broadcast_sd(&s);

	h1 = _mm256_xor_pd(tau1, sign);
	x1 = _mm256_mul_pd(x1, h1);
	x2 = _mm256_mul_pd(x2, h1);
	x3 = _mm256_mul_pd(x3, h1);
	x4 = _mm256_mul_pd(x4, h1);
	x5 = _mm256_mul_pd(x5, h1);
	x6 = _mm256_mul_pd(x6, h1);
	h1 = _mm256_xor_pd(tau2, sign);
	h2 = _mm256_mul_pd(h1, vs);
301
302
303
304
305
306
307
#ifdef __ELPA_USE_FMA__
	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
	y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2));
	y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2));
	y5 = _mm256_FMA_pd(y5, h1, _mm256_mul_pd(x5,h2));
	y6 = _mm256_FMA_pd(y6, h1, _mm256_mul_pd(x6,h2));
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#else
	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
	y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
	y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2));
	y5 = _mm256_add_pd(_mm256_mul_pd(y5,h1), _mm256_mul_pd(x5,h2));
	y6 = _mm256_add_pd(_mm256_mul_pd(y6,h1), _mm256_mul_pd(x6,h2));
#endif

	q1 = _mm256_load_pd(q);
	q1 = _mm256_add_pd(q1, y1);
	_mm256_store_pd(q,q1);
	q2 = _mm256_load_pd(&q[4]);
	q2 = _mm256_add_pd(q2, y2);
	_mm256_store_pd(&q[4],q2);
	q3 = _mm256_load_pd(&q[8]);
	q3 = _mm256_add_pd(q3, y3);
	_mm256_store_pd(&q[8],q3);
	q4 = _mm256_load_pd(&q[12]);
	q4 = _mm256_add_pd(q4, y4);
	_mm256_store_pd(&q[12],q4);
	q5 = _mm256_load_pd(&q[16]);
	q5 = _mm256_add_pd(q5, y5);
	_mm256_store_pd(&q[16],q5);
	q6 = _mm256_load_pd(&q[20]);
	q6 = _mm256_add_pd(q6, y6);
	_mm256_store_pd(&q[20],q6);

	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
337
#ifdef __ELPA_USE_FMA__
338
	q1 = _mm256_load_pd(&q[ldq]);
339
	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
340
341
	_mm256_store_pd(&q[ldq],q1);
	q2 = _mm256_load_pd(&q[ldq+4]);
342
	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
343
344
	_mm256_store_pd(&q[ldq+4],q2);
	q3 = _mm256_load_pd(&q[ldq+8]);
345
	q3 = _mm256_add_pd(q3, _mm256_FMA_pd(y3, h2, x3));
346
347
	_mm256_store_pd(&q[ldq+8],q3);
	q4 = _mm256_load_pd(&q[ldq+12]);
348
	q4 = _mm256_add_pd(q4, _mm256_FMA_pd(y4, h2, x4));
349
350
	_mm256_store_pd(&q[ldq+12],q4);
	q5 = _mm256_load_pd(&q[ldq+16]);
351
	q5 = _mm256_add_pd(q5, _mm256_FMA_pd(y5, h2, x5));
352
353
	_mm256_store_pd(&q[ldq+16],q5);
	q6 = _mm256_load_pd(&q[ldq+20]);
354
	q6 = _mm256_add_pd(q6, _mm256_FMA_pd(y6, h2, x6));
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
	_mm256_store_pd(&q[ldq+20],q6);
#else
	q1 = _mm256_load_pd(&q[ldq]);
	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
	_mm256_store_pd(&q[ldq],q1);
	q2 = _mm256_load_pd(&q[ldq+4]);
	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
	_mm256_store_pd(&q[ldq+4],q2);
	q3 = _mm256_load_pd(&q[ldq+8]);
	q3 = _mm256_add_pd(q3, _mm256_add_pd(x3, _mm256_mul_pd(y3, h2)));
	_mm256_store_pd(&q[ldq+8],q3);
	q4 = _mm256_load_pd(&q[ldq+12]);
	q4 = _mm256_add_pd(q4, _mm256_add_pd(x4, _mm256_mul_pd(y4, h2)));
	_mm256_store_pd(&q[ldq+12],q4);
	q5 = _mm256_load_pd(&q[ldq+16]);
	q5 = _mm256_add_pd(q5, _mm256_add_pd(x5, _mm256_mul_pd(y5, h2)));
	_mm256_store_pd(&q[ldq+16],q5);
	q6 = _mm256_load_pd(&q[ldq+20]);
	q6 = _mm256_add_pd(q6, _mm256_add_pd(x6, _mm256_mul_pd(y6, h2)));
	_mm256_store_pd(&q[ldq+20],q6);
#endif

	for (i = 2; i < nb; i++)
	{
		h1 = _mm256_broadcast_sd(&hh[i-1]);
		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
381
#ifdef __ELPA_USE_FMA__
382
		q1 = _mm256_load_pd(&q[i*ldq]);
383
384
		q1 = _mm256_FMA_pd(x1, h1, q1);
		q1 = _mm256_FMA_pd(y1, h2, q1);
385
386
		_mm256_store_pd(&q[i*ldq],q1);
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
387
388
		q2 = _mm256_FMA_pd(x2, h1, q2);
		q2 = _mm256_FMA_pd(y2, h2, q2);
389
390
		_mm256_store_pd(&q[(i*ldq)+4],q2);
		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
391
392
		q3 = _mm256_FMA_pd(x3, h1, q3);
		q3 = _mm256_FMA_pd(y3, h2, q3);
393
394
		_mm256_store_pd(&q[(i*ldq)+8],q3);
		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
395
396
		q4 = _mm256_FMA_pd(x4, h1, q4);
		q4 = _mm256_FMA_pd(y4, h2, q4);
397
398
		_mm256_store_pd(&q[(i*ldq)+12],q4);
		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
399
400
		q5 = _mm256_FMA_pd(x5, h1, q5);
		q5 = _mm256_FMA_pd(y5, h2, q5);
401
402
		_mm256_store_pd(&q[(i*ldq)+16],q5);
		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
403
404
		q6 = _mm256_FMA_pd(x6, h1, q6);
		q6 = _mm256_FMA_pd(y6, h2, q6);
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
		_mm256_store_pd(&q[(i*ldq)+20],q6);
#else
		q1 = _mm256_load_pd(&q[i*ldq]);
		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
		_mm256_store_pd(&q[i*ldq],q1);
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
		_mm256_store_pd(&q[(i*ldq)+4],q2);
		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
		q3 = _mm256_add_pd(q3, _mm256_add_pd(_mm256_mul_pd(x3,h1), _mm256_mul_pd(y3, h2)));
		_mm256_store_pd(&q[(i*ldq)+8],q3);
		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
		q4 = _mm256_add_pd(q4, _mm256_add_pd(_mm256_mul_pd(x4,h1), _mm256_mul_pd(y4, h2)));
		_mm256_store_pd(&q[(i*ldq)+12],q4);
		q5 = _mm256_load_pd(&q[(i*ldq)+16]);
		q5 = _mm256_add_pd(q5, _mm256_add_pd(_mm256_mul_pd(x5,h1), _mm256_mul_pd(y5, h2)));
		_mm256_store_pd(&q[(i*ldq)+16],q5);
		q6 = _mm256_load_pd(&q[(i*ldq)+20]);
		q6 = _mm256_add_pd(q6, _mm256_add_pd(_mm256_mul_pd(x6,h1), _mm256_mul_pd(y6, h2)));
		_mm256_store_pd(&q[(i*ldq)+20],q6);
#endif
	}

	h1 = _mm256_broadcast_sd(&hh[nb-1]);
429
#ifdef __ELPA_USE_FMA__
430
	q1 = _mm256_load_pd(&q[nb*ldq]);
431
	q1 = _mm256_FMA_pd(x1, h1, q1);
432
433
	_mm256_store_pd(&q[nb*ldq],q1);
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
434
	q2 = _mm256_FMA_pd(x2, h1, q2);
435
436
	_mm256_store_pd(&q[(nb*ldq)+4],q2);
	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
437
	q3 = _mm256_FMA_pd(x3, h1, q3);
438
439
	_mm256_store_pd(&q[(nb*ldq)+8],q3);
	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
440
	q4 = _mm256_FMA_pd(x4, h1, q4);
441
442
	_mm256_store_pd(&q[(nb*ldq)+12],q4);
	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
443
	q5 = _mm256_FMA_pd(x5, h1, q5);
444
445
	_mm256_store_pd(&q[(nb*ldq)+16],q5);
	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
446
	q6 = _mm256_FMA_pd(x6, h1, q6);
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
	_mm256_store_pd(&q[(nb*ldq)+20],q6);
#else
	q1 = _mm256_load_pd(&q[nb*ldq]);
	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
	_mm256_store_pd(&q[nb*ldq],q1);
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
	_mm256_store_pd(&q[(nb*ldq)+4],q2);
	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
	q3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
	_mm256_store_pd(&q[(nb*ldq)+8],q3);
	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
	q4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
	_mm256_store_pd(&q[(nb*ldq)+12],q4);
	q5 = _mm256_load_pd(&q[(nb*ldq)+16]);
	q5 = _mm256_add_pd(q5, _mm256_mul_pd(x5, h1));
	_mm256_store_pd(&q[(nb*ldq)+16],q5);
	q6 = _mm256_load_pd(&q[(nb*ldq)+20]);
	q6 = _mm256_add_pd(q6, _mm256_mul_pd(x6, h1));
	_mm256_store_pd(&q[(nb*ldq)+20],q6);
#endif
}

/**
 * Unrolled kernel that computes
 * 16 rows of Q simultaneously, a
 * matrix vector product with two householder
 * vectors + a rank 2 update is performed
 */
 __forceinline void hh_trafo_kernel_16_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
	/////////////////////////////////////////////////////
	// Matrix Vector Multiplication, Q [16 x nb+1] * hh
	// hh contains two householder vectors, with offset 1
	/////////////////////////////////////////////////////
	int i;
	// Needed bit mask for floating point sign flip
	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);

	__m256d x1 = _mm256_load_pd(&q[ldq]);
	__m256d x2 = _mm256_load_pd(&q[ldq+4]);
	__m256d x3 = _mm256_load_pd(&q[ldq+8]);
	__m256d x4 = _mm256_load_pd(&q[ldq+12]);

	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
	__m256d h2;

494
#ifdef __ELPA_USE_FMA__
495
	__m256d q1 = _mm256_load_pd(q);
496
	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
497
	__m256d q2 = _mm256_load_pd(&q[4]);
498
	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
499
	__m256d q3 = _mm256_load_pd(&q[8]);
500
	__m256d y3 = _mm256_FMA_pd(x3, h1, q3);
501
	__m256d q4 = _mm256_load_pd(&q[12]);
502
	__m256d y4 = _mm256_FMA_pd(x4, h1, q4);
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
#else
	__m256d q1 = _mm256_load_pd(q);
	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
	__m256d q2 = _mm256_load_pd(&q[4]);
	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
	__m256d q3 = _mm256_load_pd(&q[8]);
	__m256d y3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
	__m256d q4 = _mm256_load_pd(&q[12]);
	__m256d y4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
#endif

	for(i = 2; i < nb; i++)
	{
		h1 = _mm256_broadcast_sd(&hh[i-1]);
		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
518
#ifdef __ELPA_USE_FMA__
519
		q1 = _mm256_load_pd(&q[i*ldq]);
520
521
		x1 = _mm256_FMA_pd(q1, h1, x1);
		y1 = _mm256_FMA_pd(q1, h2, y1);
522
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
523
524
		x2 = _mm256_FMA_pd(q2, h1, x2);
		y2 = _mm256_FMA_pd(q2, h2, y2);
525
		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
526
527
		x3 = _mm256_FMA_pd(q3, h1, x3);
		y3 = _mm256_FMA_pd(q3, h2, y3);
528
		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
529
530
		x4 = _mm256_FMA_pd(q4, h1, x4);
		y4 = _mm256_FMA_pd(q4, h2, y4);
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
#else
		q1 = _mm256_load_pd(&q[i*ldq]);
		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
		x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
		y3 = _mm256_add_pd(y3, _mm256_mul_pd(q3,h2));
		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
		x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
		y4 = _mm256_add_pd(y4, _mm256_mul_pd(q4,h2));
#endif
	}

	h1 = _mm256_broadcast_sd(&hh[nb-1]);
548
#ifdef __ELPA_USE_FMA__
549
	q1 = _mm256_load_pd(&q[nb*ldq]);
550
	x1 = _mm256_FMA_pd(q1, h1, x1);
551
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
552
	x2 = _mm256_FMA_pd(q2, h1, x2);
553
	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
554
	x3 = _mm256_FMA_pd(q3, h1, x3);
555
	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
556
	x4 = _mm256_FMA_pd(q4, h1, x4);
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
#else
	q1 = _mm256_load_pd(&q[nb*ldq]);
	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
	x3 = _mm256_add_pd(x3, _mm256_mul_pd(q3,h1));
	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
	x4 = _mm256_add_pd(x4, _mm256_mul_pd(q4,h1));
#endif

	/////////////////////////////////////////////////////
	// Rank-2 update of Q [16 x nb+1]
	/////////////////////////////////////////////////////

	__m256d tau1 = _mm256_broadcast_sd(hh);
	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
	__m256d vs = _mm256_broadcast_sd(&s);

	h1 = _mm256_xor_pd(tau1, sign);
	x1 = _mm256_mul_pd(x1, h1);
	x2 = _mm256_mul_pd(x2, h1);
	x3 = _mm256_mul_pd(x3, h1);
	x4 = _mm256_mul_pd(x4, h1);
	h1 = _mm256_xor_pd(tau2, sign);
	h2 = _mm256_mul_pd(h1, vs);
583
584
585
586
587
#ifdef __ELPA_USE_FMA__
	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
	y3 = _mm256_FMA_pd(y3, h1, _mm256_mul_pd(x3,h2));
	y4 = _mm256_FMA_pd(y4, h1, _mm256_mul_pd(x4,h2));
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
#else
	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
	y3 = _mm256_add_pd(_mm256_mul_pd(y3,h1), _mm256_mul_pd(x3,h2));
	y4 = _mm256_add_pd(_mm256_mul_pd(y4,h1), _mm256_mul_pd(x4,h2));
#endif

	q1 = _mm256_load_pd(q);
	q1 = _mm256_add_pd(q1, y1);
	_mm256_store_pd(q,q1);
	q2 = _mm256_load_pd(&q[4]);
	q2 = _mm256_add_pd(q2, y2);
	_mm256_store_pd(&q[4],q2);
	q3 = _mm256_load_pd(&q[8]);
	q3 = _mm256_add_pd(q3, y3);
	_mm256_store_pd(&q[8],q3);
	q4 = _mm256_load_pd(&q[12]);
	q4 = _mm256_add_pd(q4, y4);
	_mm256_store_pd(&q[12],q4);

	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
609
#ifdef __ELPA_USE_FMA__
610
	q1 = _mm256_load_pd(&q[ldq]);
611
	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
612
613
	_mm256_store_pd(&q[ldq],q1);
	q2 = _mm256_load_pd(&q[ldq+4]);
614
	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
615
616
	_mm256_store_pd(&q[ldq+4],q2);
	q3 = _mm256_load_pd(&q[ldq+8]);
617
	q3 = _mm256_add_pd(q3, _mm256_FMA_pd(y3, h2, x3));
618
619
	_mm256_store_pd(&q[ldq+8],q3);
	q4 = _mm256_load_pd(&q[ldq+12]);
620
	q4 = _mm256_add_pd(q4, _mm256_FMA_pd(y4, h2, x4));
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
	_mm256_store_pd(&q[ldq+12],q4);
#else
	q1 = _mm256_load_pd(&q[ldq]);
	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
	_mm256_store_pd(&q[ldq],q1);
	q2 = _mm256_load_pd(&q[ldq+4]);
	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
	_mm256_store_pd(&q[ldq+4],q2);
	q3 = _mm256_load_pd(&q[ldq+8]);
	q3 = _mm256_add_pd(q3, _mm256_add_pd(x3, _mm256_mul_pd(y3, h2)));
	_mm256_store_pd(&q[ldq+8],q3);
	q4 = _mm256_load_pd(&q[ldq+12]);
	q4 = _mm256_add_pd(q4, _mm256_add_pd(x4, _mm256_mul_pd(y4, h2)));
	_mm256_store_pd(&q[ldq+12],q4);
#endif

	for (i = 2; i < nb; i++)
	{
		h1 = _mm256_broadcast_sd(&hh[i-1]);
		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
641
#ifdef __ELPA_USE_FMA__
642
		q1 = _mm256_load_pd(&q[i*ldq]);
643
644
		q1 = _mm256_FMA_pd(x1, h1, q1);
		q1 = _mm256_FMA_pd(y1, h2, q1);
645
646
		_mm256_store_pd(&q[i*ldq],q1);
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
647
648
		q2 = _mm256_FMA_pd(x2, h1, q2);
		q2 = _mm256_FMA_pd(y2, h2, q2);
649
650
		_mm256_store_pd(&q[(i*ldq)+4],q2);
		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
651
652
		q3 = _mm256_FMA_pd(x3, h1, q3);
		q3 = _mm256_FMA_pd(y3, h2, q3);
653
654
		_mm256_store_pd(&q[(i*ldq)+8],q3);
		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
655
656
		q4 = _mm256_FMA_pd(x4, h1, q4);
		q4 = _mm256_FMA_pd(y4, h2, q4);
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
		_mm256_store_pd(&q[(i*ldq)+12],q4);
#else
		q1 = _mm256_load_pd(&q[i*ldq]);
		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
		_mm256_store_pd(&q[i*ldq],q1);
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
		_mm256_store_pd(&q[(i*ldq)+4],q2);
		q3 = _mm256_load_pd(&q[(i*ldq)+8]);
		q3 = _mm256_add_pd(q3, _mm256_add_pd(_mm256_mul_pd(x3,h1), _mm256_mul_pd(y3, h2)));
		_mm256_store_pd(&q[(i*ldq)+8],q3);
		q4 = _mm256_load_pd(&q[(i*ldq)+12]);
		q4 = _mm256_add_pd(q4, _mm256_add_pd(_mm256_mul_pd(x4,h1), _mm256_mul_pd(y4, h2)));
		_mm256_store_pd(&q[(i*ldq)+12],q4);
#endif
	}

	h1 = _mm256_broadcast_sd(&hh[nb-1]);
675
#ifdef __ELPA_USE_FMA__
676
	q1 = _mm256_load_pd(&q[nb*ldq]);
677
	q1 = _mm256_FMA_pd(x1, h1, q1);
678
679
	_mm256_store_pd(&q[nb*ldq],q1);
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
680
	q2 = _mm256_FMA_pd(x2, h1, q2);
681
682
	_mm256_store_pd(&q[(nb*ldq)+4],q2);
	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
683
	q3 = _mm256_FMA_pd(x3, h1, q3);
684
685
	_mm256_store_pd(&q[(nb*ldq)+8],q3);
	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
686
	q4 = _mm256_FMA_pd(x4, h1, q4);
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
	_mm256_store_pd(&q[(nb*ldq)+12],q4);
#else
	q1 = _mm256_load_pd(&q[nb*ldq]);
	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
	_mm256_store_pd(&q[nb*ldq],q1);
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
	_mm256_store_pd(&q[(nb*ldq)+4],q2);
	q3 = _mm256_load_pd(&q[(nb*ldq)+8]);
	q3 = _mm256_add_pd(q3, _mm256_mul_pd(x3, h1));
	_mm256_store_pd(&q[(nb*ldq)+8],q3);
	q4 = _mm256_load_pd(&q[(nb*ldq)+12]);
	q4 = _mm256_add_pd(q4, _mm256_mul_pd(x4, h1));
	_mm256_store_pd(&q[(nb*ldq)+12],q4);
#endif
}

/**
 * Unrolled kernel that computes
 * 8 rows of Q simultaneously, a
 * matrix vector product with two householder
 * vectors + a rank 2 update is performed
 */
 __forceinline void hh_trafo_kernel_8_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
	/////////////////////////////////////////////////////
	// Matrix Vector Multiplication, Q [8 x nb+1] * hh
	// hh contains two householder vectors, with offset 1
	/////////////////////////////////////////////////////
	int i;
	// Needed bit mask for floating point sign flip
	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);

	__m256d x1 = _mm256_load_pd(&q[ldq]);
	__m256d x2 = _mm256_load_pd(&q[ldq+4]);

	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
	__m256d h2;

726
#ifdef __ELPA_USE_FMA__
727
	__m256d q1 = _mm256_load_pd(q);
728
	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
729
	__m256d q2 = _mm256_load_pd(&q[4]);
730
	__m256d y2 = _mm256_FMA_pd(x2, h1, q2);
731
732
733
734
735
736
737
738
739
740
741
#else
	__m256d q1 = _mm256_load_pd(q);
	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
	__m256d q2 = _mm256_load_pd(&q[4]);
	__m256d y2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
#endif

	for(i = 2; i < nb; i++)
	{
		h1 = _mm256_broadcast_sd(&hh[i-1]);
		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
742
#ifdef __ELPA_USE_FMA__
743
		q1 = _mm256_load_pd(&q[i*ldq]);
744
745
		x1 = _mm256_FMA_pd(q1, h1, x1);
		y1 = _mm256_FMA_pd(q1, h2, y1);
746
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
747
748
		x2 = _mm256_FMA_pd(q2, h1, x2);
		y2 = _mm256_FMA_pd(q2, h2, y2);
749
750
751
752
753
754
755
756
757
758
759
#else
		q1 = _mm256_load_pd(&q[i*ldq]);
		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
		x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
		y2 = _mm256_add_pd(y2, _mm256_mul_pd(q2,h2));
#endif
	}

	h1 = _mm256_broadcast_sd(&hh[nb-1]);
760
#ifdef __ELPA_USE_FMA__
761
	q1 = _mm256_load_pd(&q[nb*ldq]);
762
	x1 = _mm256_FMA_pd(q1, h1, x1);
763
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
764
	x2 = _mm256_FMA_pd(q2, h1, x2);
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
#else
	q1 = _mm256_load_pd(&q[nb*ldq]);
	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
	x2 = _mm256_add_pd(x2, _mm256_mul_pd(q2,h1));
#endif

	/////////////////////////////////////////////////////
	// Rank-2 update of Q [8 x nb+1]
	/////////////////////////////////////////////////////

	__m256d tau1 = _mm256_broadcast_sd(hh);
	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
	__m256d vs = _mm256_broadcast_sd(&s);

	h1 = _mm256_xor_pd(tau1, sign);
	x1 = _mm256_mul_pd(x1, h1);
	x2 = _mm256_mul_pd(x2, h1);
	h1 = _mm256_xor_pd(tau2, sign);
	h2 = _mm256_mul_pd(h1, vs);
785
786
787
#ifdef __ELPA_USE_FMA__
	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
	y2 = _mm256_FMA_pd(y2, h1, _mm256_mul_pd(x2,h2));
788
789
790
791
792
793
794
795
796
797
798
799
800
#else
	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
	y2 = _mm256_add_pd(_mm256_mul_pd(y2,h1), _mm256_mul_pd(x2,h2));
#endif

	q1 = _mm256_load_pd(q);
	q1 = _mm256_add_pd(q1, y1);
	_mm256_store_pd(q,q1);
	q2 = _mm256_load_pd(&q[4]);
	q2 = _mm256_add_pd(q2, y2);
	_mm256_store_pd(&q[4],q2);

	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
801
#ifdef __ELPA_USE_FMA__
802
	q1 = _mm256_load_pd(&q[ldq]);
803
	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
804
805
	_mm256_store_pd(&q[ldq],q1);
	q2 = _mm256_load_pd(&q[ldq+4]);
806
	q2 = _mm256_add_pd(q2, _mm256_FMA_pd(y2, h2, x2));
807
808
809
810
811
812
813
814
815
816
817
818
819
820
	_mm256_store_pd(&q[ldq+4],q2);
#else
	q1 = _mm256_load_pd(&q[ldq]);
	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
	_mm256_store_pd(&q[ldq],q1);
	q2 = _mm256_load_pd(&q[ldq+4]);
	q2 = _mm256_add_pd(q2, _mm256_add_pd(x2, _mm256_mul_pd(y2, h2)));
	_mm256_store_pd(&q[ldq+4],q2);
#endif

	for (i = 2; i < nb; i++)
	{
		h1 = _mm256_broadcast_sd(&hh[i-1]);
		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
821
#ifdef __ELPA_USE_FMA__
822
		q1 = _mm256_load_pd(&q[i*ldq]);
823
824
		q1 = _mm256_FMA_pd(x1, h1, q1);
		q1 = _mm256_FMA_pd(y1, h2, q1);
825
826
		_mm256_store_pd(&q[i*ldq],q1);
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
827
828
		q2 = _mm256_FMA_pd(x2, h1, q2);
		q2 = _mm256_FMA_pd(y2, h2, q2);
829
830
831
832
833
834
835
836
837
838
839
840
		_mm256_store_pd(&q[(i*ldq)+4],q2);
#else
		q1 = _mm256_load_pd(&q[i*ldq]);
		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
		_mm256_store_pd(&q[i*ldq],q1);
		q2 = _mm256_load_pd(&q[(i*ldq)+4]);
		q2 = _mm256_add_pd(q2, _mm256_add_pd(_mm256_mul_pd(x2,h1), _mm256_mul_pd(y2, h2)));
		_mm256_store_pd(&q[(i*ldq)+4],q2);
#endif
	}

	h1 = _mm256_broadcast_sd(&hh[nb-1]);
841
#ifdef __ELPA_USE_FMA__
842
	q1 = _mm256_load_pd(&q[nb*ldq]);
843
	q1 = _mm256_FMA_pd(x1, h1, q1);
844
845
	_mm256_store_pd(&q[nb*ldq],q1);
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
846
	q2 = _mm256_FMA_pd(x2, h1, q2);
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
	_mm256_store_pd(&q[(nb*ldq)+4],q2);
#else
	q1 = _mm256_load_pd(&q[nb*ldq]);
	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
	_mm256_store_pd(&q[nb*ldq],q1);
	q2 = _mm256_load_pd(&q[(nb*ldq)+4]);
	q2 = _mm256_add_pd(q2, _mm256_mul_pd(x2, h1));
	_mm256_store_pd(&q[(nb*ldq)+4],q2);
#endif
}

/**
 * Unrolled kernel that computes
 * 4 rows of Q simultaneously, a
 * matrix vector product with two householder
 * vectors + a rank 2 update is performed
 */
 __forceinline void hh_trafo_kernel_4_AVX_2hv(double* q, double* hh, int nb, int ldq, int ldh, double s)
{
	/////////////////////////////////////////////////////
	// Matrix Vector Multiplication, Q [4 x nb+1] * hh
	// hh contains two householder vectors, with offset 1
	/////////////////////////////////////////////////////
	int i;
	// Needed bit mask for floating point sign flip
	__m256d sign = (__m256d)_mm256_set1_epi64x(0x8000000000000000);

	__m256d x1 = _mm256_load_pd(&q[ldq]);

	__m256d h1 = _mm256_broadcast_sd(&hh[ldh+1]);
	__m256d h2;

879
#ifdef __ELPA_USE_FMA__
880
	__m256d q1 = _mm256_load_pd(q);
881
	__m256d y1 = _mm256_FMA_pd(x1, h1, q1);
882
883
884
885
886
887
888
889
890
#else
	__m256d q1 = _mm256_load_pd(q);
	__m256d y1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
#endif

	for(i = 2; i < nb; i++)
	{
		h1 = _mm256_broadcast_sd(&hh[i-1]);
		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
891
#ifdef __ELPA_USE_FMA__
892
		q1 = _mm256_load_pd(&q[i*ldq]);
893
894
		x1 = _mm256_FMA_pd(q1, h1, x1);
		y1 = _mm256_FMA_pd(q1, h2, y1);
895
896
897
898
899
900
901
902
#else
		q1 = _mm256_load_pd(&q[i*ldq]);
		x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
		y1 = _mm256_add_pd(y1, _mm256_mul_pd(q1,h2));
#endif
	}

	h1 = _mm256_broadcast_sd(&hh[nb-1]);
903
#ifdef __ELPA_USE_FMA__
904
	q1 = _mm256_load_pd(&q[nb*ldq]);
905
	x1 = _mm256_FMA_pd(q1, h1, x1);
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
#else
	q1 = _mm256_load_pd(&q[nb*ldq]);
	x1 = _mm256_add_pd(x1, _mm256_mul_pd(q1,h1));
#endif

	/////////////////////////////////////////////////////
	// Rank-2 update of Q [4 x nb+1]
	/////////////////////////////////////////////////////

	__m256d tau1 = _mm256_broadcast_sd(hh);
	__m256d tau2 = _mm256_broadcast_sd(&hh[ldh]);
	__m256d vs = _mm256_broadcast_sd(&s);

	h1 = _mm256_xor_pd(tau1, sign);
	x1 = _mm256_mul_pd(x1, h1);
	h1 = _mm256_xor_pd(tau2, sign);
	h2 = _mm256_mul_pd(h1, vs);
923
924
#ifdef __ELPA_USE_FMA__
	y1 = _mm256_FMA_pd(y1, h1, _mm256_mul_pd(x1,h2));
925
926
927
928
929
930
931
932
933
#else
	y1 = _mm256_add_pd(_mm256_mul_pd(y1,h1), _mm256_mul_pd(x1,h2));
#endif

	q1 = _mm256_load_pd(q);
	q1 = _mm256_add_pd(q1, y1);
	_mm256_store_pd(q,q1);

	h2 = _mm256_broadcast_sd(&hh[ldh+1]);
934
#ifdef __ELPA_USE_FMA__
935
	q1 = _mm256_load_pd(&q[ldq]);
936
	q1 = _mm256_add_pd(q1, _mm256_FMA_pd(y1, h2, x1));
937
938
939
940
941
942
943
944
945
946
947
	_mm256_store_pd(&q[ldq],q1);
#else
	q1 = _mm256_load_pd(&q[ldq]);
	q1 = _mm256_add_pd(q1, _mm256_add_pd(x1, _mm256_mul_pd(y1, h2)));
	_mm256_store_pd(&q[ldq],q1);
#endif

	for (i = 2; i < nb; i++)
	{
		h1 = _mm256_broadcast_sd(&hh[i-1]);
		h2 = _mm256_broadcast_sd(&hh[ldh+i]);
948
#ifdef __ELPA_USE_FMA__
949
		q1 = _mm256_load_pd(&q[i*ldq]);
950
951
		q1 = _mm256_FMA_pd(x1, h1, q1);
		q1 = _mm256_FMA_pd(y1, h2, q1);
952
953
954
955
956
957
958
959
960
		_mm256_store_pd(&q[i*ldq],q1);
#else
		q1 = _mm256_load_pd(&q[i*ldq]);
		q1 = _mm256_add_pd(q1, _mm256_add_pd(_mm256_mul_pd(x1,h1), _mm256_mul_pd(y1, h2)));
		_mm256_store_pd(&q[i*ldq],q1);
#endif
	}

	h1 = _mm256_broadcast_sd(&hh[nb-1]);
961
#ifdef __ELPA_USE_FMA__
962
	q1 = _mm256_load_pd(&q[nb*ldq]);
963
	q1 = _mm256_FMA_pd(x1, h1, q1);
964
965
966
967
968
969
970
971
	_mm256_store_pd(&q[nb*ldq],q1);
#else
	q1 = _mm256_load_pd(&q[nb*ldq]);
	q1 = _mm256_add_pd(q1, _mm256_mul_pd(x1, h1));
	_mm256_store_pd(&q[nb*ldq],q1);
#endif
}