elpa2_kernels_complex_avx-avx2_2hv_single_precision.c 31.3 KB
Newer Older
1
2
//    This file is part of ELPA.
//
Andreas Marek's avatar
Andreas Marek committed
3
//    The ELPA library was originally created by the ELPA consortium,
4
5
//    consisting of the following organizations:
//
6
7
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
12
13
14
15
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
16
17
//    - IBM Deutschland GmbH
//
18
//    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
//    enhancements authored by Intel Corporation which is not part of
20
//    the ELPA consortium.
21
22
//
//    More information can be found here:
23
//    http://elpa.mpcdf.mpg.de/
24
25
//
//    ELPA is free software: you can redistribute it and/or modify
Andreas Marek's avatar
Andreas Marek committed
26
27
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
45
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
46
//
47
#include "config-f90.h"
48

49
#include <complex.h>
50
51
52
53
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

54
55
#ifdef HAVE_AVX2

56
57
#ifdef __FMA4__
#define __ELPA_USE_FMA__
58
59
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
60
61
62
63
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
64
65
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
66
#endif
67

68
69
#endif

70
//Forward declaration
71
72
73
74
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
//static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
//static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
75
76

/*
77
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
78
79
80
81
82
83
84
85
86
87
88
89
90
!f> interface
!f>   subroutine double_hh_trafo_complex_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     complex(kind=c_float)   :: q(*)
!f>     complex(kind=c_float)   :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/


91
void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
92
93
94
95
96
97
98
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

99
	float complex s = conj(hh[(ldh)+1])*1.0f;
100
101
102
103
104
105
106
	for (i = 2; i < nb; i++)
	{
		s += hh[i-1] * conj(hh[(i+ldh)]);
	}

	for (i = 0; i < nq-4; i+=8)
	{
107
		hh_trafo_complex_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s , s);
108
109
110
	}
	if (nq-i > 0)
	{
111
		hh_trafo_complex_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
112
	}
113

114
115
}

116
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
117
{
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
	float* s_dbl = (float*)(&s);

	__m256 x1, x2, x3, x4;
	__m256 y1, y2, y3, y4;
	__m256 q1, q2, q3, q4;
	__m256 h1_real, h1_imag, h2_real, h2_imag;
	__m256 tmp1, tmp2, tmp3, tmp4;
	volatile int i=0;
	__m256 sign = (__m256)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);

	x1 = _mm256_load_ps(&q_dbl[(2*ldq)+0]);
	x2 = _mm256_load_ps(&q_dbl[(2*ldq)+8]);
//	x3 = _mm256_load_ps(&q_dbl[(2*ldq)+8]);
//	x4 = _mm256_load_ps(&q_dbl[(2*ldq)+12]);

	h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+1)*2)+1]);
137
#ifndef __ELPA_USE_FMA__
138
	// conjugate
139
	h2_imag = _mm256_xor_ps(h2_imag, sign);
140
141
#endif

142
143
144
145
	y1 = _mm256_load_ps(&q_dbl[0]);
	y2 = _mm256_load_ps(&q_dbl[8]);
//	y3 = _mm256_load_pd(&q_dbl[8]);
//	y4 = _mm256_load_pd(&q_dbl[12]);
146

147
	tmp1 = _mm256_mul_ps(h2_imag, x1);
148
#ifdef __ELPA_USE_FMA__
149
	y1 = _mm256_add_ps(y1, _mm256_FMSUBADD_ps(h2_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
150
#else
151
	y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
152
#endif
153
	tmp2 = _mm256_mul_ps(h2_imag, x2);
154
#ifdef __ELPA_USE_FMA__
155
	y2 = _mm256_add_ps(y2, _mm256_FMSUBADD_ps(h2_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
156
#else
157
	y2 = _mm256_add_ps(y2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
158
#endif
159
//	tmp3 = _mm256_mul_pd(h2_imag, x3);
160
#ifdef __ELPA_USE_FMA__
161
//	y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
162
#else
163
//	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
164
#endif
165
//	tmp4 = _mm256_mul_pd(h2_imag, x4);
166
#ifdef __ELPA_USE_FMA__
167
//	y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
168
#else
169
//	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
170
171
172
173
#endif

	for (i = 2; i < nb; i++)
	{
174
175
176
177
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_ps(&q_dbl[(2*i*ldq)+8]);
//		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
//		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
178

179
180
		h1_real = _mm256_broadcast_ss(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[((i-1)*2)+1]);
181
#ifndef __ELPA_USE_FMA__
182
		// conjugate
183
		h1_imag = _mm256_xor_ps(h1_imag, sign);
184
185
#endif

186
		tmp1 = _mm256_mul_ps(h1_imag, q1);
187
#ifdef __ELPA_USE_FMA__
188
		x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
189
#else
190
		x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
191
#endif
192
		tmp2 = _mm256_mul_ps(h1_imag, q2);
193
#ifdef __ELPA_USE_FMA__
194
		x2 = _mm256_add_ps(x2, _mm256_FMSUBADD_ps(h1_real, q2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
195
#else
196
		x2 = _mm256_add_ps(x2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
197
#endif
198
//		tmp3 = _mm256_mul_pd(h1_imag, q3);
199
#ifdef __ELPA_USE_FMA__
200
//		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
201
#else
202
//		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
203
#endif
204
//		tmp4 = _mm256_mul_pd(h1_imag, q4);
205
#ifdef __ELPA_USE_FMA__
206
//		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
207
#else
208
//		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
209
210
#endif

211
212
		h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+i)*2)+1]);
213
#ifndef __ELPA_USE_FMA__
214
		// conjugate
215
		h2_imag = _mm256_xor_ps(h2_imag, sign);
216
217
#endif

218
		tmp1 = _mm256_mul_ps(h2_imag, q1);
219
#ifdef __ELPA_USE_FMA__
220
		y1 = _mm256_add_ps(y1, _mm256_FMSUBADD_ps(h2_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
221
#else
222
		y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
223
#endif
224
		tmp2 = _mm256_mul_ps(h2_imag, q2);
225
#ifdef __ELPA_USE_FMA__
226
		y2 = _mm256_add_ps(y2, _mm256_FMSUBADD_ps(h2_real, q2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
227
#else
228
		y2 = _mm256_add_ps(y2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, q2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
229
#endif
230
//		tmp3 = _mm256_mul_pd(h2_imag, q3);
231
#ifdef __ELPA_USE_FMA__
232
//		y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
233
#else
234
//		y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
235
#endif
236
//		tmp4 = _mm256_mul_pd(h2_imag, q4);
237
#ifdef __ELPA_USE_FMA__
238
//		y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
239
#else
240
//		y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
241
242
243
#endif
	}

244
245
	h1_real = _mm256_broadcast_ss(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[((nb-1)*2)+1]);
246
#ifndef __ELPA_USE_FMA__
247
	// conjugate
248
	h1_imag = _mm256_xor_ps(h1_imag, sign);
249
250
#endif

251
252
253
254
	q1 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+8]);
//	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
//	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
255

256
	tmp1 = _mm256_mul_ps(h1_imag, q1);
257
#ifdef __ELPA_USE_FMA__
258
	x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
259
#else
260
	x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
261
#endif
262
	tmp2 = _mm256_mul_ps(h1_imag, q2);
263
#ifdef __ELPA_USE_FMA__
264
	x2 = _mm256_add_ps(x2, _mm256_FMSUBADD_ps(h1_real, q2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
265
#else
266
	x2 = _mm256_add_ps(x2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
267
#endif
268
//	tmp3 = _mm256_mul_pd(h1_imag, q3);
269
#ifdef __ELPA_USE_FMA__
270
//	x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
271
#else
272
//	x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
273
#endif
274
//	tmp4 = _mm256_mul_pd(h1_imag, q4);
275
#ifdef __ELPA_USE_FMA__
276
//	x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
277
#else
278
//	x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
279
280
#endif

281
282
283
284
	h1_real = _mm256_broadcast_ss(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[1]);
	h1_real = _mm256_xor_ps(h1_real, sign);
	h1_imag = _mm256_xor_ps(h1_imag, sign);
285

286
	tmp1 = _mm256_mul_ps(h1_imag, x1);
287
#ifdef __ELPA_USE_FMA__
288
	x1 = _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
289
#else
290
	x1 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
291
#endif
292
	tmp2 = _mm256_mul_ps(h1_imag, x2);
293
#ifdef __ELPA_USE_FMA__
294
	x2 = _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
295
#else
296
	x2 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
297
#endif
298
//	tmp3 = _mm256_mul_pd(h1_imag, x3);
299
#ifdef __ELPA_USE_FMA__
300
//	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1));
301
#else
302
//	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1));
303
#endif
304
//	tmp4 = _mm256_mul_pd(h1_imag, x4);
305
#ifdef __ELPA_USE_FMA__
306
//	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1));
307
#else
308
//	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1));
309
310
#endif

311
312
313
314
	h1_real = _mm256_broadcast_ss(&hh_dbl[ldh*2]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[(ldh*2)+1]);
	h2_real = _mm256_broadcast_ss(&hh_dbl[ldh*2]);
	h2_imag = _mm256_broadcast_ss(&hh_dbl[(ldh*2)+1]);
315

316
317
318
319
	h1_real = _mm256_xor_ps(h1_real, sign);
	h1_imag = _mm256_xor_ps(h1_imag, sign);
	h2_real = _mm256_xor_ps(h2_real, sign);
	h2_imag = _mm256_xor_ps(h2_imag, sign);
320

321
322
323
324
325
326
//carefull here
//	__m128u tmp_s_128 = _mm_loadu_pd(s_dbl);         // load 1 complex value (2 double, i.e. 16 bytes)
        __m128 tmp_s_128 =  _mm_loadu_ps(s_dbl);
//	tmp2 = _mm256_broadcast_pd(&tmp_s_128);          // broad cast the 1 complex , i.e. double it
	tmp2 = _mm256_broadcast_ps(&tmp_s_128);          // broad cast the 1 complex , i.e. double it
	tmp1 = _mm256_mul_ps(h2_imag, tmp2);             // multiply hh2_img with the complex
327
#ifdef __ELPA_USE_FMA__
328
	tmp2 = _mm256_FMADDSUB_ps(h2_real, tmp2, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
329
#else
330
	tmp2 = _mm256_addsub_ps( _mm256_mul_ps(h2_real, tmp2), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
331
#endif
332
333
334
335
336
337
//careful here

//	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
        _mm_storeu_ps(s_dbl, _mm256_castps256_ps128(tmp2));
	h2_real = _mm256_broadcast_ss(&s_dbl[0]);
	h2_imag = _mm256_broadcast_ss(&s_dbl[1]);
338

339
	tmp1 = _mm256_mul_ps(h1_imag, y1);
340
#ifdef __ELPA_USE_FMA__
341
	y1 = _mm256_FMADDSUB_ps(h1_real, y1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
342
#else
343
	y1 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, y1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
344
#endif
345
	tmp2 = _mm256_mul_ps(h1_imag, y2);
346
#ifdef __ELPA_USE_FMA__
347
	y2 = _mm256_FMADDSUB_ps(h1_real, y2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
348
#else
349
	y2 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, y2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
350
#endif
351
//	tmp3 = _mm256_mul_pd(h1_imag, y3);
352
#ifdef __ELPA_USE_FMA__
353
//	y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1));
354
#else
355
//	y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1));
356
#endif
357
//	tmp4 = _mm256_mul_pd(h1_imag, y4);
358
#ifdef __ELPA_USE_FMA__
359
//	y4 = _mm256_FMADDSUB_pd(h1_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1));
360
#else
361
//	y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1));
362
363
#endif

364
	tmp1 = _mm256_mul_ps(h2_imag, x1);
365
#ifdef __ELPA_USE_FMA__
366
	y1 = _mm256_add_ps(y1, _mm256_FMADDSUB_ps(h2_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
367
#else
368
	y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
369
#endif
370
	tmp2 = _mm256_mul_ps(h2_imag, x2);
371
#ifdef __ELPA_USE_FMA__
372
	y2 = _mm256_add_ps(y2, _mm256_FMADDSUB_ps(h2_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
373
#else
374
	y2 = _mm256_add_ps(y2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
375
#endif
376
//	tmp3 = _mm256_mul_pd(h2_imag, x3);
377
#ifdef __ELPA_USE_FMA__
378
//	y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
379
#else
380
//	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
381
#endif
382
//	tmp4 = _mm256_mul_pd(h2_imag, x4);
383
#ifdef __ELPA_USE_FMA__
384
//	y4 = _mm256_add_pd(y4, _mm256_FMADDSUB_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
385
#else
386
//	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
387
388
#endif

389
390
391
392
	q1 = _mm256_load_ps(&q_dbl[0]);
	q2 = _mm256_load_ps(&q_dbl[8]);
//	q3 = _mm256_load_pd(&q_dbl[8]);
//	q4 = _mm256_load_pd(&q_dbl[12]);
393

394
395
396
397
	q1 = _mm256_add_ps(q1, y1);
	q2 = _mm256_add_ps(q2, y2);
//	q3 = _mm256_add_pd(q3, y3);
//	q4 = _mm256_add_pd(q4, y4);
398

399
400
401
402
	_mm256_store_ps(&q_dbl[0], q1);
	_mm256_store_ps(&q_dbl[8], q2);
//	_mm256_store_pd(&q_dbl[8], q3);
//	_mm256_store_pd(&q_dbl[12], q4);
403

404
405
	h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+1)*2)+1]);
406

407
408
409
410
	q1 = _mm256_load_ps(&q_dbl[(ldq*2)+0]);
	q2 = _mm256_load_ps(&q_dbl[(ldq*2)+8]);
//	q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
//	q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]);
411

412
413
414
415
	q1 = _mm256_add_ps(q1, x1);
	q2 = _mm256_add_ps(q2, x2);
//	q3 = _mm256_add_pd(q3, x3);
//	q4 = _mm256_add_pd(q4, x4);
416

417
	tmp1 = _mm256_mul_ps(h2_imag, y1);
418
#ifdef __ELPA_USE_FMA__
419
	q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h2_real, y1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
420
#else
421
	q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, y1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
422
#endif
423
	tmp2 = _mm256_mul_ps(h2_imag, y2);
424
#ifdef __FMA4_
425
	q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h2_real, y2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
426
#else
427
	q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, y2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
428
#endif
429
//	tmp3 = _mm256_mul_pd(h2_imag, y3);
430
#ifdef __ELPA_USE_FMA__
431
//	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
432
#else
433
//	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
434
#endif
435
//	tmp4 = _mm256_mul_pd(h2_imag, y4);
436
#ifdef __ELPA_USE_FMA__
437
//	q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
438
#else
439
//	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
440
441
#endif

442
443
444
445
	_mm256_store_ps(&q_dbl[(ldq*2)+0], q1);
	_mm256_store_ps(&q_dbl[(ldq*2)+8], q2);
//	_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);
//	_mm256_store_pd(&q_dbl[(ldq*2)+12], q4);
446
447
448

	for (i = 2; i < nb; i++)
	{
449
450
451
452
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_ps(&q_dbl[(2*i*ldq)+8]);
//		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
//		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
453

454
455
		h1_real = _mm256_broadcast_ss(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[((i-1)*2)+1]);
456

457
		tmp1 = _mm256_mul_ps(h1_imag, x1);
458
#ifdef __ELPA_USE_FMA__
459
		q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
460
#else
461
462
		q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
							// ##################### //
463
#endif
464
		tmp2 = _mm256_mul_ps(h1_imag, x2);
465
#ifdef __ELPA_USE_FMA__
466
		q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
467
#else
468
		q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
469
#endif
470
//		tmp3 = _mm256_mul_pd(h1_imag, x3);
471
#ifdef __ELPA_USE_FMA__
472
//		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
473
#else
474
//		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
475
#endif
476
//		tmp4 = _mm256_mul_pd(h1_imag, x4);
477
#ifdef __ELPA_USE_FMA__
478
//		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
479
#else
480
//		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
481
482
#endif

483
484
		h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+i)*2)+1]);
485

486
		tmp1 = _mm256_mul_ps(h2_imag, y1);
487
#ifdef __ELPA_USE_FMA__
488
		q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h2_real, y1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
489
#else
490
		q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, y1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
491
#endif
492
		tmp2 = _mm256_mul_ps(h2_imag, y2);
493
#ifdef __ELPA_USE_FMA__
494
		q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h2_real, y2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
495
#else
496
		q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, y2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
497
#endif
498
//		tmp3 = _mm256_mul_pd(h2_imag, y3);
499
#ifdef __ELPA_USE_FMA__
500
//		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
501
#else
502
//		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
503
#endif
504
//		tmp4 = _mm256_mul_pd(h2_imag, y4);
505
#ifdef __ELPA_USE_FMA__
506
//		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
507
#else
508
//		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
509
510
#endif

511
512
513
514
		_mm256_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_ps(&q_dbl[(2*i*ldq)+8], q2);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
515
	}
516
517
	h1_real = _mm256_broadcast_ss(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[((nb-1)*2)+1]);
518

519
520
521
522
	q1 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+8]);
//	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
//	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
523

524
	tmp1 = _mm256_mul_ps(h1_imag, x1);
525
#ifdef __ELPA_USE_FMA__
526
	q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
527
#else
528
	q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
529
#endif
530
	tmp2 = _mm256_mul_ps(h1_imag, x2);
531
#ifdef __ELPA_USE_FMA__
532
	q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
533
#else
534
	q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
535
#endif
536
//	tmp3 = _mm256_mul_pd(h1_imag, x3);
537
#ifdef __ELPA_USE_FMA__
538
//	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
539
#else
540
//	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
541
#endif
542
//	tmp4 = _mm256_mul_pd(h1_imag, x4);
543
#ifdef __ELPA_USE_FMA__
544
//	q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
545
#else
546
//	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
547
548
#endif

549
550
551
552
	_mm256_store_ps(&q_dbl[(2*nb*ldq)+0], q1);
	_mm256_store_ps(&q_dbl[(2*nb*ldq)+8], q2);
//	_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
//	_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
553
554
}

555
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
556
{
557
558
559
560
561
562
563
564
565
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
	float* s_dbl = (float*)(&s);

	__m256 x1, x2;
	__m256 y1, y2;
	__m256 q1, q2;
	__m256 h1_real, h1_imag, h2_real, h2_imag;
	__m256 tmp1, tmp2;
566
567
	int i=0;

568
	__m256 sign = (__m256)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
569

570
571
	x1 = _mm256_load_ps(&q_dbl[(2*ldq)+0]);
//	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
572

573
574
	h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+1)*2)+1]);
575
#ifndef __ELPA_USE_FMA__
576
	// conjugate
577
	h2_imag = _mm256_xor_ps(h2_imag, sign);
578
579
#endif

580
581
	y1 = _mm256_load_ps(&q_dbl[0]);
//	y2 = _mm256_load_pd(&q_dbl[4]);
582

583
	tmp1 = _mm256_mul_ps(h2_imag, x1);
584
#ifdef __ELPA_USE_FMA__
585
	y1 = _mm256_add_ps(y1, _mm256_FMSUBADD_ps(h2_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
586
#else
587
	y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
588
#endif
589
//	tmp2 = _mm256_mul_pd(h2_imag, x2);
590
#ifdef __ELPA_USE_FMA__
591
//	y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
592
#else
593
//	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
594
595
596
597
#endif

	for (i = 2; i < nb; i++)
	{
598
599
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
//		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
600

601
602
		h1_real = _mm256_broadcast_ss(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[((i-1)*2)+1]);
603
#ifndef __ELPA_USE_FMA__
604
		// conjugate
605
		h1_imag = _mm256_xor_ps(h1_imag, sign);
606
607
#endif

608
		tmp1 = _mm256_mul_ps(h1_imag, q1);
609
#ifdef __ELPA_USE_FMA__
610
		x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
611
#else
612
		x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
613
#endif
614
//		tmp2 = _mm256_mul_pd(h1_imag, q2);
615
#ifdef __ELPA_USE_FMA__
616
//		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
617
#else
618
//		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
619
620
#endif

621
622
		h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+i)*2)+1]);
623
#ifndef __ELPA_USE_FMA__
624
		// conjugate
625
		h2_imag = _mm256_xor_ps(h2_imag, sign);
626
627
#endif

628
		tmp1 = _mm256_mul_ps(h2_imag, q1);
629
#ifdef __ELPA_USE_FMA__
630
		y1 = _mm256_add_ps(y1, _mm256_FMSUBADD_ps(h2_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
631
#else
632
		y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
633
#endif
634
//		tmp2 = _mm256_mul_pd(h2_imag, q2);
635
#ifdef __ELPA_USE_FMA__
636
//		y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
637
#else
638
//		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
639
640
641
#endif
	}

642
643
	h1_real = _mm256_broadcast_ss(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[((nb-1)*2)+1]);
644
#ifndef __ELPA_USE_FMA__
645
	// conjugate
646
	h1_imag = _mm256_xor_ps(h1_imag, sign);
647
648
#endif

649
650
	q1 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+0]);
//	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
651

652
	tmp1 = _mm256_mul_ps(h1_imag, q1);
653
#ifdef __ELPA_USE_FMA__
654
	x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
655
#else