elpa2_kernels_complex_avx-avx2_2hv_single_precision.c 31.3 KB
Newer Older
1
2
//    This file is part of ELPA.
//
Andreas Marek's avatar
Andreas Marek committed
3
//    The ELPA library was originally created by the ELPA consortium,
4
5
//    consisting of the following organizations:
//
6
7
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
12
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
Andreas Marek's avatar
Andreas Marek committed
14
15
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
16
17
//    - IBM Deutschland GmbH
//
18
//    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
//    enhancements authored by Intel Corporation which is not part of
20
//    the ELPA consortium.
21
22
//
//    More information can be found here:
23
//    http://elpa.mpcdf.mpg.de/
24
25
//
//    ELPA is free software: you can redistribute it and/or modify
Andreas Marek's avatar
Andreas Marek committed
26
27
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
45
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
46
//
47
#include "config-f90.h"
48

49
#include <complex.h>
50
51
52
53
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

54
55
#ifdef HAVE_AVX2

56
57
#ifdef __FMA4__
#define __ELPA_USE_FMA__
58
59
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
60
61
62
63
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
64
65
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
66
#endif
67

68
69
#endif

70
//Forward declaration
71
72
73
74
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
//static __forceinline void hh_trafo_complex_kernel_6_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
//static __forceinline void hh_trafo_complex_kernel_2_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
75
76

/*
77
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
78
79
80
81
82
!f> interface
!f>   subroutine double_hh_trafo_complex_avx_avx2_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_avx_avx2_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
83
84
!f>     ! complex(kind=c_float_complex)   :: q(*)
!f>     type(c_ptr), value                :: q
85
!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
86
87
88
89
90
91
!f>   end subroutine
!f> end interface
!f>#endif
*/


92
void double_hh_trafo_complex_avx_avx2_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
93
94
95
96
97
98
99
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

100
	float complex s = conj(hh[(ldh)+1])*1.0f;
101
102
103
104
105
106
107
	for (i = 2; i < nb; i++)
	{
		s += hh[i-1] * conj(hh[(i+ldh)]);
	}

	for (i = 0; i < nq-4; i+=8)
	{
108
		hh_trafo_complex_kernel_8_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s , s);
109
110
111
	}
	if (nq-i > 0)
	{
112
		hh_trafo_complex_kernel_4_AVX_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
113
	}
114

115
116
}

117
static __forceinline void hh_trafo_complex_kernel_8_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
118
{
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
	float* s_dbl = (float*)(&s);

	__m256 x1, x2, x3, x4;
	__m256 y1, y2, y3, y4;
	__m256 q1, q2, q3, q4;
	__m256 h1_real, h1_imag, h2_real, h2_imag;
	__m256 tmp1, tmp2, tmp3, tmp4;
	volatile int i=0;
	__m256 sign = (__m256)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);

	x1 = _mm256_load_ps(&q_dbl[(2*ldq)+0]);
	x2 = _mm256_load_ps(&q_dbl[(2*ldq)+8]);
//	x3 = _mm256_load_ps(&q_dbl[(2*ldq)+8]);
//	x4 = _mm256_load_ps(&q_dbl[(2*ldq)+12]);

	h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+1)*2)+1]);
138
#ifndef __ELPA_USE_FMA__
139
	// conjugate
140
	h2_imag = _mm256_xor_ps(h2_imag, sign);
141
142
#endif

143
144
145
146
	y1 = _mm256_load_ps(&q_dbl[0]);
	y2 = _mm256_load_ps(&q_dbl[8]);
//	y3 = _mm256_load_pd(&q_dbl[8]);
//	y4 = _mm256_load_pd(&q_dbl[12]);
147

148
	tmp1 = _mm256_mul_ps(h2_imag, x1);
149
#ifdef __ELPA_USE_FMA__
150
	y1 = _mm256_add_ps(y1, _mm256_FMSUBADD_ps(h2_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
151
#else
152
	y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
153
#endif
154
	tmp2 = _mm256_mul_ps(h2_imag, x2);
155
#ifdef __ELPA_USE_FMA__
156
	y2 = _mm256_add_ps(y2, _mm256_FMSUBADD_ps(h2_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
157
#else
158
	y2 = _mm256_add_ps(y2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
159
#endif
160
//	tmp3 = _mm256_mul_pd(h2_imag, x3);
161
#ifdef __ELPA_USE_FMA__
162
//	y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
163
#else
164
//	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
165
#endif
166
//	tmp4 = _mm256_mul_pd(h2_imag, x4);
167
#ifdef __ELPA_USE_FMA__
168
//	y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
169
#else
170
//	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
171
172
173
174
#endif

	for (i = 2; i < nb; i++)
	{
175
176
177
178
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_ps(&q_dbl[(2*i*ldq)+8]);
//		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
//		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
179

180
181
		h1_real = _mm256_broadcast_ss(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[((i-1)*2)+1]);
182
#ifndef __ELPA_USE_FMA__
183
		// conjugate
184
		h1_imag = _mm256_xor_ps(h1_imag, sign);
185
186
#endif

187
		tmp1 = _mm256_mul_ps(h1_imag, q1);
188
#ifdef __ELPA_USE_FMA__
189
		x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
190
#else
191
		x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
192
#endif
193
		tmp2 = _mm256_mul_ps(h1_imag, q2);
194
#ifdef __ELPA_USE_FMA__
195
		x2 = _mm256_add_ps(x2, _mm256_FMSUBADD_ps(h1_real, q2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
196
#else
197
		x2 = _mm256_add_ps(x2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
198
#endif
199
//		tmp3 = _mm256_mul_pd(h1_imag, q3);
200
#ifdef __ELPA_USE_FMA__
201
//		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
202
#else
203
//		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
204
#endif
205
//		tmp4 = _mm256_mul_pd(h1_imag, q4);
206
#ifdef __ELPA_USE_FMA__
207
//		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
208
#else
209
//		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
210
211
#endif

212
213
		h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+i)*2)+1]);
214
#ifndef __ELPA_USE_FMA__
215
		// conjugate
216
		h2_imag = _mm256_xor_ps(h2_imag, sign);
217
218
#endif

219
		tmp1 = _mm256_mul_ps(h2_imag, q1);
220
#ifdef __ELPA_USE_FMA__
221
		y1 = _mm256_add_ps(y1, _mm256_FMSUBADD_ps(h2_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
222
#else
223
		y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
224
#endif
225
		tmp2 = _mm256_mul_ps(h2_imag, q2);
226
#ifdef __ELPA_USE_FMA__
227
		y2 = _mm256_add_ps(y2, _mm256_FMSUBADD_ps(h2_real, q2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
228
#else
229
		y2 = _mm256_add_ps(y2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, q2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
230
#endif
231
//		tmp3 = _mm256_mul_pd(h2_imag, q3);
232
#ifdef __ELPA_USE_FMA__
233
//		y3 = _mm256_add_pd(y3, _mm256_FMSUBADD_pd(h2_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
234
#else
235
//		y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
236
#endif
237
//		tmp4 = _mm256_mul_pd(h2_imag, q4);
238
#ifdef __ELPA_USE_FMA__
239
//		y4 = _mm256_add_pd(y4, _mm256_FMSUBADD_pd(h2_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
240
#else
241
//		y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
242
243
244
#endif
	}

245
246
	h1_real = _mm256_broadcast_ss(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[((nb-1)*2)+1]);
247
#ifndef __ELPA_USE_FMA__
248
	// conjugate
249
	h1_imag = _mm256_xor_ps(h1_imag, sign);
250
251
#endif

252
253
254
255
	q1 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+8]);
//	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
//	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
256

257
	tmp1 = _mm256_mul_ps(h1_imag, q1);
258
#ifdef __ELPA_USE_FMA__
259
	x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
260
#else
261
	x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
262
#endif
263
	tmp2 = _mm256_mul_ps(h1_imag, q2);
264
#ifdef __ELPA_USE_FMA__
265
	x2 = _mm256_add_ps(x2, _mm256_FMSUBADD_ps(h1_real, q2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
266
#else
267
	x2 = _mm256_add_ps(x2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
268
#endif
269
//	tmp3 = _mm256_mul_pd(h1_imag, q3);
270
#ifdef __ELPA_USE_FMA__
271
//	x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
272
#else
273
//	x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
274
#endif
275
//	tmp4 = _mm256_mul_pd(h1_imag, q4);
276
#ifdef __ELPA_USE_FMA__
277
//	x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
278
#else
279
//	x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
280
281
#endif

282
283
284
285
	h1_real = _mm256_broadcast_ss(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[1]);
	h1_real = _mm256_xor_ps(h1_real, sign);
	h1_imag = _mm256_xor_ps(h1_imag, sign);
286

287
	tmp1 = _mm256_mul_ps(h1_imag, x1);
288
#ifdef __ELPA_USE_FMA__
289
	x1 = _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
290
#else
291
	x1 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
292
#endif
293
	tmp2 = _mm256_mul_ps(h1_imag, x2);
294
#ifdef __ELPA_USE_FMA__
295
	x2 = _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
296
#else
297
	x2 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
298
#endif
299
//	tmp3 = _mm256_mul_pd(h1_imag, x3);
300
#ifdef __ELPA_USE_FMA__
301
//	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1));
302
#else
303
//	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1));
304
#endif
305
//	tmp4 = _mm256_mul_pd(h1_imag, x4);
306
#ifdef __ELPA_USE_FMA__
307
//	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1));
308
#else
309
//	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1));
310
311
#endif

312
313
314
315
	h1_real = _mm256_broadcast_ss(&hh_dbl[ldh*2]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[(ldh*2)+1]);
	h2_real = _mm256_broadcast_ss(&hh_dbl[ldh*2]);
	h2_imag = _mm256_broadcast_ss(&hh_dbl[(ldh*2)+1]);
316

317
318
319
320
	h1_real = _mm256_xor_ps(h1_real, sign);
	h1_imag = _mm256_xor_ps(h1_imag, sign);
	h2_real = _mm256_xor_ps(h2_real, sign);
	h2_imag = _mm256_xor_ps(h2_imag, sign);
321

322
323
324
325
326
327
//carefull here
//	__m128u tmp_s_128 = _mm_loadu_pd(s_dbl);         // load 1 complex value (2 double, i.e. 16 bytes)
        __m128 tmp_s_128 =  _mm_loadu_ps(s_dbl);
//	tmp2 = _mm256_broadcast_pd(&tmp_s_128);          // broad cast the 1 complex , i.e. double it
	tmp2 = _mm256_broadcast_ps(&tmp_s_128);          // broad cast the 1 complex , i.e. double it
	tmp1 = _mm256_mul_ps(h2_imag, tmp2);             // multiply hh2_img with the complex
328
#ifdef __ELPA_USE_FMA__
329
	tmp2 = _mm256_FMADDSUB_ps(h2_real, tmp2, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
330
#else
331
	tmp2 = _mm256_addsub_ps( _mm256_mul_ps(h2_real, tmp2), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
332
#endif
333
334
335
336
337
338
//careful here

//	_mm_storeu_pd(s_dbl, _mm256_castpd256_pd128(tmp2));
        _mm_storeu_ps(s_dbl, _mm256_castps256_ps128(tmp2));
	h2_real = _mm256_broadcast_ss(&s_dbl[0]);
	h2_imag = _mm256_broadcast_ss(&s_dbl[1]);
339

340
	tmp1 = _mm256_mul_ps(h1_imag, y1);
341
#ifdef __ELPA_USE_FMA__
342
	y1 = _mm256_FMADDSUB_ps(h1_real, y1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
343
#else
344
	y1 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, y1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
345
#endif
346
	tmp2 = _mm256_mul_ps(h1_imag, y2);
347
#ifdef __ELPA_USE_FMA__
348
	y2 = _mm256_FMADDSUB_ps(h1_real, y2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
349
#else
350
	y2 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, y2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
351
#endif
352
//	tmp3 = _mm256_mul_pd(h1_imag, y3);
353
#ifdef __ELPA_USE_FMA__
354
//	y3 = _mm256_FMADDSUB_pd(h1_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1));
355
#else
356
//	y3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1));
357
#endif
358
//	tmp4 = _mm256_mul_pd(h1_imag, y4);
359
#ifdef __ELPA_USE_FMA__
360
//	y4 = _mm256_FMADDSUB_pd(h1_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1));
361
#else
362
//	y4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1));
363
364
#endif

365
	tmp1 = _mm256_mul_ps(h2_imag, x1);
366
#ifdef __ELPA_USE_FMA__
367
	y1 = _mm256_add_ps(y1, _mm256_FMADDSUB_ps(h2_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
368
#else
369
	y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
370
#endif
371
	tmp2 = _mm256_mul_ps(h2_imag, x2);
372
#ifdef __ELPA_USE_FMA__
373
	y2 = _mm256_add_ps(y2, _mm256_FMADDSUB_ps(h2_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
374
#else
375
	y2 = _mm256_add_ps(y2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
376
#endif
377
//	tmp3 = _mm256_mul_pd(h2_imag, x3);
378
#ifdef __ELPA_USE_FMA__
379
//	y3 = _mm256_add_pd(y3, _mm256_FMADDSUB_pd(h2_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
380
#else
381
//	y3 = _mm256_add_pd(y3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
382
#endif
383
//	tmp4 = _mm256_mul_pd(h2_imag, x4);
384
#ifdef __ELPA_USE_FMA__
385
//	y4 = _mm256_add_pd(y4, _mm256_FMADDSUB_pd(h2_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
386
#else
387
//	y4 = _mm256_add_pd(y4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
388
389
#endif

390
391
392
393
	q1 = _mm256_load_ps(&q_dbl[0]);
	q2 = _mm256_load_ps(&q_dbl[8]);
//	q3 = _mm256_load_pd(&q_dbl[8]);
//	q4 = _mm256_load_pd(&q_dbl[12]);
394

395
396
397
398
	q1 = _mm256_add_ps(q1, y1);
	q2 = _mm256_add_ps(q2, y2);
//	q3 = _mm256_add_pd(q3, y3);
//	q4 = _mm256_add_pd(q4, y4);
399

400
401
402
403
	_mm256_store_ps(&q_dbl[0], q1);
	_mm256_store_ps(&q_dbl[8], q2);
//	_mm256_store_pd(&q_dbl[8], q3);
//	_mm256_store_pd(&q_dbl[12], q4);
404

405
406
	h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+1)*2)+1]);
407

408
409
410
411
	q1 = _mm256_load_ps(&q_dbl[(ldq*2)+0]);
	q2 = _mm256_load_ps(&q_dbl[(ldq*2)+8]);
//	q3 = _mm256_load_pd(&q_dbl[(ldq*2)+8]);
//	q4 = _mm256_load_pd(&q_dbl[(ldq*2)+12]);
412

413
414
415
416
	q1 = _mm256_add_ps(q1, x1);
	q2 = _mm256_add_ps(q2, x2);
//	q3 = _mm256_add_pd(q3, x3);
//	q4 = _mm256_add_pd(q4, x4);
417

418
	tmp1 = _mm256_mul_ps(h2_imag, y1);
419
#ifdef __ELPA_USE_FMA__
420
	q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h2_real, y1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
421
#else
422
	q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, y1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
423
#endif
424
	tmp2 = _mm256_mul_ps(h2_imag, y2);
425
#ifdef __FMA4_
426
	q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h2_real, y2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
427
#else
428
	q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, y2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
429
#endif
430
//	tmp3 = _mm256_mul_pd(h2_imag, y3);
431
#ifdef __ELPA_USE_FMA__
432
//	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
433
#else
434
//	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
435
#endif
436
//	tmp4 = _mm256_mul_pd(h2_imag, y4);
437
#ifdef __ELPA_USE_FMA__
438
//	q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
439
#else
440
//	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
441
442
#endif

443
444
445
446
	_mm256_store_ps(&q_dbl[(ldq*2)+0], q1);
	_mm256_store_ps(&q_dbl[(ldq*2)+8], q2);
//	_mm256_store_pd(&q_dbl[(ldq*2)+8], q3);
//	_mm256_store_pd(&q_dbl[(ldq*2)+12], q4);
447
448
449

	for (i = 2; i < nb; i++)
	{
450
451
452
453
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_ps(&q_dbl[(2*i*ldq)+8]);
//		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
//		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
454

455
456
		h1_real = _mm256_broadcast_ss(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[((i-1)*2)+1]);
457

458
		tmp1 = _mm256_mul_ps(h1_imag, x1);
459
#ifdef __ELPA_USE_FMA__
460
		q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
461
#else
462
463
		q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
							// ##################### //
464
#endif
465
		tmp2 = _mm256_mul_ps(h1_imag, x2);
466
#ifdef __ELPA_USE_FMA__
467
		q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
468
#else
469
		q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
470
#endif
471
//		tmp3 = _mm256_mul_pd(h1_imag, x3);
472
#ifdef __ELPA_USE_FMA__
473
//		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
474
#else
475
//		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
476
#endif
477
//		tmp4 = _mm256_mul_pd(h1_imag, x4);
478
#ifdef __ELPA_USE_FMA__
479
//		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
480
#else
481
//		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
482
483
#endif

484
485
		h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+i)*2)+1]);
486

487
		tmp1 = _mm256_mul_ps(h2_imag, y1);
488
#ifdef __ELPA_USE_FMA__
489
		q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h2_real, y1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
490
#else
491
		q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, y1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
492
#endif
493
		tmp2 = _mm256_mul_ps(h2_imag, y2);
494
#ifdef __ELPA_USE_FMA__
495
		q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h2_real, y2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
496
#else
497
		q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h2_real, y2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
498
#endif
499
//		tmp3 = _mm256_mul_pd(h2_imag, y3);
500
#ifdef __ELPA_USE_FMA__
501
//		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h2_real, y3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
502
#else
503
//		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
504
#endif
505
//		tmp4 = _mm256_mul_pd(h2_imag, y4);
506
#ifdef __ELPA_USE_FMA__
507
//		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h2_real, y4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
508
#else
509
//		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h2_real, y4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
510
511
#endif

512
513
514
515
		_mm256_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_ps(&q_dbl[(2*i*ldq)+8], q2);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
516
	}
517
518
	h1_real = _mm256_broadcast_ss(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[((nb-1)*2)+1]);
519

520
521
522
523
	q1 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+8]);
//	q3 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+8]);
//	q4 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+12]);
524

525
	tmp1 = _mm256_mul_ps(h1_imag, x1);
526
#ifdef __ELPA_USE_FMA__
527
	q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
528
#else
529
	q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
530
#endif
531
	tmp2 = _mm256_mul_ps(h1_imag, x2);
532
#ifdef __ELPA_USE_FMA__
533
	q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
534
#else
535
	q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
536
#endif
537
//	tmp3 = _mm256_mul_pd(h1_imag, x3);
538
#ifdef __ELPA_USE_FMA__
539
//	q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
540
#else
541
//	q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0xb1)));
542
#endif
543
//	tmp4 = _mm256_mul_pd(h1_imag, x4);
544
#ifdef __ELPA_USE_FMA__
545
//	q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
546
#else
547
//	q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0xb1)));
548
549
#endif

550
551
552
553
	_mm256_store_ps(&q_dbl[(2*nb*ldq)+0], q1);
	_mm256_store_ps(&q_dbl[(2*nb*ldq)+8], q2);
//	_mm256_store_pd(&q_dbl[(2*nb*ldq)+8], q3);
//	_mm256_store_pd(&q_dbl[(2*nb*ldq)+12], q4);
554
555
}

556
static __forceinline void hh_trafo_complex_kernel_4_AVX_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
557
{
558
559
560
561
562
563
564
565
566
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
	float* s_dbl = (float*)(&s);

	__m256 x1, x2;
	__m256 y1, y2;
	__m256 q1, q2;
	__m256 h1_real, h1_imag, h2_real, h2_imag;
	__m256 tmp1, tmp2;
567
568
	int i=0;

569
	__m256 sign = (__m256)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
570

571
572
	x1 = _mm256_load_ps(&q_dbl[(2*ldq)+0]);
//	x2 = _mm256_load_pd(&q_dbl[(2*ldq)+4]);
573

574
575
	h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+1)*2]);
	h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+1)*2)+1]);
576
#ifndef __ELPA_USE_FMA__
577
	// conjugate
578
	h2_imag = _mm256_xor_ps(h2_imag, sign);
579
580
#endif

581
582
	y1 = _mm256_load_ps(&q_dbl[0]);
//	y2 = _mm256_load_pd(&q_dbl[4]);
583

584
	tmp1 = _mm256_mul_ps(h2_imag, x1);
585
#ifdef __ELPA_USE_FMA__
586
	y1 = _mm256_add_ps(y1, _mm256_FMSUBADD_ps(h2_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
587
#else
588
	y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
589
#endif
590
//	tmp2 = _mm256_mul_pd(h2_imag, x2);
591
#ifdef __ELPA_USE_FMA__
592
//	y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
593
#else
594
//	y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
595
596
597
598
#endif

	for (i = 2; i < nb; i++)
	{
599
600
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
//		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
601

602
603
		h1_real = _mm256_broadcast_ss(&hh_dbl[(i-1)*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[((i-1)*2)+1]);
604
#ifndef __ELPA_USE_FMA__
605
		// conjugate
606
		h1_imag = _mm256_xor_ps(h1_imag, sign);
607
608
#endif

609
		tmp1 = _mm256_mul_ps(h1_imag, q1);
610
#ifdef __ELPA_USE_FMA__
611
		x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
612
#else
613
		x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
614
#endif
615
//		tmp2 = _mm256_mul_pd(h1_imag, q2);
616
#ifdef __ELPA_USE_FMA__
617
//		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
618
#else
619
//		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
620
621
#endif

622
623
		h2_real = _mm256_broadcast_ss(&hh_dbl[(ldh+i)*2]);
		h2_imag = _mm256_broadcast_ss(&hh_dbl[((ldh+i)*2)+1]);
624
#ifndef __ELPA_USE_FMA__
625
		// conjugate
626
		h2_imag = _mm256_xor_ps(h2_imag, sign);
627
628
#endif

629
		tmp1 = _mm256_mul_ps(h2_imag, q1);
630
#ifdef __ELPA_USE_FMA__
631
		y1 = _mm256_add_ps(y1, _mm256_FMSUBADD_ps(h2_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
632
#else
633
		y1 = _mm256_add_ps(y1, _mm256_addsub_ps( _mm256_mul_ps(h2_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
634
#endif
635
//		tmp2 = _mm256_mul_pd(h2_imag, q2);
636
#ifdef __ELPA_USE_FMA__
637
//		y2 = _mm256_add_pd(y2, _mm256_FMSUBADD_pd(h2_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
638
#else
639
//		y2 = _mm256_add_pd(y2, _mm256_addsub_pd( _mm256_mul_pd(h2_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
640
641
642
#endif
	}

643
644
	h1_real = _mm256_broadcast_ss(&hh_dbl[(nb-1)*2]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[((nb-1)*2)+1]);
645
#ifndef __ELPA_USE_FMA__
646
	// conjugate
647
	h1_imag = _mm256_xor_ps(h1_imag, sign);
648
649
#endif

650
651
	q1 = _mm256_load_ps(&q_dbl[(2*nb*ldq)+0]);
//	q2 = _mm256_load_pd(&q_dbl[(2*nb*ldq)+4]);
652

653
	tmp1 = _mm256_mul_ps(h1_imag, q1);
654
#ifdef __ELPA_USE_FMA__
655
	x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
656
#else
657
	x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
658
#endif
659
//	tmp2 = _mm256_mul_pd(h1_imag, q2);
660
#ifdef __ELPA_USE_FMA__
661
//	x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
662
#else
663
//	x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0xb1)));
664
665
#endif

666
667
668
669
	h1_real = _mm256_broadcast_ss(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[1]);
	h1_real = _mm256_xor_ps(h1_real, sign);
	h1_imag = _mm256_xor_ps(h1_imag, sign);
670

671
	tmp1 = _mm256_mul_ps(h1_imag, x1);
672
#ifdef __ELPA_USE_FMA__
673
	x1 = _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
674
#else
675
	x1 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
676
#endif
677
	tmp2 = _mm256_mul_ps(h1_imag, x2);
678
#ifdef __ELPA_USE_FMA__