elpa2_kernels_complex_avx-avx2_1hv_single_precision.c 20.9 KB
Newer Older
1
2
//    This file is part of ELPA.
//
Andreas Marek's avatar
Andreas Marek committed
3
//    The ELPA library was originally created by the ELPA consortium,
4
5
//    consisting of the following organizations:
//
6
7
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
12
13
14
15
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
16
17
//    - IBM Deutschland GmbH
//
18
//    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
//    enhancements authored by Intel Corporation which is not part of
20
//    the ELPA consortium.
21
22
//
//    More information can be found here:
23
//    http://elpa.mpcdf.mpg.de/
24
25
//
//    ELPA is free software: you can redistribute it and/or modify
Andreas Marek's avatar
Andreas Marek committed
26
27
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
45
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
46
//
47
#include "config-f90.h"
48

49
#include <complex.h>
50
51
52
53
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

54
55
#ifdef HAVE_AVX2

56
57
#ifdef __FMA4__
#define __ELPA_USE_FMA__
58
59
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_maddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_msubadd_ps(a,b,c)
60
61
62
63
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
64
65
#define _mm256_FMADDSUB_ps(a,b,c) _mm256_fmaddsub_ps(a,b,c)
#define _mm256_FMSUBADD_ps(a,b,c) _mm256_fmsubadd_ps(a,b,c)
66
#endif
67

68
69
#endif

70
//Forward declaration
71
72
73
static  __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

/*
!f>#ifdef HAVE_AVX
!f> interface
!f>   subroutine single_hh_trafo_complex_avx_avx2_1hv_single(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_avx_avx2_1hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     complex(kind=c_float)   :: q(*)
!f>     complex(kind=c_float)   :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/

89
void single_hh_trafo_complex_avx_avx2_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
90
91
92
93
94
95
96
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	//int ldh = *pldh;

97
// carefull here
98
99
	for (i = 0; i < nq-8; i+=12)
	{
100
		hh_trafo_complex_kernel_12_AVX_1hv_single(&q[i], hh, nb, ldq);
101
	}
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
	if (nq == i)
	{
		return;
	}
//	if (nq-i == 20)
//	{
//		hh_trafo_complex_kernel_16_AVX_1hv_single(&q[i], hh, nb, ldq);
//		hh_trafo_complex_kernel_4_AVX_1hv_single(&q[i], hh, nb, ldq);
//	}
//	if (nq-i == 16)
//	{
//		hh_trafo_complex_kernel_16_AVX_1hv_single(&q[i], hh, nb, ldq);
//	}
//	if (nq-i == 12)
//	{
//		hh_trafo_complex_kernel_8_AVX_1hv_single(&q[i], hh, nb, ldq);
//		hh_trafo_complex_kernel_4_AVX_1hv_single(&q[i], hh, nb, ldq);
//	}
	if (nq-i == 8)
121
	{
122
		hh_trafo_complex_kernel_8_AVX_1hv_single(&q[i], hh, nb, ldq);
123
	}
124
	else
125
	{
126
		hh_trafo_complex_kernel_4_AVX_1hv_single(&q[i], hh, nb, ldq);
127
128
129
	}
}

130
 static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
131
{
132
133
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
134

135
136
137
138
	__m256 x1, x2, x3, x4, x5, x6;
	__m256 q1, q2, q3, q4, q5, q6;
	__m256 h1_real, h1_imag;
	__m256 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
139
140
	int i=0;

141
142
143
144
145
146
147
148
149
150
//carefull here
//	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
	__m256 sign = (__m256)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);

	x1 = _mm256_load_ps(&q_dbl[0]);      // load four complex q(1,1) | q(2,1) | q(3,1) q(4,1)
	x2 = _mm256_load_ps(&q_dbl[8]);      // load four complex q(5,1) | q(6,1) | q(7,1) q(8,1)
	x3 = _mm256_load_ps(&q_dbl[16]);     // load four complex q(9,1) | q(10,1) | q(11,1) q(12,1)
//	x4 = _mm256_load_pd(&q_dbl[24]);     // load four complex q(13,1) | q(14,1) | q(15,1) q(16,1)
//	x5 = _mm256_load_pd(&q_dbl[32]);     // load four complex q(17,1) | q(18,1) | q(19,1) q(20,1)
//	x6 = _mm256_load_pd(&q_dbl[40]);     // load four complex q(21,1) | q(22,1) | q(23,1) q(24,1)
151
152
153
154


	for (i = 1; i < nb; i++)
	{
155
156
		h1_real = _mm256_broadcast_ss(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[(i*2)+1]);
157
#ifndef __ELPA_USE_FMA__
158
		// conjugate
159
		h1_imag = _mm256_xor_ps(h1_imag, sign);       // h1_imag = - h1_imag
160
161
#endif

162
163
164
165
166
167
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_ps(&q_dbl[(2*i*ldq)+8]);
		q3 = _mm256_load_ps(&q_dbl[(2*i*ldq)+16]);
//		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
//		q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]);
//		q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]);
168

169
170
		tmp1 = _mm256_mul_ps(h1_imag, q1);          // tmp1 = -h1_imag * q1
		// carefull here we want x1 = x1 + q(1,i) * conjg(hh(i))
171
#ifdef __ELPA_USE_FMA__
172
		x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));                    // x1 = x1 + (h1_real *q1 + 
173
#else
174
		x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
175
#endif
176
		tmp2 = _mm256_mul_ps(h1_imag, q2);
177
#ifdef __ELPA_USE_FMA__
178
		x2 = _mm256_add_ps(x2, _mm256_FMSUBADD_ps(h1_real, q2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
179
#else
180
		x2 = _mm256_add_ps(x2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
181
#endif
182
		tmp3 = _mm256_mul_ps(h1_imag, q3);
183
#ifdef __ELPA_USE_FMA__
184
		x3 = _mm256_add_ps(x3, _mm256_FMSUBADD_ps(h1_real, q3, _mm256_shuffle_ps(tmp3, tmp3, 0xb1)));
185
#else
186
		x3 = _mm256_add_ps(x3, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q3), _mm256_shuffle_ps(tmp3, tmp3, 0xb1)));
187
#endif
188
//		tmp4 = _mm256_mul_pd(h1_imag, q4);
189
#ifdef __ELPA_USE_FMA__
190
//		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
191
#else
192
//		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
193
#endif
194
//		tmp5 = _mm256_mul_pd(h1_imag, q5);
195
#ifdef __ELPA_USE_FMA__
196
//		x5 = _mm256_add_pd(x5, _mm256_FMSUBADD_pd(h1_real, q5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
197
#else
198
//		x5 = _mm256_add_pd(x5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
199
#endif
200
//		tmp6 = _mm256_mul_pd(h1_imag, q6);
201
#ifdef __ELPA_USE_FMA__
202
//		x6 = _mm256_add_pd(x6, _mm256_FMSUBADD_pd(h1_real, q6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
203
#else
204
//		x6 = _mm256_add_pd(x6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
205
206
207
#endif
	}

208
209
210
211
	h1_real = _mm256_broadcast_ss(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[1]);
	h1_real = _mm256_xor_ps(h1_real, sign);
	h1_imag = _mm256_xor_ps(h1_imag, sign);
212

213
214
	tmp1 = _mm256_mul_ps(h1_imag, x1);
	// carefull here
215
#ifdef __ELPA_USE_FMA__
216
	x1 = _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
217
#else
218
	x1 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
219
#endif
220
	tmp2 = _mm256_mul_ps(h1_imag, x2);
221
#ifdef __ELPA_USE_FMA__
222
	x2 = _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
223
#else
224
	x2 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
225
#endif
226
	tmp3 = _mm256_mul_ps(h1_imag, x3);
227
#ifdef __ELPA_USE_FMA__
228
	x3 = _mm256_FMADDSUB_ps(h1_real, x3, _mm256_shuffle_ps(tmp3, tmp3, 0xb1));
229
#else
230
	x3 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x3), _mm256_shuffle_ps(tmp3, tmp3, 0xb1));
231
#endif
232
//	tmp4 = _mm256_mul_pd(h1_imag, x4);
233
#ifdef __ELPA_USE_FMA__
234
//	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
235
#else
236
//	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
237
#endif
238
//	tmp5 = _mm256_mul_pd(h1_imag, x5);
239
#ifdef __ELPA_USE_FMA__
240
//	x5 = _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5));
241
#else
242
//	x5 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5));
243
#endif
244
//	tmp6 = _mm256_mul_pd(h1_imag, x6);
245
#ifdef __ELPA_USE_FMA__
246
//	x6 = _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5));
247
#else
248
//	x6 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5));
249
250
#endif

251
252
253
254
255
256
	q1 = _mm256_load_ps(&q_dbl[0]);
	q2 = _mm256_load_ps(&q_dbl[8]);
	q3 = _mm256_load_ps(&q_dbl[16]);
//	q4 = _mm256_load_pd(&q_dbl[12]);
//	q5 = _mm256_load_pd(&q_dbl[16]);
//	q6 = _mm256_load_pd(&q_dbl[20]);
257

258
259
260
261
262
263
	q1 = _mm256_add_ps(q1, x1);
	q2 = _mm256_add_ps(q2, x2);
	q3 = _mm256_add_ps(q3, x3);
//	q4 = _mm256_add_pd(q4, x4);
//	q5 = _mm256_add_pd(q5, x5);
//	q6 = _mm256_add_pd(q6, x6);
264

265
266
267
268
269
270
	_mm256_store_ps(&q_dbl[0], q1);
	_mm256_store_ps(&q_dbl[8], q2);
	_mm256_store_ps(&q_dbl[16], q3);
//	_mm256_store_pd(&q_dbl[12], q4);
//	_mm256_store_pd(&q_dbl[16], q5);
//	_mm256_store_pd(&q_dbl[20], q6);
271
272
273

	for (i = 1; i < nb; i++)
	{
274
275
		h1_real = _mm256_broadcast_ss(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[(i*2)+1]);
276

277
278
279
280
281
282
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_ps(&q_dbl[(2*i*ldq)+8]);
		q3 = _mm256_load_ps(&q_dbl[(2*i*ldq)+16]);
//		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
//		q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]);
//		q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]);
283

284
285
		tmp1 = _mm256_mul_ps(h1_imag, x1);
		// carefull
286
#ifdef __ELPA_USE_FMA__
287
		q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
288
#else
289
		q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
290
#endif
291
		tmp2 = _mm256_mul_ps(h1_imag, x2);
292
#ifdef __ELPA_USE_FMA__
293
		q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
294
#else
295
		q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
296
#endif
297
		tmp3 = _mm256_mul_ps(h1_imag, x3);
298
#ifdef __ELPA_USE_FMA__
299
		q3 = _mm256_add_ps(q3, _mm256_FMADDSUB_ps(h1_real, x3, _mm256_shuffle_ps(tmp3, tmp3, 0xb1)));
300
#else
301
		q3 = _mm256_add_ps(q3, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x3), _mm256_shuffle_ps(tmp3, tmp3, 0xb1)));
302
#endif
303
//		tmp4 = _mm256_mul_pd(h1_imag, x4);
304
#ifdef __ELPA_USE_FMA__
305
//		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
306
#else
307
//		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
308
#endif
309
//		tmp5 = _mm256_mul_pd(h1_imag, x5);
310
#ifdef __ELPA_USE_FMA__
311
//		q5 = _mm256_add_pd(q5, _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
312
#else
313
//		q5 = _mm256_add_pd(q5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
314
#endif
315
//		tmp6 = _mm256_mul_pd(h1_imag, x6);
316
#ifdef __ELPA_USE_FMA__
317
//		q6 = _mm256_add_pd(q6, _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
318
#else
319
//		q6 = _mm256_add_pd(q6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
320
321
#endif

322
323
324
325
326
327
		_mm256_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_ps(&q_dbl[(2*i*ldq)+8], q2);
		_mm256_store_ps(&q_dbl[(2*i*ldq)+16], q3);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+16], q5);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+20], q6);
328
329
330
	}
}

331
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
332
{
333
334
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
335

336
337
338
339
	__m256 x1, x2, x3, x4;
	__m256 q1, q2, q3, q4;
	__m256 h1_real, h1_imag;
	__m256 tmp1, tmp2, tmp3, tmp4;
340
341
	int i=0;

342
343
344
	// carefull 
//	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
	__m256 sign = (__m256)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
345

346
347
348
349
	x1 = _mm256_load_ps(&q_dbl[0]);   // load q(1,1) | q(2,1) | q(3,1) | q(4,1)
	x2 = _mm256_load_ps(&q_dbl[8]);   // load q(1,1) | q(2,1) | q(3,1) | q(4,1)
//	x3 = _mm256_load_pd(&q_dbl[8]);
//	x4 = _mm256_load_pd(&q_dbl[12]);
350
351
352

	for (i = 1; i < nb; i++)
	{
353
354
		h1_real = _mm256_broadcast_ss(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[(i*2)+1]);
355
#ifndef __ELPA_USE_FMA__
356
		// conjugate
357
		h1_imag = _mm256_xor_ps(h1_imag, sign);
358
359
#endif

360
361
362
363
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_ps(&q_dbl[(2*i*ldq)+8]);
//		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
//		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
364

365
366
		tmp1 = _mm256_mul_ps(h1_imag, q1);
		// carefull
367
#ifdef __ELPA_USE_FMA__
368
		x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
369
#else
370
		x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
371
#endif
372
		tmp2 = _mm256_mul_ps(h1_imag, q2);
373
#ifdef __ELPA_USE_FMA__
374
		x2 = _mm256_add_ps(x2, _mm256_FMSUBADD_ps(h1_real, q2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
375
#else
376
		x2 = _mm256_add_ps(x2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
377
#endif
378
//		tmp3 = _mm256_mul_pd(h1_imag, q3);
379
#ifdef __ELPA_USE_FMA__
380
//		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
381
#else
382
//		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
383
#endif
384
//		tmp4 = _mm256_mul_pd(h1_imag, q4);
385
#ifdef __ELPA_USE_FMA__
386
//		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
387
#else
388
//		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
389
390
391
#endif
	}

392
393
394
395
	h1_real = _mm256_broadcast_ss(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[1]);
	h1_real = _mm256_xor_ps(h1_real, sign);
	h1_imag = _mm256_xor_ps(h1_imag, sign);
396

397
398
399
	tmp1 = _mm256_mul_ps(h1_imag, x1);

	// carefull
400
#ifdef __ELPA_USE_FMA__
401
	x1 = _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
402
#else
403
	x1 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
404
#endif
405
	tmp2 = _mm256_mul_ps(h1_imag, x2);
406
#ifdef __ELPA_USE_FMA__
407
	x2 = _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
408
#else
409
	x2 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1));
410
#endif
411
//	tmp3 = _mm256_mul_pd(h1_imag, x3);
412
#ifdef __ELPA_USE_FMA__
413
//	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
414
#else
415
//	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
416
#endif
417
//	tmp4 = _mm256_mul_pd(h1_imag, x4);
418
#ifdef __ELPA_USE_FMA__
419
//	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
420
#else
421
//	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
422
423
#endif

424
425
426
427
	q1 = _mm256_load_ps(&q_dbl[0]);
	q2 = _mm256_load_ps(&q_dbl[8]);
//	q3 = _mm256_load_pd(&q_dbl[8]);
//	q4 = _mm256_load_pd(&q_dbl[12]);
428

429
430
431
432
	q1 = _mm256_add_ps(q1, x1);
	q2 = _mm256_add_ps(q2, x2);
//	q3 = _mm256_add_pd(q3, x3);
//	q4 = _mm256_add_pd(q4, x4);
433

434
435
436
437
	_mm256_store_ps(&q_dbl[0], q1);
	_mm256_store_ps(&q_dbl[8], q2);
//	_mm256_store_pd(&q_dbl[8], q3);
//	_mm256_store_pd(&q_dbl[12], q4);
438
439
440

	for (i = 1; i < nb; i++)
	{
441
442
		h1_real = _mm256_broadcast_ss(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[(i*2)+1]);
443

444
445
446
447
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_ps(&q_dbl[(2*i*ldq)+8]);
//		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
//		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
448

449
450
		tmp1 = _mm256_mul_ps(h1_imag, x1);
		// carefull
451
#ifdef __ELPA_USE_FMA__
452
		q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
453
#else
454
		q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
455
#endif
456
		tmp2 = _mm256_mul_ps(h1_imag, x2);
457
#ifdef __ELPA_USE_FMA__
458
		q2 = _mm256_add_ps(q2, _mm256_FMADDSUB_ps(h1_real, x2, _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
459
#else
460
		q2 = _mm256_add_ps(q2, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x2), _mm256_shuffle_ps(tmp2, tmp2, 0xb1)));
461
#endif
462
//		tmp3 = _mm256_mul_pd(h1_imag, x3);
463
#ifdef __ELPA_USE_FMA__
464
//		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
465
#else
466
//		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
467
#endif
468
//		tmp4 = _mm256_mul_pd(h1_imag, x4);
469
#ifdef __ELPA_USE_FMA__
470
//		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
471
#else
472
//		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
473
474
#endif

475
476
477
478
		_mm256_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_ps(&q_dbl[(2*i*ldq)+8], q2);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
479
480
481
	}
}

482
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
483
{
484
485
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
486

487
488
489
490
	__m256 x1, x2;
	__m256 q1, q2;
	__m256 h1_real, h1_imag;
	__m256 tmp1, tmp2;
491
492
	int i=0;

493
494
495
	// carefull
//	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);
	__m256 sign = (__m256)_mm256_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
496

497
498
	x1 = _mm256_load_ps(&q_dbl[0]);    // load q(1,1) | q(2,1) | q(3,1) | q(4,1)
//	x2 = _mm256_load_pd(&q_dbl[4]);
499
500
501

	for (i = 1; i < nb; i++)
	{
502
503
		h1_real = _mm256_broadcast_ss(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[(i*2)+1]);
504
#ifndef __ELPA_USE_FMA__
505
		// conjugate
506
		h1_imag = _mm256_xor_ps(h1_imag, sign);
507
508
#endif

509
510
511
512
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
//		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);

		tmp1 = _mm256_mul_ps(h1_imag, q1);
513

514
		// carefull
515
#ifdef __ELPA_USE_FMA__
516
		x1 = _mm256_add_ps(x1, _mm256_FMSUBADD_ps(h1_real, q1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
517
#else
518
		x1 = _mm256_add_ps(x1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, q1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
519
#endif
520
//		tmp2 = _mm256_mul_pd(h1_imag, q2);
521
#ifdef __ELPA_USE_FMA__
522
//		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
523
#else
524
//		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
525
526
527
#endif
	}

528
529
530
531
	h1_real = _mm256_broadcast_ss(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_ss(&hh_dbl[1]);
	h1_real = _mm256_xor_ps(h1_real, sign);
	h1_imag = _mm256_xor_ps(h1_imag, sign);
532

533
	tmp1 = _mm256_mul_ps(h1_imag, x1);
534
#ifdef __ELPA_USE_FMA__
535
	x1 = _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
536
#else
537
	x1 = _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1));
538
#endif
539
//	tmp2 = _mm256_mul_pd(h1_imag, x2);
540
#ifdef __ELPA_USE_FMA__
541
//	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
542
#else
543
//	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
544
545
#endif

546
547
	q1 = _mm256_load_ps(&q_dbl[0]);
//	q2 = _mm256_load_pd(&q_dbl[4]);
548

549
550
	q1 = _mm256_add_ps(q1, x1);
//	q2 = _mm256_add_pd(q2, x2);
551

552
553
	_mm256_store_ps(&q_dbl[0], q1);
//	_mm256_store_pd(&q_dbl[4], q2);
554
555
556

	for (i = 1; i < nb; i++)
	{
557
558
		h1_real = _mm256_broadcast_ss(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_ss(&hh_dbl[(i*2)+1]);
559

560
561
		q1 = _mm256_load_ps(&q_dbl[(2*i*ldq)+0]);
//		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
562

563
		tmp1 = _mm256_mul_ps(h1_imag, x1);
564
#ifdef __ELPA_USE_FMA__
565
		q1 = _mm256_add_ps(q1, _mm256_FMADDSUB_ps(h1_real, x1, _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
566
#else
567
		q1 = _mm256_add_ps(q1, _mm256_addsub_ps( _mm256_mul_ps(h1_real, x1), _mm256_shuffle_ps(tmp1, tmp1, 0xb1)));
568
#endif
569
//		tmp2 = _mm256_mul_pd(h1_imag, x2);
570
#ifdef __ELPA_USE_FMA__
571
//		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
572
#else
573
//		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
574
575
#endif

576
577
		_mm256_store_ps(&q_dbl[(2*i*ldq)+0], q1);
//		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
578
579
	}
}