elpa2_kernels_complex_avx-avx2_1hv.c 19.8 KB
Newer Older
1
2
//    This file is part of ELPA.
//
Andreas Marek's avatar
Andreas Marek committed
3
//    The ELPA library was originally created by the ELPA consortium,
4
5
//    consisting of the following organizations:
//
6
7
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
Andreas Marek's avatar
Andreas Marek committed
11
12
13
14
15
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
16
17
//    - IBM Deutschland GmbH
//
18
//    This particular source code file contains additions, changes and
Andreas Marek's avatar
Andreas Marek committed
19
//    enhancements authored by Intel Corporation which is not part of
20
//    the ELPA consortium.
21
22
//
//    More information can be found here:
23
//    http://elpa.mpcdf.mpg.de/
24
25
//
//    ELPA is free software: you can redistribute it and/or modify
Andreas Marek's avatar
Andreas Marek committed
26
27
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
60
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
61
// --------------------------------------------------------------------------------------------------
62
#include "config-f90.h"
63

Andreas Marek's avatar
Andreas Marek committed
64
#include <complex.h>
65
66
67
68
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

69
70
#ifdef HAVE_AVX2

71
72
73
74
75
76
77
78
79
80
81
#ifdef __FMA4__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_maddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_msubadd_pd(a,b,c)
#endif

#ifdef __AVX2__
#define __ELPA_USE_FMA__
#define _mm256_FMADDSUB_pd(a,b,c) _mm256_fmaddsub_pd(a,b,c)
#define _mm256_FMSUBADD_pd(a,b,c) _mm256_fmsubadd_pd(a,b,c)
#endif
82

83
84
#endif

85
//Forward declaration
Andreas Marek's avatar
Andreas Marek committed
86
87
88
static  __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
static  __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq);
89

90
/*
91
!f>#if defined(HAVE_AVX) || defined(HAVE_AVX2)
92
93
94
95
96
97
98
99
100
101
102
!f> interface
!f>   subroutine single_hh_trafo_complex_avx_avx2_1hv(q, hh, pnb, pnq, pldq) bind(C, name="single_hh_trafo_complex_avx_avx2_1hv")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     complex(kind=c_double)     :: q(*)
!f>     complex(kind=c_double)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/

Andreas Marek's avatar
Andreas Marek committed
103
void single_hh_trafo_complex_avx_avx2_1hv(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	//int ldh = *pldh;

	for (i = 0; i < nq-8; i+=12)
	{
		hh_trafo_complex_kernel_12_AVX_1hv(&q[i], hh, nb, ldq);
	}
	if (nq-i > 4)
	{
		hh_trafo_complex_kernel_8_AVX_1hv(&q[i], hh, nb, ldq);
	}
	else if (nq-i > 0)
	{
		hh_trafo_complex_kernel_4_AVX_1hv(&q[i], hh, nb, ldq);
	}
}

Andreas Marek's avatar
Andreas Marek committed
125
 static __forceinline void hh_trafo_complex_kernel_12_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
{
	double* q_dbl = (double*)q;
	double* hh_dbl = (double*)hh;

	__m256d x1, x2, x3, x4, x5, x6;
	__m256d q1, q2, q3, q4, q5, q6;
	__m256d h1_real, h1_imag;
	__m256d tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
	int i=0;

	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);

	x1 = _mm256_load_pd(&q_dbl[0]);
	x2 = _mm256_load_pd(&q_dbl[4]);
	x3 = _mm256_load_pd(&q_dbl[8]);
	x4 = _mm256_load_pd(&q_dbl[12]);
	x5 = _mm256_load_pd(&q_dbl[16]);
	x6 = _mm256_load_pd(&q_dbl[20]);

	for (i = 1; i < nb; i++)
	{
		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
149
#ifndef __ELPA_USE_FMA__
150
151
152
153
154
155
156
157
158
159
160
161
		// conjugate
		h1_imag = _mm256_xor_pd(h1_imag, sign);
#endif

		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
		q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]);
		q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]);

		tmp1 = _mm256_mul_pd(h1_imag, q1);
162
163
#ifdef __ELPA_USE_FMA__
		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
164
165
166
167
#else
		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
		tmp2 = _mm256_mul_pd(h1_imag, q2);
168
169
#ifdef __ELPA_USE_FMA__
		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
170
171
172
173
#else
		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
		tmp3 = _mm256_mul_pd(h1_imag, q3);
174
175
#ifdef __ELPA_USE_FMA__
		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
176
177
178
179
#else
		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
		tmp4 = _mm256_mul_pd(h1_imag, q4);
180
181
#ifdef __ELPA_USE_FMA__
		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
182
183
184
185
#else
		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
		tmp5 = _mm256_mul_pd(h1_imag, q5);
186
187
#ifdef __ELPA_USE_FMA__
		x5 = _mm256_add_pd(x5, _mm256_FMSUBADD_pd(h1_real, q5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
188
189
190
191
#else
		x5 = _mm256_add_pd(x5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
#endif
		tmp6 = _mm256_mul_pd(h1_imag, q6);
192
193
#ifdef __ELPA_USE_FMA__
		x6 = _mm256_add_pd(x6, _mm256_FMSUBADD_pd(h1_real, q6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
194
195
196
197
198
199
200
201
202
203
204
#else
		x6 = _mm256_add_pd(x6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
#endif
	}

	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
	h1_real = _mm256_xor_pd(h1_real, sign);
	h1_imag = _mm256_xor_pd(h1_imag, sign);

	tmp1 = _mm256_mul_pd(h1_imag, x1);
205
206
#ifdef __ELPA_USE_FMA__
	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
207
208
209
210
#else
	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#endif
	tmp2 = _mm256_mul_pd(h1_imag, x2);
211
212
#ifdef __ELPA_USE_FMA__
	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
213
214
215
216
#else
	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#endif
	tmp3 = _mm256_mul_pd(h1_imag, x3);
217
218
#ifdef __ELPA_USE_FMA__
	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
219
220
221
222
#else
	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#endif
	tmp4 = _mm256_mul_pd(h1_imag, x4);
223
224
#ifdef __ELPA_USE_FMA__
	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
225
226
227
228
#else
	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
#endif
	tmp5 = _mm256_mul_pd(h1_imag, x5);
229
230
#ifdef __ELPA_USE_FMA__
	x5 = _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5));
231
232
233
234
#else
	x5 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5));
#endif
	tmp6 = _mm256_mul_pd(h1_imag, x6);
235
236
#ifdef __ELPA_USE_FMA__
	x6 = _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5));
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#else
	x6 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5));
#endif

	q1 = _mm256_load_pd(&q_dbl[0]);
	q2 = _mm256_load_pd(&q_dbl[4]);
	q3 = _mm256_load_pd(&q_dbl[8]);
	q4 = _mm256_load_pd(&q_dbl[12]);
	q5 = _mm256_load_pd(&q_dbl[16]);
	q6 = _mm256_load_pd(&q_dbl[20]);

	q1 = _mm256_add_pd(q1, x1);
	q2 = _mm256_add_pd(q2, x2);
	q3 = _mm256_add_pd(q3, x3);
	q4 = _mm256_add_pd(q4, x4);
	q5 = _mm256_add_pd(q5, x5);
	q6 = _mm256_add_pd(q6, x6);

	_mm256_store_pd(&q_dbl[0], q1);
	_mm256_store_pd(&q_dbl[4], q2);
	_mm256_store_pd(&q_dbl[8], q3);
	_mm256_store_pd(&q_dbl[12], q4);
	_mm256_store_pd(&q_dbl[16], q5);
	_mm256_store_pd(&q_dbl[20], q6);

	for (i = 1; i < nb; i++)
	{
		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);

		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);
		q5 = _mm256_load_pd(&q_dbl[(2*i*ldq)+16]);
		q6 = _mm256_load_pd(&q_dbl[(2*i*ldq)+20]);

		tmp1 = _mm256_mul_pd(h1_imag, x1);
275
276
#ifdef __ELPA_USE_FMA__
		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
277
278
279
280
#else
		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
		tmp2 = _mm256_mul_pd(h1_imag, x2);
281
282
#ifdef __ELPA_USE_FMA__
		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
283
284
285
286
#else
		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
		tmp3 = _mm256_mul_pd(h1_imag, x3);
287
#ifdef __ELPA_USE_FMA__
288
		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
289
290
291
292
#else
		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
		tmp4 = _mm256_mul_pd(h1_imag, x4);
293
294
#ifdef __ELPA_USE_FMA__
		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
295
296
297
298
#else
		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
		tmp5 = _mm256_mul_pd(h1_imag, x5);
299
300
#ifdef __ELPA_USE_FMA__
		q5 = _mm256_add_pd(q5, _mm256_FMADDSUB_pd(h1_real, x5, _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
301
302
303
304
#else
		q5 = _mm256_add_pd(q5, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x5), _mm256_shuffle_pd(tmp5, tmp5, 0x5)));
#endif
		tmp6 = _mm256_mul_pd(h1_imag, x6);
305
306
#ifdef __ELPA_USE_FMA__
		q6 = _mm256_add_pd(q6, _mm256_FMADDSUB_pd(h1_real, x6, _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
307
308
309
310
311
312
313
314
315
316
317
318
319
#else
		q6 = _mm256_add_pd(q6, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x6), _mm256_shuffle_pd(tmp6, tmp6, 0x5)));
#endif

		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+16], q5);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+20], q6);
	}
}

Andreas Marek's avatar
Andreas Marek committed
320
static __forceinline void hh_trafo_complex_kernel_8_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
{
	double* q_dbl = (double*)q;
	double* hh_dbl = (double*)hh;

	__m256d x1, x2, x3, x4;
	__m256d q1, q2, q3, q4;
	__m256d h1_real, h1_imag;
	__m256d tmp1, tmp2, tmp3, tmp4;
	int i=0;

	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);

	x1 = _mm256_load_pd(&q_dbl[0]);
	x2 = _mm256_load_pd(&q_dbl[4]);
	x3 = _mm256_load_pd(&q_dbl[8]);
	x4 = _mm256_load_pd(&q_dbl[12]);

	for (i = 1; i < nb; i++)
	{
		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
342
#ifndef __ELPA_USE_FMA__
343
344
345
346
347
348
349
350
351
352
		// conjugate
		h1_imag = _mm256_xor_pd(h1_imag, sign);
#endif

		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);

		tmp1 = _mm256_mul_pd(h1_imag, q1);
353
354
#ifdef __ELPA_USE_FMA__
		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
355
356
357
358
#else
		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
		tmp2 = _mm256_mul_pd(h1_imag, q2);
359
360
#ifdef __ELPA_USE_FMA__
		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
361
362
363
364
#else
		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
		tmp3 = _mm256_mul_pd(h1_imag, q3);
365
366
#ifdef __ELPA_USE_FMA__
		x3 = _mm256_add_pd(x3, _mm256_FMSUBADD_pd(h1_real, q3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
367
368
369
370
#else
		x3 = _mm256_add_pd(x3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
		tmp4 = _mm256_mul_pd(h1_imag, q4);
371
372
#ifdef __ELPA_USE_FMA__
		x4 = _mm256_add_pd(x4, _mm256_FMSUBADD_pd(h1_real, q4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
373
374
375
376
377
378
379
380
381
382
383
#else
		x4 = _mm256_add_pd(x4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif
	}

	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
	h1_real = _mm256_xor_pd(h1_real, sign);
	h1_imag = _mm256_xor_pd(h1_imag, sign);

	tmp1 = _mm256_mul_pd(h1_imag, x1);
384
385
#ifdef __ELPA_USE_FMA__
	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
386
387
388
389
#else
	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#endif
	tmp2 = _mm256_mul_pd(h1_imag, x2);
390
391
#ifdef __ELPA_USE_FMA__
	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
392
393
394
395
#else
	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#endif
	tmp3 = _mm256_mul_pd(h1_imag, x3);
396
397
#ifdef __ELPA_USE_FMA__
	x3 = _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5));
398
399
400
401
#else
	x3 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5));
#endif
	tmp4 = _mm256_mul_pd(h1_imag, x4);
402
403
#ifdef __ELPA_USE_FMA__
	x4 = _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5));
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
#else
	x4 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5));
#endif

	q1 = _mm256_load_pd(&q_dbl[0]);
	q2 = _mm256_load_pd(&q_dbl[4]);
	q3 = _mm256_load_pd(&q_dbl[8]);
	q4 = _mm256_load_pd(&q_dbl[12]);

	q1 = _mm256_add_pd(q1, x1);
	q2 = _mm256_add_pd(q2, x2);
	q3 = _mm256_add_pd(q3, x3);
	q4 = _mm256_add_pd(q4, x4);

	_mm256_store_pd(&q_dbl[0], q1);
	_mm256_store_pd(&q_dbl[4], q2);
	_mm256_store_pd(&q_dbl[8], q3);
	_mm256_store_pd(&q_dbl[12], q4);

	for (i = 1; i < nb; i++)
	{
		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);

		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm256_load_pd(&q_dbl[(2*i*ldq)+8]);
		q4 = _mm256_load_pd(&q_dbl[(2*i*ldq)+12]);

		tmp1 = _mm256_mul_pd(h1_imag, x1);
434
435
#ifdef __ELPA_USE_FMA__
		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
436
437
438
439
#else
		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
		tmp2 = _mm256_mul_pd(h1_imag, x2);
440
441
#ifdef __ELPA_USE_FMA__
		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
442
#else
443
444
445
		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
		tmp3 = _mm256_mul_pd(h1_imag, x3);
446
447
#ifdef __ELPA_USE_FMA__
		q3 = _mm256_add_pd(q3, _mm256_FMADDSUB_pd(h1_real, x3, _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
448
#else
449
450
451
		q3 = _mm256_add_pd(q3, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x3), _mm256_shuffle_pd(tmp3, tmp3, 0x5)));
#endif
		tmp4 = _mm256_mul_pd(h1_imag, x4);
452
453
#ifdef __ELPA_USE_FMA__
		q4 = _mm256_add_pd(q4, _mm256_FMADDSUB_pd(h1_real, x4, _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
454
455
456
457
458
459
460
461
462
463
464
#else
		q4 = _mm256_add_pd(q4, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x4), _mm256_shuffle_pd(tmp4, tmp4, 0x5)));
#endif

		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+8], q3);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+12], q4);
	}
}

Andreas Marek's avatar
Andreas Marek committed
465
static __forceinline void hh_trafo_complex_kernel_4_AVX_1hv(double complex* q, double complex* hh, int nb, int ldq)
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
{
	double* q_dbl = (double*)q;
	double* hh_dbl = (double*)hh;

	__m256d x1, x2;
	__m256d q1, q2;
	__m256d h1_real, h1_imag;
	__m256d tmp1, tmp2;
	int i=0;

	__m256d sign = (__m256d)_mm256_set_epi64x(0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000);

	x1 = _mm256_load_pd(&q_dbl[0]);
	x2 = _mm256_load_pd(&q_dbl[4]);

	for (i = 1; i < nb; i++)
	{
		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);
485
#ifndef __ELPA_USE_FMA__
486
487
488
489
490
491
492
493
		// conjugate
		h1_imag = _mm256_xor_pd(h1_imag, sign);
#endif

		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);

		tmp1 = _mm256_mul_pd(h1_imag, q1);
494
495
#ifdef __ELPA_USE_FMA__
		x1 = _mm256_add_pd(x1, _mm256_FMSUBADD_pd(h1_real, q1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
496
497
498
499
#else
		x1 = _mm256_add_pd(x1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
		tmp2 = _mm256_mul_pd(h1_imag, q2);
500
501
#ifdef __ELPA_USE_FMA__
		x2 = _mm256_add_pd(x2, _mm256_FMSUBADD_pd(h1_real, q2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
502
503
504
505
506
507
508
509
510
511
512
#else
		x2 = _mm256_add_pd(x2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, q2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif
	}

	h1_real = _mm256_broadcast_sd(&hh_dbl[0]);
	h1_imag = _mm256_broadcast_sd(&hh_dbl[1]);
	h1_real = _mm256_xor_pd(h1_real, sign);
	h1_imag = _mm256_xor_pd(h1_imag, sign);

	tmp1 = _mm256_mul_pd(h1_imag, x1);
513
514
#ifdef __ELPA_USE_FMA__
	x1 = _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5));
515
516
517
518
#else
	x1 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5));
#endif
	tmp2 = _mm256_mul_pd(h1_imag, x2);
519
520
#ifdef __ELPA_USE_FMA__
	x2 = _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5));
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
#else
	x2 = _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5));
#endif

	q1 = _mm256_load_pd(&q_dbl[0]);
	q2 = _mm256_load_pd(&q_dbl[4]);

	q1 = _mm256_add_pd(q1, x1);
	q2 = _mm256_add_pd(q2, x2);

	_mm256_store_pd(&q_dbl[0], q1);
	_mm256_store_pd(&q_dbl[4], q2);

	for (i = 1; i < nb; i++)
	{
		h1_real = _mm256_broadcast_sd(&hh_dbl[i*2]);
		h1_imag = _mm256_broadcast_sd(&hh_dbl[(i*2)+1]);

		q1 = _mm256_load_pd(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm256_load_pd(&q_dbl[(2*i*ldq)+4]);

		tmp1 = _mm256_mul_pd(h1_imag, x1);
543
544
#ifdef __ELPA_USE_FMA__
		q1 = _mm256_add_pd(q1, _mm256_FMADDSUB_pd(h1_real, x1, _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
545
546
547
548
#else
		q1 = _mm256_add_pd(q1, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x1), _mm256_shuffle_pd(tmp1, tmp1, 0x5)));
#endif
		tmp2 = _mm256_mul_pd(h1_imag, x2);
549
550
#ifdef __ELPA_USE_FMA__
		q2 = _mm256_add_pd(q2, _mm256_FMADDSUB_pd(h1_real, x2, _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
551
552
553
554
555
556
557
558
#else
		q2 = _mm256_add_pd(q2, _mm256_addsub_pd( _mm256_mul_pd(h1_real, x2), _mm256_shuffle_pd(tmp2, tmp2, 0x5)));
#endif

		_mm256_store_pd(&q_dbl[(2*i*ldq)+0], q1);
		_mm256_store_pd(&q_dbl[(2*i*ldq)+4], q2);
	}
}