elpa2_kernels_complex_sse_1hv_single_precision.c 18.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
45
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
46
47
48
//
#include "config-f90.h"

49
#include <complex.h>
50
51
52
53
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

54
#ifdef HAVE_SSE_INTRINSICS
55
56
57
58
59
#undef __AVX__
#endif


//Forward declaration
60
61
62
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
63
64

/*
65
!f>#ifdef HAVE_SSE_INTRINSICS
66
67
68
69
70
!f> interface
!f>   subroutine single_hh_trafo_complex_sse_1hv_single(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_sse_1hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
71
72
!f>     complex(kind=c_float_complex)   :: q(*)
!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
73
74
75
76
77
!f>   end subroutine
!f> end interface
!f>#endif
*/

78
void single_hh_trafo_complex_sse_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
79
80
81
82
83
84
85
86
87
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	//int ldh = *pldh;

	for (i = 0; i < nq-4; i+=6)
	{
88
		hh_trafo_complex_kernel_6_SSE_1hv_single(&q[i], hh, nb, ldq);
89
90
91
	}
	if (nq-i > 2)
	{
92
		hh_trafo_complex_kernel_4_SSE_1hv_single(&q[i], hh, nb, ldq);
93
94
95
	}
	else if (nq-i > 0)
	{
96
		hh_trafo_complex_kernel_2_SSE_1hv_single(&q[i], hh, nb, ldq);
97
98
99
	}
}

100
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
101
{
102
103
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
104

105
106
107
108
	__m128 x1, x2, x3, x4, x5, x6;
	__m128 q1, q2, q3, q4, q5, q6;
	__m128 h1_real, h1_imag;
	__m128 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
109
110
	int i=0;

111
112
//	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
	__m128 sign = (__m128)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
113

114
115
116
117
118
119
	x1 = _mm_load_ps(&q_dbl[0]);
	x2 = _mm_load_ps(&q_dbl[4]);
	x3 = _mm_load_ps(&q_dbl[8]);
//	x4 = _mm_load_pd(&q_dbl[6]);
//	x5 = _mm_load_pd(&q_dbl[8]);
//	x6 = _mm_load_pd(&q_dbl[10]);
120
121
122

	for (i = 1; i < nb; i++)
	{
123
124
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
125
126
#ifndef __ELPA_USE_FMA__
		// conjugate
127
		h1_imag = _mm_xor_ps(h1_imag, sign);
128
129
#endif

130
131
132
133
134
135
136
137
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm_load_ps(&q_dbl[(2*i*ldq)+8]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
//		q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]);
//		q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]);

		tmp1 = _mm_mul_ps(h1_imag, q1);
138
139

#ifdef __ELPA_USE_FMA__
140
		x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
141
#else
142
		x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
143
#endif
144
		tmp2 = _mm_mul_ps(h1_imag, q2);
145
#ifdef __ELPA_USE_FMA__
146
		x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
147
#else
148
		x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
149
#endif
150
		tmp3 = _mm_mul_ps(h1_imag, q3);
151
#ifdef __ELPA_USE_FMA__
152
		x3 = _mm_add_ps(x3, _mm_msubadd_ps(h1_real, q3, _mm_shuffle_ps(tmp3, tmp3, 0xb1)));
153
#else
154
		x3 = _mm_add_ps(x3, _mm_addsub_ps( _mm_mul_ps(h1_real, q3), _mm_shuffle_ps(tmp3, tmp3, 0xb1)));
155
#endif
156
//		tmp4 = _mm_mul_pd(h1_imag, q4);
157
#ifdef __ELPA_USE_FMA__
158
//		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, 0xb1)));
159
#else
160
//		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, 0xb1)));
161
#endif
162
//		tmp5 = _mm_mul_pd(h1_imag, q5);
163
#ifdef __ELPA_USE_FMA__
164
//		x5 = _mm_add_pd(x5, _mm_msubadd_pd(h1_real, q5, _mm_shuffle_pd(tmp5, tmp5, 0xb1)));
165
#else
166
//		x5 = _mm_add_pd(x5, _mm_addsub_pd( _mm_mul_pd(h1_real, q5), _mm_shuffle_pd(tmp5, tmp5, 0xb1)));
167
#endif
168
//		tmp6 = _mm_mul_pd(h1_imag, q6);
169
#ifdef __ELPA_USE_FMA__
170
//		x6 = _mm_add_pd(x6, _mm_msubadd_pd(h1_real, q6, _mm_shuffle_pd(tmp6, tmp6, 0xb1)));
171
#else
172
//		x6 = _mm_add_pd(x6, _mm_addsub_pd( _mm_mul_pd(h1_real, q6), _mm_shuffle_pd(tmp6, tmp6, 0xb1)));
173
174
175
#endif
	}

176
177
178
179
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
180

181
182
	tmp1 = _mm_mul_ps(h1_imag, x1);
	//carefull
183
#ifdef __ELPA_USE_FMA__
184
	x1 = _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
185
#else
186
	x1 = _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
187
#endif
188
	tmp2 = _mm_mul_ps(h1_imag, x2);
189
#ifdef __ELPA_USE_FMA__
190
	x2 = _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
191
#else
192
	x2 = _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
193
#endif
194
	tmp3 = _mm_mul_ps(h1_imag, x3);
195
#ifdef __ELPA_USE_FMA__
196
	x3 = _mm_maddsub_ps(h1_real, x3, _mm_shuffle_ps(tmp3, tmp3, 0xb1));
197
#else
198
	x3 = _mm_addsub_ps( _mm_mul_ps(h1_real, x3), _mm_shuffle_ps(tmp3, tmp3, 0xb1));
199
#endif
200
//	tmp4 = _mm_mul_pd(h1_imag, x4);
201
#ifdef __ELPA_USE_FMA__
202
//	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, 0xb1));
203
#else
204
//	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, 0xb1));
205
#endif
206
//	tmp5 = _mm_mul_pd(h1_imag, x5);
207
#ifdef __ELPA_USE_FMA__
208
//	x5 = _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, 0xb1));
209
#else
210
//	x5 = _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, 0xb1));
211
#endif
212
//	tmp6 = _mm_mul_pd(h1_imag, x6);
213
#ifdef __ELPA_USE_FMA__
214
//	x6 = _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, 0xb1));
215
#else
216
//	x6 = _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, 0xb1));
217
218
#endif

219
220
221
222
223
224
	q1 = _mm_load_ps(&q_dbl[0]);
	q2 = _mm_load_ps(&q_dbl[4]);
	q3 = _mm_load_ps(&q_dbl[8]);
//	q4 = _mm_load_pd(&q_dbl[6]);
//	q5 = _mm_load_pd(&q_dbl[8]);
//	q6 = _mm_load_pd(&q_dbl[10]);
225

226
227
228
229
230
231
	q1 = _mm_add_ps(q1, x1);
	q2 = _mm_add_ps(q2, x2);
	q3 = _mm_add_ps(q3, x3);
//	q4 = _mm_add_pd(q4, x4);
//	q5 = _mm_add_pd(q5, x5);
//	q6 = _mm_add_pd(q6, x6);
232

233
234
235
236
237
238
	_mm_store_ps(&q_dbl[0], q1);
	_mm_store_ps(&q_dbl[4], q2);
	_mm_store_ps(&q_dbl[8], q3);
//	_mm_store_pd(&q_dbl[6], q4);
//	_mm_store_pd(&q_dbl[8], q5);
//	_mm_store_pd(&q_dbl[10], q6);
239
240
241

	for (i = 1; i < nb; i++)
	{
242
243
244
	// carefull
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
245

246
247
248
249
250
251
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
		q3 = _mm_load_ps(&q_dbl[(2*i*ldq)+8]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
//		q5 = _mm_load_pd(&q_dbl[(2*i*ldq)+8]);
//		q6 = _mm_load_pd(&q_dbl[(2*i*ldq)+10]);
252

253
		tmp1 = _mm_mul_ps(h1_imag, x1);
254
#ifdef __ELPA_USE_FMA__
255
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
256
#else
257
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
258
#endif
259
		tmp2 = _mm_mul_ps(h1_imag, x2);
260
#ifdef __ELPA_USE_FMA__
261
		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
262
#else
263
		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
264
#endif
265
		tmp3 = _mm_mul_ps(h1_imag, x3);
266
#ifdef __ELPA_USE_FMA__
267
		q3 = _mm_add_ps(q3, _mm_maddsub_ps(h1_real, x3, _mm_shuffle_ps(tmp3, tmp3, 0xb1)));
268
#else
269
		q3 = _mm_add_ps(q3, _mm_addsub_ps( _mm_mul_ps(h1_real, x3), _mm_shuffle_ps(tmp3, tmp3, 0xb1)));
270
#endif
271
//		tmp4 = _mm_mul_pd(h1_imag, x4);
272
#ifdef __ELPA_USE_FMA__
273
//		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, 0xb1)));
274
#else
275
//		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, 0xb1)));
276
#endif
277
//		tmp5 = _mm_mul_pd(h1_imag, x5);
278
#ifdef __ELPA_USE_FMA__
279
//		q5 = _mm_add_pd(q5, _mm_maddsub_pd(h1_real, x5, _mm_shuffle_pd(tmp5, tmp5, 0xb1)));
280
#else
281
//		q5 = _mm_add_pd(q5, _mm_addsub_pd( _mm_mul_pd(h1_real, x5), _mm_shuffle_pd(tmp5, tmp5, 0xb1)));
282
#endif
283
//		tmp6 = _mm_mul_pd(h1_imag, x6);
284
#ifdef __ELPA_USE_FMA__
285
//		q6 = _mm_add_pd(q6, _mm_maddsub_pd(h1_real, x6, _mm_shuffle_pd(tmp6, tmp6, 0xb1)));
286
#else
287
//		q6 = _mm_add_pd(q6, _mm_addsub_pd( _mm_mul_pd(h1_real, x6), _mm_shuffle_pd(tmp6, tmp6, 0xb1)));
288
289
#endif

290
291
292
293
294
295
		_mm_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm_store_ps(&q_dbl[(2*i*ldq)+4], q2);
		_mm_store_ps(&q_dbl[(2*i*ldq)+8], q3);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+8], q5);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+10], q6);
296
297
298
	}
}

299
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
300
{
301
302
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
303

304
305
306
307
	__m128 x1, x2, x3, x4;
	__m128 q1, q2, q3, q4;
	__m128 h1_real, h1_imag;
	__m128 tmp1, tmp2, tmp3, tmp4;
308
	int i=0;
309
	__m128 sign = (__m128)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
310
311


312
313
314
315
	x1 = _mm_load_ps(&q_dbl[0]);
	x2 = _mm_load_ps(&q_dbl[4]);
//	x3 = _mm_load_pd(&q_dbl[4]);
//	x4 = _mm_load_pd(&q_dbl[6]);
316
317
318

	for (i = 1; i < nb; i++)
	{
319
320
321
	// carefull
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
322
323
#ifndef __ELPA_USE_FMA__
		// conjugate
324
		h1_imag = _mm_xor_ps(h1_imag, sign);
325
326
#endif

327
328
329
330
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
331

332
333
		tmp1 = _mm_mul_ps(h1_imag, q1);
		// carefull
334
#ifdef __ELPA_USE_FMA__
335
		x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
336
#else
337
		x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
338
#endif
339
		tmp2 = _mm_mul_ps(h1_imag, q2);
340
#ifdef __ELPA_USE_FMA__
341
		x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
342
#else
343
		x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
344
#endif
345
//		tmp3 = _mm_mul_ps(h1_imag, q3);
346
#ifdef __ELPA_USE_FMA__
347
//		x3 = _mm_add_ps(x3, _mm_msubadd_ps(h1_real, q3, _mm_shuffle_ps(tmp3, tmp3, 0xb1)));
348
#else
349
//		x3 = _mm_add_ps(x3, _mm_addsub_ps( _mm_mul_ps(h1_real, q3), _mm_shuffle_ps(tmp3, tmp3, 0xb1)));
350
#endif
351
//		tmp4 = _mm_mul_ps(h1_imag, q4);
352
#ifdef __ELPA_USE_FMA__
353
//		x4 = _mm_add_ps(x4, _mm_msubadd_ps(h1_real, q4, _mm_shuffle_ps(tmp4, tmp4, 0xb1)));
354
#else
355
//		x4 = _mm_add_ps(x4, _mm_addsub_ps( _mm_mul_ps(h1_real, q4), _mm_shuffle_ps(tmp4, tmp4, 0xb1)));
356
357
358
#endif
	}

359
360
361
362
363
//carefull
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
364

365
366
	tmp1 = _mm_mul_ps(h1_imag, x1);
	// carefull
367
#ifdef __ELPA_USE_FMA__
368
	x1 = _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
369
#else
370
	x1 = _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
371
#endif
372
	tmp2 = _mm_mul_ps(h1_imag, x2);
373
#ifdef __ELPA_USE_FMA__
374
	x2 = _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
375
#else
376
	x2 = _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
377
#endif
378
//	tmp3 = _mm_mul_ps(h1_imag, x3);
379
#ifdef __ELPA_USE_FMA__
380
//	x3 = _mm_maddsub_ps(h1_real, x3, _mm_shuffle_ps(tmp3, tmp3, 0xb1));
381
#else
382
//	x3 = _mm_addsub_ps( _mm_mul_ps(h1_real, x3), _mm_shuffle_ps(tmp3, tmp3, 0xb1));
383
#endif
384
//	tmp4 = _mm_mul_ps(h1_imag, x4);
385
#ifdef __ELPA_USE_FMA__
386
//	x4 = _mm_maddsub_ps(h1_real, x4, _mm_shuffle_ps(tmp4, tmp4, 0xb1));
387
#else
388
//	x4 = _mm_addsub_ps( _mm_mul_ps(h1_real, x4), _mm_shuffle_ps(tmp4, tmp4, 0xb1));
389
390
#endif

391
392
393
394
	q1 = _mm_load_ps(&q_dbl[0]);
	q2 = _mm_load_ps(&q_dbl[4]);
//	q3 = _mm_load_ps(&q_dbl[4]);
//	q4 = _mm_load_ps(&q_dbl[6]);
395

396
397
398
399
	q1 = _mm_add_ps(q1, x1);
	q2 = _mm_add_ps(q2, x2);
//	q3 = _mm_add_ps(q3, x3);
//	q4 = _mm_add_ps(q4, x4);
400

401
402
403
404
	_mm_store_ps(&q_dbl[0], q1);
	_mm_store_ps(&q_dbl[4], q2);
//	_mm_store_ps(&q_dbl[4], q3);
//	_mm_store_ps(&q_dbl[6], q4);
405
406
407

	for (i = 1; i < nb; i++)
	{
408
409
410
411
412
413
414
415
	// carefull
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));

		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q3 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q4 = _mm_load_ps(&q_dbl[(2*i*ldq)+6]);
416

417
		tmp1 = _mm_mul_ps(h1_imag, x1);
418

419
		//carefull
420
#ifdef __ELPA_USE_FMA__
421
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
422
#else
423
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
424
#endif
425
		tmp2 = _mm_mul_ps(h1_imag, x2);
426
#ifdef __ELPA_USE_FMA__
427
		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
428
#else
429
		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
430
#endif
431
//		tmp3 = _mm_mul_ps(h1_imag, x3);
432
#ifdef __ELPA_USE_FMA__
433
//		q3 = _mm_add_ps(q3, _mm_maddsub_ps(h1_real, x3, _mm_shuffle_ps(tmp3, tmp3, 0xb1)));
434
#else
435
//		q3 = _mm_add_ps(q3, _mm_addsub_ps( _mm_mul_ps(h1_real, x3), _mm_shuffle_ps(tmp3, tmp3, 0xb1)));
436
#endif
437
//		tmp4 = _mm_mul_ps(h1_imag, x4);
438
#ifdef __ELPA_USE_FMA__
439
//		q4 = _mm_add_ps(q4, _mm_maddsub_ps(h1_real, x4, _mm_shuffle_ps(tmp4, tmp4, 0xb1)));
440
#else
441
//		q4 = _mm_add_ps(q4, _mm_addsub_ps( _mm_mul_ps(h1_real, x4), _mm_shuffle_ps(tmp4, tmp4, 0xb1)));
442
443
#endif

444
445
446
447
		_mm_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm_store_ps(&q_dbl[(2*i*ldq)+4], q2);
//		_mm_store_ps(&q_dbl[(2*i*ldq)+4], q3);
//		_mm_store_ps(&q_dbl[(2*i*ldq)+6], q4);
448
449
450
	}
}

451
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
452
{
453
454
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
455

456
457
458
459
	__m128 x1, x2;
	__m128 q1, q2;
	__m128 h1_real, h1_imag;
	__m128 tmp1, tmp2;
460
461
	int i=0;

462
463
//	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
	__m128 sign = (__m128)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
464

465
466
467
// careful jsut load two
	x1 = _mm_load_ps(&q_dbl[0]);
//	x2 = _mm_load_ps(&q_dbl[2]);
468
469
470

	for (i = 1; i < nb; i++)
	{
471
472
473
	// careful
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
474
475
#ifndef __ELPA_USE_FMA__
		// conjugate
476
		h1_imag = _mm_xor_ps(h1_imag, sign);
477
478
#endif

479
480
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
//		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+2]);
481

482
483
		tmp1 = _mm_mul_ps(h1_imag, q1);
		//carefull
484
#ifdef __ELPA_USE_FMA__
485
		x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
486
#else
487
		x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
488
#endif
489
//		tmp2 = _mm_mul_ps(h1_imag, q2);
490
#ifdef __ELPA_USE_FMA__
491
//		x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
492
#else
493
//		x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
494
495
496
#endif
	}

497
498
499
500
501
//carefull
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
502

503
504
	tmp1 = _mm_mul_ps(h1_imag, x1);
	//carefull
505
#ifdef __ELPA_USE_FMA__
506
	x1 = _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
507
#else
508
	x1 = _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
509
#endif
510
//	tmp2 = _mm_mul_ps(h1_imag, x2);
511
#ifdef __ELPA_USE_FMA__
512
//	x2 = _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
513
#else
514
//	x2 = _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
515
516
#endif

517
518
	q1 = _mm_load_ps(&q_dbl[0]);
//	q2 = _mm_load_ps(&q_dbl[2]);
519

520
521
//	q1 = _mm_add_ps(q1, x1);
	q2 = _mm_add_ps(q2, x2);
522

523
524
	_mm_store_ps(&q_dbl[0], q1);
//	_mm_store_ps(&q_dbl[2], q2);
525
526
527

	for (i = 1; i < nb; i++)
	{
528
529
530
	// carefull
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
531

532
533
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
//		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+2]);
534

535
536
		tmp1 = _mm_mul_ps(h1_imag, x1);
		//carefull
537
#ifdef __ELPA_USE_FMA__
538
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
539
#else
540
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
541
#endif
542
//		tmp2 = _mm_mul_ps(h1_imag, x2);
543
#ifdef __ELPA_USE_FMA__
544
//		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
545
#else
546
//		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
547
548
#endif

549
550
		_mm_store_ps(&q_dbl[(2*i*ldq)+0], q1);
//		_mm_store_ps(&q_dbl[(2*i*ldq)+2], q2);
551
552
	}
}