elpa2_kernels_complex_sse_2hv_single_precision.c 20.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
45
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
46
47
48
//
#include "config-f90.h"

49
#include <complex.h>
50
51
52
53
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

54
#ifdef HAVE_SSE_INTRINSICS
55
56
57
58
#undef __AVX__
#endif

//Forward declaration
59
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
60
61

/*
62
!f>#ifdef HAVE_SSE_INTRINSICS
63
64
65
66
67
!f> interface
!f>   subroutine double_hh_trafo_complex_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_sse_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
68
69
!f>     ! complex(kind=c_float_complex)   :: q(*)
!f>     type(c_ptr), value                :: q
70
!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
71
72
73
74
!f>   end subroutine
!f> end interface
!f>#endif
*/
75
void double_hh_trafo_complex_sse_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
76
77
78
79
80
81
82
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

83
	float complex s = conj(hh[(ldh)+1])*1.0f;
84
85
86
87
88
89
90
	for (i = 2; i < nb; i++)
	{
		s += hh[i-1] * conj(hh[(i+ldh)]);
	}

	for (i = 0; i < nq; i+=4)
	{
91
		hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
92
93
94
	}
}

95
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
96
{
97
98
99
100
101
102
103
104
105
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
	float* s_dbl = (float*)(&s);

	__m128 x1, x2, x3, x4;
	__m128 y1, y2, y3, y4;
	__m128 q1, q2, q3, q4;
	__m128 h1_real, h1_imag, h2_real, h2_imag;
	__m128 tmp1, tmp2, tmp3, tmp4;
106
107
	int i=0;

108
109
//	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
	__m128 sign = (__m128)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
110

111
112
113
114
	x1 = _mm_load_ps(&q_dbl[(2*ldq)+0]);  // q(1,1) ... q(2,1)
	x2 = _mm_load_ps(&q_dbl[(2*ldq)+4]);  // q(3,1) ... q(4,1)
//	x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
//	x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]);
115

116
117
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
118
119
#ifndef __ELPA_USE_FMA__
	// conjugate
120
	h2_imag = _mm_xor_ps(h2_imag, sign);
121
122
#endif

123
124
125
126
	y1 = _mm_load_ps(&q_dbl[0]);
	y2 = _mm_load_ps(&q_dbl[4]);
//	y3 = _mm_load_pd(&q_dbl[4]);
//	y4 = _mm_load_pd(&q_dbl[6]);
127

128
	tmp1 = _mm_mul_ps(h2_imag, x1);
129
#ifdef __ELPA_USE_FMA__
130
	y1 = _mm_add_ps(y1, _mm_msubadd_ps(h2_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
131
#else
132
	y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
133
#endif
134
	tmp2 = _mm_mul_ps(h2_imag, x2);
135
#ifdef __ELPA_USE_FMA__
136
	y2 = _mm_add_ps(y2, _mm_msubadd_ps(h2_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
137
#else
138
	y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
139
#endif
140
//	tmp3 = _mm_mul_pd(h2_imag, x3);
141
#ifdef __ELPA_USE_FMA__
142
//	y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
143
#else
144
//	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
145
#endif
146
//	tmp4 = _mm_mul_pd(h2_imag, x4);
147
#ifdef __ELPA_USE_FMA__
148
//	y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
149
#else
150
//	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
151
152
153
154
#endif

	for (i = 2; i < nb; i++)
	{
155
156
157
158
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
159

160
161
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
162
163
#ifndef __ELPA_USE_FMA__
		// conjugate
164
		h1_imag = _mm_xor_ps(h1_imag, sign);
165
166
#endif

167
		tmp1 = _mm_mul_ps(h1_imag, q1);
168
#ifdef __ELPA_USE_FMA__
169
		x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
170
#else
171
		x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
172
#endif
173
		tmp2 = _mm_mul_ps(h1_imag, q2);
174
#ifdef __ELPA_USE_FMA__
175
		x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
176
#else
177
		x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
178
#endif
179
//		tmp3 = _mm_mul_pd(h1_imag, q3);
180
#ifdef __ELPA_USE_FMA__
181
//		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
182
#else
183
//		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
184
#endif
185
//		tmp4 = _mm_mul_pd(h1_imag, q4);
186
#ifdef __ELPA_USE_FMA__
187
//		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
188
#else
189
//		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
190
191
#endif

192
193
		h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
		h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
194
195
#ifndef __ELPA_USE_FMA__
		// conjugate
196
		h2_imag = _mm_xor_ps(h2_imag, sign);
197
198
#endif

199
		tmp1 = _mm_mul_ps(h2_imag, q1);
200
#ifdef __ELPA_USE_FMA__
201
		y1 = _mm_add_ps(y1, _mm_msubadd_ps(h2_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1 )));
202
#else
203
		y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
204
#endif
205
		tmp2 = _mm_mul_ps(h2_imag, q2);
206
#ifdef __ELPA_USE_FMA__
207
		y2 = _mm_add_ps(y2, _mm_msubadd_ps(h2_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
208
#else
209
		y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
210
#endif
211
//		tmp3 = _mm_mul_pd(h2_imag, q3);
212
#ifdef __ELPA_USE_FMA__
213
//		y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
214
#else
215
//		y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
216
#endif
217
//		tmp4 = _mm_mul_pd(h2_imag, q4);
218
#ifdef __ELPA_USE_FMA__
219
//		y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
220
#else
221
//		y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
222
223
#endif
	}
224
225
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
226
227
#ifndef __ELPA_USE_FMA__
	// conjugate
228
	h1_imag = _mm_xor_ps(h1_imag, sign);
229
230
#endif

231
232
233
234
	q1 = _mm_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm_load_ps(&q_dbl[(2*nb*ldq)+4]);
//	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
//	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
235

236
	tmp1 = _mm_mul_ps(h1_imag, q1);
237
#ifdef __ELPA_USE_FMA__
238
	x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
239
#else
240
	x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
241
#endif
242
	tmp2 = _mm_mul_ps(h1_imag, q2);
243
#ifdef __ELPA_USE_FMA__
244
	x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
245
#else
246
	x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
247
#endif
248
//	tmp3 = _mm_mul_pd(h1_imag, q3);
249
#ifdef __ELPA_USE_FMA__
250
//	x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
251
#else
252
//	x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
253
#endif
254
//	tmp4 = _mm_mul_pd(h1_imag, q4);
255
#ifdef __ELPA_USE_FMA__
256
//	x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
257
#else
258
//	x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
259
260
#endif

261
262
263
264
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
265

266
	tmp1 = _mm_mul_ps(h1_imag, x1);
267
#ifdef __ELPA_USE_FMA__
268
	x1 = _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
269
#else
270
	x1 = _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
271
#endif
272
	tmp2 = _mm_mul_ps(h1_imag, x2);
273
#ifdef __ELPA_USE_FMA__
274
	x2 = _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
275
#else
276
	x2 = _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
277
#endif
278
//	tmp3 = _mm_mul_pd(h1_imag, x3);
279
#ifdef __ELPA_USE_FMA__
280
//	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
281
#else
282
//	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
283
#endif
284
//	tmp4 = _mm_mul_pd(h1_imag, x4);
285
#ifdef __ELPA_USE_FMA__
286
//	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
287
#else
288
//	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
289
290
#endif

291
292
293
294
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
295

296
297
298
299
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
	h2_real = _mm_xor_ps(h2_real, sign);
	h2_imag = _mm_xor_ps(h2_imag, sign);
300

301
302
	tmp2 = _mm_loadu_ps(s_dbl);
	tmp1 = _mm_mul_ps(h2_imag, tmp2);
303
#ifdef __ELPA_USE_FMA__
304
	tmp2 = _mm_maddsub_ps(h2_real, tmp2, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
305
#else
306
	tmp2 = _mm_addsub_ps( _mm_mul_ps(h2_real, tmp2), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
307
#endif
308
309
310
	_mm_storeu_ps(s_dbl, tmp2);
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[0]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[1]) )));
311

312
	tmp1 = _mm_mul_ps(h1_imag, y1);
313
#ifdef __ELPA_USE_FMA__
314
	y1 = _mm_maddsub_ps(h1_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
315
#else
316
	y1 = _mm_addsub_ps( _mm_mul_ps(h1_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
317
#endif
318
	tmp2 = _mm_mul_ps(h1_imag, y2);
319
#ifdef __ELPA_USE_FMA__
320
	y2 = _mm_maddsub_ps(h1_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
321
#else
322
	y2 = _mm_addsub_ps( _mm_mul_ps(h1_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
323
#endif
324
//	tmp3 = _mm_mul_pd(h1_imag, y3);
325
#ifdef __ELPA_USE_FMA__
326
//	y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
327
#else
328
//	y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
329
#endif
330
//	tmp4 = _mm_mul_pd(h1_imag, y4);
331
#ifdef __ELPA_USE_FMA__
332
//	y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
333
#else
334
//	y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
335
336
#endif

337
	tmp1 = _mm_mul_ps(h2_imag, x1);
338
#ifdef __ELPA_USE_FMA__
339
	y1 = _mm_add_ps(y1, _mm_maddsub_ps(h2_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
340
#else
341
	y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
342
#endif
343
	tmp2 = _mm_mul_ps(h2_imag, x2);
344
#ifdef __ELPA_USE_FMA__
345
	y2 = _mm_add_ps(y2, _mm_maddsub_ps(h2_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
346
#else
347
	y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
348
#endif
349
//	tmp3 = _mm_mul_pd(h2_imag, x3);
350
#ifdef __ELPA_USE_FMA__
351
//	y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
352
#else
353
//	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
354
#endif
355
//	tmp4 = _mm_mul_pd(h2_imag, x4);
356
#ifdef __ELPA_USE_FMA__
357
//	y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
358
#else
359
//	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
360
361
#endif

362
363
364
365
	q1 = _mm_load_ps(&q_dbl[0]);
	q2 = _mm_load_ps(&q_dbl[4]);
//	q3 = _mm_load_pd(&q_dbl[4]);
//	q4 = _mm_load_pd(&q_dbl[6]);
366

367
368
369
370
	q1 = _mm_add_ps(q1, y1);
	q2 = _mm_add_ps(q2, y2);
//	q3 = _mm_add_pd(q3, y3);
//	q4 = _mm_add_pd(q4, y4);
371

372
373
374
375
	_mm_store_ps(&q_dbl[0], q1);
	_mm_store_ps(&q_dbl[4], q2);
//	_mm_store_pd(&q_dbl[4], q3);
//	_mm_store_pd(&q_dbl[6], q4);
376

377
378
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
379

380
381
382
383
	q1 = _mm_load_ps(&q_dbl[(ldq*2)+0]);
	q2 = _mm_load_ps(&q_dbl[(ldq*2)+4]);
//	q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]);
//	q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]);
384

385
386
387
388
	q1 = _mm_add_ps(q1, x1);
	q2 = _mm_add_ps(q2, x2);
//	q3 = _mm_add_pd(q3, x3);
//	q4 = _mm_add_pd(q4, x4);
389

390
	tmp1 = _mm_mul_ps(h2_imag, y1);
391
#ifdef __ELPA_USE_FMA__
392
	q1 = _mm_add_ps(q1, _mm_maddsub_ps(h2_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
393
#else
394
	q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h2_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
395
#endif
396
	tmp2 = _mm_mul_ps(h2_imag, y2);
397
#ifdef __ELPA_USE_FMA__
398
	q2 = _mm_add_ps(q2, _mm_maddsub_ps(h2_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
399
#else
400
	q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h2_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
401
#endif
402
//	tmp3 = _mm_mul_pd(h2_imag, y3);
403
#ifdef __ELPA_USE_FMA__
404
//	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
405
#else
406
//	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
407
#endif
408
//	tmp4 = _mm_mul_pd(h2_imag, y4);
409
#ifdef __ELPA_USE_FMA__
410
//	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
411
#else
412
//	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
413
414
#endif

415
416
417
418
	_mm_store_ps(&q_dbl[(ldq*2)+0], q1);
	_mm_store_ps(&q_dbl[(ldq*2)+4], q2);
//	_mm_store_pd(&q_dbl[(ldq*2)+4], q3);
//	_mm_store_pd(&q_dbl[(ldq*2)+6], q4);
419
420
421

	for (i = 2; i < nb; i++)
	{
422
423
424
425
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
426

427
428
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
429

430
		tmp1 = _mm_mul_ps(h1_imag, x1);
431
#ifdef __ELPA_USE_FMA__
432
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
433
#else
434
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
435
#endif
436
		tmp2 = _mm_mul_ps(h1_imag, x2);
437
#ifdef __ELPA_USE_FMA__
438
		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
439
#else
440
		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
441
#endif
442
//		tmp3 = _mm_mul_pd(h1_imag, x3);
443
#ifdef __ELPA_USE_FMA__
444
//		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
445
#else
446
//		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
447
#endif
448
//		tmp4 = _mm_mul_pd(h1_imag, x4);
449
#ifdef __ELPA_USE_FMA__
450
//		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
451
#else
452
//		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
453
454
#endif

455
456
		h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
		h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
457

458
		tmp1 = _mm_mul_ps(h2_imag, y1);
459
#ifdef __ELPA_USE_FMA__
460
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h2_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
461
#else
462
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h2_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
463
#endif
464
		tmp2 = _mm_mul_ps(h2_imag, y2);
465
#ifdef __ELPA_USE_FMA__
466
		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h2_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
467
#else
468
		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h2_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
469
#endif
470
//		tmp3 = _mm_mul_pd(h2_imag, y3);
471
#ifdef __ELPA_USE_FMA__
472
//		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
473
#else
474
//		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
475
#endif
476
//		tmp4 = _mm_mul_pd(h2_imag, y4);
477
#ifdef __ELPA_USE_FMA__
478
//		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
479
#else
480
//		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
481
482
#endif

483
484
485
486
		_mm_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm_store_ps(&q_dbl[(2*i*ldq)+4], q2);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
487
488
	}

489
490
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
491

492
493
494
495
	q1 = _mm_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm_load_ps(&q_dbl[(2*nb*ldq)+4]);
//	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
//	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
496

497
	tmp1 = _mm_mul_ps(h1_imag, x1);
498
#ifdef __ELPA_USE_FMA__
499
	q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
500
#else
501
	q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
502
#endif
503
	tmp2 = _mm_mul_ps(h1_imag, x2);
504
#ifdef __ELPA_USE_FMA__
505
	q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
506
#else
507
	q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
508
#endif
509
//	tmp3 = _mm_mul_pd(h1_imag, x3);
510
#ifdef __ELPA_USE_FMA__
511
//	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
512
#else
513
//	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
514
#endif
515
//	tmp4 = _mm_mul_pd(h1_imag, x4);
516
#ifdef __ELPA_USE_FMA__
517
//	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
518
#else
519
//	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
520
521
#endif

522
523
524
525
	_mm_store_ps(&q_dbl[(2*nb*ldq)+0], q1);
	_mm_store_ps(&q_dbl[(2*nb*ldq)+4], q2);
//	_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
//	_mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4);
526
}