elpa2_kernels_complex_sse_2hv_single_precision.c 20.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
45
// Author: Andreas Marek, MPCDF, based on the double precision case of A. Heinecke
46
47
48
//
#include "config-f90.h"

49
#include <complex.h>
50
51
52
53
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

54
#ifdef HAVE_SSE_INTRINSICS
55
56
57
58
#undef __AVX__
#endif

//Forward declaration
59
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1);
60
61

/*
62
!f>#ifdef HAVE_SSE_INTRINSICS
63
64
65
66
67
68
69
70
71
72
73
!f> interface
!f>   subroutine double_hh_trafo_complex_sse_2hv_single(q, hh, pnb, pnq, pldq, pldh) &
!f>                             bind(C, name="double_hh_trafo_complex_sse_2hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq, pldh
!f>     complex(kind=c_float)   :: q(*)
!f>     complex(kind=c_float)   :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
74
void double_hh_trafo_complex_sse_2hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq, int* pldh)
75
76
77
78
79
80
81
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

82
	float complex s = conj(hh[(ldh)+1])*1.0f;
83
84
85
86
87
88
89
	for (i = 2; i < nb; i++)
	{
		s += hh[i-1] * conj(hh[(i+ldh)]);
	}

	for (i = 0; i < nq; i+=4)
	{
90
		hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
91
92
93
	}
}

94
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(float complex* q, float complex* hh, int nb, int ldq, int ldh, float complex s, float complex s1)
95
{
96
97
98
99
100
101
102
103
104
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
	float* s_dbl = (float*)(&s);

	__m128 x1, x2, x3, x4;
	__m128 y1, y2, y3, y4;
	__m128 q1, q2, q3, q4;
	__m128 h1_real, h1_imag, h2_real, h2_imag;
	__m128 tmp1, tmp2, tmp3, tmp4;
105
106
	int i=0;

107
108
//	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
	__m128 sign = (__m128)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
109

110
111
112
113
	x1 = _mm_load_ps(&q_dbl[(2*ldq)+0]);  // q(1,1) ... q(2,1)
	x2 = _mm_load_ps(&q_dbl[(2*ldq)+4]);  // q(3,1) ... q(4,1)
//	x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
//	x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]);
114

115
116
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
117
118
#ifndef __ELPA_USE_FMA__
	// conjugate
119
	h2_imag = _mm_xor_ps(h2_imag, sign);
120
121
#endif

122
123
124
125
	y1 = _mm_load_ps(&q_dbl[0]);
	y2 = _mm_load_ps(&q_dbl[4]);
//	y3 = _mm_load_pd(&q_dbl[4]);
//	y4 = _mm_load_pd(&q_dbl[6]);
126

127
	tmp1 = _mm_mul_ps(h2_imag, x1);
128
#ifdef __ELPA_USE_FMA__
129
	y1 = _mm_add_ps(y1, _mm_msubadd_ps(h2_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
130
#else
131
	y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
132
#endif
133
	tmp2 = _mm_mul_ps(h2_imag, x2);
134
#ifdef __ELPA_USE_FMA__
135
	y2 = _mm_add_ps(y2, _mm_msubadd_ps(h2_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
136
#else
137
	y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
138
#endif
139
//	tmp3 = _mm_mul_pd(h2_imag, x3);
140
#ifdef __ELPA_USE_FMA__
141
//	y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
142
#else
143
//	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
144
#endif
145
//	tmp4 = _mm_mul_pd(h2_imag, x4);
146
#ifdef __ELPA_USE_FMA__
147
//	y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
148
#else
149
//	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
150
151
152
153
#endif

	for (i = 2; i < nb; i++)
	{
154
155
156
157
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
158

159
160
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
161
162
#ifndef __ELPA_USE_FMA__
		// conjugate
163
		h1_imag = _mm_xor_ps(h1_imag, sign);
164
165
#endif

166
		tmp1 = _mm_mul_ps(h1_imag, q1);
167
#ifdef __ELPA_USE_FMA__
168
		x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
169
#else
170
		x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
171
#endif
172
		tmp2 = _mm_mul_ps(h1_imag, q2);
173
#ifdef __ELPA_USE_FMA__
174
		x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
175
#else
176
		x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
177
#endif
178
//		tmp3 = _mm_mul_pd(h1_imag, q3);
179
#ifdef __ELPA_USE_FMA__
180
//		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
181
#else
182
//		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
183
#endif
184
//		tmp4 = _mm_mul_pd(h1_imag, q4);
185
#ifdef __ELPA_USE_FMA__
186
//		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
187
#else
188
//		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
189
190
#endif

191
192
		h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
		h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
193
194
#ifndef __ELPA_USE_FMA__
		// conjugate
195
		h2_imag = _mm_xor_ps(h2_imag, sign);
196
197
#endif

198
		tmp1 = _mm_mul_ps(h2_imag, q1);
199
#ifdef __ELPA_USE_FMA__
200
		y1 = _mm_add_ps(y1, _mm_msubadd_ps(h2_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1 )));
201
#else
202
		y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
203
#endif
204
		tmp2 = _mm_mul_ps(h2_imag, q2);
205
#ifdef __ELPA_USE_FMA__
206
		y2 = _mm_add_ps(y2, _mm_msubadd_ps(h2_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
207
#else
208
		y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
209
#endif
210
//		tmp3 = _mm_mul_pd(h2_imag, q3);
211
#ifdef __ELPA_USE_FMA__
212
//		y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
213
#else
214
//		y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
215
#endif
216
//		tmp4 = _mm_mul_pd(h2_imag, q4);
217
#ifdef __ELPA_USE_FMA__
218
//		y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
219
#else
220
//		y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
221
222
#endif
	}
223
224
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
225
226
#ifndef __ELPA_USE_FMA__
	// conjugate
227
	h1_imag = _mm_xor_ps(h1_imag, sign);
228
229
#endif

230
231
232
233
	q1 = _mm_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm_load_ps(&q_dbl[(2*nb*ldq)+4]);
//	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
//	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
234

235
	tmp1 = _mm_mul_ps(h1_imag, q1);
236
#ifdef __ELPA_USE_FMA__
237
	x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
238
#else
239
	x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
240
#endif
241
	tmp2 = _mm_mul_ps(h1_imag, q2);
242
#ifdef __ELPA_USE_FMA__
243
	x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
244
#else
245
	x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
246
#endif
247
//	tmp3 = _mm_mul_pd(h1_imag, q3);
248
#ifdef __ELPA_USE_FMA__
249
//	x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
250
#else
251
//	x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
252
#endif
253
//	tmp4 = _mm_mul_pd(h1_imag, q4);
254
#ifdef __ELPA_USE_FMA__
255
//	x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
256
#else
257
//	x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
258
259
#endif

260
261
262
263
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
264

265
	tmp1 = _mm_mul_ps(h1_imag, x1);
266
#ifdef __ELPA_USE_FMA__
267
	x1 = _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
268
#else
269
	x1 = _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
270
#endif
271
	tmp2 = _mm_mul_ps(h1_imag, x2);
272
#ifdef __ELPA_USE_FMA__
273
	x2 = _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
274
#else
275
	x2 = _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
276
#endif
277
//	tmp3 = _mm_mul_pd(h1_imag, x3);
278
#ifdef __ELPA_USE_FMA__
279
//	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
280
#else
281
//	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
282
#endif
283
//	tmp4 = _mm_mul_pd(h1_imag, x4);
284
#ifdef __ELPA_USE_FMA__
285
//	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
286
#else
287
//	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
288
289
#endif

290
291
292
293
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
294

295
296
297
298
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
	h2_real = _mm_xor_ps(h2_real, sign);
	h2_imag = _mm_xor_ps(h2_imag, sign);
299

300
301
	tmp2 = _mm_loadu_ps(s_dbl);
	tmp1 = _mm_mul_ps(h2_imag, tmp2);
302
#ifdef __ELPA_USE_FMA__
303
	tmp2 = _mm_maddsub_ps(h2_real, tmp2, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
304
#else
305
	tmp2 = _mm_addsub_ps( _mm_mul_ps(h2_real, tmp2), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
306
#endif
307
308
309
	_mm_storeu_ps(s_dbl, tmp2);
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[0]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[1]) )));
310

311
	tmp1 = _mm_mul_ps(h1_imag, y1);
312
#ifdef __ELPA_USE_FMA__
313
	y1 = _mm_maddsub_ps(h1_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
314
#else
315
	y1 = _mm_addsub_ps( _mm_mul_ps(h1_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
316
#endif
317
	tmp2 = _mm_mul_ps(h1_imag, y2);
318
#ifdef __ELPA_USE_FMA__
319
	y2 = _mm_maddsub_ps(h1_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
320
#else
321
	y2 = _mm_addsub_ps( _mm_mul_ps(h1_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
322
#endif
323
//	tmp3 = _mm_mul_pd(h1_imag, y3);
324
#ifdef __ELPA_USE_FMA__
325
//	y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
326
#else
327
//	y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
328
#endif
329
//	tmp4 = _mm_mul_pd(h1_imag, y4);
330
#ifdef __ELPA_USE_FMA__
331
//	y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
332
#else
333
//	y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
334
335
#endif

336
	tmp1 = _mm_mul_ps(h2_imag, x1);
337
#ifdef __ELPA_USE_FMA__
338
	y1 = _mm_add_ps(y1, _mm_maddsub_ps(h2_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
339
#else
340
	y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
341
#endif
342
	tmp2 = _mm_mul_ps(h2_imag, x2);
343
#ifdef __ELPA_USE_FMA__
344
	y2 = _mm_add_ps(y2, _mm_maddsub_ps(h2_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
345
#else
346
	y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
347
#endif
348
//	tmp3 = _mm_mul_pd(h2_imag, x3);
349
#ifdef __ELPA_USE_FMA__
350
//	y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
351
#else
352
//	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
353
#endif
354
//	tmp4 = _mm_mul_pd(h2_imag, x4);
355
#ifdef __ELPA_USE_FMA__
356
//	y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
357
#else
358
//	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
359
360
#endif

361
362
363
364
	q1 = _mm_load_ps(&q_dbl[0]);
	q2 = _mm_load_ps(&q_dbl[4]);
//	q3 = _mm_load_pd(&q_dbl[4]);
//	q4 = _mm_load_pd(&q_dbl[6]);
365

366
367
368
369
	q1 = _mm_add_ps(q1, y1);
	q2 = _mm_add_ps(q2, y2);
//	q3 = _mm_add_pd(q3, y3);
//	q4 = _mm_add_pd(q4, y4);
370

371
372
373
374
	_mm_store_ps(&q_dbl[0], q1);
	_mm_store_ps(&q_dbl[4], q2);
//	_mm_store_pd(&q_dbl[4], q3);
//	_mm_store_pd(&q_dbl[6], q4);
375

376
377
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
378

379
380
381
382
	q1 = _mm_load_ps(&q_dbl[(ldq*2)+0]);
	q2 = _mm_load_ps(&q_dbl[(ldq*2)+4]);
//	q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]);
//	q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]);
383

384
385
386
387
	q1 = _mm_add_ps(q1, x1);
	q2 = _mm_add_ps(q2, x2);
//	q3 = _mm_add_pd(q3, x3);
//	q4 = _mm_add_pd(q4, x4);
388

389
	tmp1 = _mm_mul_ps(h2_imag, y1);
390
#ifdef __ELPA_USE_FMA__
391
	q1 = _mm_add_ps(q1, _mm_maddsub_ps(h2_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
392
#else
393
	q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h2_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
394
#endif
395
	tmp2 = _mm_mul_ps(h2_imag, y2);
396
#ifdef __ELPA_USE_FMA__
397
	q2 = _mm_add_ps(q2, _mm_maddsub_ps(h2_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
398
#else
399
	q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h2_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
400
#endif
401
//	tmp3 = _mm_mul_pd(h2_imag, y3);
402
#ifdef __ELPA_USE_FMA__
403
//	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
404
#else
405
//	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
406
#endif
407
//	tmp4 = _mm_mul_pd(h2_imag, y4);
408
#ifdef __ELPA_USE_FMA__
409
//	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
410
#else
411
//	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
412
413
#endif

414
415
416
417
	_mm_store_ps(&q_dbl[(ldq*2)+0], q1);
	_mm_store_ps(&q_dbl[(ldq*2)+4], q2);
//	_mm_store_pd(&q_dbl[(ldq*2)+4], q3);
//	_mm_store_pd(&q_dbl[(ldq*2)+6], q4);
418
419
420

	for (i = 2; i < nb; i++)
	{
421
422
423
424
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
425

426
427
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
428

429
		tmp1 = _mm_mul_ps(h1_imag, x1);
430
#ifdef __ELPA_USE_FMA__
431
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
432
#else
433
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
434
#endif
435
		tmp2 = _mm_mul_ps(h1_imag, x2);
436
#ifdef __ELPA_USE_FMA__
437
		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
438
#else
439
		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
440
#endif
441
//		tmp3 = _mm_mul_pd(h1_imag, x3);
442
#ifdef __ELPA_USE_FMA__
443
//		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
444
#else
445
//		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
446
#endif
447
//		tmp4 = _mm_mul_pd(h1_imag, x4);
448
#ifdef __ELPA_USE_FMA__
449
//		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
450
#else
451
//		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
452
453
#endif

454
455
		h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
		h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
456

457
		tmp1 = _mm_mul_ps(h2_imag, y1);
458
#ifdef __ELPA_USE_FMA__
459
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h2_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
460
#else
461
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h2_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
462
#endif
463
		tmp2 = _mm_mul_ps(h2_imag, y2);
464
#ifdef __ELPA_USE_FMA__
465
		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h2_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
466
#else
467
		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h2_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
468
#endif
469
//		tmp3 = _mm_mul_pd(h2_imag, y3);
470
#ifdef __ELPA_USE_FMA__
471
//		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
472
#else
473
//		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
474
#endif
475
//		tmp4 = _mm_mul_pd(h2_imag, y4);
476
#ifdef __ELPA_USE_FMA__
477
//		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
478
#else
479
//		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
480
481
#endif

482
483
484
485
		_mm_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm_store_ps(&q_dbl[(2*i*ldq)+4], q2);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
486
487
	}

488
489
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
490

491
492
493
494
	q1 = _mm_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm_load_ps(&q_dbl[(2*nb*ldq)+4]);
//	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
//	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
495

496
	tmp1 = _mm_mul_ps(h1_imag, x1);
497
#ifdef __ELPA_USE_FMA__
498
	q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
499
#else
500
	q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
501
#endif
502
	tmp2 = _mm_mul_ps(h1_imag, x2);
503
#ifdef __ELPA_USE_FMA__
504
	q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
505
#else
506
	q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
507
#endif
508
//	tmp3 = _mm_mul_pd(h1_imag, x3);
509
#ifdef __ELPA_USE_FMA__
510
//	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
511
#else
512
//	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
513
#endif
514
//	tmp4 = _mm_mul_pd(h1_imag, x4);
515
#ifdef __ELPA_USE_FMA__
516
//	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
517
#else
518
//	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
519
520
#endif

521
522
523
524
	_mm_store_ps(&q_dbl[(2*nb*ldq)+0], q1);
	_mm_store_ps(&q_dbl[(2*nb*ldq)+4], q2);
//	_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
//	_mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4);
525
}