elpa2_kernels_complex_sse_2hv_single_precision.cpp 20.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------
#include "config-f90.h"

#include <complex>
#include <x86intrin.h>

#define __forceinline __attribute__((always_inline))

#ifdef HAVE_SSE
#undef __AVX__
#endif

extern "C" {

//Forward declaration
76
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq, int ldh, std::complex<float> s, std::complex<float> s1);
77

78
void double_hh_trafo_complex_sse_2hv_single_(std::complex<float>* q, std::complex<float>* hh, int* pnb, int* pnq, int* pldq, int* pldh)
79
80
81
82
83
84
85
{
	int i;
	int nb = *pnb;
	int nq = *pldq;
	int ldq = *pldq;
	int ldh = *pldh;

86
	std::complex<float> s = conj(hh[(ldh)+1])*1.0f;
87
88
89
90
91
92
93
	for (i = 2; i < nb; i++)
	{
		s += hh[i-1] * conj(hh[(i+ldh)]);
	}

	for (i = 0; i < nq; i+=4)
	{
94
		hh_trafo_complex_kernel_4_SSE_2hv_single(&q[i], hh, nb, ldq, ldh, s, s);
95
96
97
	}
}

98
static __forceinline void hh_trafo_complex_kernel_4_SSE_2hv_single(std::complex<float>* q, std::complex<float>* hh, int nb, int ldq, int ldh, std::complex<float> s, std::complex<float> s1)
99
{
100
101
102
103
104
105
106
107
108
	float* q_dbl = (float*)q;
	float* hh_dbl = (float*)hh;
	float* s_dbl = (float*)(&s);

	__m128 x1, x2, x3, x4;
	__m128 y1, y2, y3, y4;
	__m128 q1, q2, q3, q4;
	__m128 h1_real, h1_imag, h2_real, h2_imag;
	__m128 tmp1, tmp2, tmp3, tmp4;
109
110
	int i=0;

111
112
//	__m128d sign = (__m128d)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
	__m128 sign = (__m128)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
113

114
115
116
117
	x1 = _mm_load_ps(&q_dbl[(2*ldq)+0]);  // q(1,1) ... q(2,1)
	x2 = _mm_load_ps(&q_dbl[(2*ldq)+4]);  // q(3,1) ... q(4,1)
//	x3 = _mm_load_pd(&q_dbl[(2*ldq)+4]);
//	x4 = _mm_load_pd(&q_dbl[(2*ldq)+6]);
118

119
120
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
121
122
#ifndef __ELPA_USE_FMA__
	// conjugate
123
	h2_imag = _mm_xor_ps(h2_imag, sign);
124
125
#endif

126
127
128
129
	y1 = _mm_load_ps(&q_dbl[0]);
	y2 = _mm_load_ps(&q_dbl[4]);
//	y3 = _mm_load_pd(&q_dbl[4]);
//	y4 = _mm_load_pd(&q_dbl[6]);
130

131
	tmp1 = _mm_mul_ps(h2_imag, x1);
132
#ifdef __ELPA_USE_FMA__
133
	y1 = _mm_add_ps(y1, _mm_msubadd_ps(h2_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
134
#else
135
	y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
136
#endif
137
	tmp2 = _mm_mul_ps(h2_imag, x2);
138
#ifdef __ELPA_USE_FMA__
139
	y2 = _mm_add_ps(y2, _mm_msubadd_ps(h2_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
140
#else
141
	y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
142
#endif
143
//	tmp3 = _mm_mul_pd(h2_imag, x3);
144
#ifdef __ELPA_USE_FMA__
145
//	y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
146
#else
147
//	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
148
#endif
149
//	tmp4 = _mm_mul_pd(h2_imag, x4);
150
#ifdef __ELPA_USE_FMA__
151
//	y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
152
#else
153
//	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
154
155
156
157
#endif

	for (i = 2; i < nb; i++)
	{
158
159
160
161
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
162

163
164
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
165
166
#ifndef __ELPA_USE_FMA__
		// conjugate
167
		h1_imag = _mm_xor_ps(h1_imag, sign);
168
169
#endif

170
		tmp1 = _mm_mul_ps(h1_imag, q1);
171
#ifdef __ELPA_USE_FMA__
172
		x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
173
#else
174
		x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
175
#endif
176
		tmp2 = _mm_mul_ps(h1_imag, q2);
177
#ifdef __ELPA_USE_FMA__
178
		x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
179
#else
180
		x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
181
#endif
182
//		tmp3 = _mm_mul_pd(h1_imag, q3);
183
#ifdef __ELPA_USE_FMA__
184
//		x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
185
#else
186
//		x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
187
#endif
188
//		tmp4 = _mm_mul_pd(h1_imag, q4);
189
#ifdef __ELPA_USE_FMA__
190
//		x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
191
#else
192
//		x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
193
194
#endif

195
196
		h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
		h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
197
198
#ifndef __ELPA_USE_FMA__
		// conjugate
199
		h2_imag = _mm_xor_ps(h2_imag, sign);
200
201
#endif

202
		tmp1 = _mm_mul_ps(h2_imag, q1);
203
#ifdef __ELPA_USE_FMA__
204
		y1 = _mm_add_ps(y1, _mm_msubadd_ps(h2_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1 )));
205
#else
206
		y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
207
#endif
208
		tmp2 = _mm_mul_ps(h2_imag, q2);
209
#ifdef __ELPA_USE_FMA__
210
		y2 = _mm_add_ps(y2, _mm_msubadd_ps(h2_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
211
#else
212
		y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
213
#endif
214
//		tmp3 = _mm_mul_pd(h2_imag, q3);
215
#ifdef __ELPA_USE_FMA__
216
//		y3 = _mm_add_pd(y3, _mm_msubadd_pd(h2_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
217
#else
218
//		y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
219
#endif
220
//		tmp4 = _mm_mul_pd(h2_imag, q4);
221
#ifdef __ELPA_USE_FMA__
222
//		y4 = _mm_add_pd(y4, _mm_msubadd_pd(h2_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
223
#else
224
//		y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
225
226
#endif
	}
227
228
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
229
230
#ifndef __ELPA_USE_FMA__
	// conjugate
231
	h1_imag = _mm_xor_ps(h1_imag, sign);
232
233
#endif

234
235
236
237
	q1 = _mm_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm_load_ps(&q_dbl[(2*nb*ldq)+4]);
//	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
//	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
238

239
	tmp1 = _mm_mul_ps(h1_imag, q1);
240
#ifdef __ELPA_USE_FMA__
241
	x1 = _mm_add_ps(x1, _mm_msubadd_ps(h1_real, q1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
242
#else
243
	x1 = _mm_add_ps(x1, _mm_addsub_ps( _mm_mul_ps(h1_real, q1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
244
#endif
245
	tmp2 = _mm_mul_ps(h1_imag, q2);
246
#ifdef __ELPA_USE_FMA__
247
	x2 = _mm_add_ps(x2, _mm_msubadd_ps(h1_real, q2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
248
#else
249
	x2 = _mm_add_ps(x2, _mm_addsub_ps( _mm_mul_ps(h1_real, q2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
250
#endif
251
//	tmp3 = _mm_mul_pd(h1_imag, q3);
252
#ifdef __ELPA_USE_FMA__
253
//	x3 = _mm_add_pd(x3, _mm_msubadd_pd(h1_real, q3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
254
#else
255
//	x3 = _mm_add_pd(x3, _mm_addsub_pd( _mm_mul_pd(h1_real, q3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
256
#endif
257
//	tmp4 = _mm_mul_pd(h1_imag, q4);
258
#ifdef __ELPA_USE_FMA__
259
//	x4 = _mm_add_pd(x4, _mm_msubadd_pd(h1_real, q4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
260
#else
261
//	x4 = _mm_add_pd(x4, _mm_addsub_pd( _mm_mul_pd(h1_real, q4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
262
263
#endif

264
265
266
267
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
268

269
	tmp1 = _mm_mul_ps(h1_imag, x1);
270
#ifdef __ELPA_USE_FMA__
271
	x1 = _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
272
#else
273
	x1 = _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
274
#endif
275
	tmp2 = _mm_mul_ps(h1_imag, x2);
276
#ifdef __ELPA_USE_FMA__
277
	x2 = _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
278
#else
279
	x2 = _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
280
#endif
281
//	tmp3 = _mm_mul_pd(h1_imag, x3);
282
#ifdef __ELPA_USE_FMA__
283
//	x3 = _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
284
#else
285
//	x3 = _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
286
#endif
287
//	tmp4 = _mm_mul_pd(h1_imag, x4);
288
#ifdef __ELPA_USE_FMA__
289
//	x4 = _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
290
#else
291
//	x4 = _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
292
293
#endif

294
295
296
297
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[ldh*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh*2)+1]) )));
298

299
300
301
302
	h1_real = _mm_xor_ps(h1_real, sign);
	h1_imag = _mm_xor_ps(h1_imag, sign);
	h2_real = _mm_xor_ps(h2_real, sign);
	h2_imag = _mm_xor_ps(h2_imag, sign);
303

304
305
	tmp2 = _mm_loadu_ps(s_dbl);
	tmp1 = _mm_mul_ps(h2_imag, tmp2);
306
#ifdef __ELPA_USE_FMA__
307
	tmp2 = _mm_maddsub_ps(h2_real, tmp2, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
308
#else
309
	tmp2 = _mm_addsub_ps( _mm_mul_ps(h2_real, tmp2), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
310
#endif
311
312
313
	_mm_storeu_ps(s_dbl, tmp2);
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[0]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&s_dbl[1]) )));
314

315
	tmp1 = _mm_mul_ps(h1_imag, y1);
316
#ifdef __ELPA_USE_FMA__
317
	y1 = _mm_maddsub_ps(h1_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1));
318
#else
319
	y1 = _mm_addsub_ps( _mm_mul_ps(h1_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1));
320
#endif
321
	tmp2 = _mm_mul_ps(h1_imag, y2);
322
#ifdef __ELPA_USE_FMA__
323
	y2 = _mm_maddsub_ps(h1_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1));
324
#else
325
	y2 = _mm_addsub_ps( _mm_mul_ps(h1_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1));
326
#endif
327
//	tmp3 = _mm_mul_pd(h1_imag, y3);
328
#ifdef __ELPA_USE_FMA__
329
//	y3 = _mm_maddsub_pd(h1_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
330
#else
331
//	y3 = _mm_addsub_pd( _mm_mul_pd(h1_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1)));
332
#endif
333
//	tmp4 = _mm_mul_pd(h1_imag, y4);
334
#ifdef __ELPA_USE_FMA__
335
//	y4 = _mm_maddsub_pd(h1_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
336
#else
337
//	y4 = _mm_addsub_pd( _mm_mul_pd(h1_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1)));
338
339
#endif

340
	tmp1 = _mm_mul_ps(h2_imag, x1);
341
#ifdef __ELPA_USE_FMA__
342
	y1 = _mm_add_ps(y1, _mm_maddsub_ps(h2_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
343
#else
344
	y1 = _mm_add_ps(y1, _mm_addsub_ps( _mm_mul_ps(h2_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
345
#endif
346
	tmp2 = _mm_mul_ps(h2_imag, x2);
347
#ifdef __ELPA_USE_FMA__
348
	y2 = _mm_add_ps(y2, _mm_maddsub_ps(h2_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
349
#else
350
	y2 = _mm_add_ps(y2, _mm_addsub_ps( _mm_mul_ps(h2_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
351
#endif
352
//	tmp3 = _mm_mul_pd(h2_imag, x3);
353
#ifdef __ELPA_USE_FMA__
354
//	y3 = _mm_add_pd(y3, _mm_maddsub_pd(h2_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
355
#else
356
//	y3 = _mm_add_pd(y3, _mm_addsub_pd( _mm_mul_pd(h2_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
357
#endif
358
//	tmp4 = _mm_mul_pd(h2_imag, x4);
359
#ifdef __ELPA_USE_FMA__
360
//	y4 = _mm_add_pd(y4, _mm_maddsub_pd(h2_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
361
#else
362
//	y4 = _mm_add_pd(y4, _mm_addsub_pd( _mm_mul_pd(h2_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
363
364
#endif

365
366
367
368
	q1 = _mm_load_ps(&q_dbl[0]);
	q2 = _mm_load_ps(&q_dbl[4]);
//	q3 = _mm_load_pd(&q_dbl[4]);
//	q4 = _mm_load_pd(&q_dbl[6]);
369

370
371
372
373
	q1 = _mm_add_ps(q1, y1);
	q2 = _mm_add_ps(q2, y2);
//	q3 = _mm_add_pd(q3, y3);
//	q4 = _mm_add_pd(q4, y4);
374

375
376
377
378
	_mm_store_ps(&q_dbl[0], q1);
	_mm_store_ps(&q_dbl[4], q2);
//	_mm_store_pd(&q_dbl[4], q3);
//	_mm_store_pd(&q_dbl[6], q4);
379

380
381
	h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+1)*2]) )));
	h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+1)*2)+1]) )));
382

383
384
385
386
	q1 = _mm_load_ps(&q_dbl[(ldq*2)+0]);
	q2 = _mm_load_ps(&q_dbl[(ldq*2)+4]);
//	q3 = _mm_load_pd(&q_dbl[(ldq*2)+4]);
//	q4 = _mm_load_pd(&q_dbl[(ldq*2)+6]);
387

388
389
390
391
	q1 = _mm_add_ps(q1, x1);
	q2 = _mm_add_ps(q2, x2);
//	q3 = _mm_add_pd(q3, x3);
//	q4 = _mm_add_pd(q4, x4);
392

393
	tmp1 = _mm_mul_ps(h2_imag, y1);
394
#ifdef __ELPA_USE_FMA__
395
	q1 = _mm_add_ps(q1, _mm_maddsub_ps(h2_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
396
#else
397
	q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h2_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
398
#endif
399
	tmp2 = _mm_mul_ps(h2_imag, y2);
400
#ifdef __ELPA_USE_FMA__
401
	q2 = _mm_add_ps(q2, _mm_maddsub_ps(h2_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
402
#else
403
	q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h2_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
404
#endif
405
//	tmp3 = _mm_mul_pd(h2_imag, y3);
406
#ifdef __ELPA_USE_FMA__
407
//	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
408
#else
409
//	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
410
#endif
411
//	tmp4 = _mm_mul_pd(h2_imag, y4);
412
#ifdef __ELPA_USE_FMA__
413
//	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
414
#else
415
//	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
416
417
#endif

418
419
420
421
	_mm_store_ps(&q_dbl[(ldq*2)+0], q1);
	_mm_store_ps(&q_dbl[(ldq*2)+4], q2);
//	_mm_store_pd(&q_dbl[(ldq*2)+4], q3);
//	_mm_store_pd(&q_dbl[(ldq*2)+6], q4);
422
423
424

	for (i = 2; i < nb; i++)
	{
425
426
427
428
		q1 = _mm_load_ps(&q_dbl[(2*i*ldq)+0]);
		q2 = _mm_load_ps(&q_dbl[(2*i*ldq)+4]);
//		q3 = _mm_load_pd(&q_dbl[(2*i*ldq)+4]);
//		q4 = _mm_load_pd(&q_dbl[(2*i*ldq)+6]);
429

430
431
		h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i-1)*2]) )));
		h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((i-1)*2)+1]) )));
432

433
		tmp1 = _mm_mul_ps(h1_imag, x1);
434
#ifdef __ELPA_USE_FMA__
435
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
436
#else
437
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
438
#endif
439
		tmp2 = _mm_mul_ps(h1_imag, x2);
440
#ifdef __ELPA_USE_FMA__
441
		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
442
#else
443
		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
444
#endif
445
//		tmp3 = _mm_mul_pd(h1_imag, x3);
446
#ifdef __ELPA_USE_FMA__
447
//		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
448
#else
449
//		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
450
#endif
451
//		tmp4 = _mm_mul_pd(h1_imag, x4);
452
#ifdef __ELPA_USE_FMA__
453
//		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
454
#else
455
//		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
456
457
#endif

458
459
		h2_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(ldh+i)*2]) )));
		h2_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((ldh+i)*2)+1]) )));
460

461
		tmp1 = _mm_mul_ps(h2_imag, y1);
462
#ifdef __ELPA_USE_FMA__
463
		q1 = _mm_add_ps(q1, _mm_maddsub_ps(h2_real, y1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
464
#else
465
		q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h2_real, y1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
466
#endif
467
		tmp2 = _mm_mul_ps(h2_imag, y2);
468
#ifdef __ELPA_USE_FMA__
469
		q2 = _mm_add_ps(q2, _mm_maddsub_ps(h2_real, y2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
470
#else
471
		q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h2_real, y2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
472
#endif
473
//		tmp3 = _mm_mul_pd(h2_imag, y3);
474
#ifdef __ELPA_USE_FMA__
475
//		q3 = _mm_add_pd(q3, _mm_maddsub_pd(h2_real, y3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
476
#else
477
//		q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h2_real, y3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
478
#endif
479
//		tmp4 = _mm_mul_pd(h2_imag, y4);
480
#ifdef __ELPA_USE_FMA__
481
//		q4 = _mm_add_pd(q4, _mm_maddsub_pd(h2_real, y4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
482
#else
483
//		q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h2_real, y4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
484
485
#endif

486
487
488
489
		_mm_store_ps(&q_dbl[(2*i*ldq)+0], q1);
		_mm_store_ps(&q_dbl[(2*i*ldq)+4], q2);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+4], q3);
//		_mm_store_pd(&q_dbl[(2*i*ldq)+6], q4);
490
491
	}

492
493
	h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(nb-1)*2]) )));
	h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[((nb-1)*2)+1]) )));
494

495
496
497
498
	q1 = _mm_load_ps(&q_dbl[(2*nb*ldq)+0]);
	q2 = _mm_load_ps(&q_dbl[(2*nb*ldq)+4]);
//	q3 = _mm_load_pd(&q_dbl[(2*nb*ldq)+4]);
//	q4 = _mm_load_pd(&q_dbl[(2*nb*ldq)+6]);
499

500
	tmp1 = _mm_mul_ps(h1_imag, x1);
501
#ifdef __ELPA_USE_FMA__
502
	q1 = _mm_add_ps(q1, _mm_maddsub_ps(h1_real, x1, _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
503
#else
504
	q1 = _mm_add_ps(q1, _mm_addsub_ps( _mm_mul_ps(h1_real, x1), _mm_shuffle_ps(tmp1, tmp1, 0xb1)));
505
#endif
506
	tmp2 = _mm_mul_ps(h1_imag, x2);
507
#ifdef __ELPA_USE_FMA__
508
	q2 = _mm_add_ps(q2, _mm_maddsub_ps(h1_real, x2, _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
509
#else
510
	q2 = _mm_add_ps(q2, _mm_addsub_ps( _mm_mul_ps(h1_real, x2), _mm_shuffle_ps(tmp2, tmp2, 0xb1)));
511
#endif
512
//	tmp3 = _mm_mul_pd(h1_imag, x3);
513
#ifdef __ELPA_USE_FMA__
514
//	q3 = _mm_add_pd(q3, _mm_maddsub_pd(h1_real, x3, _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
515
#else
516
//	q3 = _mm_add_pd(q3, _mm_addsub_pd( _mm_mul_pd(h1_real, x3), _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0,1))));
517
#endif
518
//	tmp4 = _mm_mul_pd(h1_imag, x4);
519
#ifdef __ELPA_USE_FMA__
520
//	q4 = _mm_add_pd(q4, _mm_maddsub_pd(h1_real, x4, _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
521
#else
522
//	q4 = _mm_add_pd(q4, _mm_addsub_pd( _mm_mul_pd(h1_real, x4), _mm_shuffle_pd(tmp4, tmp4, _MM_SHUFFLE2(0,1))));
523
524
#endif

525
526
527
528
	_mm_store_ps(&q_dbl[(2*nb*ldq)+0], q1);
	_mm_store_ps(&q_dbl[(2*nb*ldq)+4], q2);
//	_mm_store_pd(&q_dbl[(2*nb*ldq)+4], q3);
//	_mm_store_pd(&q_dbl[(2*nb*ldq)+6], q4);
529
530
}
} // extern C