complex_sse_1hv_template.c 29.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
//    This file is part of ELPA.
//
//    The ELPA library was originally created by the ELPA consortium,
//    consisting of the following organizations:
//
//    - Max Planck Computing and Data Facility (MPCDF), formerly known as
//      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
//    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
//      Informatik,
//    - Technische Universität München, Lehrstuhl für Informatik mit
//      Schwerpunkt Wissenschaftliches Rechnen ,
//    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
//    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
//      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
//      and
//    - IBM Deutschland GmbH
//
//    This particular source code file contains additions, changes and
//    enhancements authored by Intel Corporation which is not part of
//    the ELPA consortium.
//
//    More information can be found here:
//    http://elpa.mpcdf.mpg.de/
//
//    ELPA is free software: you can redistribute it and/or modify
//    it under the terms of the version 3 of the license of the
//    GNU Lesser General Public License as published by the Free
//    Software Foundation.
//
//    ELPA is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//    GNU Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public License
//    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
//
//    ELPA reflects a substantial effort on the part of the original
//    ELPA consortium, and we ask you to respect the spirit of the
//    license that we chose: i.e., please contribute any changes you
//    may have back to the original ELPA library distribution, and keep
//    any derivatives of ELPA under the same license that we chose for
//    the original distribution, the GNU Lesser General Public License.
//
//
// --------------------------------------------------------------------------------------------------
//
// This file contains the compute intensive kernels for the Householder transformations.
// It should be compiled with the highest possible optimization level.
//
// On Intel Nehalem or Intel Westmere or AMD Magny Cours use -O3 -msse3
// On Intel Sandy Bridge use -O3 -mavx
//
// Copyright of the original code rests with the authors inside the ELPA
// consortium. The copyright of any additional modifications shall rest
// with their original authors, but shall adhere to the licensing terms
// distributed along with the original code in the file "COPYING".
//
// Author: Alexander Heinecke (alexander.heinecke@mytum.de)
// Adapted for building a shared-library by Andreas Marek, MPCDF (andreas.marek@mpcdf.mpg.de)
// --------------------------------------------------------------------------------------------------

#include "config-f90.h"

#include <complex.h>
#include <x86intrin.h>

#ifdef DOUBLE_PRECISION_COMPLEX
#define offset 2
#define __SSE_DATATYPE __m128d
#define _SSE_LOAD _mm_load_pd
#define _SSE_STORE _mm_store_pd
#define _SSE_MUL _mm_mul_pd
#define _SSE_ADD _mm_add_pd
#define _SSE_XOR _mm_xor_pd
#define _SSE_MADDSUB _mm_maddsub_pd
#define _SSE_ADDSUB _mm_addsub_pd
#define _SSE_SHUFFLE _mm_shuffle_pd
79
#define _SHUFFLE _MM_SHUFFLE2(0,1)
80 81 82 83 84 85 86 87 88 89 90 91
#endif
#ifdef SINGLE_PRECISION_COMPLEX
#define offset 4
#define __SSE_DATATYPE __m128
#define _SSE_LOAD _mm_load_ps
#define _SSE_STORE _mm_store_ps
#define _SSE_MUL _mm_mul_ps
#define _SSE_ADD _mm_add_ps
#define _SSE_XOR _mm_xor_ps
#define _SSE_MADDSUB _mm_maddsub_ps
#define _SSE_ADDSUB _mm_addsub_ps
#define _SSE_SHUFFLE _mm_shuffle_ps
92
#define _SHUFFLE 0xb1
93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
#endif

#define __forceinline __attribute__((always_inline))

#ifdef HAVE_SSE_INTRINSICS
#undef __AVX__
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
//Forward declaration
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(double complex* q, double complex* hh, int nb, int ldq);
#endif

#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq);
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
/*
!f>#ifdef WITH_COMPLEX_SSE_BLOCK1_KERNEL
!f> interface
!f>   subroutine single_hh_trafo_complex_sse_1hv_double(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_sse_1hv_double")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     ! complex(kind=c_double_complex)     :: q(*)
!f>     type(c_ptr), value                   :: q
!f>     complex(kind=c_double_complex)     :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef SINGLE_PRECISION_COMPLEX
/*
!f>#ifdef HAVE_SSE_INTRINSICS
!f> interface
!f>   subroutine single_hh_trafo_complex_sse_1hv_single(q, hh, pnb, pnq, pldq) &
!f>                             bind(C, name="single_hh_trafo_complex_sse_1hv_single")
!f>     use, intrinsic :: iso_c_binding
!f>     integer(kind=c_int)     :: pnb, pnq, pldq
!f>     ! complex(kind=c_float_complex)   :: q(*)
!f>     type(c_ptr), value                :: q
!f>     complex(kind=c_float_complex)   :: hh(pnb,2)
!f>   end subroutine
!f> end interface
!f>#endif
*/
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
void single_hh_trafo_complex_sse_1hv_double(double complex* q, double complex* hh, int* pnb, int* pnq, int* pldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
void single_hh_trafo_complex_sse_1hv_single(float complex* q, float complex* hh, int* pnb, int* pnq, int* pldq)
#endif
{
Andreas Marek's avatar
Andreas Marek committed
155 156 157 158 159
        int i;
        int nb = *pnb;
        int nq = *pldq;
        int ldq = *pldq;
        //int ldh = *pldh;
160

Andreas Marek's avatar
Andreas Marek committed
161 162
        for (i = 0; i < nq-4; i+=6)
        {
163
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
164
                hh_trafo_complex_kernel_6_SSE_1hv_double(&q[i], hh, nb, ldq);
165 166
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
167
                hh_trafo_complex_kernel_6_SSE_1hv_single(&q[i], hh, nb, ldq);
168
#endif
Andreas Marek's avatar
Andreas Marek committed
169 170 171 172
        }
        if (nq-i == 0) {
          return;
        } else {
173

Andreas Marek's avatar
Andreas Marek committed
174 175
        if (nq-i > 2)
        {
176
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
177
                hh_trafo_complex_kernel_4_SSE_1hv_double(&q[i], hh, nb, ldq);
178 179
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
180
                hh_trafo_complex_kernel_4_SSE_1hv_single(&q[i], hh, nb, ldq);
181
#endif
Andreas Marek's avatar
Andreas Marek committed
182 183 184
        }
        else
        {
185
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
186
                hh_trafo_complex_kernel_2_SSE_1hv_double(&q[i], hh, nb, ldq);
187 188
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
189
                hh_trafo_complex_kernel_2_SSE_1hv_single(&q[i], hh, nb, ldq);
190
#endif
Andreas Marek's avatar
Andreas Marek committed
191
        }
192 193 194 195 196 197 198 199 200 201 202 203
    }
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_6_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
204 205
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
206 207
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
208 209
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
210
#endif
Andreas Marek's avatar
Andreas Marek committed
211 212 213 214 215
        __SSE_DATATYPE x1, x2, x3, x4, x5, x6;
        __SSE_DATATYPE q1, q2, q3, q4, q5, q6;
        __SSE_DATATYPE h1_real, h1_imag;
        __SSE_DATATYPE tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
        int i=0;
216 217

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
218
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
219 220
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
221
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
222 223
#endif

Andreas Marek's avatar
Andreas Marek committed
224 225 226
        x1 = _SSE_LOAD(&q_dbl[0]);
        x2 = _SSE_LOAD(&q_dbl[offset]);
        x3 = _SSE_LOAD(&q_dbl[2*offset]);
227
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
228 229 230
        x4 = _SSE_LOAD(&q_dbl[3*offset]);
        x5 = _SSE_LOAD(&q_dbl[4*offset]);
        x6 = _SSE_LOAD(&q_dbl[5*offset]);
231
#endif
Andreas Marek's avatar
Andreas Marek committed
232 233
        for (i = 1; i < nb; i++)
        {
234 235

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
236 237
                h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
238 239
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
240 241
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
242 243
#endif
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
244 245
                // conjugate
                h1_imag = _SSE_XOR(h1_imag, sign);
246 247
#endif

Andreas Marek's avatar
Andreas Marek committed
248 249 250
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
251
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
252 253 254
                q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _SSE_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
                q6 = _SSE_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
255 256
#endif

Andreas Marek's avatar
Andreas Marek committed
257
                tmp1 = _SSE_MUL(h1_imag, q1);
258 259

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
260
                x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
261
#else
Andreas Marek's avatar
Andreas Marek committed
262
                x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
263
#endif
Andreas Marek's avatar
Andreas Marek committed
264
                tmp2 = _SSE_MUL(h1_imag, q2);
265
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
266
                x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
267
#else
Andreas Marek's avatar
Andreas Marek committed
268
                x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
269
#endif
Andreas Marek's avatar
Andreas Marek committed
270
                tmp3 = _SSE_MUL(h1_imag, q3);
271
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
272
                x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
273
#else
Andreas Marek's avatar
Andreas Marek committed
274
                x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
275 276 277
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
278
                tmp4 = _SSE_MUL(h1_imag, q4);
279
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
280
                x4 = _SSE_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
281
#else
Andreas Marek's avatar
Andreas Marek committed
282
                x4 = _SSE_ADD(x4, _SSE_ADDSUB( _SSE_MUL(h1_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
283
#endif
Andreas Marek's avatar
Andreas Marek committed
284
                tmp5 = _SSE_MUL(h1_imag, q5);
285
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
286
                x5 = _SSE_ADD(x5, _mm_msubadd_pd(h1_real, q5, _SSE_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
287
#else
Andreas Marek's avatar
Andreas Marek committed
288
                x5 = _SSE_ADD(x5, _SSE_ADDSUB( _SSE_MUL(h1_real, q5), _SSE_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
289
#endif
Andreas Marek's avatar
Andreas Marek committed
290
                tmp6 = _SSE_MUL(h1_imag, q6);
291
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
292
                x6 = _SSE_ADD(x6, _mm_msubadd_pd(h1_real, q6, _SSE_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
293
#else
Andreas Marek's avatar
Andreas Marek committed
294
                x6 = _SSE_ADD(x6, _SSE_ADDSUB( _SSE_MUL(h1_real, q6), _SSE_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
295 296 297
#endif

#endif /* DOUBLE_PRECISION_COMPLEX */
Andreas Marek's avatar
Andreas Marek committed
298
        }
299 300

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
301 302
        h1_real = _mm_loaddup_pd(&hh_dbl[0]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
303 304
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
305 306
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
307
#endif
Andreas Marek's avatar
Andreas Marek committed
308 309
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
310

Andreas Marek's avatar
Andreas Marek committed
311
        tmp1 = _SSE_MUL(h1_imag, x1);
312 313

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
314
        x1 = _SSE_MADDSUB(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
315
#else
Andreas Marek's avatar
Andreas Marek committed
316
        x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
317
#endif
Andreas Marek's avatar
Andreas Marek committed
318
        tmp2 = _SSE_MUL(h1_imag, x2);
319
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
320
        x2 = _SSE_MADDSUB(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
321
#else
Andreas Marek's avatar
Andreas Marek committed
322
        x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
323
#endif
Andreas Marek's avatar
Andreas Marek committed
324
        tmp3 = _SSE_MUL(h1_imag, x3);
325
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
326
        x3 = _SSE_MADDSUB(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
327
#else
Andreas Marek's avatar
Andreas Marek committed
328
        x3 = _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
329 330 331
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
332
        tmp4 = _SSE_MUL(h1_imag, x4);
333
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
334
        x4 = _SSE_MADDSUB(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
335
#else
Andreas Marek's avatar
Andreas Marek committed
336
        x4 = _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
337
#endif
Andreas Marek's avatar
Andreas Marek committed
338
        tmp5 = _SSE_MUL(h1_imag, x5);
339
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
340
        x5 = _SSE_MADDSUB(h1_real, x5, _SSE_SHUFFLE(tmp5, tmp5, _SHUFFLE));
341
#else
Andreas Marek's avatar
Andreas Marek committed
342
        x5 = _SSE_ADDSUB( _SSE_MUL(h1_real, x5), _SSE_SHUFFLE(tmp5, tmp5, _SHUFFLE));
343
#endif
Andreas Marek's avatar
Andreas Marek committed
344
        tmp6 = _SSE_MUL(h1_imag, x6);
345
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
346
        x6 = _SSE_MADDSUB(h1_real, x6, _SSE_SHUFFLE(tmp6, tmp6, _SHUFFLE));
347
#else
Andreas Marek's avatar
Andreas Marek committed
348
        x6 = _SSE_ADDSUB( _SSE_MUL(h1_real, x6), _SSE_SHUFFLE(tmp6, tmp6, _SHUFFLE));
349 350 351
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

Andreas Marek's avatar
Andreas Marek committed
352 353 354
        q1 = _SSE_LOAD(&q_dbl[0]);
        q2 = _SSE_LOAD(&q_dbl[offset]);
        q3 = _SSE_LOAD(&q_dbl[2*offset]);
355
#ifdef DOUBLE_PRECISION_COMPLEX 
Andreas Marek's avatar
Andreas Marek committed
356 357 358
        q4 = _SSE_LOAD(&q_dbl[3*offset]);
        q5 = _SSE_LOAD(&q_dbl[4*offset]);
        q6 = _SSE_LOAD(&q_dbl[5*offset]);
359 360
#endif

Andreas Marek's avatar
Andreas Marek committed
361 362 363
        q1 = _SSE_ADD(q1, x1);
        q2 = _SSE_ADD(q2, x2);
        q3 = _SSE_ADD(q3, x3);
364
#ifdef DOUBLE_PRECISION_COMPLEX 
Andreas Marek's avatar
Andreas Marek committed
365 366 367
        q4 = _SSE_ADD(q4, x4);
        q5 = _SSE_ADD(q5, x5);
        q6 = _SSE_ADD(q6, x6);
368 369
#endif

Andreas Marek's avatar
Andreas Marek committed
370 371 372
        _SSE_STORE(&q_dbl[0], q1);
        _SSE_STORE(&q_dbl[offset], q2);
        _SSE_STORE(&q_dbl[2*offset], q3);
373
#ifdef DOUBLE_PRECISION_COMPLEX 
Andreas Marek's avatar
Andreas Marek committed
374 375 376
        _SSE_STORE(&q_dbl[3*offset], q4);
        _SSE_STORE(&q_dbl[4*offset], q5);
        _SSE_STORE(&q_dbl[5*offset], q6);
377
#endif
Andreas Marek's avatar
Andreas Marek committed
378 379
        for (i = 1; i < nb; i++)
        {
380
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
381 382
                h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
383 384
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
385 386
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
387 388
#endif

Andreas Marek's avatar
Andreas Marek committed
389 390 391
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
392
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
393 394 395
                q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
                q5 = _SSE_LOAD(&q_dbl[(2*i*ldq)+4*offset]);
                q6 = _SSE_LOAD(&q_dbl[(2*i*ldq)+5*offset]);
396
#endif
Andreas Marek's avatar
Andreas Marek committed
397
                tmp1 = _SSE_MUL(h1_imag, x1);
398 399

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
400
                q1 = _SSE_ADD(q1, _SSE_MADDSUB(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
401
#else
Andreas Marek's avatar
Andreas Marek committed
402
                q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
403
#endif
Andreas Marek's avatar
Andreas Marek committed
404
                tmp2 = _SSE_MUL(h1_imag, x2);
405
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
406
                q2 = _SSE_ADD(q2, _SSE_MADDSUB(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
407
#else
Andreas Marek's avatar
Andreas Marek committed
408
                q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
409
#endif
Andreas Marek's avatar
Andreas Marek committed
410
                tmp3 = _SSE_MUL(h1_imag, x3);
411
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
412
                q3 = _SSE_ADD(q3, _SSE_MADDSUB(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
413
#else
Andreas Marek's avatar
Andreas Marek committed
414
                q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
415 416 417
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
418
                tmp4 = _SSE_MUL(h1_imag, x4);
419
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
420
                q4 = _SSE_ADD(q4, _SSE_MADDSUB(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
421
#else
Andreas Marek's avatar
Andreas Marek committed
422
                q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
423
#endif
Andreas Marek's avatar
Andreas Marek committed
424
                tmp5 = _SSE_MUL(h1_imag, x5);
425
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
426
                q5 = _SSE_ADD(q5, _SSE_MADDSUB(h1_real, x5, _SSE_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
427
#else
Andreas Marek's avatar
Andreas Marek committed
428
                q5 = _SSE_ADD(q5, _SSE_ADDSUB( _SSE_MUL(h1_real, x5), _SSE_SHUFFLE(tmp5, tmp5, _SHUFFLE)));
429
#endif
Andreas Marek's avatar
Andreas Marek committed
430
                tmp6 = _SSE_MUL(h1_imag, x6);
431
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
432
                q6 = _SSE_ADD(q6, _SSE_MADDSUB(h1_real, x6, _SSE_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
433
#else
Andreas Marek's avatar
Andreas Marek committed
434
                q6 = _SSE_ADD(q6, _SSE_ADDSUB( _SSE_MUL(h1_real, x6), _SSE_SHUFFLE(tmp6, tmp6, _SHUFFLE)));
435 436 437
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

Andreas Marek's avatar
Andreas Marek committed
438 439 440
                _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _SSE_STORE(&q_dbl[(2*i*ldq)+offset], q2);
                _SSE_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
441
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
442 443 444
                _SSE_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
                _SSE_STORE(&q_dbl[(2*i*ldq)+4*offset], q5);
                _SSE_STORE(&q_dbl[(2*i*ldq)+5*offset], q6);
445
#endif
Andreas Marek's avatar
Andreas Marek committed
446
        }
447 448 449 450 451 452 453 454 455 456
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_4_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
457 458
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
459 460
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
461 462
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
463
#endif
Andreas Marek's avatar
Andreas Marek committed
464 465 466 467 468
        __SSE_DATATYPE x1, x2, x3, x4;
        __SSE_DATATYPE q1, q2, q3, q4;
        __SSE_DATATYPE h1_real, h1_imag;
        __SSE_DATATYPE tmp1, tmp2, tmp3, tmp4;
        int i=0;
469
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
470
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
471 472
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
473
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
474 475
#endif

Andreas Marek's avatar
Andreas Marek committed
476 477
        x1 = _SSE_LOAD(&q_dbl[0]);
        x2 = _SSE_LOAD(&q_dbl[offset]);
478
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
479 480
        x3 = _SSE_LOAD(&q_dbl[2*offset]);
        x4 = _SSE_LOAD(&q_dbl[3*offset]);
481
#endif
Andreas Marek's avatar
Andreas Marek committed
482 483
        for (i = 1; i < nb; i++)
        {
484
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
485 486
                h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
487 488
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
489 490
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
491 492
#endif
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
493 494
                // conjugate
                h1_imag = _SSE_XOR(h1_imag, sign);
495 496
#endif

Andreas Marek's avatar
Andreas Marek committed
497 498
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
499
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
500 501
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
502
#endif
Andreas Marek's avatar
Andreas Marek committed
503
                tmp1 = _SSE_MUL(h1_imag, q1);
504 505

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
506
                x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
507
#else
Andreas Marek's avatar
Andreas Marek committed
508
                x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
509 510
#endif

Andreas Marek's avatar
Andreas Marek committed
511
                tmp2 = _SSE_MUL(h1_imag, q2);
512
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
513
                x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
514
#else
Andreas Marek's avatar
Andreas Marek committed
515
                x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
516 517 518
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
519
                tmp3 = _SSE_MUL(h1_imag, q3);
520
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
521
                x3 = _SSE_ADD(x3, _mm_msubadd_pd(h1_real, q3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
522
#else
Andreas Marek's avatar
Andreas Marek committed
523
                x3 = _SSE_ADD(x3, _SSE_ADDSUB( _SSE_MUL(h1_real, q3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
524
#endif
Andreas Marek's avatar
Andreas Marek committed
525
                tmp4 = _SSE_MUL(h1_imag, q4);
526
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
527
                x4 = _SSE_ADD(x4, _mm_msubadd_pd(h1_real, q4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
528
#else
Andreas Marek's avatar
Andreas Marek committed
529
                x4 = _SSE_ADD(x4, _SSE_ADDSUB( _SSE_MUL(h1_real, q4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
530 531
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
Andreas Marek's avatar
Andreas Marek committed
532
        }
533 534

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
535 536
        h1_real = _mm_loaddup_pd(&hh_dbl[0]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
537 538
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
539 540
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
541
#endif
Andreas Marek's avatar
Andreas Marek committed
542 543
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
544

Andreas Marek's avatar
Andreas Marek committed
545
        tmp1 = _SSE_MUL(h1_imag, x1);
546 547

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
548
        x1 = _SSE_MADDSUB(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
549
#else
Andreas Marek's avatar
Andreas Marek committed
550
        x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
551
#endif
Andreas Marek's avatar
Andreas Marek committed
552
        tmp2 = _SSE_MUL(h1_imag, x2);
553
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
554
        x2 = _SSE_MADDSUB(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
555
#else
Andreas Marek's avatar
Andreas Marek committed
556
        x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
557 558 559
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
560
        tmp3 = _SSE_MUL(h1_imag, x3);
561
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
562
        x3 = _SSE_MADDSUB(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
563
#else
Andreas Marek's avatar
Andreas Marek committed
564
        x3 = _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE));
565
#endif
Andreas Marek's avatar
Andreas Marek committed
566
        tmp4 = _SSE_MUL(h1_imag, x4);
567
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
568
        x4 = _SSE_MADDSUB(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
569
#else
Andreas Marek's avatar
Andreas Marek committed
570
        x4 = _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE));
571 572 573
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

Andreas Marek's avatar
Andreas Marek committed
574 575
        q1 = _SSE_LOAD(&q_dbl[0]);
        q2 = _SSE_LOAD(&q_dbl[offset]);
576
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
577 578
        q3 = _SSE_LOAD(&q_dbl[2*offset]);
        q4 = _SSE_LOAD(&q_dbl[3*offset]);
579
#endif
Andreas Marek's avatar
Andreas Marek committed
580 581
        q1 = _SSE_ADD(q1, x1);
        q2 = _SSE_ADD(q2, x2);
582
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
583 584
        q3 = _SSE_ADD(q3, x3);
        q4 = _SSE_ADD(q4, x4);
585
#endif
Andreas Marek's avatar
Andreas Marek committed
586 587
        _SSE_STORE(&q_dbl[0], q1);
        _SSE_STORE(&q_dbl[offset], q2);
588
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
589 590
        _SSE_STORE(&q_dbl[2*offset], q3);
        _SSE_STORE(&q_dbl[3*offset], q4);
591
#endif
Andreas Marek's avatar
Andreas Marek committed
592 593
        for (i = 1; i < nb; i++)
        {
594
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
595 596
                h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
597 598
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
599 600
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
601
#endif
Andreas Marek's avatar
Andreas Marek committed
602 603
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
604
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
605 606
                q3 = _SSE_LOAD(&q_dbl[(2*i*ldq)+2*offset]);
                q4 = _SSE_LOAD(&q_dbl[(2*i*ldq)+3*offset]);
607
#endif
Andreas Marek's avatar
Andreas Marek committed
608
                tmp1 = _SSE_MUL(h1_imag, x1);
609 610

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
611
                q1 = _SSE_ADD(q1, _SSE_MADDSUB(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
612
#else
Andreas Marek's avatar
Andreas Marek committed
613
                q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
614
#endif
Andreas Marek's avatar
Andreas Marek committed
615
                tmp2 = _SSE_MUL(h1_imag, x2);
616
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
617
                q2 = _SSE_ADD(q2, _SSE_MADDSUB(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
618
#else
Andreas Marek's avatar
Andreas Marek committed
619
                q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
620 621 622
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
623
                tmp3 = _SSE_MUL(h1_imag, x3);
624
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
625
                q3 = _SSE_ADD(q3, _SSE_MADDSUB(h1_real, x3, _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
626
#else
Andreas Marek's avatar
Andreas Marek committed
627
                q3 = _SSE_ADD(q3, _SSE_ADDSUB( _SSE_MUL(h1_real, x3), _SSE_SHUFFLE(tmp3, tmp3, _SHUFFLE)));
628
#endif
Andreas Marek's avatar
Andreas Marek committed
629
                tmp4 = _SSE_MUL(h1_imag, x4);
630
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
631
                q4 = _SSE_ADD(q4, _SSE_MADDSUB(h1_real, x4, _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
632
#else
Andreas Marek's avatar
Andreas Marek committed
633
                q4 = _SSE_ADD(q4, _SSE_ADDSUB( _SSE_MUL(h1_real, x4), _SSE_SHUFFLE(tmp4, tmp4, _SHUFFLE)));
634 635 636
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */

Andreas Marek's avatar
Andreas Marek committed
637 638
                _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
                _SSE_STORE(&q_dbl[(2*i*ldq)+offset], q2);
639
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
640 641
                _SSE_STORE(&q_dbl[(2*i*ldq)+2*offset], q3);
                _SSE_STORE(&q_dbl[(2*i*ldq)+3*offset], q4);
642
#endif
Andreas Marek's avatar
Andreas Marek committed
643
        }
644 645 646 647 648 649 650 651 652 653 654
}

#ifdef DOUBLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_double(double complex* q, double complex* hh, int nb, int ldq)
#endif
#ifdef SINGLE_PRECISION_COMPLEX
static __forceinline void hh_trafo_complex_kernel_2_SSE_1hv_single(float complex* q, float complex* hh, int nb, int ldq)
#endif
{

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
655 656
        double* q_dbl = (double*)q;
        double* hh_dbl = (double*)hh;
657 658
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
659 660
        float* q_dbl = (float*)q;
        float* hh_dbl = (float*)hh;
661
#endif
Andreas Marek's avatar
Andreas Marek committed
662 663 664 665 666
        __SSE_DATATYPE x1, x2;
        __SSE_DATATYPE q1, q2;
        __SSE_DATATYPE h1_real, h1_imag;
        __SSE_DATATYPE tmp1, tmp2;
        int i=0;
667 668

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
669
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi64x(0x8000000000000000, 0x8000000000000000);
670 671
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
672
        __SSE_DATATYPE sign = (__SSE_DATATYPE)_mm_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000);
673
#endif
Andreas Marek's avatar
Andreas Marek committed
674
        x1 = _SSE_LOAD(&q_dbl[0]);
675
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
676
        x2 = _SSE_LOAD(&q_dbl[offset]);
677
#endif
Andreas Marek's avatar
Andreas Marek committed
678 679
        for (i = 1; i < nb; i++)
        {
680
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
681 682
                h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
683 684
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
685 686
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
687 688
#endif
#ifndef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
689 690
                // conjugate
                h1_imag = _SSE_XOR(h1_imag, sign);
691 692
#endif

Andreas Marek's avatar
Andreas Marek committed
693
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
694
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
695
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
696
#endif
Andreas Marek's avatar
Andreas Marek committed
697
                tmp1 = _SSE_MUL(h1_imag, q1);
698 699

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
700
                x1 = _SSE_ADD(x1, _mm_msubadd_pd(h1_real, q1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
701
#else
Andreas Marek's avatar
Andreas Marek committed
702
                x1 = _SSE_ADD(x1, _SSE_ADDSUB( _SSE_MUL(h1_real, q1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
703 704 705
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
706
                tmp2 = _SSE_MUL(h1_imag, q2);
707
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
708
                x2 = _SSE_ADD(x2, _mm_msubadd_pd(h1_real, q2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
709
#else
Andreas Marek's avatar
Andreas Marek committed
710
                x2 = _SSE_ADD(x2, _SSE_ADDSUB( _SSE_MUL(h1_real, q2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
711 712
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
Andreas Marek's avatar
Andreas Marek committed
713
        }
714 715

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
716 717
        h1_real = _mm_loaddup_pd(&hh_dbl[0]);
        h1_imag = _mm_loaddup_pd(&hh_dbl[1]);
718 719
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
720 721
        h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[0]) )));
        h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[1]) )));
722
#endif
Andreas Marek's avatar
Andreas Marek committed
723 724
        h1_real = _SSE_XOR(h1_real, sign);
        h1_imag = _SSE_XOR(h1_imag, sign);
725

Andreas Marek's avatar
Andreas Marek committed
726
        tmp1 = _SSE_MUL(h1_imag, x1);
727 728

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
729
        x1 = _SSE_MADDSUB(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
730
#else
Andreas Marek's avatar
Andreas Marek committed
731
        x1 = _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE));
732 733 734
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
735
        tmp2 = _SSE_MUL(h1_imag, x2);
736
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
737
        x2 = _SSE_MADDSUB(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
738
#else
Andreas Marek's avatar
Andreas Marek committed
739
        x2 = _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE));
740 741
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
Andreas Marek's avatar
Andreas Marek committed
742
        q1 = _SSE_LOAD(&q_dbl[0]);
743
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
744
        q2 = _SSE_LOAD(&q_dbl[offset]);
745
#endif
Andreas Marek's avatar
Andreas Marek committed
746
        q1 = _SSE_ADD(q1, x1);
747
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
748
        q2 = _SSE_ADD(q2, x2);
749
#endif
Andreas Marek's avatar
Andreas Marek committed
750
        _SSE_STORE(&q_dbl[0], q1);
751
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
752
        _SSE_STORE(&q_dbl[offset], q2);
753
#endif
Andreas Marek's avatar
Andreas Marek committed
754 755
        for (i = 1; i < nb; i++)
        {
756
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
757 758
                h1_real = _mm_loaddup_pd(&hh_dbl[i*2]);
                h1_imag = _mm_loaddup_pd(&hh_dbl[(i*2)+1]);
759 760
#endif
#ifdef SINGLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
761 762
                h1_real = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[i*2]) )));
                h1_imag = _mm_moveldup_ps(_mm_castpd_ps(_mm_loaddup_pd( (double *)(&hh_dbl[(i*2)+1]) )));
763 764
#endif

Andreas Marek's avatar
Andreas Marek committed
765
                q1 = _SSE_LOAD(&q_dbl[(2*i*ldq)+0]);
766
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
767
                q2 = _SSE_LOAD(&q_dbl[(2*i*ldq)+offset]);
768
#endif
Andreas Marek's avatar
Andreas Marek committed
769
                tmp1 = _SSE_MUL(h1_imag, x1);
770 771

#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
772
                q1 = _SSE_ADD(q1, _SSE_MADDSUB(h1_real, x1, _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
773
#else
Andreas Marek's avatar
Andreas Marek committed
774
                q1 = _SSE_ADD(q1, _SSE_ADDSUB( _SSE_MUL(h1_real, x1), _SSE_SHUFFLE(tmp1, tmp1, _SHUFFLE)));
775 776 777
#endif

#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
778
                tmp2 = _SSE_MUL(h1_imag, x2);
779
#ifdef __ELPA_USE_FMA__
Andreas Marek's avatar
Andreas Marek committed
780
                q2 = _SSE_ADD(q2, _SSE_MADDSUB(h1_real, x2, _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
781
#else
Andreas Marek's avatar
Andreas Marek committed
782
                q2 = _SSE_ADD(q2, _SSE_ADDSUB( _SSE_MUL(h1_real, x2), _SSE_SHUFFLE(tmp2, tmp2, _SHUFFLE)));
783 784
#endif
#endif /* DOUBLE_PRECISION_COMPLEX */
Andreas Marek's avatar
Andreas Marek committed
785
                _SSE_STORE(&q_dbl[(2*i*ldq)+0], q1);
786
#ifdef DOUBLE_PRECISION_COMPLEX
Andreas Marek's avatar
Andreas Marek committed
787
                _SSE_STORE(&q_dbl[(2*i*ldq)+offset], q2);
788
#endif
Andreas Marek's avatar
Andreas Marek committed
789
        }
790
}