mod_compute_hh_trafo_complex.F90 36.8 KB
Newer Older
1
2
3
4
5
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6
7
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
11
12
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
20
!    http://elpa.mpcdf.mpg.de/
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF

44
45
module compute_hh_trafo_complex
#include "config-f90.h"
46
  use elpa_mpi
47
48
49
  implicit none

#ifdef WITH_OPENMP
50
  public compute_hh_trafo_complex_cpu_openmp_double
51
#else
52
  public compute_hh_trafo_complex_cpu_double
53
54
#endif

55
56
57
58
59
60
61
62
63
#ifdef WANT_SINGLE_PRECISION_COMPLEX

#ifdef WITH_OPENMP
  public compute_hh_trafo_complex_cpu_openmp_single
#else
  public compute_hh_trafo_complex_cpu_single
#endif

#endif
64
65
66
67

  contains

#ifdef WITH_OPENMP
68
         subroutine compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
69
70
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe,                                               &
71
                                                        my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
72
#else
73
         subroutine compute_hh_trafo_complex_cpu_double       (a, stripe_width, a_dim2, stripe_count,                             &
74
75
76
77
78
79
80
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe, last_stripe_width,                            &
                                                        THIS_COMPLEX_ELPA_KERNEL)
#endif
           use precision
           use elpa2_utilities
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
81
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
82
83
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
84
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
85
86
87
88
#endif
#ifdef HAVE_DETAILED_TIMINGS
           use timings
#endif
89
           use iso_c_binding
90

91
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) || defined(HAVE_AVX512)
92
93
         use kernel_interfaces
#endif
94
           implicit none
95
           real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
96
97
           integer(kind=lik)            :: kernel_flops
           integer(kind=ik), intent(in) :: nbw, max_blk_size
98
           complex(kind=ck8)            :: bcast_buffer(nbw,max_blk_size)
99
100
101
102
103
           integer(kind=ik), intent(in) :: a_off

           integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
#ifndef WITH_OPENMP
           integer(kind=ik), intent(in) :: last_stripe_width
104
           complex(kind=ck8)             :: a(stripe_width,a_dim2,stripe_count)
105
#else
106
           integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
107
           complex(kind=ck8)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
108
109
110
111
112
113
114
115
116
#endif
           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL

           ! Private variables in OMP regions (my_thread) should better be in the argument list!

           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
#ifdef WITH_OPENMP
           integer(kind=ik)             :: my_thread, noff
#endif
117
           real(kind=c_double)          :: ttt  ! MPI_WTIME always needs double
118
119
120
121
122

           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
           !        Currently (on Sandy Bridge), single is faster than double
           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

123
           complex(kind=ck8)            :: w(nbw,2)
124
125
126

#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
127
          call timer%start("compute_hh_trafo_complex_cpu_openmp_double")
128
#else
129
          call timer%start("compute_hh_trafo_complex_cpu_double")
130
131
132
133
134
135
136
137
138
139
#endif
#endif

#ifdef WITH_OPENMP
           if (istripe<stripe_count) then
             nl = stripe_width
           else
             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
             nl = min(my_thread*thread_width-noff, l_nev-noff)
             if(nl<=0) then
140
#ifdef HAVE_DETAILED_TIMINGS
141
               call timer%stop("compute_hh_trafo_complex_cpu_openmp_double")
142
143
144
145
146
147
148
149
#endif
               return
             endif
           endif
#else
           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif

150
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
151
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
152
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
153
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
154

155
156
157
158
159
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
160
               call double_hh_trafo_complex_sse_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), &
161
162
                                                       w, nbw, nl, stripe_width, nbw)
#else
163
               call double_hh_trafo_complex_sse_2hv_double(a(1,j+off+a_off-1,istripe), &
164
165
166
167
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
168
             if (j==1) call single_hh_trafo_complex_sse_1hv_double(a(1,1+off+a_off,istripe,my_thread), &
169
170
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
171
             if (j==1) call single_hh_trafo_complex_sse_1hv_double(a(1,1+off+a_off,istripe), &
172
173
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
174

175
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
176
           endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
177
178
179
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */

180
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
181
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
182
183
           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
                (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then
184
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
185
186

              ttt = mpi_wtime()
187
188
189
190
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
191
               call double_hh_trafo_complex_avx_avx2_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), &
192
193
                                                       w, nbw, nl, stripe_width, nbw)
#else
194
               call double_hh_trafo_complex_avx_avx2_2hv_double(a(1,j+off+a_off-1,istripe), &
195
196
197
198
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
199
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,1+off+a_off,istripe,my_thread), &
200
201
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
202
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,1+off+a_off,istripe), &
203
204
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
205

206
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
207
           endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) )
208
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */


#if defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK2)) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */

              ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_avx512_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_avx512_2hv_double(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_avx512_1hv_double(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_avx512_1hv_double(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
236

237
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
238
           endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK2))
239
240
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
241
242
243
244
245
246
247
248

#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
249
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
250
               call single_hh_trafo_complex_generic_simple_double(a(1,j+off+a_off,istripe,my_thread), &
251
252
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
253
254
               call single_hh_trafo_complex_generic_simple_double(a(1:stripe_width, &
                                                                  j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
255
256
257
258
259
                                                           bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
260
               call single_hh_trafo_complex_generic_simple_double(a(1,j+off+a_off,istripe), &
261
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
262
#else
263
               call single_hh_trafo_complex_generic_simple_double(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
264
                                                          bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
265
#endif
266
267

#endif /* WITH_OPENMP */
268
269
             enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
270
           endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
271
272
273
274
275
276
277
278
279
280
281
282
283
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
284
285
#ifdef DESPERATELY_WANT_ASSUMED_SIZE

286
              call single_hh_trafo_complex_generic_double(a(1,j+off+a_off,istripe,my_thread), &
287
288
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
289
              call single_hh_trafo_complex_generic_double(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
290
291
292
293
294
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
295
              call single_hh_trafo_complex_generic_double(a(1,j+off+a_off,istripe), &
296
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
297
#else
298
              call single_hh_trafo_complex_generic_double(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
299
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
300
#endif
301
302
#endif /* WITH_OPENMP */

303
304
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
305
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ )
306
307
308
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

309
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
310
311
312
313
314
315
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
316
              call single_hh_trafo_complex_double(a(1,j+off+a_off,istripe,my_thread), &
317
318
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
319
              call single_hh_trafo_complex_double(a(1,j+off+a_off,istripe), &
320
321
322
323
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
324
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE)
325
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
326
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
327
328
329


!#if defined(WITH_AVX_SANDYBRIDGE)
330
!              call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
331
332
333
!#endif

!#if defined(WITH_AMD_BULLDOZER)
334
!              call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
335
336
!#endif

337
338
339
340
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
341

342
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
343
344
345
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
346
              call single_hh_trafo_complex_sse_1hv_double(a(1,j+off+a_off,istripe,my_thread), &
347
348
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
349
              call single_hh_trafo_complex_sse_1hv_double(a(1,j+off+a_off,istripe), &
350
351
352
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
353
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
354

355
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
356
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
357
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
358
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
359

360
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
361
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
362
363
          if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
              (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
364
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
365

366
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
367
368
369
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
370
              call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe,my_thread), &
371
372
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
373
              call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe), &
374
375
376
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
377
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
378

379
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
380
          endif ! ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1))
381
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */

#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */

#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_complex_avx512_1hv_double(a(1,j+off+a_off,istripe,my_thread), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_avx512_1hv_double(a(1,j+off+a_off,istripe), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */

#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
403
          endif ! ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK1))
404
405
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
406
407
408
409
410
411
412
413
414
415
416

#ifdef WITH_OPENMP
          if (my_thread==1) then
#endif
            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
            kernel_time  = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
          endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
          call timer%stop("compute_hh_trafo_complex_cpu_openmp_double")
#else
          call timer%stop("compute_hh_trafo_complex_cpu_double")
#endif
#endif

#ifdef WITH_OPENMP
        end subroutine compute_hh_trafo_complex_cpu_openmp_double
#else
        end subroutine compute_hh_trafo_complex_cpu_double
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
! single precision implementation , at the moment duplicated !!

#ifdef WITH_OPENMP
         subroutine compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe,                                               &
                                                        my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
#else
         subroutine compute_hh_trafo_complex_cpu_single       (a, stripe_width, a_dim2, stripe_count,                             &
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe, last_stripe_width,                            &
                                                        THIS_COMPLEX_ELPA_KERNEL)
#endif
           use precision
           use elpa2_utilities
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
#endif
#ifdef HAVE_DETAILED_TIMINGS
           use timings
#endif
454

455
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) || defined(HAVE_AVX512)
456
457
           use kernel_interfaces
#endif
458
459
460
461
462
           use iso_c_binding
           implicit none
           real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
           integer(kind=lik)            :: kernel_flops
           integer(kind=ik), intent(in) :: nbw, max_blk_size
463
           complex(kind=ck4)            :: bcast_buffer(nbw,max_blk_size)
464
465
466
467
468
           integer(kind=ik), intent(in) :: a_off

           integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
#ifndef WITH_OPENMP
           integer(kind=ik), intent(in) :: last_stripe_width
469
           complex(kind=ck4)            :: a(stripe_width,a_dim2,stripe_count)
470
471
#else
           integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
472
           complex(kind=ck4)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#endif
           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL

           ! Private variables in OMP regions (my_thread) should better be in the argument list!

           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
#ifdef WITH_OPENMP
           integer(kind=ik)             :: my_thread, noff
#endif
           real(kind=c_double)          :: ttt  ! MPI_WTIME always needs double

           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
           !        Currently (on Sandy Bridge), single is faster than double
           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

488
           complex(kind=ck4)            :: w(nbw,2)
489
490
491

#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
492
          call timer%start("compute_hh_trafo_complex_cpu_openmp_single")
493
#else
494
          call timer%start("compute_hh_trafo_complex_cpu_single")
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
#endif
#endif

#ifdef WITH_OPENMP
           if (istripe<stripe_count) then
             nl = stripe_width
           else
             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
             nl = min(my_thread*thread_width-noff, l_nev-noff)
             if(nl<=0) then
#ifdef HAVE_DETAILED_TIMINGS
               call timer%stop("compute_hh_trafo_complex_cpu_openmp_single")
#endif
               return
             endif
           endif
#else
           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_sse_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_sse_2hv_single(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_sse_1hv_single(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_sse_1hv_single(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
538
           endif !  (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
539
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
540
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
541

542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
               (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2)) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_avx_avx2_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_avx_avx2_2hv_single(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */

571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
!#if defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
!           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK2) ) then
!#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
!             ttt = mpi_wtime()
!             do j = ncols, 2, -2
!               w(:,1) = bcast_buffer(1:nbw,j+off)
!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
!#ifdef WITH_OPENMP
!               call double_hh_trafo_complex_avx512_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), &
!                                                       w, nbw, nl, stripe_width, nbw)
!#else
!               call double_hh_trafo_complex_avx512_2hv_single(a(1,j+off+a_off-1,istripe), &
!                                                       w, nbw, nl, stripe_width, nbw)
!#endif
!             enddo
!#ifdef WITH_OPENMP
!             if (j==1) call single_hh_trafo_complex_avx512_1hv_single(a(1,1+off+a_off,istripe,my_thread), &
!                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#else
!             if (j==1) call single_hh_trafo_complex_avx512_1hv_single(a(1,1+off+a_off,istripe), &
!                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
!#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
!           endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK2)
!#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
!#endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627


#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
               call single_hh_trafo_complex_generic_simple_single(a(1,j+off+a_off,istripe,my_thread), &
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
               call single_hh_trafo_complex_generic_simple_single(a(1:stripe_width, &
                                                                  j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
                                                           bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
               call single_hh_trafo_complex_generic_simple_single(a(1,j+off+a_off,istripe), &
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
               call single_hh_trafo_complex_generic_simple_single(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
                                                          bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#endif /* WITH_OPENMP */
             enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
628
           endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef DESPERATELY_WANT_ASSUMED_SIZE

              call single_hh_trafo_complex_generic_single(a(1,j+off+a_off,istripe,my_thread), &
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_generic_single(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
              call single_hh_trafo_complex_generic_single(a(1,j+off+a_off,istripe), &
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_generic_single(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif
#endif /* WITH_OPENMP */

            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
663
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ )
664
665
666
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

667
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
668
669
670
671
672
673
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
674
675
              call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe,my_thread), &
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
676
#else
677
678
              call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe), &
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
679
#endif
680
681
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
682
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE)
683
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
684
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
685
686
687


!#if defined(WITH_AVX_SANDYBRIDGE)
688
!              call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
689
690
691
!#endif

!#if defined(WITH_AMD_BULLDOZER)
692
!              call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
693
694
!#endif

695
696
697
698
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
699
700

#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
701
702
703
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
704
              call single_hh_trafo_complex_sse_1hv_single(a(1,j+off+a_off,istripe,my_thread), &
705
706
707
708
709
710
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_sse_1hv_single(a(1,j+off+a_off,istripe), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
711
712
713

#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */

714
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
715
          endif !  (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
716
717
718
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */

719
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
720
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
721
          if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
722
               (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
723
724
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
            ttt = mpi_wtime()
725
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
726
727
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
728
              call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe,my_thread), &
729
730
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
731
              call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe), &
732
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
733
#endif
734
            enddo
735
736
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */

737
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
738
          endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1))
739
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
740
741
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX_BLOCK1_KERNEL */

742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
            ttt = mpi_wtime()
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_complex_avx512_1hv_single(a(1,j+off+a_off,istripe,my_thread), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_avx512_1hv_single(a(1,j+off+a_off,istripe), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */

#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK1))
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL */
763

764
#ifdef WITH_OPENMP
765
766
767
768
769
770
771
772
773
774
          if (my_thread==1) then
#endif
            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
            kernel_time  = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
          endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
          call timer%stop("compute_hh_trafo_complex_cpu_openmp_single")
775
#else
776
777
778
          call timer%stop("compute_hh_trafo_complex_cpu_single")
#endif
#endif
779

780
781
782
783
#ifdef WITH_OPENMP
        end subroutine compute_hh_trafo_complex_cpu_openmp_single
#else
        end subroutine compute_hh_trafo_complex_cpu_single
784
785
#endif

786
787
#endif /* WANT_SINGLE_PRECISION_COMPLEX */

788
end module