mod_compute_hh_trafo_complex.F90 37.2 KB
Newer Older
1
2
3
4
5
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6
7
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8
9
10
11
12
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14
15
16
17
18
19
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
20
!    http://elpa.mpcdf.mpg.de/
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF

44
45
module compute_hh_trafo_complex
#include "config-f90.h"
46
  use elpa_mpi
47
48
  implicit none

Andreas Marek's avatar
Andreas Marek committed
49
  private
50
#ifdef WITH_OPENMP
51
  public compute_hh_trafo_complex_cpu_openmp_double
52
#else
53
  public compute_hh_trafo_complex_cpu_double
54
#endif
Andreas Marek's avatar
Andreas Marek committed
55
  public compute_hh_trafo_complex_gpu_double
56
57
58
59
60
61
62
#ifdef WANT_SINGLE_PRECISION_COMPLEX

#ifdef WITH_OPENMP
  public compute_hh_trafo_complex_cpu_openmp_single
#else
  public compute_hh_trafo_complex_cpu_single
#endif
Andreas Marek's avatar
Andreas Marek committed
63
  public compute_hh_trafo_complex_gpu_single
64
#endif
65
66
67
68

  contains

#ifdef WITH_OPENMP
69
         subroutine compute_hh_trafo_complex_cpu_openmp_double(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
70
71
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe,                                               &
72
                                                        my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
73
#else
74
         subroutine compute_hh_trafo_complex_cpu_double       (a, stripe_width, a_dim2, stripe_count,                             &
75
76
77
78
79
80
81
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe, last_stripe_width,                            &
                                                        THIS_COMPLEX_ELPA_KERNEL)
#endif
           use precision
           use elpa2_utilities
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
82
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
83
84
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
85
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
86
87
88
89
#endif
#ifdef HAVE_DETAILED_TIMINGS
           use timings
#endif
90
           use iso_c_binding
91

92
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) || defined(HAVE_AVX512)
93
94
         use kernel_interfaces
#endif
95
           implicit none
96
           real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
97
98
           integer(kind=lik)            :: kernel_flops
           integer(kind=ik), intent(in) :: nbw, max_blk_size
99
           complex(kind=ck8)            :: bcast_buffer(nbw,max_blk_size)
100
101
102
103
104
           integer(kind=ik), intent(in) :: a_off

           integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
#ifndef WITH_OPENMP
           integer(kind=ik), intent(in) :: last_stripe_width
105
           complex(kind=ck8)             :: a(stripe_width,a_dim2,stripe_count)
106
#else
107
           integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
108
           complex(kind=ck8)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
109
110
111
112
113
114
115
116
117
#endif
           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL

           ! Private variables in OMP regions (my_thread) should better be in the argument list!

           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
#ifdef WITH_OPENMP
           integer(kind=ik)             :: my_thread, noff
#endif
118
           real(kind=c_double)          :: ttt  ! MPI_WTIME always needs double
119
120
121
122
123

           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
           !        Currently (on Sandy Bridge), single is faster than double
           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

124
           complex(kind=ck8)            :: w(nbw,2)
125
126
127

#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
128
          call timer%start("compute_hh_trafo_complex_cpu_openmp_double")
129
#else
130
          call timer%start("compute_hh_trafo_complex_cpu_double")
131
132
133
134
135
136
137
138
139
140
#endif
#endif

#ifdef WITH_OPENMP
           if (istripe<stripe_count) then
             nl = stripe_width
           else
             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
             nl = min(my_thread*thread_width-noff, l_nev-noff)
             if(nl<=0) then
141
#ifdef HAVE_DETAILED_TIMINGS
142
               call timer%stop("compute_hh_trafo_complex_cpu_openmp_double")
143
144
145
146
147
148
149
150
#endif
               return
             endif
           endif
#else
           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif

151
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
152
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
153
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
154
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
155

156
157
158
159
160
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
161
               call double_hh_trafo_complex_sse_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), &
162
163
                                                       w, nbw, nl, stripe_width, nbw)
#else
164
               call double_hh_trafo_complex_sse_2hv_double(a(1,j+off+a_off-1,istripe), &
165
166
167
168
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
169
             if (j==1) call single_hh_trafo_complex_sse_1hv_double(a(1,1+off+a_off,istripe,my_thread), &
170
171
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
172
             if (j==1) call single_hh_trafo_complex_sse_1hv_double(a(1,1+off+a_off,istripe), &
173
174
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
175

176
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
177
           endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
178
179
180
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */

181
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
182
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
183
184
           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
                (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then
185
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
186
187

              ttt = mpi_wtime()
188
189
190
191
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
192
               call double_hh_trafo_complex_avx_avx2_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), &
193
194
                                                       w, nbw, nl, stripe_width, nbw)
#else
195
               call double_hh_trafo_complex_avx_avx2_2hv_double(a(1,j+off+a_off-1,istripe), &
196
197
198
199
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
200
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,1+off+a_off,istripe,my_thread), &
201
202
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
203
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,1+off+a_off,istripe), &
204
205
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
206

207
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
208
           endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) )
209
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */


#if defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK2)) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */

              ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_avx512_2hv_double(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_avx512_2hv_double(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_avx512_1hv_double(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_avx512_1hv_double(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
237

238
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
239
           endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK2))
240
241
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
242
243
244
245
246
247
248
249

#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
250
#ifdef USE_ASSUMED_SIZE
251
               call single_hh_trafo_complex_generic_simple_double(a(1,j+off+a_off,istripe,my_thread), &
252
253
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
254
255
               call single_hh_trafo_complex_generic_simple_double(a(1:stripe_width, &
                                                                  j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
256
257
258
259
                                                           bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
260
261

#ifdef USE_ASSUMED_SIZE
262
               call single_hh_trafo_complex_generic_simple_double(a(1,j+off+a_off,istripe), &
263
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
264
#else
265
               call single_hh_trafo_complex_generic_simple_double(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
266
                                                          bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
267
#endif
268
269

#endif /* WITH_OPENMP */
270
271
             enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
272
           endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
273
274
275
276
277
278
279
280
281
282
283
284
285
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
286
#ifdef USE_ASSUMED_SIZE
287

288
              call single_hh_trafo_complex_generic_double(a(1,j+off+a_off,istripe,my_thread), &
289
290
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
291
              call single_hh_trafo_complex_generic_double(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
292
293
294
295
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
296
297

#ifdef USE_ASSUMED_SIZE
298
              call single_hh_trafo_complex_generic_double(a(1,j+off+a_off,istripe), &
299
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
300
#else
301
              call single_hh_trafo_complex_generic_double(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
302
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
303
#endif
304
305
#endif /* WITH_OPENMP */

306
307
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
308
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ )
309
310
311
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

312
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
313
314
315
316
317
318
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
319
              call single_hh_trafo_complex_double(a(1,j+off+a_off,istripe,my_thread), &
320
321
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
322
              call single_hh_trafo_complex_double(a(1,j+off+a_off,istripe), &
323
324
325
326
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
327
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE)
328
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
329
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
330
331
332


!#if defined(WITH_AVX_SANDYBRIDGE)
333
!              call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
334
335
336
!#endif

!#if defined(WITH_AMD_BULLDOZER)
337
!              call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
338
339
!#endif

340
341
342
343
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
344

345
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
346
347
348
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
349
              call single_hh_trafo_complex_sse_1hv_double(a(1,j+off+a_off,istripe,my_thread), &
350
351
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
352
              call single_hh_trafo_complex_sse_1hv_double(a(1,j+off+a_off,istripe), &
353
354
355
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
356
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
357

358
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
359
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
360
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
361
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
362

363
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
364
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
365
366
          if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
              (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
367
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
368

369
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
370
371
372
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
373
              call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe,my_thread), &
374
375
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
376
              call single_hh_trafo_complex_avx_avx2_1hv_double(a(1,j+off+a_off,istripe), &
377
378
379
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
380
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
381

382
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
383
          endif ! ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1))
384
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */

#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */

#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_complex_avx512_1hv_double(a(1,j+off+a_off,istripe,my_thread), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_avx512_1hv_double(a(1,j+off+a_off,istripe), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */

#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
406
          endif ! ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK1))
407
408
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
409
410
411
412
413
414
415
416
417
418
419

#ifdef WITH_OPENMP
          if (my_thread==1) then
#endif
            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
            kernel_time  = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
          endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
          call timer%stop("compute_hh_trafo_complex_cpu_openmp_double")
#else
          call timer%stop("compute_hh_trafo_complex_cpu_double")
#endif
#endif

#ifdef WITH_OPENMP
        end subroutine compute_hh_trafo_complex_cpu_openmp_double
#else
        end subroutine compute_hh_trafo_complex_cpu_double
#endif

#ifdef WANT_SINGLE_PRECISION_COMPLEX
! single precision implementation , at the moment duplicated !!

#ifdef WITH_OPENMP
         subroutine compute_hh_trafo_complex_cpu_openmp_single(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe,                                               &
                                                        my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
#else
         subroutine compute_hh_trafo_complex_cpu_single       (a, stripe_width, a_dim2, stripe_count,                             &
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe, last_stripe_width,                            &
                                                        THIS_COMPLEX_ELPA_KERNEL)
#endif
           use precision
           use elpa2_utilities
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
#endif
#ifdef HAVE_DETAILED_TIMINGS
           use timings
#endif
457

458
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) || defined(HAVE_AVX512)
459
460
           use kernel_interfaces
#endif
461
462
463
464
465
           use iso_c_binding
           implicit none
           real(kind=c_double), intent(inout) :: kernel_time ! MPI_WTIME always needs double
           integer(kind=lik)            :: kernel_flops
           integer(kind=ik), intent(in) :: nbw, max_blk_size
466
           complex(kind=ck4)            :: bcast_buffer(nbw,max_blk_size)
467
468
469
470
471
           integer(kind=ik), intent(in) :: a_off

           integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
#ifndef WITH_OPENMP
           integer(kind=ik), intent(in) :: last_stripe_width
472
           complex(kind=ck4)            :: a(stripe_width,a_dim2,stripe_count)
473
474
#else
           integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
475
           complex(kind=ck4)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
#endif
           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL

           ! Private variables in OMP regions (my_thread) should better be in the argument list!

           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
#ifdef WITH_OPENMP
           integer(kind=ik)             :: my_thread, noff
#endif
           real(kind=c_double)          :: ttt  ! MPI_WTIME always needs double

           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
           !        Currently (on Sandy Bridge), single is faster than double
           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

491
           complex(kind=ck4)            :: w(nbw,2)
492
493
494

#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
495
          call timer%start("compute_hh_trafo_complex_cpu_openmp_single")
496
#else
497
          call timer%start("compute_hh_trafo_complex_cpu_single")
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
#endif
#endif

#ifdef WITH_OPENMP
           if (istripe<stripe_count) then
             nl = stripe_width
           else
             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
             nl = min(my_thread*thread_width-noff, l_nev-noff)
             if(nl<=0) then
#ifdef HAVE_DETAILED_TIMINGS
               call timer%stop("compute_hh_trafo_complex_cpu_openmp_single")
#endif
               return
             endif
           endif
#else
           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_sse_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_sse_2hv_single(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_sse_1hv_single(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_sse_1hv_single(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
541
           endif !  (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2)
542
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
543
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
544

545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
               (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2)) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_avx_avx2_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_avx_avx2_2hv_single(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */

574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
#if defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK2) ) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_avx512_2hv_single(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_avx512_2hv_single(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_avx512_1hv_single(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_avx512_1hv_single(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK2)
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK2_KERNEL */
601
602
603
604
605
606
607
608
609


#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
610
#ifdef USE_ASSUMED_SIZE
611
612
613
614
615
616
617
618
619
               call single_hh_trafo_complex_generic_simple_single(a(1,j+off+a_off,istripe,my_thread), &
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
               call single_hh_trafo_complex_generic_simple_single(a(1:stripe_width, &
                                                                  j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
                                                           bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
620
#ifdef USE_ASSUMED_SIZE
621
622
623
624
625
626
627
628
629
630
               call single_hh_trafo_complex_generic_simple_single(a(1,j+off+a_off,istripe), &
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
               call single_hh_trafo_complex_generic_simple_single(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
                                                          bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#endif /* WITH_OPENMP */
             enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
631
           endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE)
632
633
634
635
636
637
638
639
640
641
642
643
644
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
645
#ifdef USE_ASSUMED_SIZE
646
647
648
649
650
651
652
653
654

              call single_hh_trafo_complex_generic_single(a(1,j+off+a_off,istripe,my_thread), &
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_generic_single(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
655
#ifdef USE_ASSUMED_SIZE
656
657
658
659
660
661
662
663
664
665
              call single_hh_trafo_complex_generic_single(a(1,j+off+a_off,istripe), &
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_generic_single(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif
#endif /* WITH_OPENMP */

            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
666
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ )
667
668
669
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

670
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
671
672
673
674
675
676
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
677
678
              call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe,my_thread), &
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
679
#else
680
681
              call single_hh_trafo_complex_single(a(1,j+off+a_off,istripe), &
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
682
#endif
683
684
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
685
          endif ! (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE)
686
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
687
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
688
689
690


!#if defined(WITH_AVX_SANDYBRIDGE)
691
!              call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
692
693
694
!#endif

!#if defined(WITH_AMD_BULLDOZER)
695
!              call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
696
697
!#endif

698
699
700
701
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
702
703

#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
704
705
706
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
707
              call single_hh_trafo_complex_sse_1hv_single(a(1,j+off+a_off,istripe,my_thread), &
708
709
710
711
712
713
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_sse_1hv_single(a(1,j+off+a_off,istripe), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
714
715
716

#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */

717
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
718
          endif !  (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1)
719
720
721
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */

722
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
723
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
724
          if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
725
               (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
726
727
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
            ttt = mpi_wtime()
728
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
729
730
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
731
              call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe,my_thread), &
732
733
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
734
              call single_hh_trafo_complex_avx_avx2_1hv_single(a(1,j+off+a_off,istripe), &
735
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
736
#endif
737
            enddo
738
739
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */

740
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
741
          endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1))
742
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
743
744
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX_BLOCK1_KERNEL */

745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK1)) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
            ttt = mpi_wtime()
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_complex_avx512_1hv_single(a(1,j+off+a_off,istripe,my_thread), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_avx512_1hv_single(a(1,j+off+a_off,istripe), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */

#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif ! ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX512_BLOCK1))
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL */
766

767
#ifdef WITH_OPENMP
768
769
770
771
772
773
774
775
776
777
          if (my_thread==1) then
#endif
            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
            kernel_time  = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
          endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
          call timer%stop("compute_hh_trafo_complex_cpu_openmp_single")
778
#else
779
780
781
          call timer%stop("compute_hh_trafo_complex_cpu_single")
#endif
#endif
782

783
784
785
786
#ifdef WITH_OPENMP
        end subroutine compute_hh_trafo_complex_cpu_openmp_single
#else
        end subroutine compute_hh_trafo_complex_cpu_single
787
788
#endif

789
790
#endif /* WANT_SINGLE_PRECISION_COMPLEX */

Andreas Marek's avatar
Andreas Marek committed
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811

  !complex double precision
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION

 ! complex single precision
#if defined(WANT_SINGLE_PRECISION_COMPLEX)
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "precision_macros.h"
#include "compute_hh_trafo_complex_gpu.X90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
#endif



812
end module