mod_compute_hh_trafo_real.F90 67.4 KB
Newer Older
1 2 3 4 5
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
6 7
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
8 9 10 11 12
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
13
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
14 15 16 17 18 19
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
20
!    http://elpa.mpcdf.mpg.de/
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF

44 45
module compute_hh_trafo_real
#include "config-f90.h"
46
  use elpa_mpi
47 48 49
  implicit none

#ifdef WITH_OPENMP
50 51 52 53 54 55 56 57
  public compute_hh_trafo_real_cpu_openmp_double
#else
  public compute_hh_trafo_real_cpu_double
#endif

#ifdef WANT_SINGLE_PRECISION_REAL
#ifdef WITH_OPENMP
  public compute_hh_trafo_real_cpu_openmp_single
58
#else
59 60 61 62
  public compute_hh_trafo_real_cpu_single
#endif


63 64 65 66 67
#endif

  contains

#ifdef WITH_OPENMP
68
       subroutine compute_hh_trafo_real_cpu_openmp_double(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
69
                                                   a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev,    &
70
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,              &
71
                                                   my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
72
#else
73
       subroutine compute_hh_trafo_real_cpu_double       (a, a_dev, stripe_width, a_dim2, stripe_count,                       &
74 75 76
                                                   a_off, nbw, max_blk_size, bcast_buffer,  bcast_buffer_dev, hh_dot_dev,     &
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,                &
                                                   last_stripe_width, THIS_REAL_ELPA_KERNEL)
77 78 79 80
#endif


         use precision
81
         use iso_c_binding
82 83
         use elpa2_utilities
         use single_hh_trafo_real
84 85 86
         use cuda_c_kernel
         use cuda_functions

87
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
88
         use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
89 90
#endif

91
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE))
92
        use real_generic_kernel !, only : double_hh_trafo_generic
93
#endif
94 95

#if defined(WITH_REAL_BGP_KERNEL)
96
         use real_bgp_kernel !, only : double_hh_trafo_bgp
97 98 99
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
100
         use real_bgq_kernel !, only : double_hh_trafo_bgq
101 102 103 104
#endif
#ifdef HAVE_DETAILED_TIMINGS
         use timings
#endif
105

106
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY) || defined(HAVE_AVX512)
107 108
         use kernel_interfaces
#endif
109
         implicit none
110
         real(kind=c_double), intent(inout) :: kernel_time  ! MPI_WTIME always needs double
111 112
         integer(kind=lik)            :: kernel_flops
         integer(kind=ik), intent(in) :: nbw, max_blk_size
113
         real(kind=rk8)                :: bcast_buffer(nbw,max_blk_size)
114 115 116 117 118 119
         integer(kind=ik), intent(in) :: a_off

         integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count

#ifndef WITH_OPENMP
         integer(kind=ik), intent(in) :: last_stripe_width
120
!         real(kind=rk8)                :: a(stripe_width,a_dim2,stripe_count)
121
         real(kind=rk8), pointer      :: a(:,:,:)
122
#else
123
         integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
124
!         real(kind=rk8)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
125
         real(kind=rk8), pointer      :: a(:,:,:,:)
126 127 128
#endif
         integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL

129 130 131 132 133
         integer(kind=c_intptr_t)     :: a_dev
         integer(kind=c_intptr_t)     :: bcast_buffer_dev
         integer(kind=c_size_t)       :: dev_offset
         integer(kind=c_intptr_t)     :: hh_dot_dev
         integer(kind=c_intptr_t)     :: hh_tau_dev
134 135 136 137 138 139
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
         integer(kind=ik)             :: off, ncols, istripe
#ifdef WITH_OPENMP
         integer(kind=ik)             :: my_thread, noff
#endif
         integer(kind=ik)             :: j, nl, jj, jjj
140
         real(kind=rk8)                :: w(nbw,6)
141
         real(kind=c_double)          :: ttt ! MPI_WTIME always needs double
142

143 144 145 146 147
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
           if (ncols < 1) return
         endif

148 149
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
150
         call timer%start("compute_hh_trafo_real_cpu_openmp_double")
151
#else
152
         call timer%start("compute_hh_trafo_real_cpu_double")
153 154 155 156 157 158 159
#endif
#endif
         ttt = mpi_wtime()

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else
160 161 162 163
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           print *,"compute_hh_trafo_real GPU OPENMP: not yet implemented"
           stop
         endif
164 165 166 167 168 169 170 171 172

         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
173
             call timer%stop("compute_hh_trafo_real_cpu_openmp_double")
174
#else
175
             call timer%stop("compute_hh_trafo_real_cpu_double")
176 177 178 179 180
#endif
#endif
             return
           endif
         endif
181 182 183
#endif /* not WITH_OPENMP */

         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
184 185
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_double_real_datatype
           call launch_compute_hh_trafo_c_kernel_real_double(a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, &
186 187
                                                      hh_tau_dev, nl, nbw, stripe_width, off, ncols)
         else ! not CUDA kernel
188 189 190

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
191
             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2 .or. &
192
             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
193
             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2 .or. &
194 195 196 197 198 199 200
             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC    .or. &
             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or.        &
             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP .or.        &
             THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

201
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
202 203
#if defined(WITH_REAL_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
204
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) then
205 206
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

207 208 209
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
210 211

#ifdef WITH_OPENMP
212
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
213
                 call double_hh_trafo_generic_double(a(1,j+off+a_off-1,istripe,my_thread), w, &
214 215
                                            nbw, nl, stripe_width, nbw)

216
#else
217
                 call double_hh_trafo_generic_double(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, &
218
                                              istripe,my_thread), w(1:nbw,1:6), &
219
                                              nbw, nl, stripe_width, nbw)
220 221 222 223 224
#endif

#else /* WITH_OPENMP */

#ifdef DESPERATELY_WANT_ASSUMED_SIZE
225
                 call double_hh_trafo_generic_double(a(1,j+off+a_off-1,istripe),w, &
226 227 228
                                            nbw, nl, stripe_width, nbw)

#else
229
                 call double_hh_trafo_generic_double(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), &
230
                                            nbw, nl, stripe_width, nbw)
231
#endif
232 233
#endif /* WITH_OPENMP */

234
               enddo
235 236

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
237
             endif
238 239 240 241 242 243
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_KERNEL */


#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
244
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) then
245
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
246 247 248
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
249
#ifdef WITH_OPENMP
250
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
251
                 call double_hh_trafo_generic_simple_double(a(1,j+off+a_off-1,istripe,my_thread), &
252
                                                       w, nbw, nl, stripe_width, nbw)
253
#else
254
                 call double_hh_trafo_generic_simple_double(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), &
255 256 257 258 259 260
                                                     w, nbw, nl, stripe_width, nbw)

#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
261
                 call double_hh_trafo_generic_simple_double(a(1,j+off+a_off-1,istripe), &
262
                                                       w, nbw, nl, stripe_width, nbw)
263
#else
264
                 call double_hh_trafo_generic_simple_double(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), &
265 266
                                                     w, nbw, nl, stripe_width, nbw)

267
#endif
268 269 270

#endif /* WITH_OPENMP */

271
               enddo
272
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
273
             endif
274 275 276 277
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */


278
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
279
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
280
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) then
281
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
282 283 284
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
285
#ifdef WITH_OPENMP
286
                 call double_hh_trafo_double(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, &
287 288
                                      stripe_width, nbw)
#else
289
                 call double_hh_trafo_double(c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, &
290 291
                                      stripe_width, nbw)
#endif
292
               enddo
293
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
294
             endif
295
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
296
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */
297

298 299 300 301
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
302 303

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
304 305 306 307
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
308
               call double_hh_trafo_real_sse_2hv_double(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), &
309 310
                                                       w, nbw, nl, stripe_width, nbw)
#else
311
               call double_hh_trafo_real_sse_2hv_double(c_loc(a(1,j+off+a_off-1,istripe)), &
312 313 314
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
315 316
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */

317 318 319 320
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */
321

322
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
323
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
324

325 326
           if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) .or. &
               (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2))  then
327
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
328 329

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
330 331 332
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
333
#ifdef WITH_OPENMP
334

335
               call double_hh_trafo_real_avx_avx2_2hv_double(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), &
336 337
                                                       w, nbw, nl, stripe_width, nbw)
#else
338
               call double_hh_trafo_real_avx_avx2_2hv_double(c_loc(a(1,j+off+a_off-1,istripe)), &
339 340 341
                                                       w, nbw, nl, stripe_width, nbw)
#endif
               enddo
342 343
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */

344 345 346
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
347
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL || WITH_REAL_AVX2_BLOCK2_KERNEL */
348

349 350 351 352 353 354 355

#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)

           if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

356
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK4_KERNEL))
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

               call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,j+off+a_off-1,istripe)), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
               enddo
#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) ... */

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK2_KERNEL */


377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
#if defined(WITH_REAL_BGP_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
                 call double_hh_trafo_bgp_double(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
                                          stripe_width, nbw)
#else
                 call double_hh_trafo_bgp_double(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
                                          stripe_width, nbw)
#endif
               enddo
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGP_KERNEL */

#if defined(WITH_REAL_BGQ_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
                 call double_hh_trafo_bgq_double(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
                                          stripe_width, nbw)
#else
                 call double_hh_trafo_bgq_double(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
                                          stripe_width, nbw)
#endif
               enddo
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGQ_KERNEL */


!#if defined(WITH_AVX_SANDYBRIDGE)
419
!              call double_hh_trafo_real_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437
!#endif

#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width, &
                                      1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
                                      stripe_width)
#else
             if (j==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe),           &
                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
                                      stripe_width)
#endif


#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           endif !
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

438 439 440 441
#if defined(WITH_REAL_SSE_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
442 443

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL))
444 445 446 447 448 449 450
             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
             do j = ncols, 4, -4
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
451
               call quad_hh_trafo_real_sse_4hv_double(c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, &
452 453
                                                  nbw, nl, stripe_width, nbw)
#else
454
               call quad_hh_trafo_real_sse_4hv_double(c_loc(a(1,j+off+a_off-3,istripe)), w, &
455 456 457 458 459 460 461
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jj = j, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
462
               call double_hh_trafo_real_sse_2hv_double(c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), &
463 464
                                                    w, nbw, nl, stripe_width, nbw)
#else
465
               call double_hh_trafo_real_sse_2hv_double(c_loc(a(1,jj+off+a_off-1,istripe)), &
466 467 468 469
                                                    w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
470 471
             if (jj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, &
                                                                     istripe,my_thread), &
472 473 474 475 476
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
             if (jj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
477 478 479

#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL)) */

480 481 482 483 484 485
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */


486
#if defined(WITH_REAL_AVX_BLOCK4_KERNEL) || defined(WITH_REAL_AVX2_BLOCK4_KERNEL)
487
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
488
           if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) .or.  &
489
               (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK4)) then
490
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
491 492

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL))
493 494 495 496 497 498 499
             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
             do j = ncols, 4, -4
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
500
               call quad_hh_trafo_real_avx_avx2_4hv_double(c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, &
501 502
                                                  nbw, nl, stripe_width, nbw)
#else
503
               call quad_hh_trafo_real_avx_avx2_4hv_double(c_loc(a(1,j+off+a_off-3,istripe)), w, &
504 505 506 507 508 509 510
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jj = j, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
511
               call double_hh_trafo_real_avx_avx2_2hv_double(c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), &
512 513
                                                    w, nbw, nl, stripe_width, nbw)
#else
514
               call double_hh_trafo_real_avx_avx2_2hv_double(c_loc(a(1,jj+off+a_off-1,istripe)), &
515 516 517 518
                                                    w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
519 520
             if (jj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, &
                                                                      istripe,my_thread), &
521 522 523 524 525
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
             if (jj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
526 527 528

#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL)) */

529 530 531
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL || WITH_REAL_AVX2_BLOCK4_KERNEL */

#if defined(WITH_REAL_AVX512_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK4) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL))
             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
             do j = ncols, 4, -4
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
#ifdef WITH_OPENMP
               call quad_hh_trafo_real_avx512_4hv_double(c_loc(a(1,j+off+a_off-3,istripe,my_thread)), w, &
                                                  nbw, nl, stripe_width, nbw)
#else
               call quad_hh_trafo_real_avx512_4hv_double(c_loc(a(1,j+off+a_off-3,istripe)), w, &
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jj = j, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,jj+off+a_off-1,istripe,my_thread)), &
                                                    w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,jj+off+a_off-1,istripe)), &
                                                    w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (jj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, &
                                                                      istripe,my_thread), &
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
             if (jj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif

#endif /* defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) ) */

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK4_KERNEL */

581 582 583 584 585 586 587 588 589 590 591 592 593
#if defined(WITH_REAL_SSE_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK6) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
             do j = ncols, 6, -6
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
               w(:,5) = bcast_buffer(1:nbw,j+off-4)
               w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
594
               call hexa_hh_trafo_real_sse_6hv_double(c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, &
595 596
                                                  nbw, nl, stripe_width, nbw)
#else
597
               call hexa_hh_trafo_real_sse_6hv_double(c_loc(a(1,j+off+a_off-5,istripe)), w, &
598 599 600 601 602 603 604 605 606
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jj = j, 4, -4
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
607
               call quad_hh_trafo_real_sse_4hv_double(c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, &
608 609
                                                  nbw, nl, stripe_width, nbw)
#else
610
               call quad_hh_trafo_real_sse_4hv_double(c_loc(a(1,jj+off+a_off-3,istripe)), w, &
611 612 613 614 615 616 617
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jjj = jj, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jjj+off)
               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
618
               call double_hh_trafo_real_sse_2hv_double(c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), &
619 620
                                                    w, nbw, nl, stripe_width, nbw)
#else
621
               call double_hh_trafo_real_sse_2hv_double(c_loc(a(1,jjj+off+a_off-1,istripe)), &
622 623 624 625
                                                    w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
626 627
             if (jjj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, &
                                                                        istripe,my_thread), &
628 629 630 631 632 633 634 635 636 637
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
             if (jjj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK4_KERNEL */

638

639
#if defined(WITH_REAL_AVX_BLOCK6_KERNEL) || defined(WITH_REAL_AVX2_BLOCK6_KERNEL)
640
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
641 642
           if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) .or. &
               (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK6)) then
643 644 645 646 647 648 649 650 651 652
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
             do j = ncols, 6, -6
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
               w(:,5) = bcast_buffer(1:nbw,j+off-4)
               w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
653
               call hexa_hh_trafo_real_avx_avx2_6hv_double(c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, &
654 655
                                                  nbw, nl, stripe_width, nbw)
#else
656
               call hexa_hh_trafo_real_avx_avx2_6hv_double(c_loc(a(1,j+off+a_off-5,istripe)), w, &
657 658 659 660 661 662 663 664 665
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jj = j, 4, -4
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
666
               call quad_hh_trafo_real_avx_avx2_4hv_double(c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, &
667 668
                                                  nbw, nl, stripe_width, nbw)
#else
669
               call quad_hh_trafo_real_avx_avx2_4hv_double(c_loc(a(1,jj+off+a_off-3,istripe)), w, &
670 671 672 673 674 675 676
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jjj = jj, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jjj+off)
               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
677
               call double_hh_trafo_real_avx_avx2_2hv_double(c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), &
678 679
                                                    w, nbw, nl, stripe_width, nbw)
#else
680
               call double_hh_trafo_real_avx_avx2_2hv_double(c_loc(a(1,jjj+off+a_off-1,istripe)), &
681 682 683 684
                                                    w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
685 686
             if (jjj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, &
                                                                       istripe,my_thread), &
687 688 689 690 691 692 693 694
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
             if (jjj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
695
#endif /* WITH_REAL_AVX_BLOCK6_KERNEL || WITH_REAL_AVX2_BLOCK6_KERNEL */
696

697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
#if defined(WITH_REAL_AVX512_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           if ((THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK6)) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
             do j = ncols, 6, -6
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
               w(:,5) = bcast_buffer(1:nbw,j+off-4)
               w(:,6) = bcast_buffer(1:nbw,j+off-5)
#ifdef WITH_OPENMP
               call hexa_hh_trafo_real_avx512_6hv_double(c_loc(a(1,j+off+a_off-5,istripe,my_thread)), w, &
                                                  nbw, nl, stripe_width, nbw)
#else
               call hexa_hh_trafo_real_avx512_6hv_double(c_loc(a(1,j+off+a_off-5,istripe)), w, &
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jj = j, 4, -4
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
#ifdef WITH_OPENMP
               call quad_hh_trafo_real_avx512_4hv_double(c_loc(a(1,jj+off+a_off-3,istripe,my_thread)), w, &
                                                  nbw, nl, stripe_width, nbw)
#else
               call quad_hh_trafo_real_avx512_4hv_double(c_loc(a(1,jj+off+a_off-3,istripe)), w, &
                                                  nbw, nl, stripe_width, nbw)
#endif
             enddo
             do jjj = jj, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jjj+off)
               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,jjj+off+a_off-1,istripe,my_thread)), &
                                                    w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_real_avx512_2hv_double(c_loc(a(1,jjj+off+a_off-1,istripe)), &
                                                    w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (jjj==1) call single_hh_trafo_real_cpu_openmp_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1, &
                                                                       istripe,my_thread), &
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
             if (jjj==1) call single_hh_trafo_real_cpu_double(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX512_BLOCK6_KERNEL */

754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
         endif ! GPU_KERNEL

#ifdef WITH_OPENMP
         if (my_thread==1) then
#endif
           kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
           kernel_time = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
         endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
         call timer%stop("compute_hh_trafo_real_cpu_openmp_double")
#else
         call timer%stop("compute_hh_trafo_real_cpu_double")
#endif
#endif

#ifdef WITH_OPENMP
       end subroutine compute_hh_trafo_real_cpu_openmp_double
#else
       end subroutine compute_hh_trafo_real_cpu_double
#endif

#ifdef WANT_SINGLE_PRECISION_REAL
! SINGLE PRECISION IMPLEMENTATION AT THE MOMENT DUPLICATED !!!

#ifdef WITH_OPENMP
       subroutine compute_hh_trafo_real_cpu_openmp_single(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev, &
                                                   a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev,    &
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,              &
                                                   my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
#else
       subroutine compute_hh_trafo_real_cpu_single       (a, a_dev, stripe_width, a_dim2, stripe_count,                      &
                                                   a_off, nbw, max_blk_size, bcast_buffer,  bcast_buffer_dev, hh_dot_dev,     &
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,                &
                                                   last_stripe_width, THIS_REAL_ELPA_KERNEL)
#endif


         use precision
         use iso_c_binding
         use elpa2_utilities
         use single_hh_trafo_real
         use cuda_c_kernel
         use cuda_functions

#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
         use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif

#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE))
        use real_generic_kernel !, only : double_hh_trafo_generic
#endif

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel !, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel !, only : double_hh_trafo_bgq
#endif
#ifdef HAVE_DETAILED_TIMINGS
         use timings
#endif
819

820
#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(AVX512) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
821 822
         use kernel_interfaces
#endif
823 824 825 826 827 828 829 830 831 832 833 834
         implicit none
         real(kind=c_double), intent(inout) :: kernel_time  ! MPI_WTIME always needs double
         integer(kind=lik)            :: kernel_flops
         integer(kind=ik), intent(in) :: nbw, max_blk_size
         real(kind=rk4)                :: bcast_buffer(nbw,max_blk_size)
         integer(kind=ik), intent(in) :: a_off

         integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count

#ifndef WITH_OPENMP
         integer(kind=ik), intent(in) :: last_stripe_width
!         real(kind=rk4)                :: a(stripe_width,a_dim2,stripe_count)
835
         real(kind=rk4), pointer      :: a(:,:,:)
836 837 838
#else
         integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
!         real(kind=rk8)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
839
         real(kind=rk4), pointer      :: a(:,:,:,:)
840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
#endif
         integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL

         integer(kind=c_intptr_t)     :: a_dev
         integer(kind=c_intptr_t)     :: bcast_buffer_dev
         integer(kind=c_size_t)       :: dev_offset
         integer(kind=c_intptr_t)     :: hh_dot_dev
         integer(kind=c_intptr_t)     :: hh_tau_dev
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
         integer(kind=ik)             :: off, ncols, istripe
#ifdef WITH_OPENMP
         integer(kind=ik)             :: my_thread, noff
#endif
         integer(kind=ik)             :: j, nl, jj, jjj
         real(kind=rk4)                :: w(nbw,6)
         real(kind=c_double)          :: ttt ! MPI_WTIME always needs double

         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
           if (ncols < 1) return
         endif

#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
         call timer%start("compute_hh_trafo_real_cpu_openmp_single")
#else
         call timer%start("compute_hh_trafo_real_cpu_single")
#endif
#endif
         ttt = mpi_wtime()

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           print *,"compute_hh_trafo_real GPU OPENMP: not yet implemented"
           stop
         endif

         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
             call timer%stop("compute_hh_trafo_real_cpu_openmp_single")
#else
             call timer%stop("compute_hh_trafo_real_cpu_single")
#endif
#endif
             return
           endif
         endif
#endif /* not WITH_OPENMP */

         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_single_real_datatype
           call launch_compute_hh_trafo_c_kernel_real_single(a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, &
                                                      hh_tau_dev, nl, nbw, stripe_width, off, ncols)
         else ! not CUDA kernel

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
905 906
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX2_BLOCK2 .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX512_BLOCK2 .or. &
907
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2 .or. &
908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC    .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or.        &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP .or.        &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
#if defined(WITH_REAL_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)

#ifdef WITH_OPENMP
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
                 call double_hh_trafo_generic_single(a(1,j+off+a_off-1,istripe,my_thread), w, &
                                            nbw, nl, stripe_width, nbw)

#else
                 call double_hh_trafo_generic_single(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, &
                                              istripe,my_thread), w(1:nbw,1:6), &
                                              nbw, nl, stripe_width, nbw)
#endif

#else /* WITH_OPENMP */

#ifdef DESPERATELY_WANT_ASSUMED_SIZE
                 call double_hh_trafo_generic_single(a(1,j+off+a_off-1,istripe),w, &
                                            nbw, nl, stripe_width, nbw)

#else
                 call double_hh_trafo_generic_single(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), &
                                            nbw, nl, stripe_width, nbw)
#endif
#endif /* WITH_OPENMP */

               enddo

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_KERNEL */


#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
                 call double_hh_trafo_generic_simple_single(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
                 call double_hh_trafo_generic_simple_single(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), &
                                                     w, nbw, nl, stripe_width, nbw)

#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
                 call double_hh_trafo_generic_simple_single(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
                 call double_hh_trafo_generic_simple_single(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), &
                                                     w, nbw, nl, stripe_width, nbw)

#endif

#endif /* WITH_OPENMP */

               enddo
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */


992
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
993 994 995 996 997 998 999
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
1000
                 call double_hh_trafo_single(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, &
1001
                                      stripe_width, nbw)
1002
#else
1003
                 call double_hh_trafo_single(c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, &
1004
                                      stripe_width, nbw)
1005 1006 1007 1008 1009
#endif
               enddo
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             endif
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
1010
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */
1011

1012 1013 1014 1015
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE_BLOCK2) then
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
1016

1017
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL) || (defined(WITH_ONE_SPECIFIC_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
1018 1019 1020 1021
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
1022
                 call double_hh_trafo_real_sse_2hv_single(c_loc(a(1,j+off+a_off-1,istripe,my_thread)), &
1023 1024
                                                       w, nbw, nl, stripe_width, nbw)
#else
1025
                 call double_hh_trafo_real_sse_2hv_single(c_loc(a(1,j+off+a_off-1,istripe)), &
1026 1027 1028
                                                       w, nbw, nl, stripe_width, nbw)
#endif
               enddo