compute_hh_trafo.F90 94.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif

       subroutine compute_hh_trafo_&
       &MATH_DATATYPE&
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
49
       &_openmp_&
50
#else
Andreas Marek's avatar
Andreas Marek committed
51
       &_&
52 53
#endif
       &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
54
       (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, &
55
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
56
       l_nev, &
57
#endif
Andreas Marek's avatar
Andreas Marek committed
58
       a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
59
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
60
       hh_dot_dev, &
61
#endif
Andreas Marek's avatar
Andreas Marek committed
62
       hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
63 64 65 66 67
#ifdef WITH_OPENMP
       my_thread, thread_width, &
#else
       last_stripe_width, &
#endif
68
       kernel)
69 70

         use precision
71
         use elpa_abstract_impl
72 73
         use iso_c_binding
#if REALCASE == 1
74

75
         use single_hh_trafo_real
76 77 78 79
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif

Andreas Marek's avatar
Andreas Marek committed
80 81 82
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
#endif
Andreas Marek's avatar
Andreas Marek committed
83

84 85 86
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
!         use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
!#endif
87

88
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
89
         use real_generic_kernel !, only : double_hh_trafo_generic
90 91 92 93 94 95 96 97 98
#endif

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel !, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel !, only : double_hh_trafo_bgq
#endif
99 100 101 102 103

#endif /* REALCASE */

#if COMPLEXCASE == 1

104
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
105 106
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
#endif
107
#if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
108 109 110 111 112 113 114 115
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
#endif

#endif /* COMPLEXCASE */

         use cuda_c_kernel
         use cuda_functions

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
116
         use elpa_generated_fortran_interfaces
117

118
         implicit none
Andreas Marek's avatar
Andreas Marek committed
119
         class(elpa_abstract_impl_t), intent(inout) :: obj
Andreas Marek's avatar
Andreas Marek committed
120
         logical, intent(in)                        :: useGPU, wantDebug
Andreas Marek's avatar
Andreas Marek committed
121 122 123
         real(kind=c_double), intent(inout)         :: kernel_time  ! MPI_WTIME always needs double
         integer(kind=lik)                          :: kernel_flops
         integer(kind=ik), intent(in)               :: nbw, max_blk_size
124
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
125
         real(kind=C_DATATYPE_KIND)                 :: bcast_buffer(nbw,max_blk_size)
126 127
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
128
         complex(kind=C_DATATYPE_KIND)              :: bcast_buffer(nbw,max_blk_size)
129
#endif
Andreas Marek's avatar
Andreas Marek committed
130
         integer(kind=ik), intent(in)               :: a_off
131

Andreas Marek's avatar
Andreas Marek committed
132
         integer(kind=ik), intent(in)               :: stripe_width,a_dim2,stripe_count
133

Andreas Marek's avatar
Andreas Marek committed
134
         integer(kind=ik), intent(in)               :: max_threads
135
#ifndef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
136
         integer(kind=ik), intent(in)               :: last_stripe_width
137
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
138 139
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:)
140 141
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
142 143
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:)
144 145 146
#endif

#else /* WITH_OPENMP */
Andreas Marek's avatar
Andreas Marek committed
147
         integer(kind=ik), intent(in)               :: l_nev, thread_width
148
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
149 150
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:,:)
151
 
152
#endif
153
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
154 155
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:,:)
156 157 158 159
#endif

#endif /* WITH_OPENMP */

Andreas Marek's avatar
Andreas Marek committed
160
         integer(kind=ik), intent(in)               :: kernel
161

Andreas Marek's avatar
Andreas Marek committed
162 163
         integer(kind=c_intptr_t)                   :: a_dev
   integer(kind=c_intptr_t)                         :: bcast_buffer_dev
Andreas Marek's avatar
Andreas Marek committed
164
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
165
         integer(kind=c_intptr_t)                   :: hh_dot_dev ! why not needed in complex case
166
#endif
Andreas Marek's avatar
Andreas Marek committed
167 168
         integer(kind=c_intptr_t)                   :: hh_tau_dev
         integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
Andreas Marek's avatar
Andreas Marek committed
169

170
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
Andreas Marek's avatar
Andreas Marek committed
171
         integer(kind=ik)                           :: off, ncols, istripe
172
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
173
         integer(kind=ik)                           :: my_thread, noff
174
#endif
Andreas Marek's avatar
Andreas Marek committed
175
         integer(kind=ik)                           :: j, nl, jj, jjj, n_times
176
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
177
         real(kind=C_DATATYPE_KIND)                 :: w(nbw,6)
178 179
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
180
         complex(kind=C_DATATYPE_KIND)              :: w(nbw,2)
181
#endif
Andreas Marek's avatar
Andreas Marek committed
182
         real(kind=c_double)                        :: ttt ! MPI_WTIME always needs double
183

184
         j = -99
Andreas Marek's avatar
Andreas Marek committed
185 186 187

         if (wantDebug) then
           if (useGPU .and. &
188
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
189 190 191 192 193 194
             ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
#endif
#if COMPLEXCASE == 1
             ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
             print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
Andreas Marek's avatar
Andreas Marek committed
195 196
             stop
           endif
197
         endif
Andreas Marek's avatar
Andreas Marek committed
198 199 200

#if REALCASE == 1
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
201
#endif
Andreas Marek's avatar
Andreas Marek committed
202
#if COMPLEXCASE == 1
203
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
204
#endif
Andreas Marek's avatar
Andreas Marek committed
205
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
Andreas Marek's avatar
Andreas Marek committed
206
           if (ncols < 1) then
Andreas Marek's avatar
Andreas Marek committed
207 208 209 210 211
             if (wantDebug) then
               print *, "Returning early from compute_hh_trafo"
             endif
             return
           endif
Andreas Marek's avatar
Andreas Marek committed
212
         endif
213

214
         if (wantDebug) call obj%timer%start("compute_hh_trafo_&
215
                                              &MATH_DATATYPE&
216
#ifdef WITH_OPENMP
217
                                              &_openmp" // &
218
#else
219
                                              &" // &
220
#endif
221 222
                                              &PRECISION_SUFFIX &
                                              )
223 224 225 226 227 228 229 230 231 232


#ifdef WITH_OPENMP
         if (my_thread==1) then
#endif
           ttt = mpi_wtime()
#ifdef WITH_OPENMP
         endif
#endif

233
#ifdef WITH_OPENMP
234 235

#if REALCASE == 1
236
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
237
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
238 239
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
240 241
           stop 1
         endif
Andreas Marek's avatar
Andreas Marek committed
242 243
#endif
#if COMPLEXCASE == 1
244
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
245
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
246 247
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
Andreas Marek's avatar
Andreas Marek committed
248 249
           stop 1
         endif
250
#endif
251 252 253 254 255 256
#endif /* WITH_OPENMP */

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP */

257 258 259 260 261 262
         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
263
             if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
264
                                                &MATH_DATATYPE&
265
#ifdef WITH_OPENMP
266
                                                &_openmp" // &
267
#else
268
                                                &" // &
269
#endif
270 271
                                                &PRECISION_SUFFIX &
                                                )
272 273 274 275 276 277

             return
           endif
         endif
#endif /* not WITH_OPENMP */

278
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
279
! GPU kernel real
280
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
Andreas Marek's avatar
Andreas Marek committed
281 282 283
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: GPU")
           endif
284
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
285
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
286 287 288
                  &_&
                  &MATH_DATATYPE

Andreas Marek's avatar
Andreas Marek committed
289
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
290 291 292 293
                &MATH_DATATYPE&
                &_&
                &PRECISION&
                & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
294
#endif /* REALCASE */
Andreas Marek's avatar
Andreas Marek committed
295 296
#if COMPLEXCASE == 1
! GPU kernel complex
297
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
298 299 300
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: GPU")
           endif
Andreas Marek's avatar
Andreas Marek committed
301 302

           dev_offset = (0 + ( (  a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
303
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
304 305
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
306 307

           dev_offset_1 = (0 +  (  off-1 )* nbw) * size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
308
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
309 310
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
311 312

           dev_offset_2 =( off-1 )* size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
313
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
314 315
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
316

Andreas Marek's avatar
Andreas Marek committed
317
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
318 319 320 321
                &MATH_DATATYPE&
                &_&
                &PRECISION&
                & (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
Andreas Marek's avatar
Andreas Marek committed
322 323 324 325
                                                         hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)


#endif /* COMPLEXCASE */
326 327 328
           if (wantDebug) then
             call obj%timer%stop("compute_hh_trafo: GPU")
           endif
Andreas Marek's avatar
Andreas Marek committed
329 330

         else ! not CUDA kernel
331

332 333 334
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: CPU")
           endif
335
#if REALCASE == 1
336
#ifndef WITH_FIXED_REAL_KERNEL
337 338 339 340
         if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
341
             kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
342
             kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
343
             kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
344 345 346 347 348
             kernel .eq. ELPA_2STAGE_REAL_GENERIC    .or. &
             kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
             kernel .eq. ELPA_2STAGE_REAL_BGP .or.        &
             kernel .eq. ELPA_2STAGE_REAL_BGQ) then
349
#endif /* not WITH_FIXED_REAL_KERNEL */
350

351 352 353
#endif /* REALCASE */
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

354
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
355 356
#if REALCASE == 1
! generic kernel real case
357
#if defined(WITH_REAL_GENERIC_KERNEL)
358
#ifndef WITH_FIXED_REAL_KERNEL
359
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
360
#endif /* not WITH_FIXED_REAL_KERNEL */
361 362 363 364 365 366 367 368

               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)

#ifdef WITH_OPENMP

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
369
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
370 371 372 373
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
374 375

#else
Andreas Marek's avatar
Andreas Marek committed
376
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
377 378 379 380
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
381 382 383 384 385 386
                    nbw, nl, stripe_width, nbw)
#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
387
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
388 389 390 391
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
392 393

#else
Andreas Marek's avatar
Andreas Marek committed
394
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
395 396 397 398
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
399 400 401 402 403
#endif
#endif /* WITH_OPENMP */

               enddo

404
#ifndef WITH_FIXED_REAL_KERNEL
405
             endif
406
#endif /* not WITH_FIXED_REAL_KERNEL */
407 408
#endif /* WITH_REAL_GENERIC_KERNEL */

409 410 411 412 413
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
! generic kernel complex case
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
414
#ifndef WITH_FIXED_COMPLEX_KERNEL
415 416 417
           if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
418
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
419 420 421 422 423 424
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE

              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
425 426 427 428
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
429 430
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
431 432 433 434
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
Andreas Marek's avatar
Andreas Marek committed
435
                     bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
436
#endif
437

438 439 440 441
#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
442 443 444 445
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
446 447
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
448 449 450 451
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
452
                      nbw, nl, stripe_width)
453 454 455 456
#endif
#endif /* WITH_OPENMP */

            enddo
457
#ifndef WITH_FIXED_COMPLEX_KERNEL
458
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
459
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
460 461 462 463 464
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
465 466


467
! generic simple real kernel
468
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
469
#ifndef WITH_FIXED_REAL_KERNEL
470
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
471
#endif /* not WITH_FIXED_REAL_KERNEL */
472 473 474 475 476 477
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

#ifdef USE_ASSUMED_SIZE
478
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
479 480 481 482
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
483
#else
484
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
485 486 487 488
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
489 490 491 492 493 494

#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
495
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
496 497 498 499
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
500
#else
501
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
502 503 504 505
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
506 507 508 509 510 511

#endif

#endif /* WITH_OPENMP */

               enddo
512
#ifndef WITH_FIXED_REAL_KERNEL
513
             endif
514
#endif /* not WITH_FIXED_REAL_KERNEL */
515 516
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */

517 518 519 520
#endif /* REALCASE */

#if COMPLEXCASE == 1
! generic simple complex case
521

522
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
523
#ifndef WITH_FIXED_COMPLEX_KERNEL
524
            if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
525
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
526 527 528 529 530
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
531 532 533 534
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
535 536
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
537 538 539 540
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
541
                       nbw, nl, stripe_width)
542 543 544 545 546 547
#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
548 549 550 551
                     &MATH_DATATYPE&
                     &_generic_simple_&
                     &PRECISION&
                     & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
552 553
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
554 555 556 557
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
558
                       nbw, nl, stripe_width)
559 560 561 562
#endif

#endif /* WITH_OPENMP */
             enddo
563
#ifndef WITH_FIXED_COMPLEX_KERNEL
564
           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
565
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
566
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
567

568 569 570 571
#endif /* COMPLEXCASE */

#if REALCASE == 1
! sse assembly kernel real case
572
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
573
#ifndef WITH_FIXED_REAL_KERNEL
574
             if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
Andreas Marek's avatar
Andreas Marek committed
575

576
#endif /* not WITH_FIXED_REAL_KERNEL */
577 578 579 580 581
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
                 call double_hh_trafo_&
582
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
583 584 585 586
                 &_&
                 &PRECISION&
                 &_sse_assembly&
                 & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
587 588
#else
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
589 590 591 592 593
                      &MATH_DATATYPE&
                      &_&
                      &PRECISION&
                      &_sse_assembly&
                      & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
594 595
#endif
               enddo
596
#ifndef WITH_FIXED_REAL_KERNEL
597
             endif
598
#endif /* not WITH_FIXED_REAL_KERNEL */
599 600
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */

601 602 603
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
604

605 606
! sse assembly kernel complex case
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
607
#ifndef WITH_FIXED_COMPLEX_KERNEL
608
           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
609
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
610 611 612 613
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
614 615 616 617 618
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
619 620
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
621 622 623 624 625
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
626 627
#endif
            enddo
628
#ifndef WITH_FIXED_COMPLEX_KERNEL
629
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
630
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
631 632 633 634
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
#endif /* COMPLEXCASE */

#if REALCASE == 1
635
! no sse, vsx, sparc64 block1 real kernel
636 637
#endif

638 639 640 641
#if COMPLEXCASE == 1

! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
!            ttt = mpi_wtime()
!            do j = ncols, 1, -1
!#ifdef WITH_OPENMP
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_sparc64_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_sparc64_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
!            enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
668 669 670 671 672
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


673 674 675 676
#if COMPLEXCASE == 1

! vsx block1 complex kernel
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
677 678 679 680 681 682 683
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
!            ttt = mpi_wtime()
!            do j = ncols, 1, -1
684 685 686 687 688 689 690 691 692 693 694 695 696
!#ifdef WITH_OPENMP
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_vsx_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_vsx_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
697 698 699 700 701 702
!            enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
703 704 705 706 707
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


708
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
709

710 711
! sse block1 complex kernel
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
712
#ifndef WITH_FIXED_COMPLEX_KERNEL
713
          if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
714
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
715

716
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
717 718 719 720
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
721 722 723 724
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
725 726
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
727 728 729 730
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
731 732
#endif
            enddo
733
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
734

735
#ifndef WITH_FIXED_COMPLEX_KERNEL
736
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
737
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
738 739 740 741 742 743 744 745 746
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
!no avx block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
747

748 749
! avx block1 complex kernel
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
750
#ifndef WITH_FIXED_COMPLEX_KERNEL
751 752
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
              (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
753
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
754

755
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
756 757 758 759
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
760 761 762 763
                   &MATH_DATATYPE&
                   &_avx_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
764 765
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
766 767 768 769
                   &MATH_DATATYPE&
                   &_avx_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
770 771
#endif
            enddo
772
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
773

774
#ifndef WITH_FIXED_COMPLEX_KERNEL
775
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
776
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
777 778 779 780 781 782 783 784 785
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
! no avx512 block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
786

787 788
! avx512 block1 complex kernel
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
789
#ifndef WITH_FIXED_COMPLEX_KERNEL
790
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
791
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
792

793
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
794 795 796 797
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
798 799 800 801
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
802 803
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
804 805 806 807
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
808 809
#endif
            enddo
810
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
811

812
#ifndef WITH_FIXED_COMPLEX_KERNEL
813
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
814
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
815 816 817 818
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
#endif /* COMPLEXCASE */

#if REALCASE == 1
819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
! implementation of sparc64 block 2 real case
#if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sparc64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sparc64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */
853

854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890
#if REALCASE == 1
! implementation of neon_arch64 block 2 real case
#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_neon_arch64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_neon_arch64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */


891

892
#if REALCASE == 1
893 894
! implementation of vsx block 2 real case
#if defined(WITH_REAL_VSX_BLOCK2_KERNEL)
895

896
#ifndef WITH_FIXED_REAL_KERNEL
897
           if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2) then
Andreas Marek's avatar
Andreas Marek committed
898

899
#endif /* not WITH_FIXED_REAL_KERNEL */
900

901
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL))
902 903 904 905 906
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
907
                    &MATH_DATATYPE&