compute_hh_trafo.F90 99.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif

       subroutine compute_hh_trafo_&
       &MATH_DATATYPE&
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
49
       &_openmp_&
50
#else
Andreas Marek's avatar
Andreas Marek committed
51
       &_&
52 53
#endif
       &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
54
       (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, &
55
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
56
       l_nev, &
57
#endif
Andreas Marek's avatar
Andreas Marek committed
58
       a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
59
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
60
       hh_dot_dev, &
61
#endif
Andreas Marek's avatar
Andreas Marek committed
62
       hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
63 64 65 66 67
#ifdef WITH_OPENMP
       my_thread, thread_width, &
#else
       last_stripe_width, &
#endif
68
       kernel, h_dev, s_dev, q_dev, w_dev)
69 70

         use precision
71
         use elpa_abstract_impl
72 73
         use iso_c_binding
#if REALCASE == 1
74

75
         use single_hh_trafo_real
76 77 78 79
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif

Andreas Marek's avatar
Andreas Marek committed
80 81 82
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
#endif
Andreas Marek's avatar
Andreas Marek committed
83

84 85 86
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
!         use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
!#endif
87

88 89 90 91
#if defined(WITH_REAL_BLAS_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_blas_block4_kernel !, only : double_hh_trafo_generic_simple
#endif

92
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
93
         use real_generic_kernel !, only : double_hh_trafo_generic
94 95 96 97 98 99 100 101 102
#endif

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel !, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel !, only : double_hh_trafo_bgq
#endif
103 104 105 106 107

#endif /* REALCASE */

#if COMPLEXCASE == 1

108
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
109 110
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
#endif
111
#if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
112 113 114 115 116 117 118 119
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
#endif

#endif /* COMPLEXCASE */

         use cuda_c_kernel
         use cuda_functions

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
120
         use elpa_generated_fortran_interfaces
121

122
         implicit none
Andreas Marek's avatar
Andreas Marek committed
123
         class(elpa_abstract_impl_t), intent(inout) :: obj
Andreas Marek's avatar
Andreas Marek committed
124
         logical, intent(in)                        :: useGPU, wantDebug
Andreas Marek's avatar
Andreas Marek committed
125 126 127
         real(kind=c_double), intent(inout)         :: kernel_time  ! MPI_WTIME always needs double
         integer(kind=lik)                          :: kernel_flops
         integer(kind=ik), intent(in)               :: nbw, max_blk_size
128
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
129
         real(kind=C_DATATYPE_KIND)                 :: bcast_buffer(nbw,max_blk_size)
130 131
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
132
         complex(kind=C_DATATYPE_KIND)              :: bcast_buffer(nbw,max_blk_size)
133
#endif
Andreas Marek's avatar
Andreas Marek committed
134
         integer(kind=ik), intent(in)               :: a_off
135

Andreas Marek's avatar
Andreas Marek committed
136
         integer(kind=ik), intent(in)               :: stripe_width,a_dim2,stripe_count
137

Andreas Marek's avatar
Andreas Marek committed
138
         integer(kind=ik), intent(in)               :: max_threads
139
#ifndef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
140
         integer(kind=ik), intent(in)               :: last_stripe_width
141
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
142 143
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:)
144 145
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
146 147
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:)
148 149 150
#endif

#else /* WITH_OPENMP */
Andreas Marek's avatar
Andreas Marek committed
151
         integer(kind=ik), intent(in)               :: l_nev, thread_width
152
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
153 154
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:,:)
155
 
156
#endif
157
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
158 159
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:,:)
160 161 162 163
#endif

#endif /* WITH_OPENMP */

Andreas Marek's avatar
Andreas Marek committed
164
         integer(kind=ik), intent(in)               :: kernel
165

Andreas Marek's avatar
Andreas Marek committed
166
         integer(kind=c_intptr_t)                   :: a_dev
167 168 169 170 171

         ! for the blas kernel
         integer(kind=c_intptr_t)                   :: h_dev, s_dev, q_dev, w_dev

         integer(kind=c_intptr_t)                         :: bcast_buffer_dev
Andreas Marek's avatar
Andreas Marek committed
172
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
173
         integer(kind=c_intptr_t)                   :: hh_dot_dev ! why not needed in complex case
174
#endif
Andreas Marek's avatar
Andreas Marek committed
175 176
         integer(kind=c_intptr_t)                   :: hh_tau_dev
         integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
Andreas Marek's avatar
Andreas Marek committed
177

178
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
Andreas Marek's avatar
Andreas Marek committed
179
         integer(kind=ik)                           :: off, ncols, istripe
180
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
181
         integer(kind=ik)                           :: my_thread, noff
182
#endif
Andreas Marek's avatar
Andreas Marek committed
183
         integer(kind=ik)                           :: j, nl, jj, jjj, n_times
184
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
185
         real(kind=C_DATATYPE_KIND)                 :: w(nbw,6)
186 187
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
188
         complex(kind=C_DATATYPE_KIND)              :: w(nbw,2)
189
#endif
Andreas Marek's avatar
Andreas Marek committed
190
         real(kind=c_double)                        :: ttt ! MPI_WTIME always needs double
191

192
         j = -99
Andreas Marek's avatar
Andreas Marek committed
193 194 195

         if (wantDebug) then
           if (useGPU .and. &
196
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
197 198 199 200 201 202
             ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
#endif
#if COMPLEXCASE == 1
             ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
             print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
Andreas Marek's avatar
Andreas Marek committed
203 204
             stop
           endif
205
         endif
Andreas Marek's avatar
Andreas Marek committed
206 207 208

#if REALCASE == 1
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
209
#endif
Andreas Marek's avatar
Andreas Marek committed
210
#if COMPLEXCASE == 1
211
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
212
#endif
Andreas Marek's avatar
Andreas Marek committed
213
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
Andreas Marek's avatar
Andreas Marek committed
214
           if (ncols < 1) then
Andreas Marek's avatar
Andreas Marek committed
215 216 217 218 219
             if (wantDebug) then
               print *, "Returning early from compute_hh_trafo"
             endif
             return
           endif
Andreas Marek's avatar
Andreas Marek committed
220
         endif
221

222
         if (wantDebug) call obj%timer%start("compute_hh_trafo_&
223
                                              &MATH_DATATYPE&
224
#ifdef WITH_OPENMP
225
                                              &_openmp" // &
226
#else
227
                                              &" // &
228
#endif
229 230
                                              &PRECISION_SUFFIX &
                                              )
231 232 233 234 235 236 237 238 239 240


#ifdef WITH_OPENMP
         if (my_thread==1) then
#endif
           ttt = mpi_wtime()
#ifdef WITH_OPENMP
         endif
#endif

241
#ifdef WITH_OPENMP
242 243

#if REALCASE == 1
244
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
245
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
246 247
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
248 249
           stop 1
         endif
Andreas Marek's avatar
Andreas Marek committed
250 251
#endif
#if COMPLEXCASE == 1
252
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
253
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
254 255
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
Andreas Marek's avatar
Andreas Marek committed
256 257
           stop 1
         endif
258
#endif
259 260 261 262 263 264
#endif /* WITH_OPENMP */

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP */

265 266 267 268 269 270
         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
271
             if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
272
                                                &MATH_DATATYPE&
273
#ifdef WITH_OPENMP
274
                                                &_openmp" // &
275
#else
276
                                                &" // &
277
#endif
278 279
                                                &PRECISION_SUFFIX &
                                                )
280 281 282 283 284 285

             return
           endif
         endif
#endif /* not WITH_OPENMP */

286
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
287
! GPU kernel real
288
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
Andreas Marek's avatar
Andreas Marek committed
289 290 291
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: GPU")
           endif
292
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
293
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
294 295 296
                  &_&
                  &MATH_DATATYPE

Andreas Marek's avatar
Andreas Marek committed
297
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
298 299 300 301
                &MATH_DATATYPE&
                &_&
                &PRECISION&
                & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
302
#endif /* REALCASE */
Andreas Marek's avatar
Andreas Marek committed
303 304
#if COMPLEXCASE == 1
! GPU kernel complex
305
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
306 307 308
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: GPU")
           endif
Andreas Marek's avatar
Andreas Marek committed
309 310

           dev_offset = (0 + ( (  a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
311
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
312 313
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
314 315

           dev_offset_1 = (0 +  (  off-1 )* nbw) * size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
316
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
317 318
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
319 320

           dev_offset_2 =( off-1 )* size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
321
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
322 323
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
324

Andreas Marek's avatar
Andreas Marek committed
325
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
326 327 328 329
                &MATH_DATATYPE&
                &_&
                &PRECISION&
                & (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
Andreas Marek's avatar
Andreas Marek committed
330 331 332 333
                                                         hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)


#endif /* COMPLEXCASE */
334 335 336
           if (wantDebug) then
             call obj%timer%stop("compute_hh_trafo: GPU")
           endif
Andreas Marek's avatar
Andreas Marek committed
337 338

         else ! not CUDA kernel
339

340 341 342
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: CPU")
           endif
343
#if REALCASE == 1
344
#ifndef WITH_FIXED_REAL_KERNEL
345 346 347 348
         if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
349
             kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
350
             kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
351
             kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
352 353 354 355 356
             kernel .eq. ELPA_2STAGE_REAL_GENERIC    .or. &
             kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
             kernel .eq. ELPA_2STAGE_REAL_BGP .or.        &
             kernel .eq. ELPA_2STAGE_REAL_BGQ) then
357
#endif /* not WITH_FIXED_REAL_KERNEL */
358

359 360 361
#endif /* REALCASE */
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

362
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
363 364
#if REALCASE == 1
! generic kernel real case
365
#if defined(WITH_REAL_GENERIC_KERNEL)
366
#ifndef WITH_FIXED_REAL_KERNEL
367
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
368
#endif /* not WITH_FIXED_REAL_KERNEL */
369 370 371 372 373 374 375 376

               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)

#ifdef WITH_OPENMP

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
377
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
378 379 380 381
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
382 383

#else
Andreas Marek's avatar
Andreas Marek committed
384
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
385 386 387 388
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
389 390 391 392 393 394
                    nbw, nl, stripe_width, nbw)
#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
395
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
396 397 398 399
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
400 401

#else
Andreas Marek's avatar
Andreas Marek committed
402
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
403 404 405 406
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
407 408 409 410 411
#endif
#endif /* WITH_OPENMP */

               enddo

412
#ifndef WITH_FIXED_REAL_KERNEL
413
             endif
414
#endif /* not WITH_FIXED_REAL_KERNEL */
415 416
#endif /* WITH_REAL_GENERIC_KERNEL */

417 418 419 420 421
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
! generic kernel complex case
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
422
#ifndef WITH_FIXED_COMPLEX_KERNEL
423 424 425
           if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
426
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
427 428 429 430 431 432
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE

              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
433 434 435 436
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
437 438
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
439 440 441 442
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
Andreas Marek's avatar
Andreas Marek committed
443
                     bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
444
#endif
445

446 447 448 449
#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
450 451 452 453
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
454 455
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
456 457 458 459
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
460
                      nbw, nl, stripe_width)
461 462 463 464
#endif
#endif /* WITH_OPENMP */

            enddo
465
#ifndef WITH_FIXED_COMPLEX_KERNEL
466
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
467
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
468 469 470 471 472
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
473 474


475
! generic simple real kernel
476
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
477
#ifndef WITH_FIXED_REAL_KERNEL
478
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
479
#endif /* not WITH_FIXED_REAL_KERNEL */
480 481 482 483 484 485
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

#ifdef USE_ASSUMED_SIZE
486
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
487 488 489 490
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
491
#else
492
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
493 494 495 496
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
497 498 499 500 501 502

#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
503
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
504 505 506 507
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
508
#else
509
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
510 511 512 513
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
514 515 516 517 518 519

#endif

#endif /* WITH_OPENMP */

               enddo
520
#ifndef WITH_FIXED_REAL_KERNEL
521
             endif
522
#endif /* not WITH_FIXED_REAL_KERNEL */
523 524
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */

525 526 527 528
#endif /* REALCASE */

#if COMPLEXCASE == 1
! generic simple complex case
529

530
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
531
#ifndef WITH_FIXED_COMPLEX_KERNEL
532
            if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
533
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
534 535 536 537 538
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
539 540 541 542
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
543 544
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
545 546 547 548
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
549
                       nbw, nl, stripe_width)
550 551 552 553 554 555
#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
556 557 558 559
                     &MATH_DATATYPE&
                     &_generic_simple_&
                     &PRECISION&
                     & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
560 561
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
562 563 564 565
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
566
                       nbw, nl, stripe_width)
567 568 569 570
#endif

#endif /* WITH_OPENMP */
             enddo
571
#ifndef WITH_FIXED_COMPLEX_KERNEL
572
           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
573
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
574
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
575

576 577 578 579
#endif /* COMPLEXCASE */

#if REALCASE == 1
! sse assembly kernel real case
580
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
581
#ifndef WITH_FIXED_REAL_KERNEL
582
             if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
Andreas Marek's avatar
Andreas Marek committed
583

584
#endif /* not WITH_FIXED_REAL_KERNEL */
585 586 587 588 589
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
                 call double_hh_trafo_&
590
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
591 592 593 594
                 &_&
                 &PRECISION&
                 &_sse_assembly&
                 & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
595 596
#else
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
597 598 599 600 601
                      &MATH_DATATYPE&
                      &_&
                      &PRECISION&
                      &_sse_assembly&
                      & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
602 603
#endif
               enddo
604
#ifndef WITH_FIXED_REAL_KERNEL
605
             endif
606
#endif /* not WITH_FIXED_REAL_KERNEL */
607 608
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */

609 610 611
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
612

613 614
! sse assembly kernel complex case
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
615
#ifndef WITH_FIXED_COMPLEX_KERNEL
616
           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
617
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
618 619 620 621
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
622 623 624 625 626
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
627 628
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
629 630 631 632 633
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
634 635
#endif
            enddo
636
#ifndef WITH_FIXED_COMPLEX_KERNEL
637
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
638
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
639 640 641 642
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
#endif /* COMPLEXCASE */

#if REALCASE == 1
643
! no sse, vsx, sparc64 block1 real kernel
644 645
#endif

646 647 648 649
#if COMPLEXCASE == 1

! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
!            ttt = mpi_wtime()
!            do j = ncols, 1, -1
!#ifdef WITH_OPENMP
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_sparc64_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_sparc64_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
!            enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
676 677 678 679 680
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


681 682 683 684
#if COMPLEXCASE == 1

! vsx block1 complex kernel
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
685 686 687 688 689 690 691
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
!            ttt = mpi_wtime()
!            do j = ncols, 1, -1
692 693 694 695 696 697 698 699 700 701 702 703 704
!#ifdef WITH_OPENMP
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_vsx_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_vsx_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
705 706 707 708 709 710
!            enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
711 712 713 714 715
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


716
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
717

718 719
! sse block1 complex kernel
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
720
#ifndef WITH_FIXED_COMPLEX_KERNEL
721
          if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
722
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
723

724
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
725 726 727 728
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
729 730 731 732
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
733 734
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
735 736 737 738
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
739 740
#endif
            enddo
741
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
742

743
#ifndef WITH_FIXED_COMPLEX_KERNEL
744
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
745
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
746 747 748 749 750 751 752 753 754
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
!no avx block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
755

756 757
! avx block1 complex kernel
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
758
#ifndef WITH_FIXED_COMPLEX_KERNEL
759 760
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
              (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
761
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
762

763
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
764 765 766 767
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
768 769 770 771
                   &MATH_DATATYPE&
                   &_avx_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
772 773
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
774 775 776 777
                   &MATH_DATATYPE&
                   &_avx_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
778 779
#endif
            enddo
780
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
781

782
#ifndef WITH_FIXED_COMPLEX_KERNEL
783
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
784
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
785 786 787 788 789 790 791 792 793
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
! no avx512 block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
794

795 796
! avx512 block1 complex kernel
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
797
#ifndef WITH_FIXED_COMPLEX_KERNEL
798
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
799
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
800

801
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
802 803 804 805
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
806 807 808 809
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
810 811
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
812 813 814 815
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
816 817
#endif
            enddo
818
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
819

820
#ifndef WITH_FIXED_COMPLEX_KERNEL
821
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
822
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
823 824 825 826
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
#endif /* COMPLEXCASE */

#if REALCASE == 1
827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
! implementation of sparc64 block 2 real case
#if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sparc64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sparc64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */
861

862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898
#if REALCASE == 1
! implementation of neon_arch64 block 2 real case
#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_neon_arch64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_neon_arch64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */


899

900
#if REALCASE == 1
901 902
! implementation of vsx block 2 real case
#if defined(WITH_REAL_VSX_BLOCK2_KERNEL)
903

904
#ifndef WITH_FIXED_REAL_KERNEL
905
           if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2) then
Andreas Marek's avatar
Andreas Marek committed
906

907
#endif /* not WITH_FIXED_REAL_KERNEL */
908

909
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL))
910 911 912 913 914
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
915
                    &MATH_DATATYPE&
916
                    &_vsx_2hv_&
Andreas Marek's avatar
Andreas Marek committed
917 918
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
919 920
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
921
                    &MATH_DATATYPE&
922
                    &_vsx_2hv_&
Andreas Marek's avatar
Andreas Marek committed
923 924
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
925 926
#endif
             enddo
927
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL)) */
928

929
#ifndef WITH_FIXED_REAL_KERNEL
930
           endif
931
#endif /* not WITH_FIXED_REAL_KERNEL */
932
#endif /* WITH_REAL_VSX_BLOCK2_KERNEL */
933

934 935
#endif /* REALCASE == 1 */

Andreas Marek's avatar
Andreas Marek committed
936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972
#if REALCASE == 1
! implementation of sse block 2 real case
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */


973 974 975 976
#if COMPLEXCASE == 1
! implementation of sparc64 block 2 complex case

#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
!
!             ttt = mpi_wtime()
!             do j = ncols, 2, -2
!               w(:,1) = bcast_buffer(1:nbw,j+off)
!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
!#ifdef WITH_OPENMP
!               call double_hh_trafo_&
!                    &MATH_DATATYPE&
!                    &_sparc64_2hv_&
!                    &PRECISION&
!                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!#else
!               call double_hh_trafo_&
!                    &MATH_DATATYPE&
!                    &_sparc64_2hv_&
!                    &PRECISION&
!                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!#endif
!             enddo
!#ifdef WITH_OPENMP
!             if (j==1) call single_hh_trafo_&
!                 &MATH_DATATYPE&
!                       &_sparc64_1hv_&
!                       &PRECISION&
!                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#else
!             if (j==1) call single_hh_trafo_&
!                 &MATH_DATATYPE&
!                            &_sparc64_1hv_&
!                            &PRECISION&
!                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
1016 1017 1018
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */

1019 1020 1021 1022 1023

#if COMPLEXCASE == 1
! implementation of vsx block 2 complex case

#if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
1024 1025 1026 1027 1028 1029 1030 1031
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2) then
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
!
!             ttt = mpi_wtime()
!             do j = ncols, 2, -2
!               w(:,1) = bcast_buffer(1:nbw,j+off)
!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
!#ifdef WITH_OPENMP
!               call double_hh_trafo_&
!                    &MATH_DATATYPE&
!                    &_vsx_2hv_&
!                    &PRECISION&
!                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!#else
!               call double_hh_trafo_&
!                    &MATH_DATATYPE&
!                    &_vsx_2hv_&
!                    &PRECISION&
!                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!#endif
1045
!             enddo
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
!#ifdef WITH_OPENMP
!             if (j==1) call single_hh_trafo_&
!                 &MATH_DATATYPE&
!                       &_vsx_1hv_&
!                       &PRECISION&
!                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#else
!             if (j==1) call single_hh_trafo_&
!                 &MATH_DATATYPE&
!                            &_vsx_1hv_&
!                            &PRECISION&
!                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
1059 1060 1061 1062
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
1063 1064 1065
#endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */

1066 1067 1068 1069
#if COMPLEXCASE == 1
! implementation of sse block 2 complex case

#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
1070
#ifndef WITH_FIXED_COMPLEX_KERNEL
1071
           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2) then
1072
#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
1073 1074 1075 1076 1077 1078 1079

             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
1080 1081 1082 1083
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1084 1085
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
1086 1087 1088 1089
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1090 1091 1092 1093
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
1094
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
1095 1096 1097
                       &_sse_1hv_&
                       &PRECISION&
                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1098 1099
#else
             if (j==1) call single_hh_trafo_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
1100
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
1101 1102 1103
                            &_sse_1hv_&
                            &PRECISION&
                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1104 1105
#endif

1106
#ifndef WITH_FIXED_COMPLEX_KERNEL
1107
           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
1108
#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
1109 1110 1111 1112 1113 1114
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */

#if REALCASE == 1
! implementation of avx block 2 real case

1115
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
1116
#ifndef WITH_FIXED_REAL_KERNEL
1117

1118 1119
           if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2) .or. &
               (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2))  then
Andreas Marek's avatar
Andreas Marek committed
1120

1121
#endif /* not WITH_FIXED_REAL_KERNEL */
1122

1123
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
1124 1125 1126 1127 1128 1129
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
1130 1131 1132 1133
                    &MATH_DATATYPE&
                    &_avx_avx2_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1134 1135
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
1136 1137 1138 1139
                    &MATH_DATATYPE&
                    &_avx_avx2_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1140 1141
#endif
               enddo
1142
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
1143

1144
#ifndef WITH_FIXED_REAL_KERNEL
1145
             endif
1146