compute_hh_trafo.F90 119 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif

Andreas Marek's avatar
Andreas Marek committed
46
47
subroutine compute_hh_trafo_&
&MATH_DATATYPE&
48
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
49
&_openmp_&
50
#else
Andreas Marek's avatar
Andreas Marek committed
51
&_&
52
#endif
Andreas Marek's avatar
Andreas Marek committed
53
54
&PRECISION &
(obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, &
55
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
56
l_nev, &
57
#endif
Andreas Marek's avatar
Andreas Marek committed
58
59
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
60
#ifdef WITH_OPENMP_TRADITIONAL
61
my_thread, thread_width, kernel, last_stripe_width)
62
#else
63
last_stripe_width, kernel)
64
65
#endif

Andreas Marek's avatar
Andreas Marek committed
66
67
  use precision
  use elpa_abstract_impl
68
  use, intrinsic :: iso_c_binding
69
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
70
  use single_hh_trafo_real
71
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
Andreas Marek's avatar
Andreas Marek committed
72
  use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
73
74
#endif

Andreas Marek's avatar
Andreas Marek committed
75
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
Andreas Marek's avatar
Andreas Marek committed
76
  use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
Andreas Marek's avatar
Andreas Marek committed
77
#endif
Andreas Marek's avatar
Andreas Marek committed
78

79
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
Andreas Marek's avatar
Andreas Marek committed
80
!  use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
81
!#endif
82

83
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
Andreas Marek's avatar
Andreas Marek committed
84
  use real_generic_kernel !, only : double_hh_trafo_generic
85
86
87
#endif

#if defined(WITH_REAL_BGP_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
88
  use real_bgp_kernel !, only : double_hh_trafo_bgp
89
90
91
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
Andreas Marek's avatar
Andreas Marek committed
92
  use real_bgq_kernel !, only : double_hh_trafo_bgq
93
#endif
94
95
96
97
98

#endif /* REALCASE */

#if COMPLEXCASE == 1

99
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
Andreas Marek's avatar
Andreas Marek committed
100
  use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
101
#endif
102
#if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
Andreas Marek's avatar
Andreas Marek committed
103
  use complex_generic_kernel !, only : single_hh_trafo_complex_generic
104
105
106
107
#endif

#endif /* COMPLEXCASE */

Andreas Marek's avatar
Andreas Marek committed
108
109
110
111
  !use cuda_c_kernel
  !use cuda_functions
  !use hip_functions
  use gpu_c_kernel
112
  use elpa_gpu
113

Andreas Marek's avatar
Andreas Marek committed
114
  use elpa_generated_fortran_interfaces
115

Andreas Marek's avatar
Andreas Marek committed
116
117
118
119
120
121
  implicit none
  class(elpa_abstract_impl_t), intent(inout) :: obj
  logical, intent(in)                        :: useGPU, wantDebug
  real(kind=c_double), intent(inout)         :: kernel_time  ! MPI_WTIME always needs double
  integer(kind=lik)                          :: kernel_flops
  integer(kind=ik), intent(in)               :: nbw, max_blk_size
122
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
123
  real(kind=C_DATATYPE_KIND)                 :: bcast_buffer(nbw,max_blk_size)
124
125
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
126
  complex(kind=C_DATATYPE_KIND)              :: bcast_buffer(nbw,max_blk_size)
127
#endif
Andreas Marek's avatar
Andreas Marek committed
128
  integer(kind=ik), intent(in)               :: a_off
129

Andreas Marek's avatar
Andreas Marek committed
130
  integer(kind=ik), intent(in)               :: stripe_width,a_dim2,stripe_count
131

Andreas Marek's avatar
Andreas Marek committed
132
  integer(kind=ik), intent(in)               :: max_threads
133
#ifndef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
134
  integer(kind=ik), intent(in)               :: last_stripe_width
135
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
136
137
!  real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count)
  real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:)
138
139
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
140
141
!  complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count)
  complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:)
142
143
#endif

144
#else /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
145
  integer(kind=ik), intent(in)               :: l_nev, thread_width
146
  integer(kind=ik), intent(in), optional     :: last_stripe_width
147
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
148
149
!  real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
  real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:,:)
150
#endif
151
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
152
153
!  complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
  complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:,:)
154
155
#endif

156
#endif /* WITH_OPENMP_TRADITIONAL */
157

Andreas Marek's avatar
Andreas Marek committed
158
  integer(kind=ik), intent(in)               :: kernel
159

Andreas Marek's avatar
Andreas Marek committed
160
161
162
163
  integer(kind=c_intptr_t)                   :: a_dev
  integer(kind=c_intptr_t)                   :: bcast_buffer_dev
  integer(kind=c_intptr_t)                   :: hh_tau_dev
  integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
Andreas Marek's avatar
Andreas Marek committed
164

Andreas Marek's avatar
Andreas Marek committed
165
166
  ! Private variables in OMP regions (my_thread) should better be in the argument list!
  integer(kind=ik)                           :: off, ncols, istripe
167
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
168
  integer(kind=ik)                           :: my_thread, noff
169
#endif
Andreas Marek's avatar
Andreas Marek committed
170
  integer(kind=ik)                           :: j, nl, jj, jjj, n_times
171
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
172
  real(kind=C_DATATYPE_KIND)                 :: w(nbw,6)
173
174
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
175
  complex(kind=C_DATATYPE_KIND)              :: w(nbw,2)
176
#endif
Andreas Marek's avatar
Andreas Marek committed
177
  real(kind=c_double)                        :: ttt ! MPI_WTIME always needs double
178

Andreas Marek's avatar
Andreas Marek committed
179
180
181
182
  integer(kind=c_intptr_t), parameter        :: size_of_datatype = size_of_&
                                                                 &PRECISION&
                                                                 &_&
                                                                 &MATH_DATATYPE
183
184


Andreas Marek's avatar
Andreas Marek committed
185
  j = -99
Andreas Marek's avatar
Andreas Marek committed
186

Andreas Marek's avatar
Andreas Marek committed
187
188
  if (wantDebug) then
    if (useGPU .and. &
189
#if REALCASE == 1
190
      ( kernel .ne. ELPA_2STAGE_REAL_NVIDIA_GPU)) then
Andreas Marek's avatar
Andreas Marek committed
191
192
#endif
#if COMPLEXCASE == 1
193
      ( kernel .ne. ELPA_2STAGE_COMPLEX_NVIDIA_GPU)) then
Andreas Marek's avatar
Andreas Marek committed
194
#endif
Andreas Marek's avatar
Andreas Marek committed
195
196
197
198
      print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
      stop
    endif
  endif
Andreas Marek's avatar
Andreas Marek committed
199
200

#if REALCASE == 1
201
  if (kernel .eq. ELPA_2STAGE_REAL_NVIDIA_GPU) then
202
#endif
Andreas Marek's avatar
Andreas Marek committed
203
#if COMPLEXCASE == 1
204
  if (kernel .eq. ELPA_2STAGE_COMPLEX_NVIDIA_GPU) then
Andreas Marek's avatar
Andreas Marek committed
205
#endif
Andreas Marek's avatar
Andreas Marek committed
206
207
208
209
210
211
212
213
    ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
    if (ncols < 1) then
      if (wantDebug) then
        print *, "Returning early from compute_hh_trafo"
      endif
      return
    endif
  endif
214

Andreas Marek's avatar
Andreas Marek committed
215
216
  if (wantDebug) call obj%timer%start("compute_hh_trafo_&
  &MATH_DATATYPE&
217
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
218
  &_openmp" // &
219
#else
Andreas Marek's avatar
Andreas Marek committed
220
  &" // &
221
#endif
Andreas Marek's avatar
Andreas Marek committed
222
223
  &PRECISION_SUFFIX &
  )
224
225


226
#ifdef WITH_OPENMP_TRADITIONAL
227
  if (my_thread==1) then ! in the calling routine threads go form 1 .. max_threads
228
#endif
Andreas Marek's avatar
Andreas Marek committed
229
    ttt = mpi_wtime()
230
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
231
  endif
232
233
#endif

234

235
#ifndef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
236
  nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
237
#else /* WITH_OPENMP_TRADITIONAL */
238

239
240
  if (present(last_stripe_width)) then
    nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
Andreas Marek's avatar
Andreas Marek committed
241
  else
242
243
244
245
246
247
248
249
    if (istripe<stripe_count) then
      nl = stripe_width
    else
      noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
      nl = min(my_thread*thread_width-noff, l_nev-noff)
      if (nl<=0) then
        if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
        &MATH_DATATYPE&
250
#ifdef WITH_OPENMP_TRADITIONAL
251
        &_openmp" // &
252
#else
253
        &" // &
254
#endif
255
256
        &PRECISION_SUFFIX &
        )
257

258
259
        return
      endif
Andreas Marek's avatar
Andreas Marek committed
260
261
    endif
  endif
262
#endif /* not WITH_OPENMP_TRADITIONAL */
263

264
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
265
! GPU kernel real
266
  if (kernel .eq. ELPA_2STAGE_REAL_NVIDIA_GPU) then
267
#endif
Andreas Marek's avatar
Andreas Marek committed
268
269
#if COMPLEXCASE == 1
! GPU kernel complex
270
  if (kernel .eq. ELPA_2STAGE_COMPLEX_NVIDIA_GPU) then
271
#endif
Andreas Marek's avatar
Andreas Marek committed
272
273
274
    if (wantDebug) then
      call obj%timer%start("compute_hh_trafo: GPU")
    endif
Andreas Marek's avatar
Andreas Marek committed
275

276
    dev_offset = ((a_off+off)*stripe_width+(istripe-1)*stripe_width*a_dim2)*size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
277

278
    dev_offset_1 = off*nbw*size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
279

280
    dev_offset_2 = off*size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
281

Andreas Marek's avatar
Andreas Marek committed
282
283
284
285
286
287
    call launch_compute_hh_trafo_gpu_kernel_&
         &MATH_DATATYPE&
         &_&
         &PRECISION&
         &(a_dev + dev_offset, bcast_buffer_dev + dev_offset_1, &
         hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, ncols)
Andreas Marek's avatar
Andreas Marek committed
288

Andreas Marek's avatar
Andreas Marek committed
289
290
291
    if (wantDebug) then
      call obj%timer%stop("compute_hh_trafo: GPU")
    endif
Andreas Marek's avatar
Andreas Marek committed
292

Andreas Marek's avatar
Andreas Marek committed
293
  else ! not CUDA kernel
294

Andreas Marek's avatar
Andreas Marek committed
295
296
297
    if (wantDebug) then
      call obj%timer%start("compute_hh_trafo: CPU")
    endif
298
#if REALCASE == 1
299
#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
300
301
302
303
304
305
306
    if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
307
308
        kernel .eq. ELPA_2STAGE_REAL_SVE128_BLOCK2 .or. &
        kernel .eq. ELPA_2STAGE_REAL_SVE256_BLOCK2 .or. &
309
        kernel .eq. ELPA_2STAGE_REAL_SVE512_BLOCK2 .or. &
Andreas Marek's avatar
Andreas Marek committed
310
311
312
313
314
        kernel .eq. ELPA_2STAGE_REAL_GENERIC    .or. &
        kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
        kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
        kernel .eq. ELPA_2STAGE_REAL_BGP .or.        &
        kernel .eq. ELPA_2STAGE_REAL_BGQ) then
315
#endif /* not WITH_FIXED_REAL_KERNEL */
316

317
318
#endif /* REALCASE */

Andreas Marek's avatar
Andreas Marek committed
319
      !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
320
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
321
      ! generic kernel real case
322
#if defined(WITH_REAL_GENERIC_KERNEL)
323
#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
324
      if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
325
#endif /* not WITH_FIXED_REAL_KERNEL */
326

Andreas Marek's avatar
Andreas Marek committed
327
328
329
        do j = ncols, 2, -2
          w(:,1) = bcast_buffer(1:nbw,j+off)
          w(:,2) = bcast_buffer(1:nbw,j+off-1)
330

331
#ifdef WITH_OPENMP_TRADITIONAL
332
333

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
334
335
336
337
338
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_generic_&
          &PRECISION&
          & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
339
#else
Andreas Marek's avatar
Andreas Marek committed
340
341
342
343
344
345
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_generic_&
          &PRECISION&
          & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
          nbw, nl, stripe_width, nbw)
346
347
#endif

348
#else /* WITH_OPENMP_TRADITIONAL */
349
350

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
351
352
353
354
355
356
357
358
359
360
361
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_generic_&
          &PRECISION&
          & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
#else
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_generic_&
          &PRECISION&
          & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
362
#endif
363
#endif /* WITH_OPENMP_TRADITIONAL */
364

Andreas Marek's avatar
Andreas Marek committed
365
        enddo
366

367
#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
368
      endif
369
#endif /* not WITH_FIXED_REAL_KERNEL */
370
371
#endif /* WITH_REAL_GENERIC_KERNEL */

372
373
374
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
375
      ! generic kernel complex case
376
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
377
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
378
379
380
      if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
          kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
          kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
381
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
382
383
          ttt = mpi_wtime()
          do j = ncols, 1, -1
384
#ifdef WITH_OPENMP_TRADITIONAL
385
386
#ifdef USE_ASSUMED_SIZE

Andreas Marek's avatar
Andreas Marek committed
387
388
389
390
391
            call single_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_&
                 &PRECISION&
                 & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
392
#else
Andreas Marek's avatar
Andreas Marek committed
393
394
395
396
397
398
            call single_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_&
                 &PRECISION&
                 & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
                 bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
399
#endif
400

401
#else /* WITH_OPENMP_TRADITIONAL */
402
403

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
404
405
406
407
408
409
410
411
412
413
414
415
            call single_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_&
                 &PRECISION&
                 & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
            call single_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_&
                 &PRECISION&
                 & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
                 nbw, nl, stripe_width)
416
#endif
417
#endif /* WITH_OPENMP_TRADITIONAL */
418

Andreas Marek's avatar
Andreas Marek committed
419
          enddo
420
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
421
        endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
422
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
423
424
425
426
427
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
428
        ! generic simple real kernel
429
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
430
#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
431
        if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
432
#endif /* not WITH_FIXED_REAL_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
433
434
435
          do j = ncols, 2, -2
            w(:,1) = bcast_buffer(1:nbw,j+off)
            w(:,2) = bcast_buffer(1:nbw,j+off-1)
436
#ifdef WITH_OPENMP_TRADITIONAL
437
438

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
439
440
441
442
443
            call double_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_simple_&
                 &PRECISION&
                 & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
444
#else
Andreas Marek's avatar
Andreas Marek committed
445
446
447
448
449
            call double_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_simple_&
                 &PRECISION&
                 & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
450
451
452

#endif

453
#else /* WITH_OPENMP_TRADITIONAL */
454
455

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
456
457
458
459
460
            call double_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_simple_&
                 &PRECISION&
                 & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
461
#else
Andreas Marek's avatar
Andreas Marek committed
462
463
464
465
466
            call double_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_simple_&
                 &PRECISION&
                 & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
467
468
#endif

469
#endif /* WITH_OPENMP_TRADITIONAL */
470

Andreas Marek's avatar
Andreas Marek committed
471
          enddo
472
#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
473
        endif
474
#endif /* not WITH_FIXED_REAL_KERNEL */
475
476
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */

477
478
479
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
480
        ! generic simple complex case
481

482
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
483
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
484
        if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
485
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
486
487
          ttt = mpi_wtime()
          do j = ncols, 1, -1
488
#ifdef WITH_OPENMP_TRADITIONAL
489
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
490
491
492
493
494
            call single_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_simple_&
                 &PRECISION&
                 & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
495
#else
Andreas Marek's avatar
Andreas Marek committed
496
497
498
499
500
501
            call single_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_simple_&
                 &PRECISION&
                 & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
                 nbw, nl, stripe_width)
502
503
#endif

504
#else /* WITH_OPENMP_TRADITIONAL */
505
506

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
507
508
509
510
511
            call single_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_simple_&
                 &PRECISION&
                 & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
512
#else
Andreas Marek's avatar
Andreas Marek committed
513
514
515
516
517
518
            call single_hh_trafo_&
                 &MATH_DATATYPE&
                 &_generic_simple_&
                 &PRECISION&
                 & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
                 nbw, nl, stripe_width)
519
520
#endif

521
#endif /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
522
          enddo
523
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
524
        endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
525
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
526
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
527

528
529
530
#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
531
        ! sse assembly kernel real case
532
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
533
#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
534
        if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
Andreas Marek's avatar
Andreas Marek committed
535

536
#endif /* not WITH_FIXED_REAL_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
          do j = ncols, 2, -2
            w(:,1) = bcast_buffer(1:nbw,j+off)
            w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
            call double_hh_trafo_&
            &MATH_DATATYPE&
            &_&
            &PRECISION&
            &_sse_assembly&
            & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
            call double_hh_trafo_&
            &MATH_DATATYPE&
            &_&
            &PRECISION&
            &_sse_assembly&
            & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
          enddo
556
#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
557
        endif
558
#endif /* not WITH_FIXED_REAL_KERNEL */
559
560
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */

561
562
563
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
564

Andreas Marek's avatar
Andreas Marek committed
565
        ! sse assembly kernel complex case
566
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
567
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
568
        if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
569
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
          ttt = mpi_wtime()
          do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
            call single_hh_trafo_&
            &MATH_DATATYPE&
            &_&
            &PRECISION&
            &_sse_assembly&
            & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
            call single_hh_trafo_&
            &MATH_DATATYPE&
            &_&
            &PRECISION&
            &_sse_assembly&
            & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
          enddo
588
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
589
        endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
590
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
591
592
593
594
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
#endif /* COMPLEXCASE */

#if REALCASE == 1
595
        ! no sse, vsx, sparc64 sve block1 real kernel
596
597
#endif

598
599
#if COMPLEXCASE == 1

Andreas Marek's avatar
Andreas Marek committed
600
        ! sparc64 block1 complex kernel
601
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
602
!#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
603
!        if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
604
605
606
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
Andreas Marek's avatar
Andreas Marek committed
607
608
!        ttt = mpi_wtime()
!        do j = ncols, 1, -1
609
!#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
610
611
612
613
614
!          call single_hh_trafo_&
!          &MATH_DATATYPE&
!          &_sparc64_1hv_&
!          &PRECISION&
!          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
615
!#else
Andreas Marek's avatar
Andreas Marek committed
616
617
618
619
620
!          call single_hh_trafo_&
!          &MATH_DATATYPE&
!          &_sparc64_1hv_&
!          &PRECISION&
!          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
621
!#endif
Andreas Marek's avatar
Andreas Marek committed
622
!        enddo
623
624
625
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
626
!      endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
627
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
628
629
630
631
632
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


633
634
#if COMPLEXCASE == 1

Andreas Marek's avatar
Andreas Marek committed
635
      ! vsx block1 complex kernel
636
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
637
!#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
638
!      if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
639
640
641
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
Andreas Marek's avatar
Andreas Marek committed
642
643
!        ttt = mpi_wtime()
!        do j = ncols, 1, -1
644
!#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
645
646
647
648
649
!          call single_hh_trafo_&
!          &MATH_DATATYPE&
!          &_vsx_1hv_&
!          &PRECISION&
!          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
650
!#else
Andreas Marek's avatar
Andreas Marek committed
651
652
653
654
655
!          call single_hh_trafo_&
!          &MATH_DATATYPE&
!          &_vsx_1hv_&
!          &PRECISION&
!          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
656
!#endif
Andreas Marek's avatar
Andreas Marek committed
657
!        enddo
658
659
660
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
661
!      endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
662
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
663
664
665
666
667
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


668
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
669

Andreas Marek's avatar
Andreas Marek committed
670
      ! sse block1 complex kernel
671
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
672
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
673
      if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
674
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
675

676
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
Andreas Marek's avatar
Andreas Marek committed
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
        ttt = mpi_wtime()
        do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_sse_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_sse_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
        enddo
693
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
694

695
696
697
698
699
#ifndef WITH_FIXED_COMPLEX_KERNEL
      endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */

700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
      ! neon_arch64 block1 complex kernel
#if defined(WITH_COMPLEX_NEON_ARCH64_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
      if (kernel .eq. ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */

#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_NEON_ARCH64_BLOCK2_KERNEL))
        ttt = mpi_wtime()
        do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_neon_arch64_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_neon_arch64_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
        enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_NEON_ARCH64_BLOCK2_KERNEL)) */

#ifndef WITH_FIXED_COMPLEX_KERNEL
      endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_NEON_ARCH64_BLOCK1)
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_NEON_ARCH64_BLOCK1_KERNEL */

730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
      ! sve128 block1 complex kernel
#if defined(WITH_COMPLEX_SVE128_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
      if (kernel .eq. ELPA_2STAGE_COMPLEX_SVE128_BLOCK1) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */

#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE128_BLOCK2_KERNEL))
        ttt = mpi_wtime()
        do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_sve128_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_sve128_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
        enddo
753
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE128_BLOCK2_KERNEL)) */
754

755
#ifndef WITH_FIXED_COMPLEX_KERNEL
756
      endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SVE128_BLOCK1)
757
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
758
#endif /* WITH_COMPLEX_SVE128_BLOCK1_KERNEL */
759
760
761
762

#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
763
      !no avx block1 real kernel
764
765
766
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
767

Andreas Marek's avatar
Andreas Marek committed
768
      ! avx block1 complex kernel
Andreas Marek's avatar
Andreas Marek committed
769
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
770
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
771
      if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1)) then
772
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
773

Andreas Marek's avatar
Andreas Marek committed
774
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) )
Andreas Marek's avatar
Andreas Marek committed
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
        ttt = mpi_wtime()
        do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_avx_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_avx_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
        enddo
Andreas Marek's avatar
Andreas Marek committed
791
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)) */
792

793
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
794
      endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) )
795
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
796
797
798
799
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */

#if defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
800
      if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
Andreas Marek's avatar
Andreas Marek committed
801
802
803
#endif /* not WITH_FIXED_COMPLEX_KERNEL */

#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
Andreas Marek's avatar
Andreas Marek committed
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
        ttt = mpi_wtime()
        do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_avx2_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_avx2_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
        enddo
Andreas Marek's avatar
Andreas Marek committed
820
821
822
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */

#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
823
      endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
Andreas Marek's avatar
Andreas Marek committed
824
825
826
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX2_BLOCK1_KERNEL */

827
828
829
830
#if defined(WITH_COMPLEX_SVE256_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
      if ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE256_BLOCK1)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
831

832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE256_BLOCK2_KERNEL))
        ttt = mpi_wtime()
        do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_sve256_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_sve256_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
        enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE256_BLOCK2_KERNEL)) */

#ifndef WITH_FIXED_COMPLEX_KERNEL
      endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE256_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE256_BLOCK1_KERNEL */
855
856
857
858

#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
859
      ! no avx512 block1 real kernel
860
      ! no sve512 block1 real kernel
861
862
863
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
864

Andreas Marek's avatar
Andreas Marek committed
865
      ! avx512 block1 complex kernel
866
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
867
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
868
      if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
869
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
870

871
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
Andreas Marek's avatar
Andreas Marek committed
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
        ttt = mpi_wtime()
        do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_avx512_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_avx512_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
        enddo
888
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
889

890
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
891
      endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
892
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
893
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924

      ! sve512 block1 complex kernel
#if defined(WITH_COMPLEX_SVE512_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
      if ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */

#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE512_BLOCK2_KERNEL) )
        ttt = mpi_wtime()
        do j = ncols, 1, -1
#ifdef WITH_OPENMP_TRADITIONAL
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_sve512_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
          call single_hh_trafo_&
          &MATH_DATATYPE&
          &_sve512_1hv_&
          &PRECISION&
          & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
        enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SVE512_BLOCK2_KERNEL) ) */

#ifndef WITH_FIXED_COMPLEX_KERNEL
      endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_SVE512_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SVE512_BLOCK1_KERNEL  */

925
926
927
#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
928
      ! implementation of sparc64 block 2 real case
929
930
931
#if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
932
      if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then
933
934
935
936

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
Andreas Marek's avatar
Andreas Marek committed
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
        do j = ncols, 2, -2
          w(:,1) = bcast_buffer(1:nbw,j+off)
          w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_sparc64_2hv_&
          &PRECISION &
          & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_sparc64_2hv_&
          &PRECISION &
          & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
        enddo
954
955
956
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
957
      endif
958
959
960
961
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */
962

963
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
964
      ! implementation of neon_arch64 block 2 real case
965
966
967
#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
968
      if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) then
969
970
971
972

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL))
Andreas Marek's avatar
Andreas Marek committed
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
        do j = ncols, 2, -2
          w(:,1) = bcast_buffer(1:nbw,j+off)
          w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_neon_arch64_2hv_&
          &PRECISION &
          & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_neon_arch64_2hv_&
          &PRECISION &
          & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
        enddo
990
991
992
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
Andreas Marek's avatar
Andreas Marek committed
993
      endif
994
995
996
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */

997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
      ! implementation of neon_arch64 block 2 real case
#if defined(WITH_REAL_SVE128_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
      if (kernel .eq. ELPA_2STAGE_REAL_SVE128_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK6_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK4_KERNEL))
        do j = ncols, 2, -2
          w(:,1) = bcast_buffer(1:nbw,j+off)
          w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP_TRADITIONAL
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_sve128_2hv_&
          &PRECISION &
          & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
          call double_hh_trafo_&
          &MATH_DATATYPE&
          &_sve128_2hv_&
          &PRECISION &
          & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
        enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK6_KERNEL) && !defined(WITH_REAL_SVE128_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
      endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SVE128_BLOCK2_KERNEL */