compute_hh_trafo.X90 55.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif

       subroutine compute_hh_trafo_&
       &MATH_DATATYPE&
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
49
       &_openmp_&
50
#else
Andreas Marek's avatar
Andreas Marek committed
51
       &_&
52
53
#endif
       &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
54
       (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count,  &
55
56
57
#ifdef WITH_OPENMP
       max_threads, l_nev, &
#endif
Andreas Marek's avatar
Andreas Marek committed
58
       a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
59
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
60
       hh_dot_dev, &
61
#endif
Andreas Marek's avatar
Andreas Marek committed
62
       hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
63
64
65
66
67
#ifdef WITH_OPENMP
       my_thread, thread_width, &
#else
       last_stripe_width, &
#endif
68
       kernel)
69
70

         use precision
71
         use elpa_abstract_impl
72
73
         use iso_c_binding
#if REALCASE == 1
74

75
         use single_hh_trafo_real
76
77
78
79
80
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif

#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
81
         use real_generic_kernel !, only : double_hh_trafo_generic
82
83
84
85
86
87
88
89
90
#endif

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel !, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel !, only : double_hh_trafo_bgq
#endif
91
92
93
94
95

#endif /* REALCASE */

#if COMPLEXCASE == 1

96
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
97
98
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
#endif
99
#if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
100
101
102
103
104
105
106
107
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
#endif

#endif /* COMPLEXCASE */

         use cuda_c_kernel
         use cuda_functions

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
108
         use elpa_generated_fortran_interfaces
109

110
         implicit none
Andreas Marek's avatar
Andreas Marek committed
111
112
113
114
115
         class(elpa_abstract_impl_t), intent(inout) :: obj
	 logical, intent(in)                        :: useGPU, wantDebug
         real(kind=c_double), intent(inout)         :: kernel_time  ! MPI_WTIME always needs double
         integer(kind=lik)                          :: kernel_flops
         integer(kind=ik), intent(in)               :: nbw, max_blk_size
116
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
117
         real(kind=C_DATATYPE_KIND)                 :: bcast_buffer(nbw,max_blk_size)
118
119
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
120
         complex(kind=C_DATATYPE_KIND)              :: bcast_buffer(nbw,max_blk_size)
121
#endif
Andreas Marek's avatar
Andreas Marek committed
122
         integer(kind=ik), intent(in)               :: a_off
123

Andreas Marek's avatar
Andreas Marek committed
124
         integer(kind=ik), intent(in)               :: stripe_width,a_dim2,stripe_count
125
126

#ifndef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
127
         integer(kind=ik), intent(in)               :: last_stripe_width
128
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
129
130
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:)
131
132
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
133
134
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:)
135
136
137
#endif

#else /* WITH_OPENMP */
Andreas Marek's avatar
Andreas Marek committed
138
         integer(kind=ik), intent(in)               :: max_threads, l_nev, thread_width
139
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
140
141
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:,:)
142
#endif
143
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
144
145
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:,:)
146
147
148
149
#endif

#endif /* WITH_OPENMP */

Andreas Marek's avatar
Andreas Marek committed
150
         integer(kind=ik), intent(in)               :: kernel
151

Andreas Marek's avatar
Andreas Marek committed
152
153
         integer(kind=c_intptr_t)                   :: a_dev
   integer(kind=c_intptr_t)                         :: bcast_buffer_dev
Andreas Marek's avatar
Andreas Marek committed
154
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
155
         integer(kind=c_intptr_t)                   :: hh_dot_dev ! why not needed in complex case
156
#endif
Andreas Marek's avatar
Andreas Marek committed
157
158
         integer(kind=c_intptr_t)                   :: hh_tau_dev
         integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
Andreas Marek's avatar
Andreas Marek committed
159

160
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
Andreas Marek's avatar
Andreas Marek committed
161
         integer(kind=ik)                           :: off, ncols, istripe
162
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
163
         integer(kind=ik)                           :: my_thread, noff
164
#endif
Andreas Marek's avatar
Andreas Marek committed
165
         integer(kind=ik)                           :: j, nl, jj, jjj, n_times
166
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
167
         real(kind=C_DATATYPE_KIND)                 :: w(nbw,6)
168
169
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
170
         complex(kind=C_DATATYPE_KIND)              :: w(nbw,2)
171
#endif
Andreas Marek's avatar
Andreas Marek committed
172
         real(kind=c_double)                        :: ttt ! MPI_WTIME always needs double
173

Andreas Marek's avatar
Andreas Marek committed
174

Andreas Marek's avatar
Andreas Marek committed
175
176
177

         if (wantDebug) then
           if (useGPU .and. &
178
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
179
180
181
182
183
184
185
186
             ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
#endif
#if COMPLEXCASE == 1
             ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
             print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
	     stop
	   endif
187
         endif
Andreas Marek's avatar
Andreas Marek committed
188
189
190

#if REALCASE == 1
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
191
#endif
Andreas Marek's avatar
Andreas Marek committed
192
#if COMPLEXCASE == 1
193
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
194
#endif
Andreas Marek's avatar
Andreas Marek committed
195
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
Andreas Marek's avatar
Andreas Marek committed
196
197
198
199
200
201
           if (ncols < 1) then
	     if (wantDebug) then
	       print *, "Returning early from compute_hh_trafo"
	     endif
	     return
	   endif
Andreas Marek's avatar
Andreas Marek committed
202
         endif
203

204
         if (wantDebug) call obj%timer%start("compute_hh_trafo_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
205
   &MATH_DATATYPE&
206
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
207
         &_openmp" // &
208
#else
Andreas Marek's avatar
Andreas Marek committed
209
         &" // &
210
211
212
213
214
215
216
217
218
219
220
221
222
#endif
         &PRECISION_SUFFIX &
         )


#ifdef WITH_OPENMP
         if (my_thread==1) then
#endif
           ttt = mpi_wtime()
#ifdef WITH_OPENMP
         endif
#endif

223
#ifdef WITH_OPENMP
224
225

#if REALCASE == 1
226
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
227
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
228
229
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
230
231
           stop 1
         endif
Andreas Marek's avatar
Andreas Marek committed
232
233
#endif
#if COMPLEXCASE == 1
234
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
235
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
236
237
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
Andreas Marek's avatar
Andreas Marek committed
238
239
           stop 1
         endif
240
#endif
241
242
243
244
245
246
#endif /* WITH_OPENMP */

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP */

247
248
249
250
251
252
         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
253
             if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
254
       &MATH_DATATYPE&
255
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
256
             &_openmp" // &
257
#else
Andreas Marek's avatar
Andreas Marek committed
258
             &" // &
259
260
261
262
263
264
265
266
267
#endif
             &PRECISION_SUFFIX &
             )

             return
           endif
         endif
#endif /* not WITH_OPENMP */

268
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
269
! GPU kernel real
270
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
Andreas Marek's avatar
Andreas Marek committed
271
272
273
	   if (wantDebug) then
	     call obj%timer%start("compute_hh_trafo: GPU")
	   endif
274
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
275
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
276
277
278
                  &_&
                  &MATH_DATATYPE

Andreas Marek's avatar
Andreas Marek committed
279
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
280
281
282
283
                &MATH_DATATYPE&
                &_&
                &PRECISION&
                & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
284
#endif /* REALCASE */
Andreas Marek's avatar
Andreas Marek committed
285
286
#if COMPLEXCASE == 1
! GPU kernel complex
287
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
288
289
290
	   if (wantDebug) then
	     call obj%timer%start("compute_hh_trafo: GPU")
	   endif
Andreas Marek's avatar
Andreas Marek committed
291
292

           dev_offset = (0 + ( (  a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
293
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
294
295
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
296
297

           dev_offset_1 = (0 +  (  off-1 )* nbw) * size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
298
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
299
300
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
301
302

           dev_offset_2 =( off-1 )* size_of_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
303
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
304
305
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
306

Andreas Marek's avatar
Andreas Marek committed
307
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
308
309
310
311
                &MATH_DATATYPE&
                &_&
                &PRECISION&
                & (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
Andreas Marek's avatar
Andreas Marek committed
312
313
314
315
                                                         hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)


#endif /* COMPLEXCASE */
Andreas Marek's avatar
Andreas Marek committed
316
317
318
	   if (wantDebug) then
	     call obj%timer%stop("compute_hh_trafo: GPU")
	   endif
Andreas Marek's avatar
Andreas Marek committed
319
320

         else ! not CUDA kernel
321

Andreas Marek's avatar
Andreas Marek committed
322
323
324
	   if (wantDebug) then
	     call obj%timer%start("compute_hh_trafo: CPU")
	   endif
325
#if REALCASE == 1
326
#ifndef WITH_FIXED_REAL_KERNEL
327
328
329
330
331
332
333
334
335
         if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_GENERIC    .or. &
             kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
             kernel .eq. ELPA_2STAGE_REAL_BGP .or.        &
             kernel .eq. ELPA_2STAGE_REAL_BGQ) then
336
#endif /* not WITH_FIXED_REAL_KERNEL */
337

338
339
340
#endif /* REALCASE */
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

341
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
342
343
#if REALCASE == 1
! generic kernel real case
344
#if defined(WITH_REAL_GENERIC_KERNEL)
345
#ifndef WITH_FIXED_REAL_KERNEL
346
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
347
#endif /* not WITH_FIXED_REAL_KERNEL */
348
349
350
351
352
353
354
355

               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)

#ifdef WITH_OPENMP

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
356
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
357
358
359
360
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
361
362

#else
Andreas Marek's avatar
Andreas Marek committed
363
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
364
365
366
367
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
368
369
370
371
372
373
                    nbw, nl, stripe_width, nbw)
#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
374
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
375
376
377
378
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
379
380

#else
Andreas Marek's avatar
Andreas Marek committed
381
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
382
383
384
385
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
386
387
388
389
390
#endif
#endif /* WITH_OPENMP */

               enddo

391
#ifndef WITH_FIXED_REAL_KERNEL
392
             endif
393
#endif /* not WITH_FIXED_REAL_KERNEL */
394
395
#endif /* WITH_REAL_GENERIC_KERNEL */

396
397
398
399
400
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
! generic kernel complex case
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
401
#ifndef WITH_FIXED_COMPLEX_KERNEL
402
403
404
           if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
405
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
406
407
408
409
410
411
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE

              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
412
413
414
415
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
416
417
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
418
419
420
421
422
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
		     bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
423
#endif
424

425
426
427
428
#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
429
430
431
432
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
433
434
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
435
436
437
438
439
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
		      nbw, nl, stripe_width)
440
441
442
443
#endif
#endif /* WITH_OPENMP */

            enddo
444
#ifndef WITH_FIXED_COMPLEX_KERNEL
445
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
446
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
447
448
449
450
451
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
452
453


454
! generic simple real kernel
455
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
456
#ifndef WITH_FIXED_REAL_KERNEL
457
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
458
#endif /* not WITH_FIXED_REAL_KERNEL */
459
460
461
462
463
464
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

#ifdef USE_ASSUMED_SIZE
465
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
466
467
468
469
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
470
#else
471
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
472
473
474
475
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
476
477
478
479
480
481

#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
482
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
483
484
485
486
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
487
#else
488
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
489
490
491
492
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
493
494
495
496
497
498

#endif

#endif /* WITH_OPENMP */

               enddo
499
#ifndef WITH_FIXED_REAL_KERNEL
500
             endif
501
#endif /* not WITH_FIXED_REAL_KERNEL */
502
503
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */

504
505
506
507
#endif /* REALCASE */

#if COMPLEXCASE == 1
! generic simple complex case
508

509
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
510
#ifndef WITH_FIXED_COMPLEX_KERNEL
511
            if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
512
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
513
514
515
516
517
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
518
519
520
521
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
522
523
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
524
525
526
527
528
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
		       nbw, nl, stripe_width)
529
530
531
532
533
534
#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
535
536
537
538
                     &MATH_DATATYPE&
                     &_generic_simple_&
                     &PRECISION&
                     & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
539
540
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
541
542
543
544
545
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
		       nbw, nl, stripe_width)
546
547
548
549
#endif

#endif /* WITH_OPENMP */
             enddo
550
#ifndef WITH_FIXED_COMPLEX_KERNEL
551
           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
552
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
553
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
554

555
556
557
558
#endif /* COMPLEXCASE */

#if REALCASE == 1
! sse assembly kernel real case
559
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
560
#ifndef WITH_FIXED_REAL_KERNEL
561
             if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
Andreas Marek's avatar
Andreas Marek committed
562

563
#endif /* not WITH_FIXED_REAL_KERNEL */
564
565
566
567
568
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
                 call double_hh_trafo_&
569
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
570
571
572
573
                 &_&
                 &PRECISION&
                 &_sse_assembly&
                 & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
574
575
#else
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
576
577
578
579
580
                      &MATH_DATATYPE&
                      &_&
                      &PRECISION&
                      &_sse_assembly&
                      & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
581
582
#endif
               enddo
583
#ifndef WITH_FIXED_REAL_KERNEL
584
             endif
585
#endif /* not WITH_FIXED_REAL_KERNEL */
586
587
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */

588
589
590
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
591

592
593
! sse assembly kernel complex case
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
594
#ifndef WITH_FIXED_COMPLEX_KERNEL
595
           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
596
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
597
598
599
600
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
601
602
603
604
605
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
606
607
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
608
609
610
611
612
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
613
614
#endif
            enddo
615
#ifndef WITH_FIXED_COMPLEX_KERNEL
616
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
617
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
618
619
620
621
622
623
624
625
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
#endif /* COMPLEXCASE */

#if REALCASE == 1
! no sse block1 real kernel
#endif

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
626

627
628
! sse block1 complex kernel
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
629
#ifndef WITH_FIXED_COMPLEX_KERNEL
630
          if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
631
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
632

633
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
634
635
636
637
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
638
639
640
641
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
642
643
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
644
645
646
647
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
648
649
#endif
            enddo
650
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
651

652
#ifndef WITH_FIXED_COMPLEX_KERNEL
653
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
654
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
655
656
657
658
659
660
661
662
663
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
!no avx block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
664

665
666
! avx block1 complex kernel
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
667
#ifndef WITH_FIXED_COMPLEX_KERNEL
668
669
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
              (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
670
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
671

672
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
673
674
675
676
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
677
678
679
680
                   &MATH_DATATYPE&
                   &_avx_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
681
682
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
683
684
685
686
                   &MATH_DATATYPE&
                   &_avx_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
687
688
#endif
            enddo
689
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
690

691
#ifndef WITH_FIXED_COMPLEX_KERNEL
692
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
693
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
694
695
696
697
698
699
700
701
702
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
! no avx512 block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
703

704
705
! avx512 block1 complex kernel
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
706
#ifndef WITH_FIXED_COMPLEX_KERNEL
707
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
708
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
709

710
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
711
712
713
714
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
715
716
717
718
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
719
720
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
721
722
723
724
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
725
726
#endif
            enddo
727
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
728

729
#ifndef WITH_FIXED_COMPLEX_KERNEL
730
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
731
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
732
733
734
735
736
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
#endif /* COMPLEXCASE */

#if REALCASE == 1
! implementation of sse block 2 real case
737
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)
738

739
#ifndef WITH_FIXED_REAL_KERNEL
740
           if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then
Andreas Marek's avatar
Andreas Marek committed
741

742
#endif /* not WITH_FIXED_REAL_KERNEL */
743

744
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
745
746
747
748
749
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
750
751
752
753
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
754
755
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
756
757
758
759
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
760
761
#endif
             enddo
762
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */
763

764
#ifndef WITH_FIXED_REAL_KERNEL
765
           endif
766
#endif /* not WITH_FIXED_REAL_KERNEL */
767
768
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */

769
770
771
772
773
774
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
! implementation of sse block 2 complex case

#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
775
#ifndef WITH_FIXED_COMPLEX_KERNEL
776
           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2) then
777
#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
778
779
780
781
782
783
784

             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
785
786
787
788
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
789
790
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
791
792
793
794
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
795
796
797
798
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
799
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
800
801
802
                       &_sse_1hv_&
                       &PRECISION&
                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
803
804
#else
             if (j==1) call single_hh_trafo_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
805
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
806
807
808
                            &_sse_1hv_&
                            &PRECISION&
                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
809
810
#endif

811
#ifndef WITH_FIXED_COMPLEX_KERNEL
812
           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
813
#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
814
815
816
817
818
819
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */

#if REALCASE == 1
! implementation of avx block 2 real case

820
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
821
#ifndef WITH_FIXED_REAL_KERNEL
822

823
824
           if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2) .or. &
               (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2))  then
Andreas Marek's avatar
Andreas Marek committed
825

826
#endif /* not WITH_FIXED_REAL_KERNEL */
827

828
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
829
830
831
832
833
834
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
835
836
837
838
                    &MATH_DATATYPE&
                    &_avx_avx2_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
839
840
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
841
842
843
844
                    &MATH_DATATYPE&
                    &_avx_avx2_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
845
846
#endif
               enddo
847
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) ... */
848

849
#ifndef WITH_FIXED_REAL_KERNEL
850
             endif
851
#endif /* not WITH_FIXED_REAL_KERNEL */
852
853
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL || WITH_REAL_AVX2_BLOCK2_KERNEL */

854
855
856
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
857

858
859
! implementation of avx block 2 complex case
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
860
#ifndef WITH_FIXED_COMPLEX_KERNEL
861
862
           if ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. &
                (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) ) then
863
#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
864
865
866
867
868
869
870

              ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
871
872
873
874
                    &MATH_DATATYPE&
                    &_avx_avx2_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
875
876
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
877
878
879
880
                    &MATH_DATATYPE&
                    &_avx_avx2_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
881
882
883
884
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
885
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
886
887
888
                 &_avx_avx2_1hv_&
                 &PRECISION&
                 & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
889
890
#else
             if (j==1) call single_hh_trafo_&
Andreas Marek's avatar
Retab  
Andreas Marek committed
891
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
892
893
894
                 &_avx_avx2_1hv_&
                 &PRECISION&
                 & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
895
896
#endif

897
#ifndef WITH_FIXED_COMPLEX_KERNEL
898
           endif ! ( (kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK2) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK2) )
899
#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
900
901
902
903
904
905
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
! implementation of avx512 block 2 real case
906
907

#if defined(WITH_REAL_AVX512_BLOCK2_KERNEL)
908
#ifndef WITH_FIXED_REAL_KERNEL
909

910
           if ((kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2)) then
Andreas Marek's avatar
Andreas Marek committed
911

912
#endif /* not WITH_FIXED_REAL_KERNEL */
913

914
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX512_BLOCK4_KERNEL))
915
916
917
918
919
920
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
921
922
923
924
                    &MATH_DATATYPE&
                    &_avx512_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
925
926
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
927
928
929
930