compute_hh_trafo.F90 86.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif

       subroutine compute_hh_trafo_&
       &MATH_DATATYPE&
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
49
       &_openmp_&
50
#else
Andreas Marek's avatar
Andreas Marek committed
51
       &_&
52
53
#endif
       &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
54
       (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, &
55
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
56
       l_nev, &
57
#endif
Andreas Marek's avatar
Andreas Marek committed
58
       a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
59
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
60
       hh_dot_dev, &
61
#endif
Andreas Marek's avatar
Andreas Marek committed
62
       hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
63
64
65
66
67
#ifdef WITH_OPENMP
       my_thread, thread_width, &
#else
       last_stripe_width, &
#endif
68
       kernel)
69
70

         use precision
71
         use elpa_abstract_impl
72
73
         use iso_c_binding
#if REALCASE == 1
74

75
         use single_hh_trafo_real
76
77
78
79
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif

Andreas Marek's avatar
Andreas Marek committed
80
81
82
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
#endif
Andreas Marek's avatar
Andreas Marek committed
83

84
85
86
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
!         use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
!#endif
87

88
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
89
         use real_generic_kernel !, only : double_hh_trafo_generic
90
91
92
93
94
95
96
97
98
#endif

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel !, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel !, only : double_hh_trafo_bgq
#endif
99
100
101
102
103

#endif /* REALCASE */

#if COMPLEXCASE == 1

104
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
105
106
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
#endif
107
#if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
108
109
110
111
112
113
114
115
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
#endif

#endif /* COMPLEXCASE */

         use cuda_c_kernel
         use cuda_functions

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
116
         use elpa_generated_fortran_interfaces
117

118
         implicit none
Andreas Marek's avatar
Andreas Marek committed
119
         class(elpa_abstract_impl_t), intent(inout) :: obj
Andreas Marek's avatar
Andreas Marek committed
120
         logical, intent(in)                        :: useGPU, wantDebug
Andreas Marek's avatar
Andreas Marek committed
121
122
123
         real(kind=c_double), intent(inout)         :: kernel_time  ! MPI_WTIME always needs double
         integer(kind=lik)                          :: kernel_flops
         integer(kind=ik), intent(in)               :: nbw, max_blk_size
124
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
125
         real(kind=C_DATATYPE_KIND)                 :: bcast_buffer(nbw,max_blk_size)
126
127
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
128
         complex(kind=C_DATATYPE_KIND)              :: bcast_buffer(nbw,max_blk_size)
129
#endif
Andreas Marek's avatar
Andreas Marek committed
130
         integer(kind=ik), intent(in)               :: a_off
131

Andreas Marek's avatar
Andreas Marek committed
132
         integer(kind=ik), intent(in)               :: stripe_width,a_dim2,stripe_count
133

Andreas Marek's avatar
Andreas Marek committed
134
         integer(kind=ik), intent(in)               :: max_threads
135
#ifndef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
136
         integer(kind=ik), intent(in)               :: last_stripe_width
137
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
138
139
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:)
140
141
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
142
143
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:)
144
145
146
#endif

#else /* WITH_OPENMP */
Andreas Marek's avatar
Andreas Marek committed
147
         integer(kind=ik), intent(in)               :: l_nev, thread_width
148
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
149
150
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:,:)
151
 
152
#endif
153
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
154
155
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:,:)
156
157
158
159
#endif

#endif /* WITH_OPENMP */

Andreas Marek's avatar
Andreas Marek committed
160
         integer(kind=ik), intent(in)               :: kernel
161

Andreas Marek's avatar
Andreas Marek committed
162
163
         integer(kind=c_intptr_t)                   :: a_dev
   integer(kind=c_intptr_t)                         :: bcast_buffer_dev
Andreas Marek's avatar
Andreas Marek committed
164
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
165
         integer(kind=c_intptr_t)                   :: hh_dot_dev ! why not needed in complex case
166
#endif
Andreas Marek's avatar
Andreas Marek committed
167
168
         integer(kind=c_intptr_t)                   :: hh_tau_dev
         integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
Andreas Marek's avatar
Andreas Marek committed
169

170
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
Andreas Marek's avatar
Andreas Marek committed
171
         integer(kind=ik)                           :: off, ncols, istripe
172
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
173
         integer(kind=ik)                           :: my_thread, noff
174
#endif
Andreas Marek's avatar
Andreas Marek committed
175
         integer(kind=ik)                           :: j, nl, jj, jjj, n_times
176
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
177
         real(kind=C_DATATYPE_KIND)                 :: w(nbw,6)
178
179
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
180
         complex(kind=C_DATATYPE_KIND)              :: w(nbw,2)
181
#endif
Andreas Marek's avatar
Andreas Marek committed
182
         real(kind=c_double)                        :: ttt ! MPI_WTIME always needs double
183

184
         j = -99
Andreas Marek's avatar
Andreas Marek committed
185
186
187

         if (wantDebug) then
           if (useGPU .and. &
188
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
189
190
191
192
193
194
             ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
#endif
#if COMPLEXCASE == 1
             ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
             print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
Andreas Marek's avatar
Andreas Marek committed
195
196
             stop
           endif
197
         endif
Andreas Marek's avatar
Andreas Marek committed
198
199
200

#if REALCASE == 1
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
201
#endif
Andreas Marek's avatar
Andreas Marek committed
202
#if COMPLEXCASE == 1
203
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
204
#endif
Andreas Marek's avatar
Andreas Marek committed
205
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
Andreas Marek's avatar
Andreas Marek committed
206
           if (ncols < 1) then
Andreas Marek's avatar
Andreas Marek committed
207
208
209
210
211
             if (wantDebug) then
               print *, "Returning early from compute_hh_trafo"
             endif
             return
           endif
Andreas Marek's avatar
Andreas Marek committed
212
         endif
213

214
         if (wantDebug) call obj%timer%start("compute_hh_trafo_&
215
                                              &MATH_DATATYPE&
216
#ifdef WITH_OPENMP
217
                                              &_openmp" // &
218
#else
219
                                              &" // &
220
#endif
221
222
                                              &PRECISION_SUFFIX &
                                              )
223
224
225
226
227
228
229
230
231
232


#ifdef WITH_OPENMP
         if (my_thread==1) then
#endif
           ttt = mpi_wtime()
#ifdef WITH_OPENMP
         endif
#endif

233
#ifdef WITH_OPENMP
234
235

#if REALCASE == 1
236
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
237
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
238
239
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
240
241
           stop 1
         endif
Andreas Marek's avatar
Andreas Marek committed
242
243
#endif
#if COMPLEXCASE == 1
244
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
245
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
246
247
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
Andreas Marek's avatar
Andreas Marek committed
248
249
           stop 1
         endif
250
#endif
251
252
253
254
255
256
#endif /* WITH_OPENMP */

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP */

257
258
259
260
261
262
         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
263
             if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
264
                                                &MATH_DATATYPE&
265
#ifdef WITH_OPENMP
266
                                                &_openmp" // &
267
#else
268
                                                &" // &
269
#endif
270
271
                                                &PRECISION_SUFFIX &
                                                )
272
273
274
275
276
277

             return
           endif
         endif
#endif /* not WITH_OPENMP */

278
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
279
! GPU kernel real
280
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
Andreas Marek's avatar
Andreas Marek committed
281
282
283
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: GPU")
           endif
284
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
285
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
286
287
288
                  &_&
                  &MATH_DATATYPE

Andreas Marek's avatar
Andreas Marek committed
289
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
290
291
292
293
                &MATH_DATATYPE&
                &_&
                &PRECISION&
                & (a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, hh_tau_dev, nl, nbw, stripe_width, off, ncols)
294
#endif /* REALCASE */
Andreas Marek's avatar
Andreas Marek committed
295
296
#if COMPLEXCASE == 1
! GPU kernel complex
297
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
298
299
300
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: GPU")
           endif
Andreas Marek's avatar
Andreas Marek committed
301
302

           dev_offset = (0 + ( (  a_off + off-1 )* stripe_width) + ( (istripe - 1)*stripe_width*a_dim2 )) * size_of_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
303
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
304
305
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
306
307

           dev_offset_1 = (0 +  (  off-1 )* nbw) * size_of_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
308
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
309
310
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
311
312

           dev_offset_2 =( off-1 )* size_of_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
313
                  &PRECISION&
Andreas Marek's avatar
Andreas Marek committed
314
315
                  &_&
                  &MATH_DATATYPE
Andreas Marek's avatar
Andreas Marek committed
316

Andreas Marek's avatar
Andreas Marek committed
317
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
318
319
320
321
                &MATH_DATATYPE&
                &_&
                &PRECISION&
                & (a_dev + dev_offset,bcast_buffer_dev + dev_offset_1, &
Andreas Marek's avatar
Andreas Marek committed
322
323
324
325
                                                         hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, off,ncols)


#endif /* COMPLEXCASE */
326
327
328
           if (wantDebug) then
             call obj%timer%stop("compute_hh_trafo: GPU")
           endif
Andreas Marek's avatar
Andreas Marek committed
329
330

         else ! not CUDA kernel
331

332
333
334
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: CPU")
           endif
335
#if REALCASE == 1
336
#ifndef WITH_FIXED_REAL_KERNEL
337
338
339
340
         if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
341
             kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
342
             kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
343
344
345
346
347
             kernel .eq. ELPA_2STAGE_REAL_GENERIC    .or. &
             kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
             kernel .eq. ELPA_2STAGE_REAL_BGP .or.        &
             kernel .eq. ELPA_2STAGE_REAL_BGQ) then
348
#endif /* not WITH_FIXED_REAL_KERNEL */
349

350
351
352
#endif /* REALCASE */
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

353
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
354
355
#if REALCASE == 1
! generic kernel real case
356
#if defined(WITH_REAL_GENERIC_KERNEL)
357
#ifndef WITH_FIXED_REAL_KERNEL
358
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
359
#endif /* not WITH_FIXED_REAL_KERNEL */
360
361
362
363
364
365
366
367

               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)

#ifdef WITH_OPENMP

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
368
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
369
370
371
372
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
373
374

#else
Andreas Marek's avatar
Andreas Marek committed
375
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
376
377
378
379
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
380
381
382
383
384
385
                    nbw, nl, stripe_width, nbw)
#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
386
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
387
388
389
390
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
391
392

#else
Andreas Marek's avatar
Andreas Marek committed
393
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
394
395
396
397
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
398
399
400
401
402
#endif
#endif /* WITH_OPENMP */

               enddo

403
#ifndef WITH_FIXED_REAL_KERNEL
404
             endif
405
#endif /* not WITH_FIXED_REAL_KERNEL */
406
407
#endif /* WITH_REAL_GENERIC_KERNEL */

408
409
410
411
412
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
! generic kernel complex case
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
413
#ifndef WITH_FIXED_COMPLEX_KERNEL
414
415
416
           if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
417
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
418
419
420
421
422
423
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE

              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
424
425
426
427
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
428
429
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
430
431
432
433
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
Andreas Marek's avatar
Andreas Marek committed
434
                     bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
435
#endif
436

437
438
439
440
#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
441
442
443
444
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
445
446
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
447
448
449
450
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
451
                      nbw, nl, stripe_width)
452
453
454
455
#endif
#endif /* WITH_OPENMP */

            enddo
456
#ifndef WITH_FIXED_COMPLEX_KERNEL
457
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
458
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
459
460
461
462
463
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
464
465


466
! generic simple real kernel
467
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
468
#ifndef WITH_FIXED_REAL_KERNEL
469
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
470
#endif /* not WITH_FIXED_REAL_KERNEL */
471
472
473
474
475
476
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

#ifdef USE_ASSUMED_SIZE
477
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
478
479
480
481
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
482
#else
483
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
484
485
486
487
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
488
489
490
491
492
493

#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
494
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
495
496
497
498
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
499
#else
500
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
501
502
503
504
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
505
506
507
508
509
510

#endif

#endif /* WITH_OPENMP */

               enddo
511
#ifndef WITH_FIXED_REAL_KERNEL
512
             endif
513
#endif /* not WITH_FIXED_REAL_KERNEL */
514
515
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */

516
517
518
519
#endif /* REALCASE */

#if COMPLEXCASE == 1
! generic simple complex case
520

521
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
522
#ifndef WITH_FIXED_COMPLEX_KERNEL
523
            if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
524
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
525
526
527
528
529
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
530
531
532
533
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
534
535
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
536
537
538
539
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
540
                       nbw, nl, stripe_width)
541
542
543
544
545
546
#endif

#else /* WITH_OPENMP */

#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
547
548
549
550
                     &MATH_DATATYPE&
                     &_generic_simple_&
                     &PRECISION&
                     & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
551
552
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
553
554
555
556
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
557
                       nbw, nl, stripe_width)
558
559
560
561
#endif

#endif /* WITH_OPENMP */
             enddo
562
#ifndef WITH_FIXED_COMPLEX_KERNEL
563
           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
564
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
565
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
566

567
568
569
570
#endif /* COMPLEXCASE */

#if REALCASE == 1
! sse assembly kernel real case
571
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
572
#ifndef WITH_FIXED_REAL_KERNEL
573
             if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
Andreas Marek's avatar
Andreas Marek committed
574

575
#endif /* not WITH_FIXED_REAL_KERNEL */
576
577
578
579
580
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
                 call double_hh_trafo_&
581
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
582
583
584
585
                 &_&
                 &PRECISION&
                 &_sse_assembly&
                 & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
586
587
#else
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
588
589
590
591
592
                      &MATH_DATATYPE&
                      &_&
                      &PRECISION&
                      &_sse_assembly&
                      & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
593
594
#endif
               enddo
595
#ifndef WITH_FIXED_REAL_KERNEL
596
             endif
597
#endif /* not WITH_FIXED_REAL_KERNEL */
598
599
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */

600
601
602
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
603

604
605
! sse assembly kernel complex case
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
606
#ifndef WITH_FIXED_COMPLEX_KERNEL
607
           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
608
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
609
610
611
612
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
613
614
615
616
617
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
618
619
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
620
621
622
623
624
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
625
626
#endif
            enddo
627
#ifndef WITH_FIXED_COMPLEX_KERNEL
628
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
629
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
630
631
632
633
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
#endif /* COMPLEXCASE */

#if REALCASE == 1
634
! no sse, vsx, sparc64 block1 real kernel
635
636
#endif

637
638
639
640
#if COMPLEXCASE == 1

! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
!            ttt = mpi_wtime()
!            do j = ncols, 1, -1
!#ifdef WITH_OPENMP
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_sparc64_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_sparc64_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
!            enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
667
668
669
670
671
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


672
673
674
675
#if COMPLEXCASE == 1

! vsx block1 complex kernel
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
676
677
678
679
680
681
682
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
!            ttt = mpi_wtime()
!            do j = ncols, 1, -1
683
684
685
686
687
688
689
690
691
692
693
694
695
!#ifdef WITH_OPENMP
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_vsx_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_vsx_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
696
697
698
699
700
701
!            enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
702
703
704
705
706
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


707
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
708

709
710
! sse block1 complex kernel
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
711
#ifndef WITH_FIXED_COMPLEX_KERNEL
712
          if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
713
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
714

715
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
716
717
718
719
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
720
721
722
723
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
724
725
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
726
727
728
729
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
730
731
#endif
            enddo
732
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
733

734
#ifndef WITH_FIXED_COMPLEX_KERNEL
735
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
736
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
737
738
739
740
741
742
743
744
745
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
!no avx block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
746

747
748
! avx block1 complex kernel
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
749
#ifndef WITH_FIXED_COMPLEX_KERNEL
750
751
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. &
              (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
752
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
753

754
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
755
756
757
758
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
759
760
761
762
                   &MATH_DATATYPE&
                   &_avx_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
763
764
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
765
766
767
768
                   &MATH_DATATYPE&
                   &_avx_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
769
770
#endif
            enddo
771
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
772

773
#ifndef WITH_FIXED_COMPLEX_KERNEL
774
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) .or. (kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
775
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
776
777
778
779
780
781
782
783
784
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
! no avx512 block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
785

786
787
! avx512 block1 complex kernel
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
788
#ifndef WITH_FIXED_COMPLEX_KERNEL
789
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
790
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
791

792
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
793
794
795
796
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
797
798
799
800
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
801
802
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
803
804
805
806
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
807
808
#endif
            enddo
809
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
810

811
#ifndef WITH_FIXED_COMPLEX_KERNEL
812
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
813
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
814
815
816
817
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
#endif /* COMPLEXCASE */

#if REALCASE == 1
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
! implementation of sparc64 block 2 real case
#if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sparc64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sparc64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */
852
853


854
#if REALCASE == 1
855
856
! implementation of vsx block 2 real case
#if defined(WITH_REAL_VSX_BLOCK2_KERNEL)
857

858
#ifndef WITH_FIXED_REAL_KERNEL
859
           if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2) then
Andreas Marek's avatar
Andreas Marek committed
860

861
#endif /* not WITH_FIXED_REAL_KERNEL */
862

863
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL))
864
865
866
867
868
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
869
                    &MATH_DATATYPE&
870
                    &_vsx_2hv_&
Andreas Marek's avatar
Andreas Marek committed
871
872
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
873
874
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
875
                    &MATH_DATATYPE&
876
                    &_vsx_2hv_&
Andreas Marek's avatar
Andreas Marek committed
877
878
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
879
880
#endif
             enddo
881
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL)) */
882

883
#ifndef WITH_FIXED_REAL_KERNEL
884
           endif
885
#endif /* not WITH_FIXED_REAL_KERNEL */
886
#endif /* WITH_REAL_VSX_BLOCK2_KERNEL */
887

888
889
#endif /* REALCASE == 1 */

Andreas Marek's avatar
Andreas Marek committed
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
#if REALCASE == 1
! implementation of sse block 2 real case
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */


927
928
929
930
#if COMPLEXCASE == 1
! implementation of sparc64 block 2 complex case

#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
!
!             ttt = mpi_wtime()
!             do j = ncols, 2, -2
!               w(:,1) = bcast_buffer(1:nbw,j+off)
!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
!#ifdef WITH_OPENMP
!               call double_hh_trafo_&
!                    &MATH_DATATYPE&
!                    &_sparc64_2hv_&
!                    &PRECISION&
!                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!#else
!               call double_hh_trafo_&
!                    &MATH_DATATYPE&
!                    &_sparc64_2hv_&
!                    &PRECISION&
!                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!#endif
!             enddo
!#ifdef WITH_OPENMP
!             if (j==1) call single_hh_trafo_&
!                 &MATH_DATATYPE&
!                       &_sparc64_1hv_&
!                       &PRECISION&
!                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#else
!             if (j==1) call single_hh_trafo_&
!                 &MATH_DATATYPE&
!                            &_sparc64_1hv_&
!                            &PRECISION&
!                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2)
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
970
971
972
#endif /* WITH_COMPLEX_SPARC64_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */

973
974
975
976
977

#if COMPLEXCASE == 1
! implementation of vsx block 2 complex case

#if defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)
978
979
980
981
982
983
984
985
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2) then
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
!
!             ttt = mpi_wtime()
!             do j = ncols, 2, -2
!               w(:,1) = bcast_buffer(1:nbw,j+off)
!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
986
987
988
989
990
991
992
993
994
995
996
997
998
!#ifdef WITH_OPENMP
!               call double_hh_trafo_&
!                    &MATH_DATATYPE&
!                    &_vsx_2hv_&
!                    &PRECISION&
!                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
!#else
!               call double_hh_trafo_&
!                    &MATH_DATATYPE&
!                    &_vsx_2hv_&
!                    &PRECISION&
!                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
!#endif
999
!             enddo
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
!#ifdef WITH_OPENMP
!             if (j==1) call single_hh_trafo_&
!                 &MATH_DATATYPE&
!                       &_vsx_1hv_&
!                       &PRECISION&
!                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#else
!             if (j==1) call single_hh_trafo_&
!                 &MATH_DATATYPE&
!                            &_vsx_1hv_&
!                            &PRECISION&
!                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
!#endif
1013
1014
1015
1016
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK2)
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
1017
1018
1019
#endif /* WITH_COMPLEX_VSX_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */

1020
1021
1022
1023
#if COMPLEXCASE == 1
! implementation of sse block 2 complex case

#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
1024
#ifndef WITH_FIXED_COMPLEX_KERNEL
1025
           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2) then
1026
#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
1027
1028
1029
1030
1031
1032
1033

             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
1034
1035
1036
1037
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1038
1039
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
1040
1041
1042
1043
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
1044
1045
1046
1047
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
1048
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
1049
1050
1051
                       &_sse_1hv_&
                       &PRECISION&
                       & (c_loc(a(1,1+off+a_off,istripe,my_thread)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1052
1053
#else
             if (j==1) call single_hh_trafo_&
Andreas Marek's avatar
Retab    
Andreas Marek committed
1054
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
1055
1056
1057
                            &_sse_1hv_&
                            &PRECISION&
                            & (c_loc(a(1,1+off+a_off,istripe)), bcast_buffer(1,off+1), nbw, nl, stripe_width)
1058
1059
#endif

1060
#ifndef WITH_FIXED_COMPLEX_KERNEL
1061
           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK2)
1062
#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
1063
1064
1065
1066
1067
1068
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */
#endif /* COMPLEXCASE == 1 */

#if REALCASE == 1
! implementation of avx block 2 real case

1069
#if defined(WITH_REAL_AVX_BLOCK2_KERNEL) || defined(WITH_REAL_AVX2_BLOCK2_KERNEL)
1070
#ifndef WITH_FIXED_REAL_KERNEL
1071

1072
1073
           if ((kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2) .or. &
               (kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2))  then
Andreas Marek's avatar
Andreas Marek committed
1074

1075
#endif /* not WITH_FIXED_REAL_KERNEL */
1076

1077
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_AVX_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX_BLOCK4_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK6_KERNEL) && !defined(WITH_REAL_AVX2_BLOCK4_KERNEL))
1078
1079
1080
1081
1082
1083
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP

               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
1084
1085
1086
1087
                    &MATH_DATATYPE&
                    &_avx_avx2_2hv_&
                    &PRECISION&
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
1088
1089
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
1090
1091
1092