compute_hh_trafo.F90 104 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif

       subroutine compute_hh_trafo_&
       &MATH_DATATYPE&
48
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
49
       &_openmp_&
50
#else
Andreas Marek's avatar
Andreas Marek committed
51
       &_&
52
53
#endif
       &PRECISION &
Andreas Marek's avatar
Andreas Marek committed
54
       (obj, useGPU, wantDebug, a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, &
55
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
56
       l_nev, &
57
#endif
Andreas Marek's avatar
Andreas Marek committed
58
59
       a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
       hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
60
#ifdef WITH_OPENMP_TRADITIONAL
61
62
63
64
       my_thread, thread_width, &
#else
       last_stripe_width, &
#endif
65
       kernel)
66
67

         use precision
68
         use elpa_abstract_impl
69
70
         use iso_c_binding
#if REALCASE == 1
71

72
         use single_hh_trafo_real
73
74
75
76
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_kernel !, only : double_hh_trafo_generic_simple
#endif

Andreas Marek's avatar
Andreas Marek committed
77
78
79
#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK4_KERNEL) && !(defined(USE_ASSUMED_SIZE))
         use real_generic_simple_block4_kernel !, only : double_hh_trafo_generic_simple
#endif
Andreas Marek's avatar
Andreas Marek committed
80

81
82
83
!#if defined(WITH_REAL_GENERIC_SIMPLE_BLOCK6_KERNEL) && !(defined(USE_ASSUMED_SIZE))
!         use real_generic_simple_block6_kernel !, only : double_hh_trafo_generic_simple
!#endif
84

85
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
86
         use real_generic_kernel !, only : double_hh_trafo_generic
87
88
89
90
91
92
93
94
95
#endif

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel !, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel !, only : double_hh_trafo_bgq
#endif
96
97
98
99
100

#endif /* REALCASE */

#if COMPLEXCASE == 1

101
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL) && !(defined(USE_ASSUMED_SIZE))
102
103
           use complex_generic_simple_kernel !, only : single_hh_trafo_complex_generic_simple
#endif
104
#if defined(WITH_COMPLEX_GENERIC_KERNEL) && !(defined(USE_ASSUMED_SIZE))
105
106
107
108
109
110
111
112
           use complex_generic_kernel !, only : single_hh_trafo_complex_generic
#endif

#endif /* COMPLEXCASE */

         use cuda_c_kernel
         use cuda_functions

Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
113
         use elpa_generated_fortran_interfaces
114

115
         implicit none
Andreas Marek's avatar
Andreas Marek committed
116
         class(elpa_abstract_impl_t), intent(inout) :: obj
Andreas Marek's avatar
Andreas Marek committed
117
         logical, intent(in)                        :: useGPU, wantDebug
Andreas Marek's avatar
Andreas Marek committed
118
119
120
         real(kind=c_double), intent(inout)         :: kernel_time  ! MPI_WTIME always needs double
         integer(kind=lik)                          :: kernel_flops
         integer(kind=ik), intent(in)               :: nbw, max_blk_size
121
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
122
         real(kind=C_DATATYPE_KIND)                 :: bcast_buffer(nbw,max_blk_size)
123
124
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
125
         complex(kind=C_DATATYPE_KIND)              :: bcast_buffer(nbw,max_blk_size)
126
#endif
Andreas Marek's avatar
Andreas Marek committed
127
         integer(kind=ik), intent(in)               :: a_off
128

Andreas Marek's avatar
Andreas Marek committed
129
         integer(kind=ik), intent(in)               :: stripe_width,a_dim2,stripe_count
130

Andreas Marek's avatar
Andreas Marek committed
131
         integer(kind=ik), intent(in)               :: max_threads
132
#ifndef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
133
         integer(kind=ik), intent(in)               :: last_stripe_width
134
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
135
136
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:)
137
138
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
139
140
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:)
141
142
#endif

143
#else /* WITH_OPENMP_TRADITIONAL */
Andreas Marek's avatar
Andreas Marek committed
144
         integer(kind=ik), intent(in)               :: l_nev, thread_width
145
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
146
147
!         real(kind=C_DATATYPE_KIND)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
         real(kind=C_DATATYPE_KIND), pointer        :: a(:,:,:,:)
148

149
#endif
150
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
151
152
!          complex(kind=C_DATATYPE_KIND)            :: a(stripe_width,a_dim2,stripe_count,max_threads)
          complex(kind=C_DATATYPE_KIND),pointer     :: a(:,:,:,:)
153
154
#endif

155
#endif /* WITH_OPENMP_TRADITIONAL */
156

Andreas Marek's avatar
Andreas Marek committed
157
         integer(kind=ik), intent(in)               :: kernel
158

Andreas Marek's avatar
Andreas Marek committed
159
         integer(kind=c_intptr_t)                   :: a_dev
160
         integer(kind=c_intptr_t)                   :: bcast_buffer_dev
Andreas Marek's avatar
Andreas Marek committed
161
162
         integer(kind=c_intptr_t)                   :: hh_tau_dev
         integer(kind=c_intptr_t)                   :: dev_offset, dev_offset_1, dev_offset_2
Andreas Marek's avatar
Andreas Marek committed
163

164
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
Andreas Marek's avatar
Andreas Marek committed
165
         integer(kind=ik)                           :: off, ncols, istripe
166
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
167
         integer(kind=ik)                           :: my_thread, noff
168
#endif
Andreas Marek's avatar
Andreas Marek committed
169
         integer(kind=ik)                           :: j, nl, jj, jjj, n_times
170
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
171
         real(kind=C_DATATYPE_KIND)                 :: w(nbw,6)
172
173
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
174
         complex(kind=C_DATATYPE_KIND)              :: w(nbw,2)
175
#endif
Andreas Marek's avatar
Andreas Marek committed
176
         real(kind=c_double)                        :: ttt ! MPI_WTIME always needs double
177

178
179
180
181
182
183
         integer(kind=c_intptr_t), parameter        :: size_of_datatype = size_of_&
                                                                        &PRECISION&
                                                                        &_&
                                                                        &MATH_DATATYPE


184
         j = -99
Andreas Marek's avatar
Andreas Marek committed
185
186
187

         if (wantDebug) then
           if (useGPU .and. &
188
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
189
190
191
192
193
194
             ( kernel .ne. ELPA_2STAGE_REAL_GPU)) then
#endif
#if COMPLEXCASE == 1
             ( kernel .ne. ELPA_2STAGE_COMPLEX_GPU)) then
#endif
             print *,"ERROR: useGPU is set in conpute_hh_trafo but not GPU kernel!"
Andreas Marek's avatar
Andreas Marek committed
195
196
             stop
           endif
197
         endif
Andreas Marek's avatar
Andreas Marek committed
198
199
200

#if REALCASE == 1
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
201
#endif
Andreas Marek's avatar
Andreas Marek committed
202
#if COMPLEXCASE == 1
203
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
204
#endif
Andreas Marek's avatar
Andreas Marek committed
205
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
Andreas Marek's avatar
Andreas Marek committed
206
           if (ncols < 1) then
Andreas Marek's avatar
Andreas Marek committed
207
208
209
210
211
             if (wantDebug) then
               print *, "Returning early from compute_hh_trafo"
             endif
             return
           endif
Andreas Marek's avatar
Andreas Marek committed
212
         endif
213

214
         if (wantDebug) call obj%timer%start("compute_hh_trafo_&
215
                                              &MATH_DATATYPE&
216
#ifdef WITH_OPENMP_TRADITIONAL
217
                                              &_openmp" // &
218
#else
219
                                              &" // &
220
#endif
221
222
                                              &PRECISION_SUFFIX &
                                              )
223
224


225
#ifdef WITH_OPENMP_TRADITIONAL
226
227
228
         if (my_thread==1) then
#endif
           ttt = mpi_wtime()
229
#ifdef WITH_OPENMP_TRADITIONAL
230
231
232
         endif
#endif

233
#ifdef WITH_OPENMP_TRADITIONAL
234
235

#if REALCASE == 1
236
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
237
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
238
239
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
240
241
           stop 1
         endif
Andreas Marek's avatar
Andreas Marek committed
242
243
#endif
#if COMPLEXCASE == 1
244
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
Andreas Marek's avatar
Andreas Marek committed
245
           print *,"compute_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
246
247
                   &MATH_DATATYPE&
                   &_GPU OPENMP: not yet implemented"
Andreas Marek's avatar
Andreas Marek committed
248
249
           stop 1
         endif
250
#endif
251
#endif /* WITH_OPENMP_TRADITIONAL */
252

253
#ifndef WITH_OPENMP_TRADITIONAL
254
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
255
#else /* WITH_OPENMP_TRADITIONAL */
256

257
258
259
260
261
262
         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
263
             if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
264
                                                &MATH_DATATYPE&
265
#ifdef WITH_OPENMP_TRADITIONAL
266
                                                &_openmp" // &
267
#else
268
                                                &" // &
269
#endif
270
271
                                                &PRECISION_SUFFIX &
                                                )
272
273
274
275

             return
           endif
         endif
276
#endif /* not WITH_OPENMP_TRADITIONAL */
277

278
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
279
! GPU kernel real
280
         if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
281
#endif
Andreas Marek's avatar
Andreas Marek committed
282
283
#if COMPLEXCASE == 1
! GPU kernel complex
284
         if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
285
#endif
Andreas Marek's avatar
Andreas Marek committed
286
287
288
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: GPU")
           endif
Andreas Marek's avatar
Andreas Marek committed
289

290
           dev_offset = ((a_off+off-1)*stripe_width+(istripe-1)*stripe_width*a_dim2)*size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
291

292
           dev_offset_1 = (off-1)*nbw*size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
293

294
           dev_offset_2 = (off-1)*size_of_datatype
Andreas Marek's avatar
Andreas Marek committed
295

Andreas Marek's avatar
Andreas Marek committed
296
           call launch_compute_hh_trafo_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
297
298
299
                &MATH_DATATYPE&
                &_&
                &PRECISION&
300
301
                &(a_dev + dev_offset, bcast_buffer_dev + dev_offset_1, &
                hh_tau_dev + dev_offset_2, nl, nbw,stripe_width, ncols)
Andreas Marek's avatar
Andreas Marek committed
302

303
304
305
           if (wantDebug) then
             call obj%timer%stop("compute_hh_trafo: GPU")
           endif
Andreas Marek's avatar
Andreas Marek committed
306
307

         else ! not CUDA kernel
308

309
310
311
           if (wantDebug) then
             call obj%timer%start("compute_hh_trafo: CPU")
           endif
312
#if REALCASE == 1
313
#ifndef WITH_FIXED_REAL_KERNEL
314
315
316
317
         if (kernel .eq. ELPA_2STAGE_REAL_AVX_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX2_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_AVX512_BLOCK2 .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2 .or. &
318
             kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2 .or. &
319
             kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2 .or. &
320
             kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2 .or. &
321
322
323
324
325
             kernel .eq. ELPA_2STAGE_REAL_GENERIC    .or. &
             kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE .or. &
             kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY .or. &
             kernel .eq. ELPA_2STAGE_REAL_BGP .or.        &
             kernel .eq. ELPA_2STAGE_REAL_BGQ) then
326
#endif /* not WITH_FIXED_REAL_KERNEL */
327

328
329
330
#endif /* REALCASE */
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

331
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
332
333
#if REALCASE == 1
! generic kernel real case
334
#if defined(WITH_REAL_GENERIC_KERNEL)
335
#ifndef WITH_FIXED_REAL_KERNEL
336
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC) then
337
#endif /* not WITH_FIXED_REAL_KERNEL */
338
339
340
341
342

               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)

343
#ifdef WITH_OPENMP_TRADITIONAL
344
345

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
346
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
347
348
349
350
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
351
352

#else
Andreas Marek's avatar
Andreas Marek committed
353
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
354
355
356
357
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, istripe,my_thread), w(1:nbw,1:6), &
358
359
360
                    nbw, nl, stripe_width, nbw)
#endif

361
#else /* WITH_OPENMP_TRADITIONAL */
362
363

#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
364
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
365
366
367
368
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe),w, nbw, nl, stripe_width, nbw)
369
370

#else
Andreas Marek's avatar
Andreas Marek committed
371
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
372
373
374
375
                      &MATH_DATATYPE&
                      &_generic_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), nbw, nl, stripe_width, nbw)
376
#endif
377
#endif /* WITH_OPENMP_TRADITIONAL */
378
379
380

               enddo

381
#ifndef WITH_FIXED_REAL_KERNEL
382
             endif
383
#endif /* not WITH_FIXED_REAL_KERNEL */
384
385
#endif /* WITH_REAL_GENERIC_KERNEL */

386
387
388
389
390
#endif /* REALCASE == 1 */

#if COMPLEXCASE == 1
! generic kernel complex case
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
391
#ifndef WITH_FIXED_COMPLEX_KERNEL
392
393
394
           if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. &
               kernel .eq. ELPA_2STAGE_COMPLEX_BGQ ) then
395
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
396
397
             ttt = mpi_wtime()
             do j = ncols, 1, -1
398
#ifdef WITH_OPENMP_TRADITIONAL
399
400
401
#ifdef USE_ASSUMED_SIZE

              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
402
403
404
405
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
406
407
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
408
409
410
411
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
Andreas Marek's avatar
Andreas Marek committed
412
                     bcast_buffer(1:nbw,j+off), nbw, nl, stripe_width)
413
#endif
414

415
#else /* WITH_OPENMP_TRADITIONAL */
416
417
418

#ifdef USE_ASSUMED_SIZE
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
419
420
421
422
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
423
424
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
425
426
427
428
                   &MATH_DATATYPE&
                   &_generic_&
                   &PRECISION&
                   & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
429
                      nbw, nl, stripe_width)
430
#endif
431
#endif /* WITH_OPENMP_TRADITIONAL */
432
433

            enddo
434
#ifndef WITH_FIXED_COMPLEX_KERNEL
435
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGP .or. kernel .eq. ELPA_2STAGE_COMPLEX_BGQ )
436
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
437
438
439
440
441
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
442
443


444
! generic simple real kernel
445
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
446
#ifndef WITH_FIXED_REAL_KERNEL
447
             if (kernel .eq. ELPA_2STAGE_REAL_GENERIC_SIMPLE) then
448
#endif /* not WITH_FIXED_REAL_KERNEL */
449
450
451
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
452
#ifdef WITH_OPENMP_TRADITIONAL
453
454

#ifdef USE_ASSUMED_SIZE
455
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
456
457
458
459
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
460
#else
461
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
462
463
464
465
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), w, nbw, nl, stripe_width, nbw)
466
467
468

#endif

469
#else /* WITH_OPENMP_TRADITIONAL */
470
471

#ifdef USE_ASSUMED_SIZE
472
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
473
474
475
476
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
477
#else
478
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
479
480
481
482
                      &MATH_DATATYPE&
                      &_generic_simple_&
                      &PRECISION&
                      & (a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), w, nbw, nl, stripe_width, nbw)
483
484
485

#endif

486
#endif /* WITH_OPENMP_TRADITIONAL */
487
488

               enddo
489
#ifndef WITH_FIXED_REAL_KERNEL
490
             endif
491
#endif /* not WITH_FIXED_REAL_KERNEL */
492
493
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */

494
495
496
497
#endif /* REALCASE */

#if COMPLEXCASE == 1
! generic simple complex case
498

499
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
500
#ifndef WITH_FIXED_COMPLEX_KERNEL
501
            if (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE) then
502
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
503
504
             ttt = mpi_wtime()
             do j = ncols, 1, -1
505
#ifdef WITH_OPENMP_TRADITIONAL
506
507
#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
508
509
510
511
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1,j+off+a_off,istripe,my_thread), bcast_buffer(1,j+off),nbw,nl,stripe_width)
512
513
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
514
515
516
517
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width, j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
518
                       nbw, nl, stripe_width)
519
520
#endif

521
#else /* WITH_OPENMP_TRADITIONAL */
522
523
524

#ifdef USE_ASSUMED_SIZE
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
525
526
527
528
                     &MATH_DATATYPE&
                     &_generic_simple_&
                     &PRECISION&
                     & (a(1,j+off+a_off,istripe), bcast_buffer(1,j+off),nbw,nl,stripe_width)
529
530
#else
               call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
531
532
533
534
                    &MATH_DATATYPE&
                    &_generic_simple_&
                    &PRECISION&
                    & (a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), bcast_buffer(1:nbw,j+off), &
Andreas Marek's avatar
Andreas Marek committed
535
                       nbw, nl, stripe_width)
536
537
#endif

538
#endif /* WITH_OPENMP_TRADITIONAL */
539
             enddo
540
#ifndef WITH_FIXED_COMPLEX_KERNEL
541
           endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_GENERIC_SIMPLE)
542
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
543
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
544

545
546
547
548
#endif /* COMPLEXCASE */

#if REALCASE == 1
! sse assembly kernel real case
549
#if defined(WITH_REAL_SSE_ASSEMBLY_KERNEL)
550
#ifndef WITH_FIXED_REAL_KERNEL
551
             if (kernel .eq. ELPA_2STAGE_REAL_SSE_ASSEMBLY) then
Andreas Marek's avatar
Andreas Marek committed
552

553
#endif /* not WITH_FIXED_REAL_KERNEL */
554
555
556
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
557
#ifdef WITH_OPENMP_TRADITIONAL
558
                 call double_hh_trafo_&
559
                 &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
560
561
562
563
                 &_&
                 &PRECISION&
                 &_sse_assembly&
                 & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
564
565
#else
                 call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
566
567
568
569
570
                      &MATH_DATATYPE&
                      &_&
                      &PRECISION&
                      &_sse_assembly&
                      & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
571
572
#endif
               enddo
573
#ifndef WITH_FIXED_REAL_KERNEL
574
             endif
575
#endif /* not WITH_FIXED_REAL_KERNEL */
576
577
#endif /* WITH_REAL_SSE_ASSEMBLY_KERNEL */

578
579
580
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
581

582
583
! sse assembly kernel complex case
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
584
#ifndef WITH_FIXED_COMPLEX_KERNEL
585
           if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_ASSEMBLY) then
586
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
587
588
             ttt = mpi_wtime()
             do j = ncols, 1, -1
589
#ifdef WITH_OPENMP_TRADITIONAL
590
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
591
592
593
594
595
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
596
597
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
598
599
600
601
602
                   &MATH_DATATYPE&
                   &_&
                   &PRECISION&
                   &_sse_assembly&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
603
604
#endif
            enddo
605
#ifndef WITH_FIXED_COMPLEX_KERNEL
606
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE)
607
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
608
609
610
611
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
#endif /* COMPLEXCASE */

#if REALCASE == 1
612
! no sse, vsx, sparc64 block1 real kernel
613
614
#endif

615
616
617
618
#if COMPLEXCASE == 1

! sparc64 block1 complex kernel
#if defined(WITH_COMPLEX_SPARC64_BLOCK1_KERNEL)
619
620
621
622
623
624
625
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL))
!            ttt = mpi_wtime()
!            do j = ncols, 1, -1
626
!#ifdef WITH_OPENMP_TRADITIONAL
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_sparc64_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_sparc64_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
!            enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
645
646
647
648
649
#endif /* WITH_COMPLEX_SPARC64_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


650
651
652
653
#if COMPLEXCASE == 1

! vsx block1 complex kernel
#if defined(WITH_COMPLEX_VSX_BLOCK1_KERNEL)
654
655
656
657
658
659
660
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          if (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1) then
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
!
!#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL))
!            ttt = mpi_wtime()
!            do j = ncols, 1, -1
661
!#ifdef WITH_OPENMP_TRADITIONAL
662
663
664
665
666
667
668
669
670
671
672
673
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_vsx_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#else
!              call single_hh_trafo_&
!                   &MATH_DATATYPE&
!                   &_vsx_1hv_&
!                   &PRECISION&
!                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
!#endif
674
675
676
677
678
679
!            enddo
!#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_VSX_BLOCK2_KERNEL)) */
!
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_VSX_BLOCK1)
!#endif /* not WITH_FIXED_COMPLEX_KERNEL */
680
681
682
683
684
#endif /* WITH_COMPLEX_VSX_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */


685
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
686

687
688
! sse block1 complex kernel
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
689
#ifndef WITH_FIXED_COMPLEX_KERNEL
690
          if (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1) then
691
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
692

693
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
694
695
            ttt = mpi_wtime()
            do j = ncols, 1, -1
696
#ifdef WITH_OPENMP_TRADITIONAL
697
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
698
699
700
701
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
702
703
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
704
705
706
707
                   &MATH_DATATYPE&
                   &_sse_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
708
709
#endif
            enddo
710
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
711

712
#ifndef WITH_FIXED_COMPLEX_KERNEL
713
          endif ! (kernel .eq. ELPA_2STAGE_COMPLEX_SSE_BLOCK1)
714
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
715
716
717
718
719
720
721
722
723
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */

#endif /* COMPLEXCASE */

#if REALCASE == 1
!no avx block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
724

725
! avx block1 complex kernel
Andreas Marek's avatar
Andreas Marek committed
726
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL)
727
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
728
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1)) then
729
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
730

Andreas Marek's avatar
Andreas Marek committed
731
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) )
732
733
            ttt = mpi_wtime()
            do j = ncols, 1, -1
734
#ifdef WITH_OPENMP_TRADITIONAL
735
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
736
                   &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
737
                   &_avx_1hv_&
Andreas Marek's avatar
Andreas Marek committed
738
739
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
740
741
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
742
                   &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
743
                   &_avx_1hv_&
Andreas Marek's avatar
Andreas Marek committed
744
745
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
746
747
#endif
            enddo
Andreas Marek's avatar
Andreas Marek committed
748
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL)) */
749

750
#ifndef WITH_FIXED_COMPLEX_KERNEL
Andreas Marek's avatar
Andreas Marek committed
751
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX_BLOCK1) )
752
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
753
754
755
756
757
758
759
760
761
762
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */

#if defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
#ifndef WITH_FIXED_COMPLEX_KERNEL
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1)) then
#endif /* not WITH_FIXED_COMPLEX_KERNEL */

#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
            ttt = mpi_wtime()
            do j = ncols, 1, -1
763
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
              call single_hh_trafo_&
                   &MATH_DATATYPE&
                   &_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_&
                   &MATH_DATATYPE&
                   &_avx2_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */

#ifndef WITH_FIXED_COMPLEX_KERNEL
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX2_BLOCK1))
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX2_BLOCK1_KERNEL */


785
786
787
788
789
790
791
792

#endif /* COMPLEXCASE */

#if REALCASE == 1
! no avx512 block1 real kernel
#endif /* REALCASE */

#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
793

794
795
! avx512 block1 complex kernel
#if defined(WITH_COMPLEX_AVX512_BLOCK1_KERNEL)
796
#ifndef WITH_FIXED_COMPLEX_KERNEL
797
          if ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1)) then
798
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
799

800
#if (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) )
801
802
            ttt = mpi_wtime()
            do j = ncols, 1, -1
803
#ifdef WITH_OPENMP_TRADITIONAL
804
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
805
806
807
808
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe,my_thread)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
809
810
#else
              call single_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
811
812
813
814
                   &MATH_DATATYPE&
                   &_avx512_1hv_&
                   &PRECISION&
                   & (c_loc(a(1,j+off+a_off,istripe)), bcast_buffer(1,j+off),nbw,nl,stripe_width)
815
816
#endif
            enddo
817
#endif /* (!defined(WITH_FIXED_COMPLEX_KERNEL)) || (defined(WITH_FIXED_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX512_BLOCK2_KERNEL) ) */
818

819
#ifndef WITH_FIXED_COMPLEX_KERNEL
820
          endif ! ((kernel .eq. ELPA_2STAGE_COMPLEX_AVX512_BLOCK1))
821
#endif /* not WITH_FIXED_COMPLEX_KERNEL */
822
823
824
825
#endif /* WITH_COMPLEX_AVX512_BLOCK1_KERNEL  */
#endif /* COMPLEXCASE */

#if REALCASE == 1
826
827
828
829
830
831
832
833
834
835
836
837
! implementation of sparc64 block 2 real case
#if defined(WITH_REAL_SPARC64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_SPARC64_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
838
#ifdef WITH_OPENMP_TRADITIONAL
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sparc64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sparc64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK6_KERNEL) && !defined(WITH_REAL_SPARC64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SPARC64_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */
860

861
862
863
864
865
866
867
868
869
870
871
872
873
#if REALCASE == 1
! implementation of neon_arch64 block 2 real case
#if defined(WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_NEON_ARCH64_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
874
#ifdef WITH_OPENMP_TRADITIONAL
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_neon_arch64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_neon_arch64_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK6_KERNEL) && !defined(WITH_REAL_NEON_ARCH64_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_NEON_ARCH64_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */


898

899
#if REALCASE == 1
900
901
! implementation of vsx block 2 real case
#if defined(WITH_REAL_VSX_BLOCK2_KERNEL)
902

903
#ifndef WITH_FIXED_REAL_KERNEL
904
           if (kernel .eq. ELPA_2STAGE_REAL_VSX_BLOCK2) then
Andreas Marek's avatar
Andreas Marek committed
905

906
#endif /* not WITH_FIXED_REAL_KERNEL */
907

908
#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL))
909
910
911
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
912
#ifdef WITH_OPENMP_TRADITIONAL
913
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
914
                    &MATH_DATATYPE&
915
                    &_vsx_2hv_&
Andreas Marek's avatar
Andreas Marek committed
916
917
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
918
919
#else
               call double_hh_trafo_&
Andreas Marek's avatar
Andreas Marek committed
920
                    &MATH_DATATYPE&
921
                    &_vsx_2hv_&
Andreas Marek's avatar
Andreas Marek committed
922
923
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
924
925
#endif
             enddo
926
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_VSX_BLOCK6_KERNEL) && !defined(WITH_REAL_VSX_BLOCK4_KERNEL)) */
927

928
#ifndef WITH_FIXED_REAL_KERNEL
929
           endif
930
#endif /* not WITH_FIXED_REAL_KERNEL */
931
#endif /* WITH_REAL_VSX_BLOCK2_KERNEL */
932

933
934
#endif /* REALCASE == 1 */

Andreas Marek's avatar
Andreas Marek committed
935
936
937
938
939
940
941
942
943
944
945
946
947
#if REALCASE == 1
! implementation of sse block 2 real case
#if defined(WITH_REAL_SSE_BLOCK2_KERNEL)

#ifndef WITH_FIXED_REAL_KERNEL
           if (kernel .eq. ELPA_2STAGE_REAL_SSE_BLOCK2) then

#endif /* not WITH_FIXED_REAL_KERNEL */

#if (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL))
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
948
#ifdef WITH_OPENMP_TRADITIONAL
Andreas Marek's avatar
Andreas Marek committed
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe,my_thread)), w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_&
                    &MATH_DATATYPE&
                    &_sse_2hv_&
                    &PRECISION &
                    & (c_loc(a(1,j+off+a_off-1,istripe)), w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#endif /* (!defined(WITH_FIXED_REAL_KERNEL)) || (defined(WITH_FIXED_REAL_KERNEL) && !defined(WITH_REAL_SSE_BLOCK6_KERNEL) && !defined(WITH_REAL_SSE_BLOCK4_KERNEL)) */

#ifndef WITH_FIXED_REAL_KERNEL
           endif
#endif /* not WITH_FIXED_REAL_KERNEL */
#endif /* WITH_REAL_SSE_BLOCK2_KERNEL */

#endif /* REALCASE == 1 */


972
973
974
975
#if COMPLEXCASE == 1
! implementation of sparc64 block 2 complex case

#if defined(WITH_COMPLEX_SPARC64_BLOCK2_KERNEL)
976
977
978
979
980
981
982
983
!#ifndef WITH_FIXED_COMPLEX_KERNEL
!           if (kernel .eq. ELPA_2STAGE_COMPLEX_SPARC64_BLOCK2) then
!#endif  /* not WITH_FIXED_COMPLEX_KERNEL */
!
!             ttt = mpi_wtime()
!             do j = ncols, 2, -2
!               w(:,1) = bcast_buffer(1:nbw,j+off)
!               w(:,2) = bcast_buffer(1:nbw,j+off-1)
Andreas Marek's avatar