mod_compute_hh_trafo_real.F90 17.8 KB
Newer Older
1
2
module compute_hh_trafo_real
#include "config-f90.h"
3
  use elpa_mpi
4
5
6
7
8
9
10
11
12
13
14
  implicit none

#ifdef WITH_OPENMP
  public compute_hh_trafo_real_cpu_openmp
#else
  public compute_hh_trafo_real_cpu
#endif

  contains

#ifdef WITH_OPENMP
15
16
       subroutine compute_hh_trafo_real_cpu_openmp(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads, l_nev,        &
                                                   a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev,    &
17
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,              &
18
                                                   my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
19
#else
20
21
22
23
       subroutine compute_hh_trafo_real_cpu       (a, a_dev, stripe_width, a_dim2, stripe_count,                              &
                                                   a_off, nbw, max_blk_size, bcast_buffer,  bcast_buffer_dev, hh_dot_dev,     &
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,                &
                                                   last_stripe_width, THIS_REAL_ELPA_KERNEL)
24
25
26
27
#endif


         use precision
28
         use iso_c_binding
29
30
         use elpa2_utilities
         use single_hh_trafo_real
31
32
33
         use cuda_c_kernel
         use cuda_functions

34
35
36
37
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
         use real_generic_simple_kernel, only : double_hh_trafo_generic_simple
#endif

38
39
40
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE))
        use real_generic_kernel, only : double_hh_trafo_generic
#endif
41
42
43
44
45
46
47
48
49
50
51
52

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel, only : double_hh_trafo_bgq
#endif
#ifdef HAVE_DETAILED_TIMINGS
         use timings
#endif
         implicit none
53
         real(kind=c_double), intent(inout) :: kernel_time  ! MPI_WTIME always needs double
54
55
56
57
58
59
60
61
62
         integer(kind=lik)            :: kernel_flops
         integer(kind=ik), intent(in) :: nbw, max_blk_size
         real(kind=rk)                :: bcast_buffer(nbw,max_blk_size)
         integer(kind=ik), intent(in) :: a_off

         integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count

#ifndef WITH_OPENMP
         integer(kind=ik), intent(in) :: last_stripe_width
63
64
!         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count)
         real(kind=rk), allocatable   :: a(:,:,:)
65
#else
66
         integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
67
68
!         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
         real(kind=rk), allocatable   :: a(:,:,:,:)
69
70
71
#endif
         integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL

72
73
74
75
76
         integer(kind=c_intptr_t)     :: a_dev
         integer(kind=c_intptr_t)     :: bcast_buffer_dev
         integer(kind=c_size_t)       :: dev_offset
         integer(kind=c_intptr_t)     :: hh_dot_dev
         integer(kind=c_intptr_t)     :: hh_tau_dev
77
78
79
80
81
82
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
         integer(kind=ik)             :: off, ncols, istripe
#ifdef WITH_OPENMP
         integer(kind=ik)             :: my_thread, noff
#endif
         integer(kind=ik)             :: j, nl, jj, jjj
83
84
         real(kind=rk)                :: w(nbw,6)
         real(kind=c_double)          :: ttt ! MPI_WTIME always needs double
85

86
87
88
89
90
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
           if (ncols < 1) return
         endif

91
92
93
94
95
96
97
98
99
100
101
102
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
         call timer%start("compute_hh_trafo_real_cpu_openmp")
#else
         call timer%start("compute_hh_trafo_real_cpu")
#endif
#endif
         ttt = mpi_wtime()

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else
103
104
105
106
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           print *,"compute_hh_trafo_real GPU OPENMP: not yet implemented"
           stop
         endif
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123

         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
             call timer%stop("compute_hh_trafo_real_cpu_openmp")
#else
             call timer%stop("compute_hh_trafo_real_cpu")
#endif
#endif
             return
           endif
         endif
124
125
126
127
128
129
130
#endif /* not WITH_OPENMP */

         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_real_datatype
           call launch_compute_hh_trafo_c_kernel_real(a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, &
                                                      hh_tau_dev, nl, nbw, stripe_width, off, ncols)
         else ! not CUDA kernel
131
132

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
133
134
135
136
137
138
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC    .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or.        &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP .or.        &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
139
140
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

141
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
142
143
#if defined(WITH_REAL_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
144
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) then
145
146
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

147
148
149
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
150
151

#ifdef WITH_OPENMP
152
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
153
                 call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe,my_thread), w, &
154
155
                                            nbw, nl, stripe_width, nbw)

156
#else
157
                 call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, &
158
                                              istripe,my_thread), w(1:nbw,1:6), &
159
                                              nbw, nl, stripe_width, nbw)
160
161
162
163
164
#endif

#else /* WITH_OPENMP */

#ifdef DESPERATELY_WANT_ASSUMED_SIZE
165
                 call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe),w, &
166
167
168
                                            nbw, nl, stripe_width, nbw)

#else
169
                 call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), &
170
                                            nbw, nl, stripe_width, nbw)
171
#endif
172
173
#endif /* WITH_OPENMP */

174
               enddo
175
176

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
177
             endif
178
179
180
181
182
183
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_KERNEL */


#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
184
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) then
185
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
186
187
188
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
189
#ifdef WITH_OPENMP
190
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
191
192
                 call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
193
#else
194
                 call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), &
195
196
197
198
199
200
                                                     w, nbw, nl, stripe_width, nbw)

#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
201
202
                 call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
203
#else
204
                 call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), &
205
206
                                                     w, nbw, nl, stripe_width, nbw)

207
#endif
208
209
210

#endif /* WITH_OPENMP */

211
               enddo
212
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
213
             endif
214
215
216
217
218
219
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_REAL_SSE_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
220
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) then
221
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
222
223
224
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
225
#ifdef WITH_OPENMP
226
                 call double_hh_trafo(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
227
228
                                      stripe_width, nbw)
#else
229
                 call double_hh_trafo(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
230
231
                                      stripe_width, nbw)
#endif
232
               enddo
233
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
234
             endif
235
236
237
238
239
240
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_KERNEL */


#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
241
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) then
242
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
243
244
245
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
246
#ifdef WITH_OPENMP
247
                 call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
248
249
                                                       w, nbw, nl, stripe_width, nbw)
#else
250
                 call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), &
251
252
                                                       w, nbw, nl, stripe_width, nbw)
#endif
253
               enddo
254
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
255
             endif
256
257
258
259
260
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */

#if defined(WITH_REAL_BGP_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
261
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP) then
262
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
263
264
265
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
266
#ifdef WITH_OPENMP
267
                 call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
268
269
                                          stripe_width, nbw)
#else
270
                 call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
271
272
                                          stripe_width, nbw)
#endif
273
               enddo
274
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
275
             endif
276
277
278
279
280
281
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGP_KERNEL */


#if defined(WITH_REAL_BGQ_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
282
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
283
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
284
285
286
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
287
#ifdef WITH_OPENMP
288
                 call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
289
290
                                          stripe_width, nbw)
#else
291
                 call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
292
293
                                          stripe_width, nbw)
#endif
294
               enddo
295
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
296
             endif
297
298
299
300
301
302
303
304
305
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGQ_KERNEL */


!#if defined(WITH_AVX_SANDYBRIDGE)
!              call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!#endif

#ifdef WITH_OPENMP
306
             if (j==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
307
308
309
                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
                                      stripe_width)
#else
310
             if (j==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe),           &
311
312
313
314
315
316
                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
                                      stripe_width)
#endif


#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
317
           endif !
318
319
320
321
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

#if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
322
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then
323
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
324
325
326
327
328
329
             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
             do j = ncols, 4, -4
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
330
#ifdef WITH_OPENMP
331
               call quad_hh_trafo_real_sse_avx_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, &
332
333
                                                  nbw, nl, stripe_width, nbw)
#else
334
               call quad_hh_trafo_real_sse_avx_4hv(a(1,j+off+a_off-3,istripe), w, &
335
336
                                                  nbw, nl, stripe_width, nbw)
#endif
337
338
339
340
             enddo
             do jj = j, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
341
#ifdef WITH_OPENMP
342
               call double_hh_trafo_real_sse_avx_2hv(a(1,jj+off+a_off-1,istripe,my_thread), &
343
344
                                                    w, nbw, nl, stripe_width, nbw)
#else
345
               call double_hh_trafo_real_sse_avx_2hv(a(1,jj+off+a_off-1,istripe), &
346
347
                                                    w, nbw, nl, stripe_width, nbw)
#endif
348
             enddo
349
#ifdef WITH_OPENMP
350
             if (jj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
351
352
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
353
             if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
354
355
356
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
357
           endif
358
359
360
361
362
363
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */


#if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
364
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) then
365
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
366
367
368
369
370
371
372
373
             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
             do j = ncols, 6, -6
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
               w(:,5) = bcast_buffer(1:nbw,j+off-4)
               w(:,6) = bcast_buffer(1:nbw,j+off-5)
374
#ifdef WITH_OPENMP
375
               call hexa_hh_trafo_real_sse_avx_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, &
376
377
                                                  nbw, nl, stripe_width, nbw)
#else
378
               call hexa_hh_trafo_real_sse_avx_6hv(a(1,j+off+a_off-5,istripe), w, &
379
380
                                                  nbw, nl, stripe_width, nbw)
#endif
381
382
383
384
385
386
             enddo
             do jj = j, 4, -4
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
387
#ifdef WITH_OPENMP
388
               call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
389
390
                                                  nbw, nl, stripe_width, nbw)
#else
391
               call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe), w, &
392
393
                                                  nbw, nl, stripe_width, nbw)
#endif
394
395
396
397
             enddo
             do jjj = jj, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jjj+off)
               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
398
#ifdef WITH_OPENMP
399
               call double_hh_trafo_real_sse_avx_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), &
400
401
                                                    w, nbw, nl, stripe_width, nbw)
#else
402
               call double_hh_trafo_real_sse_avx_2hv(a(1,jjj+off+a_off-1,istripe), &
403
404
                                                    w, nbw, nl, stripe_width, nbw)
#endif
405
             enddo
406
#ifdef WITH_OPENMP
407
             if (jjj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
408
409
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
410
             if (jjj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
411
412
413
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
414
           endif
415
416
417
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */

418
419
         endif ! GPU_KERNEL

420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
#ifdef WITH_OPENMP
         if (my_thread==1) then
#endif
           kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
           kernel_time = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
         endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
         call timer%stop("compute_hh_trafo_real_cpu_openmp")
#else
         call timer%stop("compute_hh_trafo_real_cpu")
#endif
#endif

#ifdef WITH_OPENMP
       end subroutine compute_hh_trafo_real_cpu_openmp
#else
       end subroutine compute_hh_trafo_real_cpu
#endif

end module