mod_compute_hh_trafo_real.F90 17.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
module compute_hh_trafo_real
#include "config-f90.h"
  implicit none

#ifdef WITH_OPENMP
  public compute_hh_trafo_real_cpu_openmp
#else
  public compute_hh_trafo_real_cpu
#endif

  include 'mpif.h'

  contains

#ifdef WITH_OPENMP
16
17
18
       subroutine compute_hh_trafo_real_cpu_openmp(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads,                &
                                                   a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev,     &
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,              &
19
20
                                                   my_thread, THIS_REAL_ELPA_KERNEL)
#else
21
22
23
24
       subroutine compute_hh_trafo_real_cpu       (a, a_dev, stripe_width, a_dim2, stripe_count,                              &
                                                   a_off, nbw, max_blk_size, bcast_buffer,  bcast_buffer_dev, hh_dot_dev,     &
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,                &
                                                   last_stripe_width, THIS_REAL_ELPA_KERNEL)
25
26
27
28
#endif


         use precision
29
         use iso_c_binding
30
31
         use elpa2_utilities
         use single_hh_trafo_real
32
33
34
         use cuda_c_kernel
         use cuda_functions

35
36
37
38
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
         use real_generic_simple_kernel, only : double_hh_trafo_generic_simple
#endif

39
40
41
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE))
        use real_generic_kernel, only : double_hh_trafo_generic
#endif
42
43
44
45
46
47
48
49
50
51
52
53
54

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel, only : double_hh_trafo_bgq
#endif
#ifdef HAVE_DETAILED_TIMINGS
         use timings
#endif
         implicit none
         include "mpif.h"
55
         real(kind=c_double), intent(inout) :: kernel_time  ! MPI_WTIME always needs double
56
57
58
59
60
61
62
63
64
         integer(kind=lik)            :: kernel_flops
         integer(kind=ik), intent(in) :: nbw, max_blk_size
         real(kind=rk)                :: bcast_buffer(nbw,max_blk_size)
         integer(kind=ik), intent(in) :: a_off

         integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count

#ifndef WITH_OPENMP
         integer(kind=ik), intent(in) :: last_stripe_width
65
66
!         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count)
         real(kind=rk), allocatable   :: a(:,:,:)
67
68
#else
         integer(kind=ik), intent(in) :: max_threads
69
70
!         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
         real(kind=rk), allocatable   :: a(:,:,:,:)
71
72
73
#endif
         integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL

74
75
76
77
78
         integer(kind=c_intptr_t)     :: a_dev
         integer(kind=c_intptr_t)     :: bcast_buffer_dev
         integer(kind=c_size_t)       :: dev_offset
         integer(kind=c_intptr_t)     :: hh_dot_dev
         integer(kind=c_intptr_t)     :: hh_tau_dev
79
80
81
82
83
84
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
         integer(kind=ik)             :: off, ncols, istripe
#ifdef WITH_OPENMP
         integer(kind=ik)             :: my_thread, noff
#endif
         integer(kind=ik)             :: j, nl, jj, jjj
85
86
         real(kind=rk)                :: w(nbw,6)
         real(kind=c_double)          :: ttt ! MPI_WTIME always needs double
87

88
89
90
91
92
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
           if (ncols < 1) return
         endif

93
94
95
96
97
98
99
100
101
102
103
104
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
         call timer%start("compute_hh_trafo_real_cpu_openmp")
#else
         call timer%start("compute_hh_trafo_real_cpu")
#endif
#endif
         ttt = mpi_wtime()

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else
105
106
107
108
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           print *,"compute_hh_trafo_real GPU OPENMP: not yet implemented"
           stop
         endif
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
             call timer%stop("compute_hh_trafo_real_cpu_openmp")
#else
             call timer%stop("compute_hh_trafo_real_cpu")
#endif
#endif
             return
           endif
         endif
126
127
128
129
130
131
132
#endif /* not WITH_OPENMP */

         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_real_datatype
           call launch_compute_hh_trafo_c_kernel_real(a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, &
                                                      hh_tau_dev, nl, nbw, stripe_width, off, ncols)
         else ! not CUDA kernel
133
134

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
135
136
137
138
139
140
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC    .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or.        &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP .or.        &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
141
142
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

143
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
144
145
#if defined(WITH_REAL_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
146
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) then
147
148
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

149
150
151
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
152
153

#ifdef WITH_OPENMP
154
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
155
                 call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe,my_thread), w, &
156
157
                                            nbw, nl, stripe_width, nbw)

158
#else
159
                 call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, &
160
                                              istripe,my_thread), w(1:nbw,1:6), &
161
                                              nbw, nl, stripe_width, nbw)
162
163
164
165
166
#endif

#else /* WITH_OPENMP */

#ifdef DESPERATELY_WANT_ASSUMED_SIZE
167
                 call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe),w, &
168
169
170
                                            nbw, nl, stripe_width, nbw)

#else
171
                 call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), &
172
                                            nbw, nl, stripe_width, nbw)
173
#endif
174
175
#endif /* WITH_OPENMP */

176
               enddo
177
178

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
179
             endif
180
181
182
183
184
185
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_KERNEL */


#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
186
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) then
187
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
188
189
190
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
191
#ifdef WITH_OPENMP
192
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
193
194
                 call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
195
#else
196
                 call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), &
197
198
199
200
201
202
                                                     w, nbw, nl, stripe_width, nbw)

#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
203
204
                 call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
205
#else
206
                 call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), &
207
208
                                                     w, nbw, nl, stripe_width, nbw)

209
#endif
210
211
212

#endif /* WITH_OPENMP */

213
               enddo
214
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
215
             endif
216
217
218
219
220
221
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_REAL_SSE_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
222
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) then
223
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
224
225
226
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
227
#ifdef WITH_OPENMP
228
                 call double_hh_trafo(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
229
230
                                      stripe_width, nbw)
#else
231
                 call double_hh_trafo(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
232
233
                                      stripe_width, nbw)
#endif
234
               enddo
235
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
236
             endif
237
238
239
240
241
242
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_KERNEL */


#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
243
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) then
244
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
245
246
247
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
248
#ifdef WITH_OPENMP
249
                 call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
250
251
                                                       w, nbw, nl, stripe_width, nbw)
#else
252
                 call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), &
253
254
                                                       w, nbw, nl, stripe_width, nbw)
#endif
255
               enddo
256
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
257
             endif
258
259
260
261
262
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */

#if defined(WITH_REAL_BGP_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
263
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP) then
264
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
265
266
267
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
268
#ifdef WITH_OPENMP
269
                 call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
270
271
                                          stripe_width, nbw)
#else
272
                 call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
273
274
                                          stripe_width, nbw)
#endif
275
               enddo
276
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
277
             endif
278
279
280
281
282
283
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGP_KERNEL */


#if defined(WITH_REAL_BGQ_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
284
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
285
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
286
287
288
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
289
#ifdef WITH_OPENMP
290
                 call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
291
292
                                          stripe_width, nbw)
#else
293
                 call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
294
295
                                          stripe_width, nbw)
#endif
296
               enddo
297
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
298
             endif
299
300
301
302
303
304
305
306
307
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGQ_KERNEL */


!#if defined(WITH_AVX_SANDYBRIDGE)
!              call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!#endif

#ifdef WITH_OPENMP
308
             if (j==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off_a_off+nbw-1,istripe,my_thread), &
309
310
311
                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
                                      stripe_width)
#else
312
             if (j==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe),           &
313
314
315
316
317
318
                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
                                      stripe_width)
#endif


#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
319
           endif !
320
321
322
323
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

#if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
324
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then
325
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
326
327
328
329
330
331
             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
             do j = ncols, 4, -4
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
332
#ifdef WITH_OPENMP
333
               call quad_hh_trafo_real_sse_avx_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, &
334
335
                                                  nbw, nl, stripe_width, nbw)
#else
336
               call quad_hh_trafo_real_sse_avx_4hv(a(1,j+off+a_off-3,istripe), w, &
337
338
                                                  nbw, nl, stripe_width, nbw)
#endif
339
340
341
342
             enddo
             do jj = j, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
343
#ifdef WITH_OPENMP
344
               call double_hh_trafo_real_sse_avx_2hv(a(1,jj+off+a_off-1,istripe,my_thread), &
345
346
                                                    w, nbw, nl, stripe_width, nbw)
#else
347
               call double_hh_trafo_real_sse_avx_2hv(a(1,jj+off+a_off-1,istripe), &
348
349
                                                    w, nbw, nl, stripe_width, nbw)
#endif
350
             enddo
351
#ifdef WITH_OPENMP
352
             if (jj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
353
354
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
355
             if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
356
357
358
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
359
           endif
360
361
362
363
364
365
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */


#if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
366
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) then
367
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
368
369
370
371
372
373
374
375
             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
             do j = ncols, 6, -6
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
               w(:,5) = bcast_buffer(1:nbw,j+off-4)
               w(:,6) = bcast_buffer(1:nbw,j+off-5)
376
#ifdef WITH_OPENMP
377
               call hexa_hh_trafo_real_sse_avx_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, &
378
379
                                                  nbw, nl, stripe_width, nbw)
#else
380
               call hexa_hh_trafo_real_sse_avx_6hv(a(1,j+off+a_off-5,istripe), w, &
381
382
                                                  nbw, nl, stripe_width, nbw)
#endif
383
384
385
386
387
388
             enddo
             do jj = j, 4, -4
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
389
#ifdef WITH_OPENMP
390
               call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
391
392
                                                  nbw, nl, stripe_width, nbw)
#else
393
               call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe), w, &
394
395
                                                  nbw, nl, stripe_width, nbw)
#endif
396
397
398
399
             enddo
             do jjj = jj, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jjj+off)
               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
400
#ifdef WITH_OPENMP
401
               call double_hh_trafo_real_sse_avx_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), &
402
403
                                                    w, nbw, nl, stripe_width, nbw)
#else
404
               call double_hh_trafo_real_sse_avx_2hv(a(1,jjj+off+a_off-1,istripe), &
405
406
                                                    w, nbw, nl, stripe_width, nbw)
#endif
407
             enddo
408
#ifdef WITH_OPENMP
409
             if (jjj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
410
411
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
412
             if (jjj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
413
414
415
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
416
           endif
417
418
419
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */

420
421
         endif ! GPU_KERNEL

422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#ifdef WITH_OPENMP
         if (my_thread==1) then
#endif
           kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
           kernel_time = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
         endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
         call timer%stop("compute_hh_trafo_real_cpu_openmp")
#else
         call timer%stop("compute_hh_trafo_real_cpu")
#endif
#endif

#ifdef WITH_OPENMP
       end subroutine compute_hh_trafo_real_cpu_openmp
#else
       end subroutine compute_hh_trafo_real_cpu
#endif

end module