mod_compute_hh_trafo_real.F90 17.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
module compute_hh_trafo_real
#include "config-f90.h"
  implicit none

#ifdef WITH_OPENMP
  public compute_hh_trafo_real_cpu_openmp
#else
  public compute_hh_trafo_real_cpu
#endif

  include 'mpif.h'

  contains

#ifdef WITH_OPENMP
16
17
18
       subroutine compute_hh_trafo_real_cpu_openmp(a, a_dev, stripe_width, a_dim2, stripe_count, max_threads,                &
                                                   a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev,     &
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,              &
19
20
                                                   my_thread, THIS_REAL_ELPA_KERNEL)
#else
21
22
23
24
       subroutine compute_hh_trafo_real_cpu       (a, a_dev, stripe_width, a_dim2, stripe_count,                              &
                                                   a_off, nbw, max_blk_size, bcast_buffer,  bcast_buffer_dev, hh_dot_dev,     &
                                                   hh_tau_dev, kernel_flops, kernel_time, off, ncols, istripe,                &
                                                   last_stripe_width, THIS_REAL_ELPA_KERNEL)
25
26
27
28
#endif


         use precision
29
         use iso_c_binding
30
31
         use elpa2_utilities
         use single_hh_trafo_real
32
33
34
         use cuda_c_kernel
         use cuda_functions

35
36
37
38
#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
         use real_generic_simple_kernel, only : double_hh_trafo_generic_simple
#endif

39
40
41
#if defined(WITH_REAL_GENERIC_KERNEL) && !(defined(DESPERATELY_WANT_ASSUMED_SIZE))
        use real_generic_kernel, only : double_hh_trafo_generic
#endif
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

#if defined(WITH_REAL_BGP_KERNEL)
         use real_bgp_kernel, only : double_hh_trafo_bgp
#endif

#if defined(WITH_REAL_BGQ_KERNEL)
         use real_bgq_kernel, only : double_hh_trafo_bgq
#endif
#ifdef HAVE_DETAILED_TIMINGS
         use timings
#endif
         implicit none
         include "mpif.h"
         real(kind=rk), intent(inout) :: kernel_time
         integer(kind=lik)            :: kernel_flops
         integer(kind=ik), intent(in) :: nbw, max_blk_size
         real(kind=rk)                :: bcast_buffer(nbw,max_blk_size)
         integer(kind=ik), intent(in) :: a_off

         integer(kind=ik), intent(in) :: stripe_width,a_dim2,stripe_count

#ifndef WITH_OPENMP
         integer(kind=ik), intent(in) :: last_stripe_width
65
66
!         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count)
         real(kind=rk), allocatable   :: a(:,:,:)
67
68
#else
         integer(kind=ik), intent(in) :: max_threads
69
70
!         real(kind=rk)                :: a(stripe_width,a_dim2,stripe_count,max_threads)
         real(kind=rk), allocatable   :: a(:,:,:,:)
71
72
73
#endif
         integer(kind=ik), intent(in) :: THIS_REAL_ELPA_KERNEL

74
75
76
77
78
         integer(kind=c_intptr_t)     :: a_dev
         integer(kind=c_intptr_t)     :: bcast_buffer_dev
         integer(kind=c_size_t)       :: dev_offset
         integer(kind=c_intptr_t)     :: hh_dot_dev
         integer(kind=c_intptr_t)     :: hh_tau_dev
79
80
81
82
83
84
85
86
         ! Private variables in OMP regions (my_thread) should better be in the argument list!
         integer(kind=ik)             :: off, ncols, istripe
#ifdef WITH_OPENMP
         integer(kind=ik)             :: my_thread, noff
#endif
         integer(kind=ik)             :: j, nl, jj, jjj
         real(kind=rk)                :: w(nbw,6), ttt

87
88
89
90
91
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           ! ncols - indicates the number of HH reflectors to apply; at least 1 must be available
           if (ncols < 1) return
         endif

92
93
94
95
96
97
98
99
100
101
102
103
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
         call timer%start("compute_hh_trafo_real_cpu_openmp")
#else
         call timer%start("compute_hh_trafo_real_cpu")
#endif
#endif
         ttt = mpi_wtime()

#ifndef WITH_OPENMP
         nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else
104
105
106
107
         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           print *,"compute_hh_trafo_real GPU OPENMP: not yet implemented"
           stop
         endif
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

         if (istripe<stripe_count) then
           nl = stripe_width
         else
           noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
           nl = min(my_thread*thread_width-noff, l_nev-noff)
           if (nl<=0) then
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
             call timer%stop("compute_hh_trafo_real_cpu_openmp")
#else
             call timer%stop("compute_hh_trafo_real_cpu")
#endif
#endif
             return
           endif
         endif
125
126
127
128
129
130
131
#endif /* not WITH_OPENMP */

         if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GPU) then
           dev_offset = (0 + (a_off * stripe_width) + ( (istripe - 1) * stripe_width *a_dim2 )) *size_of_real_datatype
           call launch_compute_hh_trafo_c_kernel_real(a_dev + dev_offset, bcast_buffer_dev, hh_dot_dev, &
                                                      hh_tau_dev, nl, nbw, stripe_width, off, ncols)
         else ! not CUDA kernel
132
133

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
134
135
136
137
138
139
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2 .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC    .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE .or. &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE .or.        &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP .or.        &
               THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
140
141
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

142
             !FORTRAN CODE / X86 INRINISIC CODE / BG ASSEMBLER USING 2 HOUSEHOLDER VECTORS
143
144
#if defined(WITH_REAL_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
145
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC) then
146
147
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

148
149
150
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
151
152

#ifdef WITH_OPENMP
153
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
154
                 call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe,my_thread), w, &
155
156
                                            nbw, nl, stripe_width, nbw)

157
#else
158
                 call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1, &
159
                                              istripe,my_thread), w(1:nbw,1:6), &
160
                                              nbw, nl, stripe_width, nbw)
161
162
163
164
165
#endif

#else /* WITH_OPENMP */

#ifdef DESPERATELY_WANT_ASSUMED_SIZE
166
                 call double_hh_trafo_generic(a(1,j+off+a_off-1,istripe),w, &
167
168
169
                                            nbw, nl, stripe_width, nbw)

#else
170
                 call double_hh_trafo_generic(a(1:stripe_width,j+off+a_off-1:j+off+a_off+nbw-1,istripe),w(1:nbw,1:6), &
171
                                            nbw, nl, stripe_width, nbw)
172
#endif
173
174
#endif /* WITH_OPENMP */

175
               enddo
176
177

#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
178
             endif
179
180
181
182
183
184
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_KERNEL */


#if defined(WITH_REAL_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
185
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_GENERIC_SIMPLE) then
186
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
187
188
189
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
190
#ifdef WITH_OPENMP
191
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
192
193
                 call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
194
#else
195
                 call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe,my_thread), &
196
197
198
199
200
201
                                                     w, nbw, nl, stripe_width, nbw)

#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
202
203
                 call double_hh_trafo_generic_simple(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
204
#else
205
                 call double_hh_trafo_generic_simple(a(1:stripe_width,j+off+a_off-1:j+off+a_off-1+nbw,istripe), &
206
207
                                                     w, nbw, nl, stripe_width, nbw)

208
#endif
209
210
211

#endif /* WITH_OPENMP */

212
               enddo
213
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
214
             endif
215
216
217
218
219
220
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_REAL_SSE_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
221
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_SSE) then
222
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
223
224
225
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
226
#ifdef WITH_OPENMP
227
                 call double_hh_trafo(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
228
229
                                      stripe_width, nbw)
#else
230
                 call double_hh_trafo(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
231
232
                                      stripe_width, nbw)
#endif
233
               enddo
234
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
235
             endif
236
237
238
239
240
241
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_SSE_KERNEL */


#if defined(WITH_REAL_AVX_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
242
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK2) then
243
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
244
245
246
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
247
#ifdef WITH_OPENMP
248
                 call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
249
250
                                                       w, nbw, nl, stripe_width, nbw)
#else
251
                 call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), &
252
253
                                                       w, nbw, nl, stripe_width, nbw)
#endif
254
               enddo
255
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
256
             endif
257
258
259
260
261
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK2_KERNEL */

#if defined(WITH_REAL_BGP_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
262
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGP) then
263
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
264
265
266
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
267
#ifdef WITH_OPENMP
268
                 call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
269
270
                                          stripe_width, nbw)
#else
271
                 call double_hh_trafo_bgp(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
272
273
                                          stripe_width, nbw)
#endif
274
               enddo
275
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
276
             endif
277
278
279
280
281
282
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGP_KERNEL */


#if defined(WITH_REAL_BGQ_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
283
             if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_BGQ) then
284
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
285
286
287
               do j = ncols, 2, -2
                 w(:,1) = bcast_buffer(1:nbw,j+off)
                 w(:,2) = bcast_buffer(1:nbw,j+off-1)
288
#ifdef WITH_OPENMP
289
                 call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe,my_thread), w, nbw, nl, &
290
291
                                          stripe_width, nbw)
#else
292
                 call double_hh_trafo_bgq(a(1,j+off+a_off-1,istripe), w, nbw, nl, &
293
294
                                          stripe_width, nbw)
#endif
295
               enddo
296
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
297
             endif
298
299
300
301
302
303
304
305
306
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_BGQ_KERNEL */


!#if defined(WITH_AVX_SANDYBRIDGE)
!              call double_hh_trafo_real_sse_avx_2hv(a(1,j+off+a_off-1,istripe), w, nbw, nl, stripe_width, nbw)
!#endif

#ifdef WITH_OPENMP
307
             if (j==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off_a_off+nbw-1,istripe,my_thread), &
308
309
310
                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
                                      stripe_width)
#else
311
             if (j==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe),           &
312
313
314
315
316
317
                                      bcast_buffer(1:nbw,off+1), nbw, nl,     &
                                      stripe_width)
#endif


#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
318
           endif !
319
320
321
322
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */

#if defined(WITH_REAL_AVX_BLOCK4_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
323
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK4) then
324
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
325
326
327
328
329
330
             ! X86 INTRINSIC CODE, USING 4 HOUSEHOLDER VECTORS
             do j = ncols, 4, -4
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
331
#ifdef WITH_OPENMP
332
               call quad_hh_trafo_real_sse_avx_4hv(a(1,j+off+a_off-3,istripe,my_thread), w, &
333
334
                                                  nbw, nl, stripe_width, nbw)
#else
335
               call quad_hh_trafo_real_sse_avx_4hv(a(1,j+off+a_off-3,istripe), w, &
336
337
                                                  nbw, nl, stripe_width, nbw)
#endif
338
339
340
341
             enddo
             do jj = j, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
342
#ifdef WITH_OPENMP
343
               call double_hh_trafo_real_sse_avx_2hv(a(1,jj+off+a_off-1,istripe,my_thread), &
344
345
                                                    w, nbw, nl, stripe_width, nbw)
#else
346
               call double_hh_trafo_real_sse_avx_2hv(a(1,jj+off+a_off-1,istripe), &
347
348
                                                    w, nbw, nl, stripe_width, nbw)
#endif
349
             enddo
350
#ifdef WITH_OPENMP
351
             if (jj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
352
353
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
354
             if (jj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
355
356
357
                                          bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
358
           endif
359
360
361
362
363
364
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */


#if defined(WITH_REAL_AVX_BLOCK6_KERNEL)
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
365
           if (THIS_REAL_ELPA_KERNEL .eq. REAL_ELPA_KERNEL_AVX_BLOCK6) then
366
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
367
368
369
370
371
372
373
374
             ! X86 INTRINSIC CODE, USING 6 HOUSEHOLDER VECTORS
             do j = ncols, 6, -6
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
               w(:,3) = bcast_buffer(1:nbw,j+off-2)
               w(:,4) = bcast_buffer(1:nbw,j+off-3)
               w(:,5) = bcast_buffer(1:nbw,j+off-4)
               w(:,6) = bcast_buffer(1:nbw,j+off-5)
375
#ifdef WITH_OPENMP
376
               call hexa_hh_trafo_real_sse_avx_6hv(a(1,j+off+a_off-5,istripe,my_thread), w, &
377
378
                                                  nbw, nl, stripe_width, nbw)
#else
379
               call hexa_hh_trafo_real_sse_avx_6hv(a(1,j+off+a_off-5,istripe), w, &
380
381
                                                  nbw, nl, stripe_width, nbw)
#endif
382
383
384
385
386
387
             enddo
             do jj = j, 4, -4
               w(:,1) = bcast_buffer(1:nbw,jj+off)
               w(:,2) = bcast_buffer(1:nbw,jj+off-1)
               w(:,3) = bcast_buffer(1:nbw,jj+off-2)
               w(:,4) = bcast_buffer(1:nbw,jj+off-3)
388
#ifdef WITH_OPENMP
389
               call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe,my_thread), w, &
390
391
                                                  nbw, nl, stripe_width, nbw)
#else
392
               call quad_hh_trafo_real_sse_avx_4hv(a(1,jj+off+a_off-3,istripe), w, &
393
394
                                                  nbw, nl, stripe_width, nbw)
#endif
395
396
397
398
             enddo
             do jjj = jj, 2, -2
               w(:,1) = bcast_buffer(1:nbw,jjj+off)
               w(:,2) = bcast_buffer(1:nbw,jjj+off-1)
399
#ifdef WITH_OPENMP
400
               call double_hh_trafo_real_sse_avx_2hv(a(1,jjj+off+a_off-1,istripe,my_thread), &
401
402
                                                    w, nbw, nl, stripe_width, nbw)
#else
403
               call double_hh_trafo_real_sse_avx_2hv(a(1,jjj+off+a_off-1,istripe), &
404
405
                                                    w, nbw, nl, stripe_width, nbw)
#endif
406
             enddo
407
#ifdef WITH_OPENMP
408
             if (jjj==1) call single_hh_trafo_real_cpu_openmp(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe,my_thread), &
409
410
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#else
411
             if (jjj==1) call single_hh_trafo_real_cpu(a(1:stripe_width,1+off+a_off:1+off+a_off+nbw-1,istripe), &
412
413
414
                                           bcast_buffer(1:nbw,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_REAL_KERNEL)
415
           endif
416
417
418
#endif /* WITH_NO_SPECIFIC_REAL_KERNEL */
#endif /* WITH_REAL_AVX_BLOCK4_KERNEL */

419
420
         endif ! GPU_KERNEL

421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#ifdef WITH_OPENMP
         if (my_thread==1) then
#endif
           kernel_flops = kernel_flops + 4*int(nl,8)*int(ncols,8)*int(nbw,8)
           kernel_time = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
         endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
         call timer%stop("compute_hh_trafo_real_cpu_openmp")
#else
         call timer%stop("compute_hh_trafo_real_cpu")
#endif
#endif

#ifdef WITH_OPENMP
       end subroutine compute_hh_trafo_real_cpu_openmp
#else
       end subroutine compute_hh_trafo_real_cpu
#endif

end module