mod_compute_hh_trafo_complex.F90 13 KB
Newer Older
1
2
module compute_hh_trafo_complex
#include "config-f90.h"
3
  use elpa_mpi
4
5
6
7
8
9
10
11
12
13
14
15
  implicit none

#ifdef WITH_OPENMP
  public compute_hh_trafo_complex_cpu_openmp
#else
  public compute_hh_trafo_complex_cpu
#endif


  contains

#ifdef WITH_OPENMP
16
         subroutine compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
17
18
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe,                                               &
19
                                                        my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#else
         subroutine compute_hh_trafo_complex_cpu       (a, stripe_width, a_dim2, stripe_count,                             &
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe, last_stripe_width,                            &
                                                        THIS_COMPLEX_ELPA_KERNEL)
#endif
           use precision
           use elpa2_utilities
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
           use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
           use complex_generic_kernel, only : single_hh_trafo_complex_generic
#endif
#ifdef HAVE_DETAILED_TIMINGS
           use timings
#endif
37

38
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
39
40
         use kernel_interfaces
#endif
41
42
43
44
45
46
47
48
49
50
51
52
           implicit none
           real(kind=rk), intent(inout) :: kernel_time
           integer(kind=lik)            :: kernel_flops
           integer(kind=ik), intent(in) :: nbw, max_blk_size
           complex(kind=ck)             :: bcast_buffer(nbw,max_blk_size)
           integer(kind=ik), intent(in) :: a_off

           integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
#ifndef WITH_OPENMP
           integer(kind=ik), intent(in) :: last_stripe_width
           complex(kind=ck)             :: a(stripe_width,a_dim2,stripe_count)
#else
53
           integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
           complex(kind=ck)             :: a(stripe_width,a_dim2,stripe_count,max_threads)
#endif
           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL

           ! Private variables in OMP regions (my_thread) should better be in the argument list!

           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
#ifdef WITH_OPENMP
           integer(kind=ik)             :: my_thread, noff
#endif
           real(kind=rk)                :: ttt

           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
           !        Currently (on Sandy Bridge), single is faster than double
           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

           complex(kind=ck)             :: w(nbw,2)

#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
          call timer%stop("compute_hh_trafo_complex_cpu_openmp")
#else
          call timer%stop("compute_hh_trafo_complex_cpu")
#endif
#endif

#ifdef WITH_OPENMP
           if (istripe<stripe_count) then
             nl = stripe_width
           else
             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
             nl = min(my_thread*thread_width-noff, l_nev-noff)
             if(nl<=0) then
87
#ifdef HAVE_DETAILED_TIMINGS
88
89
90
91
92
93
94
95
96
               call timer%stop("compute_hh_trafo_complex_cpu_openmp")
#endif
               return
             endif
           endif
#else
           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif

97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */

125
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
126
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
127
128
           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
                (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then
129
130
131
132
133
134
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
135
               call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
136
137
                                                       w, nbw, nl, stripe_width, nbw)
#else
138
               call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), &
139
140
141
142
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
143
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe,my_thread), &
144
145
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
146
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe), &
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL */


#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
162
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
163
164
165
               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe,my_thread), &
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
166
               call single_hh_trafo_complex_generic_simple(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
167
168
169
170
171
                                                           bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
172
173
               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe), &
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
174
175
176
#else
               call single_hh_trafo_complex_generic_simple(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
                                                          bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
177
#endif
178
179

#endif /* WITH_OPENMP */
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
             enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
196
197
#ifdef DESPERATELY_WANT_ASSUMED_SIZE

198
199
200
              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe,my_thread), &
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
201
202
203
204
205
206
              call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
207
208
              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe), &
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
209
210
211
#else
              call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
212
#endif
213
214
#endif /* WITH_OPENMP */

215
216
217
218
219
220
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

221
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_complex(a(1,j+off+a_off,istripe,my_thread), &
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex(a(1,j+off+a_off,istripe), &
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
238
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
239
240
241


!#if defined(WITH_AVX_SANDYBRIDGE)
242
!              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
243
244
245
!#endif

!#if defined(WITH_AMD_BULLDOZER)
246
!              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
247
248
!#endif

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe,my_thread), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNE */

268
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
269
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
270
271
          if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
              (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
272
273
274
275
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
276
              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe,my_thread), &
277
278
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
279
              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe), &
280
281
282
283
284
285
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
286
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL */
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303

#ifdef WITH_OPENMP
          if (my_thread==1) then
#endif
            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
            kernel_time  = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
          endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
          call timer%stop("compute_hh_trafo_complex_cpu_openmp")
#else
          call timer%stop("compute_hh_trafo_complex_cpu")
#endif
#endif

304
#ifdef WITH_OPENMP
305
306
307
308
309
310
311
        end subroutine compute_hh_trafo_complex_cpu_openmp
#else
        end subroutine compute_hh_trafo_complex_cpu

#endif

end module