mod_compute_hh_trafo_complex.F90 15.5 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaftrn,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! Author: Andreas Marek, MPCDF

44
45
module compute_hh_trafo_complex
#include "config-f90.h"
46
  use elpa_mpi
47
48
49
50
51
52
53
54
55
56
57
58
  implicit none

#ifdef WITH_OPENMP
  public compute_hh_trafo_complex_cpu_openmp
#else
  public compute_hh_trafo_complex_cpu
#endif


  contains

#ifdef WITH_OPENMP
59
         subroutine compute_hh_trafo_complex_cpu_openmp(a, stripe_width, a_dim2, stripe_count, max_threads, l_nev,         &
60
61
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe,                                               &
62
                                                        my_thread, thread_width, THIS_COMPLEX_ELPA_KERNEL)
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#else
         subroutine compute_hh_trafo_complex_cpu       (a, stripe_width, a_dim2, stripe_count,                             &
                                                        a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
                                                        off, ncols, istripe, last_stripe_width,                            &
                                                        THIS_COMPLEX_ELPA_KERNEL)
#endif
           use precision
           use elpa2_utilities
#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
           use complex_generic_simple_kernel, only : single_hh_trafo_complex_generic_simple
#endif
#if defined(WITH_COMPLEX_GENERIC_KERNEL)
           use complex_generic_kernel, only : single_hh_trafo_complex_generic
#endif
#ifdef HAVE_DETAILED_TIMINGS
           use timings
#endif
80

81
#if defined(HAVE_AVX) || defined(HAVE_SSE_INTRINSICS) || defined(HAVE_SSE_ASSEMBLY)
82
83
         use kernel_interfaces
#endif
84
85
86
87
88
89
90
91
92
93
94
95
           implicit none
           real(kind=rk), intent(inout) :: kernel_time
           integer(kind=lik)            :: kernel_flops
           integer(kind=ik), intent(in) :: nbw, max_blk_size
           complex(kind=ck)             :: bcast_buffer(nbw,max_blk_size)
           integer(kind=ik), intent(in) :: a_off

           integer(kind=ik), intent(in) :: stripe_width, a_dim2, stripe_count
#ifndef WITH_OPENMP
           integer(kind=ik), intent(in) :: last_stripe_width
           complex(kind=ck)             :: a(stripe_width,a_dim2,stripe_count)
#else
96
           integer(kind=ik), intent(in) :: max_threads, l_nev, thread_width
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
           complex(kind=ck)             :: a(stripe_width,a_dim2,stripe_count,max_threads)
#endif
           integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL

           ! Private variables in OMP regions (my_thread) should better be in the argument list!

           integer(kind=ik)             :: off, ncols, istripe, j, nl, jj
#ifdef WITH_OPENMP
           integer(kind=ik)             :: my_thread, noff
#endif
           real(kind=rk)                :: ttt

           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
           !        Currently (on Sandy Bridge), single is faster than double
           !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

           complex(kind=ck)             :: w(nbw,2)

#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
Andreas Marek's avatar
Andreas Marek committed
117
          call timer%start("compute_hh_trafo_complex_cpu_openmp")
118
#else
Andreas Marek's avatar
Andreas Marek committed
119
          call timer%start("compute_hh_trafo_complex_cpu")
120
121
122
123
124
125
126
127
128
129
#endif
#endif

#ifdef WITH_OPENMP
           if (istripe<stripe_count) then
             nl = stripe_width
           else
             noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
             nl = min(my_thread*thread_width-noff, l_nev-noff)
             if(nl<=0) then
130
#ifdef HAVE_DETAILED_TIMINGS
131
132
133
134
135
136
137
138
139
               call timer%stop("compute_hh_trafo_complex_cpu_openmp")
#endif
               return
             endif
           endif
#else
           nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#endif

140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#if defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK2) then
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
               call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
                                                       w, nbw, nl, stripe_width, nbw)
#else
               call double_hh_trafo_complex_sse_2hv(a(1,j+off+a_off-1,istripe), &
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
             if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe,my_thread), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
             if (j==1) call single_hh_trafo_complex_sse_1hv(a(1,1+off+a_off,istripe), &
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_SSE_BLOCK2_KERNEL */

168
#if defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)
169
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
170
171
           if ( (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK2) .or. &
                (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK2) ) then
172
173
174
175
176
177
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 2, -2
               w(:,1) = bcast_buffer(1:nbw,j+off)
               w(:,2) = bcast_buffer(1:nbw,j+off-1)
#ifdef WITH_OPENMP
178
               call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe,my_thread), &
179
180
                                                       w, nbw, nl, stripe_width, nbw)
#else
181
               call double_hh_trafo_complex_avx_avx2_2hv(a(1,j+off+a_off-1,istripe), &
182
183
184
185
                                                       w, nbw, nl, stripe_width, nbw)
#endif
             enddo
#ifdef WITH_OPENMP
186
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe,my_thread), &
187
188
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#else
189
             if (j==1) call single_hh_trafo_complex_avx_avx2_1hv(a(1,1+off+a_off,istripe), &
190
191
192
193
194
                                                             bcast_buffer(1,off+1), nbw, nl, stripe_width)
#endif
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif
#endif  /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
195
#endif /* WITH_COMPLEX_AVX_BLOCK2_KERNEL || WITH_COMPLEX_AVX2_BLOCK2_KERNEL  */
196
197
198
199
200
201
202
203
204


#if defined(WITH_COMPLEX_GENERIC_SIMPLE_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
            if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC_SIMPLE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
205
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
206
207
208
               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe,my_thread), &
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
209
               call single_hh_trafo_complex_generic_simple(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
210
211
212
213
214
                                                           bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
215
216
               call single_hh_trafo_complex_generic_simple(a(1,j+off+a_off,istripe), &
                                                          bcast_buffer(1,j+off),nbw,nl,stripe_width)
217
218
219
#else
               call single_hh_trafo_complex_generic_simple(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
                                                          bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
220
#endif
221
222

#endif /* WITH_OPENMP */
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
             enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_SIMPLE_KERNEL */


#if defined(WITH_COMPLEX_GENERIC_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_GENERIC .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGP .or. &
               THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_BGQ ) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
239
240
#ifdef DESPERATELY_WANT_ASSUMED_SIZE

241
242
243
              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe,my_thread), &
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
244
245
246
247
248
249
              call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe,my_thread), &
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
#endif

#else /* WITH_OPENMP */
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
250
251
              call single_hh_trafo_complex_generic(a(1,j+off+a_off,istripe), &
                                                   bcast_buffer(1,j+off),nbw,nl,stripe_width)
252
253
254
#else
              call single_hh_trafo_complex_generic(a(1:stripe_width,j+off+a_off:j+off+a_off+nbw-1,istripe), &
                                                   bcast_buffer(1:nbw,j+off),nbw,nl,stripe_width)
255
#endif
256
257
#endif /* WITH_OPENMP */

258
259
260
261
262
263
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
#endif /* WITH_COMPLEX_GENERIC_KERNEL */

264
#if defined(WITH_COMPLEX_SSE_ASSEMBLY_KERNEL)
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
           if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
             ttt = mpi_wtime()
             do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_complex(a(1,j+off+a_off,istripe,my_thread), &
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex(a(1,j+off+a_off,istripe), &
                                           bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
281
#endif /* WITH_COMPLEX_SSE_ASSEMBLY_KERNEL */
282
283
284


!#if defined(WITH_AVX_SANDYBRIDGE)
285
!              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
286
287
288
!#endif

!#if defined(WITH_AMD_BULLDOZER)
289
!              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe),bcast_buffer(1,j+off),nbw,nl,stripe_width)
290
291
!#endif

292
293
294
295
#if defined(WITH_COMPLEX_SSE_BLOCK1_KERNEL)
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          if (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_SSE_BLOCK1) then
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
296

Andreas Marek's avatar
Andreas Marek committed
297
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL))
298
299
300
301
302
303
304
305
306
307
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
              call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe,my_thread), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
              call single_hh_trafo_complex_sse_1hv(a(1,j+off+a_off,istripe), &
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
Andreas Marek's avatar
Andreas Marek committed
308
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_SSE_BLOCK2_KERNEL)) */
309

310
311
312
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
Andreas Marek's avatar
Andreas Marek committed
313
#endif /* WITH_COMPLEX_SSE_BLOCK1_KERNEL */
314

315
#if defined(WITH_COMPLEX_AVX_BLOCK1_KERNEL) || defined(WITH_COMPLEX_AVX2_BLOCK1_KERNEL)
316
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
317
318
          if ((THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX_BLOCK1) .or. &
              (THIS_COMPLEX_ELPA_KERNEL .eq. COMPLEX_ELPA_KERNEL_AVX2_BLOCK1)) then
319
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
320

Andreas Marek's avatar
Andreas Marek committed
321
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL))
322
323
324
            ttt = mpi_wtime()
            do j = ncols, 1, -1
#ifdef WITH_OPENMP
325
              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe,my_thread), &
326
327
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#else
328
              call single_hh_trafo_complex_avx_avx2_1hv(a(1,j+off+a_off,istripe), &
329
330
331
                                                       bcast_buffer(1,j+off),nbw,nl,stripe_width)
#endif
            enddo
Andreas Marek's avatar
Andreas Marek committed
332
#endif /* defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL) || (defined(WITH_ONE_SPECIFIC_COMPLEX_KERNEL) && !defined(WITH_COMPLEX_AVX_BLOCK2_KERNEL) && !defined(WITH_COMPLEX_AVX2_BLOCK2_KERNEL)) */
333

334
335
336
#if defined(WITH_NO_SPECIFIC_COMPLEX_KERNEL)
          endif
#endif /* WITH_NO_SPECIFIC_COMPLEX_KERNEL */
337
#endif /* WITH_COMPLEX_AVX_BLOCK1_KERNEL || WITH_COMPLEX_AVX2_BLOCK1_KERNEL */
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354

#ifdef WITH_OPENMP
          if (my_thread==1) then
#endif
            kernel_flops = kernel_flops + 4*4*int(nl,8)*int(ncols,8)*int(nbw,8)
            kernel_time  = kernel_time + mpi_wtime()-ttt
#ifdef WITH_OPENMP
          endif
#endif
#ifdef HAVE_DETAILED_TIMINGS
#ifdef WITH_OPENMP
          call timer%stop("compute_hh_trafo_complex_cpu_openmp")
#else
          call timer%stop("compute_hh_trafo_complex_cpu")
#endif
#endif

355
#ifdef WITH_OPENMP
356
357
358
359
360
361
362
        end subroutine compute_hh_trafo_complex_cpu_openmp
#else
        end subroutine compute_hh_trafo_complex_cpu

#endif

end module