pack_unpack_gpu.F90 11.1 KB
Newer Older
Andreas Marek's avatar
Andreas Marek committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
!    http://elpa.rzg.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif

    ! Pack a filled row group (i.e. an array of consecutive rows)

    subroutine pack_row_group_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION &
    (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
                                       rows, n_offset, row_count)
      use cuda_c_kernel
      use cuda_functions
      use precision
      use iso_c_binding
      implicit none
      integer(kind=c_intptr_t)     :: row_group_dev, a_dev

      integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
      integer(kind=ik), intent(in) :: n_offset, row_count
#if REALCASE == 1
      real(kind=C_DATATYPE_KIND)   :: rows(:,:)
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
66
      complex(kind=C_DATATYPE_KIND) :: rows(:,:)
Andreas Marek's avatar
Andreas Marek committed
67
68
69
70
71
72
73
74
75
76
77
#endif
      integer(kind=ik)             :: max_idx
      logical                      :: successCUDA

      ! Use many blocks for higher GPU occupancy
      max_idx = (stripe_count - 1) * stripe_width + last_stripe_width

      ! Use one kernel call to pack the entire row group

!      call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)

Andreas Marek's avatar
Andreas Marek committed
78
      call launch_my_pack_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
      &MATH_DATATYPE&
      &_&
      &PRECISION &
      (row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev)

      ! Issue one single transfer call for all rows (device to host)
!        rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)

      successCUDA =  cuda_memcpy( loc(rows(:, 1: row_count)), row_group_dev , row_count * l_nev * size_of_&
      &PRECISION&
      &_&
      &MATH_DATATYPE&
      & , cudaMemcpyDeviceToHost)
      if (.not.(successCUDA)) then
        print *,"pack_row_group_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
94
95
96
97
        &MATH_DATATYPE&
        &_gpu_&
        &PRECISION&
        &: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
        stop 1
      endif

    end subroutine


    ! Unpack a filled row group (i.e. an array of consecutive rows)
    subroutine unpack_row_group_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION &
    (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
                                         a_dim2, l_nev, rows, n_offset, row_count)
      use cuda_c_kernel
      use precision
      use iso_c_binding
      use cuda_functions
      implicit none
Andreas Marek's avatar
Andreas Marek committed
116
117
118
      integer(kind=c_intptr_t)                     :: row_group_dev, a_dev
      integer(kind=ik), intent(in)                 :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
      integer(kind=ik), intent(in)                 :: n_offset, row_count
Andreas Marek's avatar
Andreas Marek committed
119
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
120
      real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
Andreas Marek's avatar
Andreas Marek committed
121
122
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
123
      complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
Andreas Marek's avatar
Andreas Marek committed
124
125
#endif

Andreas Marek's avatar
Andreas Marek committed
126
127
      integer(kind=ik)                             :: max_idx
      logical                                      :: successCUDA
Andreas Marek's avatar
Andreas Marek committed
128
129
130
131
132
133
134
135
136
137

      ! Use many blocks for higher GPU occupancy
      max_idx = (stripe_count - 1) * stripe_width + last_stripe_width

      ! Issue one single transfer call for all rows (host to device)
!      row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)


      successCUDA =  cuda_memcpy( row_group_dev , loc(rows(1, 1)),row_count * l_nev * &
                                 size_of_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
138
139
140
141
                                 &PRECISION&
                                 &_&
                                 &MATH_DATATYPE&
                                 &, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
142
143
      if (.not.(successCUDA)) then
        print *,"unpack_row_group_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
144
145
146
147
        &MATH_DATATYPE&
        &_gpu_&
        &PRECISION&
        &: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
148
149
150
151
152
153
        stop 1
      endif

      ! Use one kernel call to pack the entire row group
      !        call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)

Andreas Marek's avatar
Andreas Marek committed
154
      call launch_my_unpack_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
      &MATH_DATATYPE&
      &_&
      &PRECISION &
      ( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
                                          row_group_dev,a_dev)

    end subroutine

    ! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
    ! occurs when the queue is full or when the next row belongs to another group
    subroutine unpack_and_prepare_row_group_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION &
    (row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
                                                     last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
                                                     unpack_idx, next_unpack_idx, force)

      use iso_c_binding
      use precision
      implicit none
#if REALCASE == 1
177
      real(kind=C_DATATYPE_KIND)      :: row_group(:,:)
Andreas Marek's avatar
Andreas Marek committed
178
179
#endif
#if COMPLEXCASE == 1
180
      complex(kind=C_DATATYPE_KIND)   :: row_group(:,:)
Andreas Marek's avatar
Andreas Marek committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#endif
      integer(kind=c_intptr_t)        :: row_group_dev, a_dev
      integer(kind=ik), intent(in)    :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
      integer(kind=ik), intent(inout) :: row_group_size
      integer(kind=ik), intent(in)    :: nblk
      integer(kind=ik), intent(inout) :: unpack_idx
      integer(kind=ik), intent(in)    :: next_unpack_idx
      logical, intent(in)             :: force

      if (row_group_size == 0) then
        ! Nothing to flush, just prepare for the upcoming row
        row_group_size = 1
      else
        if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
          ! A flush and a reset must be performed
          call unpack_row_group_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
197
198
199
200
          &MATH_DATATYPE&
          &_gpu_&
          &PRECISION&
          (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
Andreas Marek's avatar
Andreas Marek committed
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
                                         a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size)
          row_group_size = 1
        else
          ! Just prepare for the upcoming row
          row_group_size = row_group_size + 1
        endif
      endif
      ! Always update the index for the upcoming row
      unpack_idx = next_unpack_idx
    end subroutine

    ! The host wrapper for computing the dot products between consecutive HH reflectors (see the kernel below)
    subroutine compute_hh_dot_products_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION&
    & (bcast_buffer_dev, hh_dot_dev, nbw, n)
      use cuda_c_kernel
      use precision
      use iso_c_binding
      implicit none
      integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_dot_dev
      integer(kind=ik), value  :: nbw, n

      if (n .le. 1) return
Andreas Marek's avatar
Andreas Marek committed
226
      call launch_compute_hh_dotp_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
      &MATH_DATATYPE&
      &_&
      &PRECISION&
      & ( bcast_buffer_dev, hh_dot_dev, nbw, n)


    end subroutine

    ! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
    subroutine extract_hh_tau_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION&
    & (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
      use cuda_c_kernel
      use precision
      use iso_c_binding
      implicit none
      integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
      integer(kind=ik), value  :: nbw, n
      logical, value           :: is_zero
      integer(kind=ik)         :: val_is_zero
      if (is_zero) then
      val_is_zero = 1
      else
       val_is_zero = 0
      endif

Andreas Marek's avatar
Andreas Marek committed
255
      call launch_extract_hh_tau_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
      &MATH_DATATYPE&
      &_&
      &PRECISION&
      & (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
    end subroutine

    ! -------------------------------------------
    ! Fortran back-transformation support kernels
    ! -------------------------------------------

    ! Reset a reduction block
    ! Limitation: the thread-block size must be a divider of the reduction block's size
    ! Reset 2 reduction blocks without an explicit synchronization at the end
    ! Limitation: : the thread-block size must be a divider of the reduction block's size
    ! Perform a reduction on an initialized, 128-element shared block
    ! Compute the dot-product between 2 consecutive HH vectors
    ! Limitation 1: the size of the thread block must be at most 128 and a power-of-2
    ! Limitation 2: the size of the warp must be equal to 32
    !
    ! Extract "tau" from the HH matrix and replace it with 1.0 or 0.0 (depending on case)
    ! Having "tau" as the first element in a HH reflector reduces space requirements, but causes undesired branching in the kernels
    !
    ! -------------------------------------------
    ! Fortran back-transformation support kernels
    ! -------------------------------------------
    !
    ! This is the simplest and slowest available backtransformation kernel
    !
    ! This is an improved version of the simple backtransformation kernel; here, we halve the number of iterations and apply
    ! 2 Householder reflectors per iteration
    !
    ! ---------------------------------
    ! Row packing and unpacking kernels
    ! ---------------------------------
    !
    ! The row group packing kernel

        ! Host wrapper for the Householder backtransformation step. Several kernels are available. Performance note:
        ! - "compute_hh_trafo_c_kernel" is the C kernel for the backtransformation (this exhibits best performance)
        ! - "compute_hh_trafo_kernel" is the Fortran equivalent of the C kernel
        ! - "compute_hh_trafo_single_kernel" is the reference Fortran kernel