pack_unpack_gpu.F90 8.66 KB
Newer Older
Andreas Marek's avatar
Andreas Marek committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!
!    More information can be found here:
Andreas Marek's avatar
Andreas Marek committed
20
!    https://elpa.mpcdf.mpg.de/
Andreas Marek's avatar
Andreas Marek committed
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
! This file was written by A. Marek, MPCDF
#endif

    ! Pack a filled row group (i.e. an array of consecutive rows)

    subroutine pack_row_group_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION &
    (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev, &
                                       rows, n_offset, row_count)
      use cuda_c_kernel
      use cuda_functions
      use precision
      use iso_c_binding
      implicit none
      integer(kind=c_intptr_t)     :: row_group_dev, a_dev

      integer(kind=ik), intent(in) :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
      integer(kind=ik), intent(in) :: n_offset, row_count
#if REALCASE == 1
      real(kind=C_DATATYPE_KIND)   :: rows(:,:)
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
66
      complex(kind=C_DATATYPE_KIND) :: rows(:,:)
Andreas Marek's avatar
Andreas Marek committed
67
68
69
70
71
72
73
74
75
76
77
#endif
      integer(kind=ik)             :: max_idx
      logical                      :: successCUDA

      ! Use many blocks for higher GPU occupancy
      max_idx = (stripe_count - 1) * stripe_width + last_stripe_width

      ! Use one kernel call to pack the entire row group

!      call my_pack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, a_dev, row_group_dev)

Andreas Marek's avatar
Andreas Marek committed
78
      call launch_my_pack_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
79
80
81
82
83
84
85
86
      &MATH_DATATYPE&
      &_&
      &PRECISION &
      (row_count, n_offset, max_idx, stripe_width, a_dim2, stripe_count, l_nev, a_dev, row_group_dev)

      ! Issue one single transfer call for all rows (device to host)
!        rows(:, 1 : row_count) = row_group_dev(:, 1 : row_count)

Andreas Marek's avatar
Andreas Marek committed
87
      successCUDA =  cuda_memcpy(int(loc(rows(:, 1: row_count)),kind=c_intptr_t), row_group_dev , row_count * l_nev * size_of_&
Andreas Marek's avatar
Andreas Marek committed
88
89
90
91
92
93
      &PRECISION&
      &_&
      &MATH_DATATYPE&
      & , cudaMemcpyDeviceToHost)
      if (.not.(successCUDA)) then
        print *,"pack_row_group_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
94
95
96
97
        &MATH_DATATYPE&
        &_gpu_&
        &PRECISION&
        &: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
        stop 1
      endif

    end subroutine


    ! Unpack a filled row group (i.e. an array of consecutive rows)
    subroutine unpack_row_group_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION &
    (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
                                         a_dim2, l_nev, rows, n_offset, row_count)
      use cuda_c_kernel
      use precision
      use iso_c_binding
      use cuda_functions
      implicit none
Andreas Marek's avatar
Andreas Marek committed
116
117
118
      integer(kind=c_intptr_t)                     :: row_group_dev, a_dev
      integer(kind=ik), intent(in)                 :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
      integer(kind=ik), intent(in)                 :: n_offset, row_count
Andreas Marek's avatar
Andreas Marek committed
119
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
120
      real(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
Andreas Marek's avatar
Andreas Marek committed
121
122
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
123
      complex(kind=C_DATATYPE_KIND), intent(in) :: rows(:, :)
Andreas Marek's avatar
Andreas Marek committed
124
125
#endif

Andreas Marek's avatar
Andreas Marek committed
126
127
      integer(kind=ik)                             :: max_idx
      logical                                      :: successCUDA
Andreas Marek's avatar
Andreas Marek committed
128
129
130
131
132
133
134
135

      ! Use many blocks for higher GPU occupancy
      max_idx = (stripe_count - 1) * stripe_width + last_stripe_width

      ! Issue one single transfer call for all rows (host to device)
!      row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)


Andreas Marek's avatar
Andreas Marek committed
136
      successCUDA =  cuda_memcpy( row_group_dev , int(loc(rows(1, 1)),kind=c_intptr_t),row_count * l_nev * &
Andreas Marek's avatar
Andreas Marek committed
137
                                 size_of_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
138
139
140
141
                                 &PRECISION&
                                 &_&
                                 &MATH_DATATYPE&
                                 &, cudaMemcpyHostToDevice)
Andreas Marek's avatar
Andreas Marek committed
142
143
      if (.not.(successCUDA)) then
        print *,"unpack_row_group_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
144
145
146
147
        &MATH_DATATYPE&
        &_gpu_&
        &PRECISION&
        &: error in cudaMemcpy"
Andreas Marek's avatar
Andreas Marek committed
148
149
150
151
152
153
        stop 1
      endif

      ! Use one kernel call to pack the entire row group
      !        call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)

Andreas Marek's avatar
Andreas Marek committed
154
      call launch_my_unpack_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
      &MATH_DATATYPE&
      &_&
      &PRECISION &
      ( row_count, n_offset, max_idx,stripe_width,a_dim2, stripe_count, l_nev, &
                                          row_group_dev,a_dev)

    end subroutine

    ! This subroutine must be called before queuing the next row for unpacking; it ensures that an unpacking of the current row group
    ! occurs when the queue is full or when the next row belongs to another group
    subroutine unpack_and_prepare_row_group_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION &
    (row_group, row_group_dev, a_dev, stripe_count, stripe_width, &
                                                     last_stripe_width, a_dim2, l_nev, row_group_size, nblk,      &
                                                     unpack_idx, next_unpack_idx, force)

      use iso_c_binding
      use precision
      implicit none
#if REALCASE == 1
177
      real(kind=C_DATATYPE_KIND)      :: row_group(:,:)
Andreas Marek's avatar
Andreas Marek committed
178
179
#endif
#if COMPLEXCASE == 1
180
      complex(kind=C_DATATYPE_KIND)   :: row_group(:,:)
Andreas Marek's avatar
Andreas Marek committed
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#endif
      integer(kind=c_intptr_t)        :: row_group_dev, a_dev
      integer(kind=ik), intent(in)    :: stripe_count, stripe_width, last_stripe_width, a_dim2, l_nev
      integer(kind=ik), intent(inout) :: row_group_size
      integer(kind=ik), intent(in)    :: nblk
      integer(kind=ik), intent(inout) :: unpack_idx
      integer(kind=ik), intent(in)    :: next_unpack_idx
      logical, intent(in)             :: force

      if (row_group_size == 0) then
        ! Nothing to flush, just prepare for the upcoming row
        row_group_size = 1
      else
        if (force .or. (row_group_size == nblk) .or. (unpack_idx + 1 /= next_unpack_idx)) then
          ! A flush and a reset must be performed
          call unpack_row_group_&
Lorenz Huedepohl's avatar
Lorenz Huedepohl committed
197
198
199
200
          &MATH_DATATYPE&
          &_gpu_&
          &PRECISION&
          (row_group_dev, a_dev, stripe_count, stripe_width, last_stripe_width, &
Andreas Marek's avatar
Andreas Marek committed
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
                                         a_dim2, l_nev, row_group(:, :), unpack_idx - row_group_size, row_group_size)
          row_group_size = 1
        else
          ! Just prepare for the upcoming row
          row_group_size = row_group_size + 1
        endif
      endif
      ! Always update the index for the upcoming row
      unpack_idx = next_unpack_idx
    end subroutine

    ! The host wrapper for extracting "tau" from the HH reflectors (see the kernel below)
    subroutine extract_hh_tau_&
    &MATH_DATATYPE&
    &_gpu_&
    &PRECISION&
    & (bcast_buffer_dev, hh_tau_dev, nbw, n, is_zero)
      use cuda_c_kernel
      use precision
      use iso_c_binding
      implicit none
      integer(kind=c_intptr_t) :: bcast_buffer_dev, hh_tau_dev
      integer(kind=ik), value  :: nbw, n
      logical, value           :: is_zero
      integer(kind=ik)         :: val_is_zero
      if (is_zero) then
      val_is_zero = 1
      else
       val_is_zero = 0
      endif

Andreas Marek's avatar
Andreas Marek committed
232
      call launch_extract_hh_tau_gpu_kernel_&
Andreas Marek's avatar
Andreas Marek committed
233
234
235
236
237
      &MATH_DATATYPE&
      &_&
      &PRECISION&
      & (bcast_buffer_dev, hh_tau_dev, nbw, n, val_is_zero)
    end subroutine