elpa_multiply_a_b.F90 16.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
!
! Author: A. Marek, MPCDF

54
#include "../general/sanity.F90"
55
#include "../general/error_checking.inc"
56

Andreas Marek's avatar
Andreas Marek committed
57
58
59
60
  use elpa1_compute
  use elpa_mpi
  use precision
  use elpa_abstract_impl
61
  use, intrinsic :: iso_c_binding
62
  use elpa_gpu
Andreas Marek's avatar
Andreas Marek committed
63
64
65
  use mod_check_for_gpu
  use elpa_blas_interfaces
  implicit none
66

67
#include "../../src/general/precision_kinds.F90"
Andreas Marek's avatar
Andreas Marek committed
68
  class(elpa_abstract_impl_t), intent(inout) :: obj
69

Andreas Marek's avatar
Andreas Marek committed
70
  character*1                   :: uplo_a, uplo_c
71

Andreas Marek's avatar
Andreas Marek committed
72
73
  integer(kind=ik), intent(in)  :: ldb, ldbCols, ldc, ldcCols
  integer(kind=ik)              :: na, ncb
74
#ifdef USE_ASSUMED_SIZE
Andreas Marek's avatar
Andreas Marek committed
75
  MATH_DATATYPE(kind=rck)       :: a(obj%local_nrows,*), b(ldb,*), c(ldc,*)
76
#else
Andreas Marek's avatar
Andreas Marek committed
77
  MATH_DATATYPE(kind=rck)       :: a(obj%local_nrows,obj%local_ncols), b(ldb,ldbCols), c(ldc,ldcCols)
78
#endif
Andreas Marek's avatar
Andreas Marek committed
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
  integer(kind=ik)              :: my_prow, my_pcol, np_rows, np_cols, myid
  integer(kind=MPI_KIND)        :: my_prowMPI, my_pcolMPI, np_rowsMPI, np_colsMPI
  integer(kind=MPI_KIND)        :: mpierr, myidMPI
  integer(kind=ik)              :: l_cols, l_rows, l_rows_np
  integer(kind=ik)              :: np, n, nb, nblk_mult, lrs, lre, lcs, lce
  integer(kind=ik)              :: gcol_min, gcol, goff
  integer(kind=ik)              :: nstor, nr_done, noff, np_bc, n_aux_bc, nvals
  integer(kind=ik), allocatable :: lrs_save(:), lre_save(:)

  logical                       :: a_lower, a_upper, c_lower, c_upper
  MATH_DATATYPE(kind=rck), pointer, contiguous :: aux_mat(:,:), tmp1(:,:)
  MATH_DATATYPE(kind=rck), allocatable :: aux_bc(:), tmp2(:,:)
  integer(kind=ik)              :: istat
  character(200)                :: errorMessage
  character(20)                 :: gpuString
  logical                       :: success
95
  logical                       :: successGPU
Andreas Marek's avatar
Andreas Marek committed
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
  logical                       :: useGPU
  integer(kind=c_int)           :: gpu, numGPU
  integer(kind=ik)              :: mpi_comm_rows, mpi_comm_cols, mpi_comm_all
  integer(kind=ik)              :: nblk, matrixRows, matrixCols, error
  integer(kind=c_intptr_t)      :: aux_dev, b_dev, tmp1_dev
  type(c_ptr)                   :: aux_host, tmp1_host
  integer(kind=c_intptr_t)      :: num
  integer(kind=c_intptr_t)      :: aux_off, b_off
  integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
                                                            &PRECISION&
                                                            &_&
                                                            &MATH_DATATYPE

  success = .true.

  ! GPU settings
Andreas Marek's avatar
Andreas Marek committed
112
  if (gpu_vendor() == NVIDIA_GPU) then
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
    call obj%get("gpu",gpu,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option for GPU. Aborting..."
      stop
    endif
    if (gpu .eq. 1) then
      print *,"You still use the deprecated option 'gpu', consider switching to 'nvidia-gpu'. Will set the new &
              & keyword 'nvidia-gpu'"
      call obj%set("nvidia-gpu",gpu,error)
      if (error .ne. ELPA_OK) then
        print *,"Problem setting option for NVIDIA GPU. Aborting..."
        stop
      endif
    endif

Andreas Marek's avatar
Andreas Marek committed
128
129
130
131
132
133
134
135
136
137
138
139
140
    call obj%get("nvidia-gpu",gpu,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option for NVIDIA GPU. Aborting..."
      stop
    endif
  else if (gpu_vendor() == AMD_GPU) then
    call obj%get("amd-gpu",gpu,error)
    if (error .ne. ELPA_OK) then
      print *,"Problem getting option for AMD GPU. Aborting..."
      stop
    endif
  else
    gpu = 0
Andreas Marek's avatar
Andreas Marek committed
141
142
  endif

Andreas Marek's avatar
Andreas Marek committed
143

Andreas Marek's avatar
Andreas Marek committed
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
  useGPU = (gpu == 1)

  if(useGPU) then
    gpuString = "_gpu"
  else
    gpuString = ""
  endif

  call obj%timer%start("elpa_mult_at_b_&
  &MATH_DATATYPE&
  &_&
  &PRECISION&
  &"//gpuString)

  na          = obj%na
  nblk        = obj%nblk
  matrixRows  = obj%local_nrows
  matrixCols  = obj%local_ncols

  call obj%get("mpi_comm_rows",mpi_comm_rows,error)
  if (error .ne. ELPA_OK) then
    print *,"Problem getting option for mpi_comm_rows. Aborting..."
    stop
  endif
  call obj%get("mpi_comm_cols",mpi_comm_cols,error)
  if (error .ne. ELPA_OK) then
    print *,"Problem getting option for mpi_comm_cols. Aborting..."
    stop
  endif
  call obj%get("mpi_comm_parent",mpi_comm_all,error)
  if (error .ne. ELPA_OK) then
    print *,"Problem getting option for mpi_comm_parent. Aborting..."
    stop
  endif

  call obj%timer%start("mpi_communication")
  call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI ,mpierr)
  call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI ,mpierr)
  call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI ,mpierr)
  call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI ,mpierr)
  call mpi_comm_rank(int(mpi_comm_all, kind=MPI_KIND) ,myidMPI ,mpierr)

  my_prow = int(my_prowMPI,kind=c_int)
  np_rows = int(np_rowsMPI,kind=c_int)
  my_pcol = int(my_pcolMPI,kind=c_int)
  np_cols = int(np_colsMPI,kind=c_int)
  myid    = int(myidMPI,kind=c_int)
  call obj%timer%stop("mpi_communication")
  l_rows = local_index(na,  my_prow, np_rows, nblk, -1) ! Local rows of a and b
  l_cols = local_index(ncb, my_pcol, np_cols, nblk, -1) ! Local cols of b

  ! Block factor for matrix multiplications, must be a multiple of nblk

  if (na/np_rows <= 256) then
     nblk_mult = (31/nblk+1)*nblk
  else
     nblk_mult = (63/nblk+1)*nblk
  endif

  if (useGPU) then
    call obj%timer%start("check_for_gpu")
205
    if (check_for_gpu(obj, myid, numGPU)) then
Andreas Marek's avatar
Andreas Marek committed
206
      ! set the neccessary parameters
Andreas Marek's avatar
Andreas Marek committed
207
      call set_gpu_parameters()
Andreas Marek's avatar
Andreas Marek committed
208
209
210
211
212
213
214
215
216
    else
      print *,"GPUs are requested but not detected! Aborting..."
      success = .false.
      return
    endif
    call obj%timer%stop("check_for_gpu")

    ! copy b to b_dev
    num = ldb*ldbCols*size_of_datatype
217
    successGPU = gpu_malloc(b_dev,num)
218
    check_alloc_gpu("elpa_mult_at_b: b_dev", successGPU)
Andreas Marek's avatar
Andreas Marek committed
219

220
    successGPU = gpu_host_register(int(loc(b),kind=c_intptr_t),num,&
221
                  gpuHostRegisterDefault)
Andreas Marek's avatar
Andreas Marek committed
222

223
    check_host_register_gpu("elpa_mult_at_b: b", successGPU)
Andreas Marek's avatar
Andreas Marek committed
224

225
    successGPU = gpu_memcpy(b_dev,int(loc(b),kind=c_intptr_t),num,&
226
                  gpuMemcpyHostToDevice)
227
    check_memcpy_gpu("elpa_mult_at_b: b to b_dev", successGPU)
Andreas Marek's avatar
Andreas Marek committed
228
229

    num = l_rows*nblk_mult*size_of_datatype
230
    successGPU = gpu_malloc_host(aux_host,num)
231
    check_host_alloc_gpu("elpa_mult_at_b: aux_host", successGPU)
Andreas Marek's avatar
Andreas Marek committed
232
233
234

    call c_f_pointer(aux_host,aux_mat,(/l_rows,nblk_mult/))

235
    successGPU = gpu_malloc(aux_dev,num)
236
    check_alloc_gpu("elpa_mult_at_b: aux_dev", successGPU)
Andreas Marek's avatar
Andreas Marek committed
237
238

    num = nblk_mult*l_cols*size_of_datatype
239
    successGPU = gpu_malloc_host(tmp1_host,num)
240
    check_host_alloc_gpu("elpa_mult_at_b: tmp1_host", successGPU)
Andreas Marek's avatar
Andreas Marek committed
241
242
243

    call c_f_pointer(tmp1_host,tmp1,(/nblk_mult,l_cols/))

244
    successGPU = gpu_malloc(tmp1_dev,num)
245
    check_alloc_gpu("elpa_mult_at_b: tmp1_dev", successGPU)
Andreas Marek's avatar
Andreas Marek committed
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
  else ! useGPU
    allocate(aux_mat(l_rows,nblk_mult), stat=istat, errmsg=errorMessage)
    check_allocate("elpa_mult_at_b: aux_mat", istat, errorMessage)
  endif ! useGPU

  allocate(aux_bc(l_rows*nblk), stat=istat, errmsg=errorMessage)
  check_allocate("elpa_mult_at_b: aux_bc", istat, errorMessage)

  allocate(lrs_save(nblk), stat=istat, errmsg=errorMessage)
  check_allocate("elpa_mult_at_b: lrs_save", istat, errorMessage)

  allocate(lre_save(nblk), stat=istat, errmsg=errorMessage)
  check_allocate("elpa_mult_at_b: lre_save", istat, errorMessage)

  a_lower = .false.
  a_upper = .false.
  c_lower = .false.
  c_upper = .false.

  if (uplo_a=='u' .or. uplo_a=='U') a_upper = .true.
  if (uplo_a=='l' .or. uplo_a=='L') a_lower = .true.
  if (uplo_c=='u' .or. uplo_c=='U') c_upper = .true.
  if (uplo_c=='l' .or. uplo_c=='L') c_lower = .true.
Wenzhe Yu's avatar
Wenzhe Yu committed
269

Andreas Marek's avatar
Andreas Marek committed
270
  ! Build up the result matrix by processor rows
Wenzhe Yu's avatar
Wenzhe Yu committed
271

Andreas Marek's avatar
Andreas Marek committed
272
  do np = 0, np_rows-1
Wenzhe Yu's avatar
Wenzhe Yu committed
273

Andreas Marek's avatar
Andreas Marek committed
274
    ! In this turn, procs of row np assemble the result
275

Andreas Marek's avatar
Andreas Marek committed
276
    l_rows_np = local_index(na, np, np_rows, nblk, -1) ! local rows on receiving processors
277

Andreas Marek's avatar
Andreas Marek committed
278
279
280
    nr_done = 0 ! Number of rows done
    aux_mat = 0
    nstor = 0   ! Number of columns stored in aux_mat
281

Andreas Marek's avatar
Andreas Marek committed
282
    ! Loop over the blocks on row np
283

Andreas Marek's avatar
Andreas Marek committed
284
    do nb=0,(l_rows_np-1)/nblk
285

Andreas Marek's avatar
Andreas Marek committed
286
      goff  = nb*np_rows + np ! Global offset in blocks corresponding to nb
287

Andreas Marek's avatar
Andreas Marek committed
288
289
290
      ! Get the processor column which owns this block (A is transposed, so we need the column)
      ! and the offset in blocks within this column.
      ! The corresponding block column in A is then broadcast to all for multiplication with B
291

Andreas Marek's avatar
Andreas Marek committed
292
293
294
      np_bc = MOD(goff,np_cols)
      noff = goff/np_cols
      n_aux_bc = 0
295

Andreas Marek's avatar
Andreas Marek committed
296
      ! Gather up the complete block column of A on the owner
297

Andreas Marek's avatar
Andreas Marek committed
298
      do n = 1, min(l_rows_np-nb*nblk,nblk) ! Loop over columns to be broadcast
299

Andreas Marek's avatar
Andreas Marek committed
300
301
        gcol = goff*nblk + n ! global column corresponding to n
        if (nstor==0 .and. n==1) gcol_min = gcol
302

Andreas Marek's avatar
Andreas Marek committed
303
304
305
306
        lrs = 1       ! 1st local row number for broadcast
        lre = l_rows  ! last local row number for broadcast
        if (a_lower) lrs = local_index(gcol, my_prow, np_rows, nblk, +1)
        if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)
307

Andreas Marek's avatar
Andreas Marek committed
308
309
310
311
312
        if (lrs<=lre) then
          nvals = lre-lrs+1
          if (my_pcol == np_bc) aux_bc(n_aux_bc+1:n_aux_bc+nvals) = a(lrs:lre,noff*nblk+n)
          n_aux_bc = n_aux_bc + nvals
        endif
313

Andreas Marek's avatar
Andreas Marek committed
314
315
        lrs_save(n) = lrs
        lre_save(n) = lre
316

Andreas Marek's avatar
Andreas Marek committed
317
      enddo
318

Andreas Marek's avatar
Andreas Marek committed
319
      ! Broadcast block column
320
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
321
      call obj%timer%start("mpi_communication")
322
#if REALCASE == 1
Andreas Marek's avatar
Andreas Marek committed
323
324
325
      call MPI_Bcast(aux_bc, int(n_aux_bc,kind=MPI_KIND),    &
                     MPI_REAL_PRECISION,  &
                     int(np_bc,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
326
327
#endif
#if COMPLEXCASE == 1
Andreas Marek's avatar
Andreas Marek committed
328
329
330
      call MPI_Bcast(aux_bc, int(n_aux_bc,kind=MPI_KIND),    &
                     MPI_COMPLEX_PRECISION,  &
                     int(np_bc,kind=MPI_KIND), int(mpi_comm_cols,kind=MPI_KIND), mpierr)
331
#endif
Andreas Marek's avatar
Andreas Marek committed
332
      call obj%timer%stop("mpi_communication")
333
#endif /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
      ! Insert what we got in aux_mat

      n_aux_bc = 0
      do n = 1, min(l_rows_np-nb*nblk,nblk)
        nstor = nstor+1
        lrs = lrs_save(n)
        lre = lre_save(n)
        if (lrs<=lre) then
          nvals = lre-lrs+1
          aux_mat(lrs:lre,nstor) = aux_bc(n_aux_bc+1:n_aux_bc+nvals)
          n_aux_bc = n_aux_bc + nvals
        endif
      enddo

      ! If we got nblk_mult columns in aux_mat or this is the last block
      ! do the matrix multiplication

      if (nstor==nblk_mult .or. nb*nblk+nblk >= l_rows_np) then

        lrs = 1       ! 1st local row number for multiply
        lre = l_rows  ! last local row number for multiply
        if (a_lower) lrs = local_index(gcol_min, my_prow, np_rows, nblk, +1)
        if (a_upper) lre = local_index(gcol, my_prow, np_rows, nblk, -1)

        lcs = 1       ! 1st local col number for multiply
        lce = l_cols  ! last local col number for multiply
        if (c_upper) lcs = local_index(gcol_min, my_pcol, np_cols, nblk, +1)
        if (c_lower) lce = MIN(local_index(gcol, my_pcol, np_cols, nblk, -1),l_cols)

        if (lcs<=lce) then
          allocate(tmp1(nstor,lcs:lce), tmp2(nstor,lcs:lce), stat=istat, errmsg=errorMessage)
          call check_alloc("elpa_mult_at_b_&
          &MATH_DATATYPE ", "tmp1", istat, errorMessage)

          if (lrs<=lre) then
            if (useGPU) then
              num = l_rows*nblk_mult*size_of_datatype
371
              successGPU = gpu_memcpy(aux_dev, int(loc(aux_mat),kind=c_intptr_t), &
Andreas Marek's avatar
Andreas Marek committed
372
                            num, gpuMemcpyHostToDevice)
373
              check_memcpy_gpu("elpa_mult_at_b: aux_mat to aux_dev", successGPU)
Andreas Marek's avatar
Andreas Marek committed
374
375
376
377

              aux_off = (lrs-1)*size_of_datatype
              b_off = ((lcs-1)*ldb+lrs-1)*size_of_datatype

Andreas Marek's avatar
Andreas Marek committed
378
              call obj%timer%start("gpublas")
Andreas Marek's avatar
Andreas Marek committed
379
              call gpublas_PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', nstor, lce-lcs+1, &
Andreas Marek's avatar
Andreas Marek committed
380
381
                   lre-lrs+1, ONE, aux_dev+aux_off, l_rows, b_dev+b_off, ldb, ZERO, &
                   tmp1_dev, nstor)
Andreas Marek's avatar
Andreas Marek committed
382
              call obj%timer%stop("gpublas")
Andreas Marek's avatar
Andreas Marek committed
383
384

              num = nstor*(lce-lcs+1)*size_of_datatype
385
              successGPU = gpu_memcpy(int(loc(tmp1),kind=c_intptr_t), &
Andreas Marek's avatar
Andreas Marek committed
386
                            tmp1_dev, num, gpuMemcpyDeviceToHost)
387
              check_memcpy_gpu("elpa_mult_at_b: tmp1_dev to tmp1", successGPU)
Andreas Marek's avatar
Andreas Marek committed
388
389
390
391
392
393
394
395
396
397
398
399
400
401
            else ! useGPU
              call obj%timer%start("blas")
              call PRECISION_GEMM(BLAS_TRANS_OR_CONJ, 'N', int(nstor,kind=BLAS_KIND), &
                                int(lce-lcs+1,kind=BLAS_KIND), int(lre-lrs+1,kind=BLAS_KIND), &
                                ONE, aux_mat(lrs:lre,1:nstor), int(lre-lrs+1,kind=BLAS_KIND), &
                                b(lrs,lcs), int(ldb,kind=BLAS_KIND), ZERO, tmp1, &
                                int(nstor,kind=BLAS_KIND))
              call obj%timer%stop("blas")
            endif ! useGPU
          else
            tmp1 = 0
          endif

          ! Sum up the results and send to processor row np
402
#ifdef WITH_MPI
Andreas Marek's avatar
Andreas Marek committed
403
404
405
406
407
408
          call obj%timer%start("mpi_communication")
          call mpi_reduce(tmp1, tmp2, int(nstor*(lce-lcs+1),kind=MPI_KIND),  MPI_MATH_DATATYPE_PRECISION, &
                          MPI_SUM, int(np,kind=MPI_KIND), int(mpi_comm_rows,kind=MPI_KIND), mpierr)
          call obj%timer%stop("mpi_communication")
          ! Put the result into C
          if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp2(1:nstor,lcs:lce)
409
410

#else /* WITH_MPI */
Andreas Marek's avatar
Andreas Marek committed
411
412
413
!          tmp2 = tmp1
          ! Put the result into C
          if (my_prow==np) c(nr_done+1:nr_done+nstor,lcs:lce) = tmp1(1:nstor,lcs:lce)
414
415
416

#endif /* WITH_MPI */

Andreas Marek's avatar
Andreas Marek committed
417
418
419
          deallocate(tmp1,tmp2, stat=istat, errmsg=errorMessage)
          check_deallocate("elpa_mult_at_b: tmp1, tmp2", istat, errorMessage)
        endif
420

Andreas Marek's avatar
Andreas Marek committed
421
422
423
424
425
426
        nr_done = nr_done+nstor
        nstor=0
        aux_mat(:,:)=0
      endif
    enddo
  enddo
427

Andreas Marek's avatar
Andreas Marek committed
428
  if (useGPU) then
429
    successGPU = gpu_free(b_dev)
430
    check_dealloc_gpu("elpa_multiply_a_b: b_dev", successGPU)
Wenzhe Yu's avatar
Wenzhe Yu committed
431

432
    successGPU = gpu_host_unregister(int(loc(b),kind=c_intptr_t))
433
    check_host_unregister_gpu("elpa_multiply_a_b: b", successGPU)
Wenzhe Yu's avatar
Wenzhe Yu committed
434

Andreas Marek's avatar
Andreas Marek committed
435
436
    nullify(aux_mat)
    nullify(tmp1)
Wenzhe Yu's avatar
Wenzhe Yu committed
437

438
    successGPU = gpu_free_host(aux_host)
439
    check_host_dealloc_gpu("elpa_multiply_a_b: aux_host", successGPU)
Wenzhe Yu's avatar
Wenzhe Yu committed
440

441
    successGPU = gpu_free(aux_dev)
442
    check_dealloc_gpu("elpa_multiply_a_b: aux_dev", successGPU)
Wenzhe Yu's avatar
Wenzhe Yu committed
443

444
    successGPU = gpu_free_host(tmp1_host)
445
    check_host_dealloc_gpu("elpa_multiply_a_b: tmp1_host", successGPU)
Wenzhe Yu's avatar
Wenzhe Yu committed
446

447
    successGPU = gpu_free(tmp1_dev)
448
    check_dealloc_gpu("elpa_multiply_a_b: tmp1_dev", successGPU)
Andreas Marek's avatar
Andreas Marek committed
449
450
451
452
  else ! useGPU
    deallocate(aux_mat, stat=istat, errmsg=errorMessage)
    check_deallocate("elpa_mult_at_b: aux_mat", istat, errorMessage)
  endif ! useGPU
Wenzhe Yu's avatar
Wenzhe Yu committed
453

Andreas Marek's avatar
Andreas Marek committed
454
455
  deallocate(aux_bc, lrs_save, lre_save, stat=istat, errmsg=errorMessage)
  check_deallocate("elpa_mult_at_b: aux_bc, lrs_save, lre_save", istat, errorMessage)
456

Andreas Marek's avatar
Andreas Marek committed
457

Andreas Marek's avatar
Andreas Marek committed
458
459
460
461
462
  call obj%timer%stop("elpa_mult_at_b_&
  &MATH_DATATYPE&
  &_&
  &PRECISION&
  &"//gpuString)