elpa1_trans_ev_template.X90 19.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

!> \brief Transforms the eigenvectors of a tridiagonal matrix back
!>                     to the eigenvectors of the original matrix
!>                     (like Scalapack Routine PDORMTR)
!>
!  Parameters
!
Pavel Kus's avatar
Pavel Kus committed
61
!> \param na          Order of matrix a_mat, number of rows of matrix q_mat
62
!>
Pavel Kus's avatar
Pavel Kus committed
63
!> \param nqc         Number of columns of matrix q_mat
64
!>
Pavel Kus's avatar
Pavel Kus committed
65
!> \param a_mat(lda,matrixCols)  Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
66
67
!>                           Distribution is like in Scalapack.
!>
Pavel Kus's avatar
Pavel Kus committed
68
!> \param lda         Leading dimension of a_mat
69
70
71
!>
!> \param tau(na)     Factors of the Householder vectors
!>
Pavel Kus's avatar
Pavel Kus committed
72
!> \param q_mat           On input: Eigenvectors of tridiagonal matrix
73
74
!>                    On output: Transformed eigenvectors
!>                    Distribution is like in Scalapack.
75
!>
Pavel Kus's avatar
Pavel Kus committed
76
!> \param ldq         Leading dimension of q_mat
77
78
79
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
Pavel Kus's avatar
Pavel Kus committed
80
!> \param matrixCols  local columns of matrix a_mat and q_mat
81
82
83
84
85
86
87
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!>
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
!>
88

89
90
91
92
93
    subroutine trans_ev_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
    (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
94
95
      use cuda_functions
      use iso_c_binding
96
97
98
99
100
101
102
103
#ifdef HAVE_DETAILED_TIMINGS
      use timings
#else
      use timings_dummy
#endif
      use precision
      implicit none

104
105
106
107
108
      integer(kind=ik), intent(in)                  :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(in)          :: tau(na)
#endif
#if COMPLEXCASE == 1
109
      complex(kind=COMPLEX_DATATYPE), intent(in)    :: tau(na)
110
111
112
113
114
115
116
117
118
119
#endif

#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*), q_mat(ldq,*)
#else
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
120
#ifdef USE_ASSUMED_SIZE
121
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
122
#else
123
      complex(kind=COMPLEX_DATATYPE), intent(inout) ::  a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
124
#endif
125
#endif
126
      logical, intent(in)                           :: useGPU
127

128
129
      integer(kind=ik)                              :: max_stored_rows

130
131
132
133
134
135
136
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=rk8), parameter                     :: ZERO = 0.0_rk8, ONE = 1.0_rk8
#else
      real(kind=rk4), parameter                     :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif
137
#if COMPLEXCASE == 1
138
#ifdef DOUBLE_PRECISION_COMPLEX
139
      complex(kind=ck8), parameter                  :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
140
#else
141
      complex(kind=ck4), parameter                  :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#endif
#endif
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
      integer(kind=ik)                              :: l_cols, l_rows, l_colh, nstor
      integer(kind=ik)                              :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
      integer(kind=ik)                              :: hvn_ubnd, hvm_ubnd

#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
      real(kind=REAL_DATATYPE), allocatable         :: tmat(:,:), h1(:), h2(:), hvm1(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmat(:,:), h1(:), h2(:), hvm1(:)
#endif
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
160

161
162
      integer(kind=C_intptr_T)                      :: q_dev, tmp_dev, hvm_dev, tmat_dev
      logical                                       :: successCUDA
163
      integer(kind=c_intptr_t), parameter             :: size_of_datatype = size_of_&
164
165
166
                                                                          &PRECISION&
                                                                          &_&
                                                                          &MATH_DATATYPE
167

168
169
      call timer%start("trans_ev_&
      &MATH_DATATYPE&
Andreas Marek's avatar
Andreas Marek committed
170
      &" // &
171
172
      &PRECISION_SUFFIX &
      )
173

174
      call timer%start("mpi_communication")
175
176
177
178
179
180
181
182
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      call timer%stop("mpi_communication")

      totalblocks = (na-1)/nblk + 1
      max_blocks_row = (totalblocks-1)/np_rows + 1
Pavel Kus's avatar
Pavel Kus committed
183
      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q_mat!
184
185
186
187
188
189
190

      max_local_rows = max_blocks_row*nblk
      max_local_cols = max_blocks_col*nblk

      max_stored_rows = (63/nblk+1)*nblk

      allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
191
192
193
194
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "tmat", istat, errorMessage)

195
      allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
196
197
198
199
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "h1", istat, errorMessage)

200
      allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
201
202
203
204
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "h2", istat, errorMessage)

205
      allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
206
207
208
209
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "tmp1", istat, errorMessage)

210
      allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
211
212
213
214
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "tmp2", istat, errorMessage)

215
      allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
216
217
218
219
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "hvn", istat, errorMessage)

220
      allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
221
222
223
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "hvm", istat, errorMessage)
224
225
226
227

      hvm = 0   ! Must be set to 0 !!!
      hvb = 0   ! Safety only

Pavel Kus's avatar
Pavel Kus committed
228
      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
229
230

      nstor = 0
231
232
233
      if (useGPU) then
        hvn_ubnd = 0
      endif
234

235
#if COMPLEXCASE == 1
236
237
      ! In the complex case tau(2) /= 0
      if (my_prow == prow(1, nblk, np_rows)) then
238
        q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2))
239
      endif
240
#endif
241

242
243
244
      if (useGPU) then
        ! todo: this is used only for copying hmv to device.. it should be possible to go without it
        allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
245
246
247
248
	call check_alloc("trans_ev_&
	&MATH_DATATYPE&
	&", "hvm1", istat, errorMessage)

249
        successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_datatype)
250
251
        check_alloc_cuda("trans_ev", successCUDA)

252
        successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_datatype)
253
        check_alloc_cuda("trans_ev", successCUDA)
254

255
        successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_datatype)
256
257
        check_alloc_cuda("trans_ev", successCUDA)

258
        successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_datatype)
259
260
        check_alloc_cuda("trans_ev", successCUDA)

Pavel Kus's avatar
Pavel Kus committed
261
!         q_dev = q_mat
262
        successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_datatype, cudaMemcpyHostToDevice)
263
264
        check_memcpy_cuda("trans_ev", successCUDA)
      endif  ! useGPU
265

266
      do istep = 1, na, nblk
267
268
269
270
271
272
273
        ics = MAX(istep,3)
        ice = MIN(istep+nblk-1,na)
        if (ice<ics) cycle

        cur_pcol = pcol(istep, nblk, np_cols)

        nb = 0
274
        do ic = ics, ice
275
276
277
278
279

          l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder vector
          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector


280
          if (my_pcol == cur_pcol) then
Pavel Kus's avatar
Pavel Kus committed
281
            hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
282
            if (my_prow == prow(ic-1, nblk, np_rows)) then
283
284
285
286
287
288
289
290
291
292
              hvb(nb+l_rows) = 1.
            endif
          endif

          nb = nb+l_rows
        enddo

#ifdef WITH_MPI
        call timer%start("mpi_communication")
        if (nb>0) &
293
          call MPI_Bcast(hvb, nb, &
294
#if REALCASE == 1
295
	  &MPI_REAL_PRECISION&
296
#endif
297

298
#if COMPLEXCASE == 1
299
          &MPI_COMPLEX_PRECISION&
300
#endif
301
	  , cur_pcol, mpi_comm_cols, mpierr)
302
303
        call timer%stop("mpi_communication")
#endif /* WITH_MPI */
304

305
        nb = 0
306
        do ic = ics, ice
307
308
          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
          hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
309
310
311
          if (useGPU) then
            hvm_ubnd = l_rows
          endif
312
313
314
315
316
          nstor = nstor+1
          nb = nb+l_rows
        enddo

        ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
317
        if (nstor+nblk > max_stored_rows .or. istep+nblk > na .or. (na/np_rows <= 256 .and. nstor >= 32)) then
318
319

          ! Calculate scalar products of stored vectors.
320
          ! This can be done in different ways, we use dsyrk or zherk
321
322

          tmat = 0
323
	  call timer%start("blas")
324
          if (l_rows>0) &
325
#if REALCASE == 1
326
            call PRECISION_SYRK('U', 'T',   &
327
328
#endif
#if COMPLEXCASE == 1
329
            call PRECISION_HERK('U', 'C',   &
330
#endif
331
	                         nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows)
332
	  call timer%stop("blas")
333
          nc = 0
334
          do n = 1, nstor-1
335
336
337
338
339
            h1(nc+1:nc+n) = tmat(1:n,n+1)
            nc = nc+n
          enddo
#ifdef WITH_MPI
          call timer%start("mpi_communication")
340
          if (nc>0) call mpi_allreduce( h1, h2, nc, &
341
#if REALCASE == 1
342
          &MPI_REAL_PRECISION&
343
344
#endif
#if COMPLEXCASE == 1
345
          &MPI_COMPLEX_PRECISION&
346
#endif
347
	  &, MPI_SUM, mpi_comm_rows, mpierr)
348
349
          call timer%stop("mpi_communication")
#else /* WITH_MPI */
350
351
352

          if (nc > 0) h2 = h1

353
354
355
356
357
#endif /* WITH_MPI */
          ! Calculate triangular matrix T

          nc = 0
          tmat(1,1) = tau(ice-nstor+1)
358
          do n = 1, nstor-1
359
	    call timer%start("blas")
360
#if REALCASE == 1
361
            call PRECISION_TRMV('L', 'T', 'N',    &
362
363
#endif
#if COMPLEXCASE == 1
364
            call PRECISION_TRMV('L', 'C', 'N',    &
365
#endif
366
                      	        n, tmat, max_stored_rows, h2(nc+1), 1)
367
	    call timer%stop("blas")
368

369
            tmat(n+1,1:n) = &
370
#if REALCASE == 1
371
	    -h2(nc+1:nc+n)  &
372
373
#endif
#if COMPLEXCASE == 1
374
            -conjg(h2(nc+1:nc+n)) &
375
#endif
376
377
	    *tau(ice-nstor+n+1)

378
379
380
            tmat(n+1,n+1) = tau(ice-nstor+n+1)
            nc = nc+n
          enddo
381

382
383
384
385
386
          if (useGPU) then
            ! todo: is this reshape really neccessary?
            hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))

            !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
387
            successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)),   &
388
                          hvm_ubnd * nstor * size_of_datatype, cudaMemcpyHostToDevice)
389

390
            check_memcpy_cuda("trans_ev", successCUDA)
391

392
            !tmat_dev = tmat
393
            successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)),   &
394
                          max_stored_rows * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
395
396
            check_memcpy_cuda("trans_ev", successCUDA)
          endif
397
398
399
400

          ! Q = Q - V * T * V**T * Q

          if (l_rows>0) then
401
402
            if (useGPU) then
	      call timer%start("cublas")
403
#if REALCASE == 1
404
              call cublas_PRECISION_GEMM('T', 'N',     &
405
406
#endif
#if COMPLEXCASE == 1
407
              call cublas_PRECISION_GEMM('C', 'N',     &
408
#endif
409
410
                                         nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd,  &
                                         q_dev, ldq,  ZERO, tmp_dev, nstor)
411
	      call timer%stop("cublas")
412

413
            else ! useGPU
414

415
	      call timer%start("blas")
416
#if REALCASE == 1
417
              call PRECISION_GEMM('T', 'N',           &
418
419
#endif
#if COMPLEXCASE == 1
420
              call PRECISION_GEMM('C', 'N',           &
421
#endif
422
423
             	                  nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
                                  q_mat, ldq, ZERO, tmp1, nstor)
424
              call timer%stop("blas")
425
426
            endif ! useGPU

427
          else !l_rows>0
428

429
            if (useGPU) then
430
              successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_datatype)
431
              check_memcpy_cuda("trans_ev", successCUDA)
432
            else
433
              tmp1(1:l_cols*nstor) = 0
434
            endif
435
436
          endif  !l_rows>0

437
#ifdef WITH_MPI
438
439
          ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
          ! todo: does it need to be copied whole? Wouldn't be a part sufficient?
440
441
          if (useGPU) then
            successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev,  &
442
                          max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyDeviceToHost)
443
            check_memcpy_cuda("trans_ev", successCUDA)
444
          endif
445
          call timer%start("mpi_communication")
446
          call mpi_allreduce(tmp1, tmp2, nstor*l_cols, &
447
#if REALCASE == 1
448
          &MPI_REAL_PRECISION&
449
450
#endif
#if COMPLEXCASE == 1
451
          &MPI_COMPLEX_PRECISION&
452
#endif
453
	  &, MPI_SUM, mpi_comm_rows, mpierr)
454
          call timer%stop("mpi_communication")
455
          ! copy back tmp2 - after reduction...
456
          if (useGPU) then
457
            successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)),  &
458
                          max_local_cols * max_stored_rows * size_of_datatype, cudaMemcpyHostToDevice)
459
            check_memcpy_cuda("trans_ev", successCUDA)
460
461
462
463
          endif ! useGPU


#else /* WITH_MPI */
464
!          tmp2 = tmp1
465
466
#endif /* WITH_MPI */

467
          if (l_rows>0) then
468
            if (useGPU) then
469
	      call timer%start("cublas")
470
471
472
473
              call cublas_PRECISION_TRMM('L', 'L', 'N', 'N',     &
	                                 nstor, l_cols, ONE, tmat_dev, max_stored_rows,  &
                                         tmp_dev, nstor)

474
              call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor,  &
475
476
                                         -ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor,   &
                                         ONE, q_dev, ldq)
477
  	      call timer%stop("cublas")
478
            else !useGPU
479
480
#ifdef WITH_MPI
              ! tmp2 = tmat * tmp2
481
	      call timer%start("blas")
482
              call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols,   &
483
                                  ONE, tmat, max_stored_rows, tmp2, nstor)
484
              !q_mat = q_mat - hvm*tmp2
485
              call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor,   &
486
                                  -ONE, hvm, ubound(hvm,dim=1), tmp2, nstor, ONE, q_mat, ldq)
487
	      call timer%stop("blas")
488
#else /* WITH_MPI */
489
	      call timer%start("blas")
490

491
              call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols,   &
492
                                  ONE, tmat, max_stored_rows, tmp1, nstor)
493
              call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor,   &
494
                                  -ONE, hvm, ubound(hvm,dim=1), tmp1, nstor, ONE, q_mat, ldq)
495
	      call timer%stop("blas")
496
#endif /* WITH_MPI */
497
498
            endif ! useGPU
          endif  ! l_rows>0
499
          nstor = 0
500
501
        endif  ! (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32))

502
      enddo ! istep=1,na,nblk
503
504
505

      deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
506
507
508
        print *,"trans_ev_&
	&MATH_DATATYPE&
	&: error when deallocating hvm "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
509
        stop 1
510
511
      endif

512
      if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
513
        !q_mat = q_dev
514
        successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_datatype, cudaMemcpyDeviceToHost)
515
516
        check_memcpy_cuda("trans_ev", successCUDA)

517
518
        deallocate(hvm1, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
519
520
521
          print *,"trans_ev_&
	  &MATH_DATATYPE&
	  &: error when deallocating hvm1 "//errorMessage
Andreas Marek's avatar
Andreas Marek committed
522
          stop 1
523
524
        endif

525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
        !deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
        successCUDA = cuda_free(q_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(tmp_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(hvm_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(tmat_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

      endif

540
541
542
543
544
      call timer%stop("trans_ev_&
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX&
      )
545

546
547
548
549
    end subroutine trans_ev_&
    &MATH_DATATYPE&
    &_&
    &PRECISION