elpa1_trans_ev_template.X90 19.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
#if 0
!    This file is part of ELPA.
!
!    The ELPA library was originally created by the ELPA consortium,
!    consisting of the following organizations:
!
!    - Max Planck Computing and Data Facility (MPCDF), formerly known as
!      Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
!    - Bergische Universität Wuppertal, Lehrstuhl für angewandte
!      Informatik,
!    - Technische Universität München, Lehrstuhl für Informatik mit
!      Schwerpunkt Wissenschaftliches Rechnen ,
!    - Fritz-Haber-Institut, Berlin, Abt. Theorie,
!    - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
!      Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
!      and
!    - IBM Deutschland GmbH
!
!    This particular source code file contains additions, changes and
!    enhancements authored by Intel Corporation which is not part of
!    the ELPA consortium.
!
!    More information can be found here:
!    http://elpa.mpcdf.mpg.de/
!
!    ELPA is free software: you can redistribute it and/or modify
!    it under the terms of the version 3 of the license of the
!    GNU Lesser General Public License as published by the Free
!    Software Foundation.
!
!    ELPA is distributed in the hope that it will be useful,
!    but WITHOUT ANY WARRANTY; without even the implied warranty of
!    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
!    GNU Lesser General Public License for more details.
!
!    You should have received a copy of the GNU Lesser General Public License
!    along with ELPA.  If not, see <http://www.gnu.org/licenses/>
!
!    ELPA reflects a substantial effort on the part of the original
!    ELPA consortium, and we ask you to respect the spirit of the
!    license that we chose: i.e., please contribute any changes you
!    may have back to the original ELPA library distribution, and keep
!    any derivatives of ELPA under the same license that we chose for
!    the original distribution, the GNU Lesser General Public License.
!
!
! ELPA1 -- Faster replacements for ScaLAPACK symmetric eigenvalue routines
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif

!> \brief Transforms the eigenvectors of a tridiagonal matrix back
!>                     to the eigenvectors of the original matrix
!>                     (like Scalapack Routine PDORMTR)
!>
!  Parameters
!
Pavel Kus's avatar
Pavel Kus committed
61
!> \param na          Order of matrix a_mat, number of rows of matrix q_mat
62
!>
Pavel Kus's avatar
Pavel Kus committed
63
!> \param nqc         Number of columns of matrix q_mat
64
!>
Pavel Kus's avatar
Pavel Kus committed
65
!> \param a_mat(lda,matrixCols)  Matrix containing the Householder vectors (i.e. matrix a after tridiag_real)
66 67
!>                           Distribution is like in Scalapack.
!>
Pavel Kus's avatar
Pavel Kus committed
68
!> \param lda         Leading dimension of a_mat
69 70 71
!>
!> \param tau(na)     Factors of the Householder vectors
!>
Pavel Kus's avatar
Pavel Kus committed
72
!> \param q_mat           On input: Eigenvectors of tridiagonal matrix
73 74
!>                    On output: Transformed eigenvectors
!>                    Distribution is like in Scalapack.
75
!>
Pavel Kus's avatar
Pavel Kus committed
76
!> \param ldq         Leading dimension of q_mat
77 78 79
!>
!> \param nblk        blocksize of cyclic distribution, must be the same in both directions!
!>
Pavel Kus's avatar
Pavel Kus committed
80
!> \param matrixCols  local columns of matrix a_mat and q_mat
81 82 83 84 85 86 87
!>
!> \param mpi_comm_rows        MPI-Communicator for rows
!>
!> \param mpi_comm_cols        MPI-Communicator for columns
!>
!> \param useGPU      If true,  GPU version of the subroutine will be used
!>
88

89 90 91 92 93
    subroutine trans_ev_&
    &MATH_DATATYPE&
    &_&
    &PRECISION &
    (na, nqc, a_mat, lda, tau, q_mat, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, useGPU)
94 95
      use cuda_functions
      use iso_c_binding
96 97 98 99 100 101 102 103
#ifdef HAVE_DETAILED_TIMINGS
      use timings
#else
      use timings_dummy
#endif
      use precision
      implicit none

104 105 106 107 108
      integer(kind=ik), intent(in)                  :: na, nqc, lda, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
#if REALCASE == 1
      real(kind=REAL_DATATYPE), intent(in)          :: tau(na)
#endif
#if COMPLEXCASE == 1
109
      complex(kind=COMPLEX_DATATYPE), intent(in)    :: tau(na)
110 111 112 113 114 115 116 117 118 119
#endif

#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,*), q_mat(ldq,*)
#else
      real(kind=REAL_DATATYPE), intent(inout)       :: a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
#endif
#endif
#if COMPLEXCASE == 1
120
#ifdef USE_ASSUMED_SIZE
121
      complex(kind=COMPLEX_DATATYPE), intent(inout) :: a_mat(lda,*), q_mat(ldq,*)
122
#else
123
      complex(kind=COMPLEX_DATATYPE), intent(inout) ::  a_mat(lda,matrixCols), q_mat(ldq,matrixCols)
124
#endif
125
#endif
126
      logical, intent(in)                           :: useGPU
127

128 129
      integer(kind=ik)                              :: max_stored_rows

130 131 132 133 134 135 136
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_REAL
      real(kind=rk8), parameter                     :: ZERO = 0.0_rk8, ONE = 1.0_rk8
#else
      real(kind=rk4), parameter                     :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif
137
#if COMPLEXCASE == 1
138
#ifdef DOUBLE_PRECISION_COMPLEX
139
      complex(kind=ck8), parameter                  :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
140
#else
141
      complex(kind=ck4), parameter                  :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
#endif
#endif
      integer(kind=ik)                              :: my_prow, my_pcol, np_rows, np_cols, mpierr
      integer(kind=ik)                              :: totalblocks, max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
      integer(kind=ik)                              :: l_cols, l_rows, l_colh, nstor
      integer(kind=ik)                              :: istep, i, n, nc, ic, ics, ice, nb, cur_pcol
      integer(kind=ik)                              :: hvn_ubnd, hvm_ubnd

#if REALCASE == 1
      real(kind=REAL_DATATYPE), allocatable         :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
      real(kind=REAL_DATATYPE), allocatable         :: tmat(:,:), h1(:), h2(:), hvm1(:)
#endif
#if COMPLEXCASE == 1
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
      complex(kind=COMPLEX_DATATYPE), allocatable   :: tmat(:,:), h1(:), h2(:), hvm1(:)
#endif
      integer(kind=ik)                              :: istat
      character(200)                                :: errorMessage
160

161 162 163
      integer(kind=C_intptr_T)                      :: q_dev, tmp_dev, hvm_dev, tmat_dev
      logical                                       :: successCUDA

164 165 166 167 168
      call timer%start("trans_ev_&
      &MATH_DATATYPE&
      &_" // &
      &PRECISION_SUFFIX &
      )
169

170
      call timer%start("mpi_communication")
171 172 173 174 175 176 177 178
      call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
      call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
      call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
      call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
      call timer%stop("mpi_communication")

      totalblocks = (na-1)/nblk + 1
      max_blocks_row = (totalblocks-1)/np_rows + 1
Pavel Kus's avatar
Pavel Kus committed
179
      max_blocks_col = ((nqc-1)/nblk)/np_cols + 1  ! Columns of q_mat!
180 181 182 183 184 185 186

      max_local_rows = max_blocks_row*nblk
      max_local_cols = max_blocks_col*nblk

      max_stored_rows = (63/nblk+1)*nblk

      allocate(tmat(max_stored_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
187 188 189 190
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "tmat", istat, errorMessage)

191
      allocate(h1(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
192 193 194 195
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "h1", istat, errorMessage)

196
      allocate(h2(max_stored_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
197 198 199 200
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "h2", istat, errorMessage)

201
      allocate(tmp1(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
202 203 204 205
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "tmp1", istat, errorMessage)

206
      allocate(tmp2(max_local_cols*max_stored_rows), stat=istat, errmsg=errorMessage)
207 208 209 210
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "tmp2", istat, errorMessage)

211
      allocate(hvb(max_local_rows*nblk), stat=istat, errmsg=errorMessage)
212 213 214 215
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "hvn", istat, errorMessage)

216
      allocate(hvm(max_local_rows,max_stored_rows), stat=istat, errmsg=errorMessage)
217 218 219
      call check_alloc("trans_ev_&
      &MATH_DATATYPE&
      &", "hvm", istat, errorMessage)
220 221 222 223

      hvm = 0   ! Must be set to 0 !!!
      hvb = 0   ! Safety only

Pavel Kus's avatar
Pavel Kus committed
224
      l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q_mat
225 226

      nstor = 0
227 228 229
      if (useGPU) then
        hvn_ubnd = 0
      endif
230

231
#if COMPLEXCASE == 1
232 233
      ! In the complex case tau(2) /= 0
      if (my_prow == prow(1, nblk, np_rows)) then
234
        q_mat(1,1:l_cols) = q_mat(1,1:l_cols)*(ONE-tau(2))
235
      endif
236
#endif
237

238 239 240
      if (useGPU) then
        ! todo: this is used only for copying hmv to device.. it should be possible to go without it
        allocate(hvm1(max_local_rows*max_stored_rows), stat=istat, errmsg=errorMessage)
241 242 243 244 245 246 247 248 249
	call check_alloc("trans_ev_&
	&MATH_DATATYPE&
	&", "hvm1", istat, errorMessage)

        successCUDA = cuda_malloc(tmat_dev, max_stored_rows * max_stored_rows * size_of_&
	&PRECISION&
	&_&
	&MATH_DATATYPE&
	&_datatype)
250 251
        check_alloc_cuda("trans_ev", successCUDA)

252 253 254 255 256
        successCUDA = cuda_malloc(hvm_dev, max_local_rows * max_stored_rows * size_of_&
	&PRECISION&
	&_&
	&MATH_DATATYPE&
	&_datatype)
257
        check_alloc_cuda("trans_ev", successCUDA)
258

259 260 261 262 263
        successCUDA = cuda_malloc(tmp_dev, max_local_cols * max_stored_rows * size_of_&
	&PRECISION&
	&_&
	&MATH_DATATYPE&
	&_datatype)
264 265
        check_alloc_cuda("trans_ev", successCUDA)

266 267 268 269 270
        successCUDA = cuda_malloc(q_dev, ldq * matrixCols * size_of_&
	&PRECISION&
	&_&
	&MATH_DATATYPE&
	&_datatype)
271 272
        check_alloc_cuda("trans_ev", successCUDA)

Pavel Kus's avatar
Pavel Kus committed
273
!         q_dev = q_mat
274 275 276 277 278
        successCUDA = cuda_memcpy(q_dev, loc(q_mat(1,1)), ldq * matrixCols * size_of_&
	&PRECISION&
	&_&
	&MATH_DATATYPE&
	&_datatype, cudaMemcpyHostToDevice)
279 280
        check_memcpy_cuda("trans_ev", successCUDA)
      endif  ! useGPU
281

282
      do istep = 1, na, nblk
283 284 285 286 287 288 289
        ics = MAX(istep,3)
        ice = MIN(istep+nblk-1,na)
        if (ice<ics) cycle

        cur_pcol = pcol(istep, nblk, np_cols)

        nb = 0
290
        do ic = ics, ice
291 292 293 294 295

          l_colh = local_index(ic  , my_pcol, np_cols, nblk, -1) ! Column of Householder vector
          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector


296
          if (my_pcol == cur_pcol) then
Pavel Kus's avatar
Pavel Kus committed
297
            hvb(nb+1:nb+l_rows) = a_mat(1:l_rows,l_colh)
298
            if (my_prow == prow(ic-1, nblk, np_rows)) then
299 300 301 302 303 304 305 306 307 308
              hvb(nb+l_rows) = 1.
            endif
          endif

          nb = nb+l_rows
        enddo

#ifdef WITH_MPI
        call timer%start("mpi_communication")
        if (nb>0) &
309
          call MPI_Bcast(hvb, nb, &
310
#if REALCASE == 1
311
	  &MPI_REAL_PRECISION&
312
#endif
313

314
#if COMPLEXCASE == 1
315
          &MPI_COMPLEX_PRECISION&
316
#endif
317
	  , cur_pcol, mpi_comm_cols, mpierr)
318 319
        call timer%stop("mpi_communication")
#endif /* WITH_MPI */
320

321
        nb = 0
322
        do ic = ics, ice
323 324
          l_rows = local_index(ic-1, my_prow, np_rows, nblk, -1) ! # rows of Householder vector
          hvm(1:l_rows,nstor+1) = hvb(nb+1:nb+l_rows)
325 326 327
          if (useGPU) then
            hvm_ubnd = l_rows
          endif
328 329 330 331 332
          nstor = nstor+1
          nb = nb+l_rows
        enddo

        ! Please note: for smaller matix sizes (na/np_rows<=256), a value of 32 for nstor is enough!
333
        if (nstor+nblk > max_stored_rows .or. istep+nblk > na .or. (na/np_rows <= 256 .and. nstor >= 32)) then
334 335

          ! Calculate scalar products of stored vectors.
336
          ! This can be done in different ways, we use dsyrk or zherk
337 338

          tmat = 0
339
	  call timer%start("blas")
340
          if (l_rows>0) &
341
#if REALCASE == 1
342
            call PRECISION_SYRK('U', 'T',   &
343 344
#endif
#if COMPLEXCASE == 1
345
            call PRECISION_HERK('U', 'C',   &
346
#endif
347
	                         nstor, l_rows, ONE, hvm, ubound(hvm,dim=1), ZERO, tmat, max_stored_rows)
348
	  call timer%stop("blas")
349
          nc = 0
350
          do n = 1, nstor-1
351 352 353 354 355
            h1(nc+1:nc+n) = tmat(1:n,n+1)
            nc = nc+n
          enddo
#ifdef WITH_MPI
          call timer%start("mpi_communication")
356
          if (nc>0) call mpi_allreduce( h1, h2, nc, &
357
#if REALCASE == 1
358
          &MPI_REAL_PRECISION&
359 360
#endif
#if COMPLEXCASE == 1
361
          &MPI_COMPLEX_PRECISION&
362
#endif
363
	  &, MPI_SUM, mpi_comm_rows, mpierr)
364 365
          call timer%stop("mpi_communication")
#else /* WITH_MPI */
366 367 368

          if (nc > 0) h2 = h1

369 370 371 372 373
#endif /* WITH_MPI */
          ! Calculate triangular matrix T

          nc = 0
          tmat(1,1) = tau(ice-nstor+1)
374
          do n = 1, nstor-1
375
	    call timer%start("blas")
376
#if REALCASE == 1
377
            call PRECISION_TRMV('L', 'T', 'N',    &
378 379
#endif
#if COMPLEXCASE == 1
380
            call PRECISION_TRMV('L', 'C', 'N',    &
381
#endif
382
                      	        n, tmat, max_stored_rows, h2(nc+1), 1)
383
	    call timer%stop("blas")
384

385
            tmat(n+1,1:n) = &
386
#if REALCASE == 1
387
	    -h2(nc+1:nc+n)  &
388 389
#endif
#if COMPLEXCASE == 1
390
            -conjg(h2(nc+1:nc+n)) &
391
#endif
392 393
	    *tau(ice-nstor+n+1)

394 395 396
            tmat(n+1,n+1) = tau(ice-nstor+n+1)
            nc = nc+n
          enddo
397

398 399 400 401 402
          if (useGPU) then
            ! todo: is this reshape really neccessary?
            hvm1(1:hvm_ubnd*nstor) = reshape(hvm(1:hvm_ubnd,1:nstor), (/ hvm_ubnd*nstor /))

            !hvm_dev(1:hvm_ubnd*nstor) = hvm1(1:hvm_ubnd*nstor)
403
            successCUDA = cuda_memcpy(hvm_dev, loc(hvm1(1)),   &
404 405 406 407 408 409
                          hvm_ubnd * nstor * size_of_&
	    &PRECISION&
	    &_&
	    &MATH_DATATYPE&
	    &_datatype, cudaMemcpyHostToDevice)

410
            check_memcpy_cuda("trans_ev", successCUDA)
411

412
            !tmat_dev = tmat
413
            successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1)),   &
414 415 416 417 418
                          max_stored_rows * max_stored_rows * size_of_&
	    &PRECISION&
	    &_&
	    &MATH_DATATYPE&
	    &_datatype, cudaMemcpyHostToDevice)
419 420
            check_memcpy_cuda("trans_ev", successCUDA)
          endif
421 422 423 424

          ! Q = Q - V * T * V**T * Q

          if (l_rows>0) then
425 426
            if (useGPU) then
	      call timer%start("cublas")
427
#if REALCASE == 1
428
              call cublas_PRECISION_GEMM('T', 'N',     &
429 430
#endif
#if COMPLEXCASE == 1
431
              call cublas_PRECISION_GEMM('C', 'N',     &
432
#endif
433 434
                                         nstor, l_cols, l_rows, ONE, hvm_dev, hvm_ubnd,  &
                                         q_dev, ldq,  ZERO, tmp_dev, nstor)
435
	      call timer%stop("cublas")
436

437
            else ! useGPU
438

439
	      call timer%start("blas")
440
#if REALCASE == 1
441
              call PRECISION_GEMM('T', 'N',           &
442 443
#endif
#if COMPLEXCASE == 1
444
              call PRECISION_GEMM('C', 'N',           &
445
#endif
446 447
             	                  nstor, l_cols, l_rows, ONE, hvm, ubound(hvm,dim=1), &
                                  q_mat, ldq, ZERO, tmp1, nstor)
448
              call timer%stop("blas")
449 450
            endif ! useGPU

451
          else !l_rows>0
452

453
            if (useGPU) then
454 455 456 457 458
              successCUDA = cuda_memset(tmp_dev, 0, l_cols * nstor * size_of_&
	      &PRECISION&
	      &_&
	      &MATH_DATATYPE&
	      &_datatype)
459
              check_memcpy_cuda("trans_ev", successCUDA)
460
            else
461
              tmp1(1:l_cols*nstor) = 0
462
            endif
463 464
          endif  !l_rows>0

465
#ifdef WITH_MPI
466 467
          ! In the legacy GPU version, this allreduce was ommited. But probably it has to be done for GPU + MPI
          ! todo: does it need to be copied whole? Wouldn't be a part sufficient?
468 469
          if (useGPU) then
            successCUDA = cuda_memcpy(loc(tmp1(1)), tmp_dev,  &
470 471 472 473 474
                          max_local_cols * max_stored_rows * size_of_&
			  &PRECISION&
			  &_&
			  &MATH_DATATYPE&
			  &_datatype, cudaMemcpyDeviceToHost)
475
            check_memcpy_cuda("trans_ev", successCUDA)
476
          endif
477
          call timer%start("mpi_communication")
478
          call mpi_allreduce(tmp1, tmp2, nstor*l_cols, &
479
#if REALCASE == 1
480
          &MPI_REAL_PRECISION&
481 482
#endif
#if COMPLEXCASE == 1
483
          &MPI_COMPLEX_PRECISION&
484
#endif
485
	  &, MPI_SUM, mpi_comm_rows, mpierr)
486
          call timer%stop("mpi_communication")
487
          ! copy back tmp2 - after reduction...
488
          if (useGPU) then
489
            successCUDA = cuda_memcpy(tmp_dev, loc(tmp2(1)),  &
490 491 492 493 494
                          max_local_cols * max_stored_rows * size_of_&
			  &PRECISION&
			  &_&
			  &MATH_DATATYPE&
			  &_datatype, cudaMemcpyHostToDevice)
495
            check_memcpy_cuda("trans_ev", successCUDA)
496 497 498 499
          endif ! useGPU


#else /* WITH_MPI */
500
!          tmp2 = tmp1
501 502
#endif /* WITH_MPI */

503
          if (l_rows>0) then
504
            if (useGPU) then
505
	      call timer%start("cublas")
506 507 508 509
              call cublas_PRECISION_TRMM('L', 'L', 'N', 'N',     &
	                                 nstor, l_cols, ONE, tmat_dev, max_stored_rows,  &
                                         tmp_dev, nstor)

510
              call cublas_PRECISION_GEMM('N', 'N' ,l_rows ,l_cols ,nstor,  &
511 512
                                         -ONE, hvm_dev, hvm_ubnd, tmp_dev, nstor,   &
                                         ONE, q_dev, ldq)
513
  	      call timer%stop("cublas")
514
            else !useGPU
515 516
#ifdef WITH_MPI
              ! tmp2 = tmat * tmp2
517
	      call timer%start("blas")
518
              call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols,   &
519
                                  ONE, tmat, max_stored_rows, tmp2, nstor)
520
              !q_mat = q_mat - hvm*tmp2
521
              call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor,   &
522
                                  -ONE, hvm, ubound(hvm,dim=1), tmp2, nstor, ONE, q_mat, ldq)
523
	      call timer%stop("blas")
524
#else /* WITH_MPI */
525
	      call timer%start("blas")
526

527
              call PRECISION_TRMM('L', 'L', 'N', 'N', nstor, l_cols,   &
528
                                  ONE, tmat, max_stored_rows, tmp1, nstor)
529
              call PRECISION_GEMM('N', 'N', l_rows, l_cols, nstor,   &
530
                                  -ONE, hvm, ubound(hvm,dim=1), tmp1, nstor, ONE, q_mat, ldq)
531
	      call timer%stop("blas")
532
#endif /* WITH_MPI */
533 534
            endif ! useGPU
          endif  ! l_rows>0
535
          nstor = 0
536 537
        endif  ! (nstor+nblk>max_stored_rows .or. istep+nblk>na .or. (na/np_rows<=256 .and. nstor>=32))

538
      enddo ! istep=1,na,nblk
539 540 541

      deallocate(tmat, h1, h2, tmp1, tmp2, hvb, hvm, stat=istat, errmsg=errorMessage)
      if (istat .ne. 0) then
542 543 544
        print *,"trans_ev_&
	&MATH_DATATYPE&
	&: error when deallocating hvm "//errorMessage
545
        stop
546 547
      endif

548
      if (useGPU) then
Pavel Kus's avatar
Pavel Kus committed
549
        !q_mat = q_dev
550 551 552 553 554
        successCUDA = cuda_memcpy(loc(q_mat(1,1)), q_dev, ldq * matrixCols * size_of_&
	              &PRECISION&
		      &_&
		      &MATH_DATATYPE&
		      &_datatype, cudaMemcpyDeviceToHost)
555 556
        check_memcpy_cuda("trans_ev", successCUDA)

557 558
        deallocate(hvm1, stat=istat, errmsg=errorMessage)
        if (istat .ne. 0) then
559 560 561
          print *,"trans_ev_&
	  &MATH_DATATYPE&
	  &: error when deallocating hvm1 "//errorMessage
562
          stop
563 564
        endif

565 566 567 568 569 570 571 572 573 574 575 576 577 578 579
        !deallocate(q_dev, tmp_dev, hvm_dev, tmat_dev)
        successCUDA = cuda_free(q_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(tmp_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(hvm_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

        successCUDA = cuda_free(tmat_dev)
        check_dealloc_cuda("trans_ev", successCUDA)

      endif

580 581 582 583 584
      call timer%stop("trans_ev_&
      &MATH_DATATYPE&
      &" // &
      &PRECISION_SUFFIX&
      )
585

586 587 588 589
    end subroutine trans_ev_&
    &MATH_DATATYPE&
    &_&
    &PRECISION