Commit 9c2d7f11 authored by Andreas Marek's avatar Andreas Marek
Browse files

Start to unify real/complex band_to_full

parent 2a3c5be1
...@@ -55,18 +55,17 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \ ...@@ -55,18 +55,17 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2_template.X90 \ src/elpa2_template.X90 \
src/elpa2_bandred_template.X90 \ src/elpa2_bandred_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \ src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \ src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_tridiag_band_real_template.X90 \ src/elpa2_tridiag_band_real_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \ src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \ src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \ src/elpa2_tridiag_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \ src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_real_template.X90 \ src/elpa2_kernels/elpa2_kernels_real_template.X90 \
src/elpa2_kernels/elpa2_kernels_complex_template.X90 \ src/elpa2_kernels/elpa2_kernels_complex_template.X90 \
src/elpa2_kernels/elpa2_kernels_simple_template.X90 \ src/elpa2_kernels/elpa2_kernels_simple_template.X90 \
src/redist_band.X90 \ src/redist_band.X90 \
src/precision_macros.h src/precision_macros.h
lib_LTLIBRARIES = libelpa@SUFFIX@.la lib_LTLIBRARIES = libelpa@SUFFIX@.la
libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION) libelpa@SUFFIX@_la_LINK = $(FCLINK) $(AM_LDFLAGS) -version-info $(ELPA_SO_VERSION)
...@@ -913,8 +912,7 @@ EXTRA_DIST = \ ...@@ -913,8 +912,7 @@ EXTRA_DIST = \
src/elpa2_herm_matrix_allreduce_complex_template.X90 \ src/elpa2_herm_matrix_allreduce_complex_template.X90 \
src/elpa2_symm_matrix_allreduce_real_template.X90 \ src/elpa2_symm_matrix_allreduce_real_template.X90 \
src/elpa2_template.X90 \ src/elpa2_template.X90 \
src/elpa2_trans_ev_band_to_full_complex_template.X90 \ src/elpa2_trans_ev_band_to_full_template.X90 \
src/elpa2_trans_ev_band_to_full_real_template.X90 \
src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \ src/elpa2_trans_ev_tridi_to_band_complex_template.X90 \
src/elpa2_trans_ev_tridi_to_band_real_template.X90 \ src/elpa2_trans_ev_tridi_to_band_real_template.X90 \
src/elpa2_tridiag_band_complex_template.X90 \ src/elpa2_tridiag_band_complex_template.X90 \
......
...@@ -66,7 +66,9 @@ ...@@ -66,7 +66,9 @@
#include "elpa2_bandred_template.X90" #include "elpa2_bandred_template.X90"
#undef COMPLEXCASE #undef COMPLEXCASE
#include "elpa2_herm_matrix_allreduce_complex_template.X90" #include "elpa2_herm_matrix_allreduce_complex_template.X90"
#include "elpa2_trans_ev_band_to_full_complex_template.X90" #define COMPLEXCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#undef COMPLEXCASE
#include "elpa2_tridiag_band_complex_template.X90" #include "elpa2_tridiag_band_complex_template.X90"
#include "elpa2_trans_ev_tridi_to_band_complex_template.X90" #include "elpa2_trans_ev_tridi_to_band_complex_template.X90"
......
...@@ -66,7 +66,9 @@ ...@@ -66,7 +66,9 @@
#include "elpa2_bandred_template.X90" #include "elpa2_bandred_template.X90"
#undef REALCASE #undef REALCASE
#include "elpa2_symm_matrix_allreduce_real_template.X90" #include "elpa2_symm_matrix_allreduce_real_template.X90"
#include "elpa2_trans_ev_band_to_full_real_template.X90" #define REALCASE 1
#include "elpa2_trans_ev_band_to_full_template.X90"
#undef REALCASE
#include "elpa2_tridiag_band_real_template.X90" #include "elpa2_tridiag_band_real_template.X90"
#include "elpa2_trans_ev_tridi_to_band_real_template.X90" #include "elpa2_trans_ev_tridi_to_band_real_template.X90"
......
#if 0
! This file is part of ELPA.
!
! The ELPA library was originally created by the ELPA consortium,
! consisting of the following organizations:
!
! - Max Planck Computing and Data Facility (MPCDF), formerly known as
! Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
! - Bergische Universität Wuppertal, Lehrstuhl für angewandte
! Informatik,
! - Technische Universität München, Lehrstuhl für Informatik mit
! Schwerpunkt Wissenschaftliches Rechnen ,
! - Fritz-Haber-Institut, Berlin, Abt. Theorie,
! - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
! Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
! and
! - IBM Deutschland GmbH
!
! This particular source code file contains additions, changes and
! enhancements authored by Intel Corporation which is not part of
! the ELPA consortium.
!
! More information can be found here:
! http://elpa.mpcdf.mpg.de/
!
! ELPA is free software: you can redistribute it and/or modify
! it under the terms of the version 3 of the license of the
! GNU Lesser General Public License as published by the Free
! Software Foundation.
!
! ELPA is distributed in the hope that it will be useful,
! but WITHOUT ANY WARRANTY; without even the implied warranty of
! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! GNU Lesser General Public License for more details.
!
! You should have received a copy of the GNU Lesser General Public License
! along with ELPA. If not, see <http://www.gnu.org/licenses/>
!
! ELPA reflects a substantial effort on the part of the original
! ELPA consortium, and we ask you to respect the spirit of the
! license that we chose: i.e., please contribute any changes you
! may have back to the original ELPA library distribution, and keep
! any derivatives of ELPA under the same license that we chose for
! the original distribution, the GNU Lesser General Public License.
!
! Copyright of the original code rests with the authors inside the ELPA
! consortium. The copyright of any additional modifications shall rest
! with their original authors, but shall adhere to the licensing terms
! distributed along with the original code in the file "COPYING".
#endif
subroutine trans_ev_band_to_full_&
&MATH_DATATYPE&
&_&
&PRECISION &
(na, nqc, nblk, nbw, a, &
#if REALCASE == 1
a_dev, &
#endif
lda, tmat, &
#if REALCASE == 1
tmat_dev, &
#endif
q, &
#if REALCASE == 1
q_dev, &
#endif
ldq, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols, useGPU &
#if REALCASE == 1
,useQr)
#endif
#if COMPLEXCASE == 1
)
#endif
!-------------------------------------------------------------------------------
! trans_ev_band_to_full_real/complex:
! Transforms the eigenvectors of a band matrix back to the eigenvectors of the original matrix
!
! Parameters
!
! na Order of matrix a, number of rows of matrix q
!
! nqc Number of columns of matrix q
!
! nblk blocksize of cyclic distribution, must be the same in both directions!
!
! nbw semi bandwith
!
! a(lda,matrixCols) Matrix containing the Householder vectors (i.e. matrix a after bandred_real/complex)
! Distribution is like in Scalapack.
!
! lda Leading dimension of a
! matrixCols local columns of matrix a and q
!
! tmat(nbw,nbw,numBlocks) Factors returned by bandred_real/complex
!
! q On input: Eigenvectors of band matrix
! On output: Transformed eigenvectors
! Distribution is like in Scalapack.
!
! ldq Leading dimension of q
!
! mpi_comm_rows
! mpi_comm_cols
! MPI-Communicators for rows/columns
!
!-------------------------------------------------------------------------------
#ifdef HAVE_DETAILED_TIMINGS
use timings
#else
use timings_dummy
#endif
use precision
use cuda_functions
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
#if REALCASE == 1
logical, intent(in) :: useQR
#endif
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#if REALCASE == 1
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
!real(kind=rk8) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
!real(kind=rk8) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
#endif
#if COMPLEXCASE == 1
#ifdef USE_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
!complex(kind=ck8) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#else
complex(kind=COMPLEX_DATATYPE) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
!complex(kind=ck8) :: a(lda,matrixCols), q(ldq,matrixCols), tmat(nbw, nbw, numBlocks)
#endif
#endif
#if REALCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
real(kind=REAL_DATATYPE), parameter :: ZERO = 0.0_rk8, ONE = 1.0_rk8
#else
real(kind=REAL_DATATYPE), parameter :: ZERO = 0.0_rk4, ONE = 1.0_rk4
#endif
#endif
#if COMPLEXCASE == 1
#ifdef DOUBLE_PRECISION_COMPLEX
complex(kind=COMPLEX_DATATYPE), parameter :: ZERO = (0.0_rk8,0.0_rk8), ONE = (1.0_rk8,0.0_rk8)
#else
complex(kind=COMPLEX_DATATYPE), parameter :: ZERO = (0.0_rk4,0.0_rk4), ONE = (1.0_rk4,0.0_rk4)
#endif
#endif
#if REALCASE == 1
integer(kind=C_intptr_T) :: a_dev ! passed from bandred_real at the moment not used since copied in bandred_real
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, &
max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
#if REALCASE ==1
real(kind=REAL_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
#endif
#if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
#endif
! hvm_dev is fist used and set in this routine
! q is changed in trans_ev_tridi on the host, copied to device and passed here. this can be adapted
! tmp_dev is first used in this routine
! tmat_dev is passed along from bandred_real
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmp_dev, tmat_dev
integer(kind=ik) :: i
#if REALCASE == 1
real(kind=REAL_DATATYPE), allocatable :: tmat_complete(:,:), t_tmp(:,:), t_tmp2(:,:)
integer(kind=ik) :: cwy_blocking, t_blocking, t_cols, t_rows
#endif
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
call timer%start("trans_ev_band_to_full_&
&MATH_DATATYPE&
&_" // &
&PRECISION_SUFFIX &
)
call timer%start("mpi_communication")
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call timer%stop("mpi_communication")
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of A
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
max_local_rows = max_blocks_row*nblk
max_local_cols = max_blocks_col*nblk
if (useGPU) then
#if REALCASE == 1
! here the GPU and CPU version diverged: the CPU version now always uses the useQR path which
! is not implemented in the GPU version
#endif
allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error when allocating tmp1 "//errorMessage
stop
endif
allocate(tmp2(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error when allocating tmp2 "//errorMessage
stop
endif
allocate(hvb(max_local_rows*nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error when allocating hvb "//errorMessage
stop
endif
allocate(hvm(max_local_rows,nbw), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error when allocating hvm "//errorMessage
stop
endif
successCUDA = cuda_malloc(hvm_dev, (max_local_rows)*nbw* &
#if REALCASE == 1
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop
endif
successCUDA = cuda_malloc(tmp_dev, (max_local_cols)*nbw* &
#if REALCASE == 1
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop
endif
!#ifdef WITH_MPI
! it should be possible to keep tmat dev on the device and not copy it around
! already existent on GPU
successCUDA = cuda_malloc(tmat_dev, nbw*nbw* &
#if REALCASE == 1
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop
endif
!#endif
#if REALCASE == 1
! q_dev already living on device
! successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_real)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
! q_temp(:,:) = 0.0
! q_temp(1:ldq,1:na_cols) = q(1:ldq,1:na_cols)
! ! copy q_dev to device, maybe this can be avoided if q_dev can be kept on device in trans_ev_tridi_to_band
! successCUDA = cuda_memcpy(q_dev, loc(q), (ldq)*(matrixCols)*size_of_PRECISION_real, cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_PRECISION_complex)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMalloc"
stop
endif
successCUDA = cuda_memcpy(q_dev, loc(q),ldq*matrixCols*size_of_PRECISION_complex, cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
#endif
! if MPI is NOT used the following steps could be done on the GPU and memory transfers could be avoided
successCUDA = cuda_memset(hvm_dev, 0, (max_local_rows)*(nbw)* &
#if REALCASE == 1
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMalloc"
stop
endif
#if REALCASE == 1
hvm = CONST_0_0 ! Must be set to 0 !!!
hvb = CONST_0_0 ! Safety only
#endif
#if COMPLEXCASE == 1
hvm = CONST_COMPLEX_0_0 ! Must be set to 0 !!!
hvb = CONST_COMPLEX_0_0 ! Safety only
#endif
l_cols = local_index(nqc, my_pcol, np_cols, nblk, -1) ! Local columns of q
do istep=1,(na-1)/nbw
n_cols = MIN(na,(istep+1)*nbw) - istep*nbw ! Number of columns in current step
! Broadcast all Householder vectors for current step compressed in hvb
nb = 0
ns = 0
do lc = 1, n_cols
ncol = istep*nbw + lc ! absolute column number of householder vector
nrow = ncol - nbw ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
l_colh = local_index(ncol , my_pcol, np_cols, nblk, -1) ! HV local column number
if (my_pcol==pcol(ncol, nblk, np_cols)) hvb(nb+1:nb+l_rows) = a(1:l_rows,l_colh)
nb = nb+l_rows
if (lc==n_cols .or. mod(ncol,nblk)==0) then
#ifdef WITH_MPI
call timer%start("mpi_communication")
call MPI_Bcast(hvb(ns+1), nb-ns, &
#if REALCASE == 1
MPI_REAL_PRECISION,&
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION, &
#endif
pcol(ncol, nblk, np_cols), mpi_comm_cols, mpierr)
call timer%stop("mpi_communication")
#endif /* WITH_MPI */
ns = nb
endif
enddo
! Expand compressed Householder vectors into matrix hvm
nb = 0
do lc = 1, n_cols
nrow = (istep-1)*nbw+lc ! absolute number of pivot row
l_rows = local_index(nrow-1, my_prow, np_rows, nblk, -1) ! row length for bcast
hvm(1:l_rows,lc) = hvb(nb+1:nb+l_rows)
#if REALCASE == 1
if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = CONST_1_0
#endif
#if COMPLEXCASE == 1
if (my_prow==prow(nrow, nblk, np_rows)) hvm(l_rows+1,lc) = 1.
#endif
nb = nb+l_rows
enddo
successCUDA = cuda_memcpy(hvm_dev, loc(hvm), ((max_local_rows)*nbw* &
#if REALCASE == 1
size_of_PRECISION_real), &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex), &
#endif
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
l_rows = local_index(MIN(na,(istep+1)*nbw), my_prow, np_rows, nblk, -1)
! Q = Q - V * T**T * V**T * Q
if (l_rows>0) then
call timer%start("cublas")
#if REALCASE == 1
call cublas_PRECISION_GEMM('T', 'N', &
#endif
#if COMPLEXCASE == 1
call cublas_PRECISION_GEMM('C', 'N', &
#endif
n_cols, l_cols, l_rows, ONE, hvm_dev, max_local_rows, &
q_dev, ldq , ZERO, tmp_dev, n_cols)
call timer%stop("cublas")
#ifdef WITH_MPI
! copy data from device to host for a later MPI_ALLREDUCE
#if REALCASE == 1
! copy to host maybe this can be avoided this is needed if MPI is used (allreduce)
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, l_cols*n_cols*size_of_PRECISION_real, cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
#endif
#else /* WITH_MPI */
! check whether in the complex case this ist also only necessary for MPI
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(loc(tmp1), tmp_dev, n_cols*l_cols*size_of_PRECISION_complex, &
cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
#endif
#endif /* WITH_MPI */
else ! l_rows>0
#ifdef WITH_MPI
#if REALCASE == 1
tmp1(1:l_cols*n_cols) = 0
#endif
#if COMPLEXCASE == 1
tmp1(1:l_cols*n_cols) = CONST_COMPLEX_0_0
#endif
#else /* WITH_MPI */
! if MPI is not used (we do not need to transfer because of MPI_ALLREDUCE) we can set to zero on the device
successCUDA = cuda_memset(tmp_dev, 0, l_cols*n_cols* &
#if REALCASE == 1
size_of_PRECISION_real)
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMemset"
stop
endif
#endif /* WITH_MPI */
endif ! l_rows>0
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(loc(tmp1), tmp_dev, max_local_cols*nbw*size_of_real_datatype,cudaMemcpyDeviceToHost)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!#endif
#ifdef WITH_MPI
call timer%start("mpi_communication")
call mpi_allreduce(tmp1, tmp2, n_cols*l_cols, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION, &
#endif
MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
! tmp2(1:n_cols*l_cols) = tmp1(1:n_cols*l_cols)
#endif /* WITH_MPI */
!#ifdef WITH_GPU_VERSION
! istat = cuda_memcpy(tmp_dev, loc(tmp2), max_local_cols*nbw*size_of_real_datatype,cudaMemcpyHostToDevice)
! if (istat .ne. 0) then
! print *,"error in cudaMemcpy"
! stop
! endif
!#endif
if (l_rows>0) then
#ifdef WITH_MPI
! after the mpi_allreduce we have to copy back to the device
! copy back to device
successCUDA = cuda_memcpy(tmp_dev, loc(tmp2), n_cols*l_cols* &
#if REALCASE == 1
size_of_PRECISION_real, &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex, &
#endif
cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop
endif
#else /* WITH_MPI */
#if COMPLEXCASE == 1
! check whether this could be avoided like in the real case
successCUDA = cuda_memcpy(tmp_dev,loc(tmp1),l_cols*n_cols*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_complex: error in cudaMemcpy"
stop
endif
#endif
#endif /* WITH_MPI */
!#ifdef WITH_MPI
! it should be possible to keep tmat on the device and not copy it aroud
! ! copy to device, maybe this can be avoided tmat is input from bandred_real
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw* &
#if REALCASE == 1