Commit 034c5adc authored by Andreas Marek's avatar Andreas Marek
Browse files

Prepare unify real/complex bandred

parent d2442f92
...@@ -177,8 +177,11 @@ ...@@ -177,8 +177,11 @@
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw) complex(kind=COMPLEX_DATATYPE) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
#endif
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp_CPU(:,:), vmrCPU(:,:), umcCPU(:,:) #if COMPLEXCASE == 1
complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: vr(:) complex(kind=COMPLEX_DATATYPE), allocatable :: vr(:)
#endif #endif
...@@ -260,13 +263,11 @@ ...@@ -260,13 +263,11 @@
! na_rows in used nowhere; only na_cols ! na_rows in used nowhere; only na_cols
if (useGPU) then if (useGPU) then
#ifdef WITH_MPI #ifdef WITH_MPI
! na_rows = numroc(na, nblk, my_prow, 0, np_rows)
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
na_rows = numroc(na, nblk, my_prow, 0, np_rows) na_rows = numroc(na, nblk, my_prow, 0, np_rows)
#endif #endif
na_cols = numroc(na, nblk, my_pcol, 0, np_cols) na_cols = numroc(na, nblk, my_pcol, 0, np_cols)
#else #else
! na_rows = na
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
na_rows = na na_rows = na
#endif #endif
...@@ -391,11 +392,6 @@ ...@@ -391,11 +392,6 @@
#endif /* REALCASE */ #endif /* REALCASE */
if (useGPU) then if (useGPU) then
!#if !defined(USE_ASSUMED_SIZE)
! if (size(a,dim=1) .ne. lda .or. size(a,dim=2) .ne. na_cols) then
! print *,"bandred_complex: sizes of a wrong ? ",lda,size(a,dim=1),na_cols,size(a,dim=2)
! endif
!#endif
cur_l_rows = 0 cur_l_rows = 0
cur_l_cols = 0 cur_l_cols = 0
...@@ -456,135 +452,100 @@ ...@@ -456,135 +452,100 @@
endif endif
#if REALCASE == 1
if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then if ((.not. allocated(vmrCUDA)) .or. (vmr_size .gt. ubound(vmrCUDA, dim=1))) then
if (allocated(vmrCUDA)) then if (allocated(vmrCUDA)) then
deallocate(vmrCUDA, stat=istat, errmsg=errorMessage) deallocate(vmrCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCUDA "//errorMessage print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating vmrCUDA "//errorMessage
stop stop
endif endif
successCUDA = cuda_free(vmr_dev) successCUDA = cuda_free(vmr_dev)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_real: error in cuda_free" print *,"bandred_&
&MATH_DATATYPE&: error in cuda_free"
stop stop
endif endif
endif endif
#if REALCASE == 1
allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage) allocate(vmrCUDA(vmr_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when allocating vmrCUDA "//errorMessage
stop
endif
successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_PRECISION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
endif
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
if ((.not. allocated(vmrCPU)) .or. (vmr_size .gt. ubound(vmrCPU, dim=1))) then allocate(vmrCUDA(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (allocated(vmrCPU)) then #endif
deallocate(vmrCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when deallocating vmrCPU "//errorMessage
stop
endif
successCUDA = cuda_free(vmr_dev)
if (.not.(successCUDA))then
print *,"bandred_complex: error in cudaFree"
stop
endif
endif
allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"bandred_complex: error when allocating vmrCPU "//errorMessage print *,"bandred_&
&MATH_DATATYPE&
&: error when allocating vmrCUDA "//errorMessage
stop stop
endif endif
successCUDA = cuda_malloc(vmr_dev, vmr_size* &
if (max(l_rows,1) * 2*n_cols .gt. vmr_size) then #if REALCASE == 1
print *,"bandred_complex: vmc_size ",max(l_rows,1) * 2*n_cols,vmr_size size_of_PRECISION_real)
endif #endif
#if COMPLEXCASE == 1
successCUDA = cuda_malloc(vmr_dev, vmr_size*size_of_PRECISION_complex) size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed vmr_dev ", istat print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc: vmr_dev"
stop stop
endif endif
endif endif
#endif
#if REALCASE == 1
if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then if ((.not. allocated(umcCUDA)) .or. (umc_size .gt. ubound(umcCUDA, dim=1))) then
if (allocated(umcCUDA)) then if (allocated(umcCUDA)) then
deallocate(umcCUDA, stat=istat, errmsg=errorMessage) deallocate(umcCUDA, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"bandred_real: error when deallocating umcCUDA "//errorMessage print *,"bandred_&
&MATH_DATATYPE&
&: error when deallocating umcCUDA "//errorMessage
stop stop
endif endif
successCUDA = cuda_free(umc_dev) successCUDA = cuda_free(umc_dev)
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaFree" print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaFree umc_dev"
stop stop
endif endif
endif endif
allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_real: error when deallocating umcCUDA "//errorMessage
stop
endif
successCUDA = cuda_malloc(umc_dev, umc_size*size_of_PRECISION_real)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMalloc"
stop
endif
endif
#endif /* REALCASE == 1 */
#if REALCASE == 1
allocate(umcCUDA(umc_size), stat=istat, errmsg=errorMessage)
#endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
if ((.not. allocated(umcCPU)) .or. (umc_size .gt. ubound(umcCPU, dim=1))) then allocate(umcCUDA(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (allocated(umcCPU)) then #endif
deallocate(umcCPU, stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umcCPU "//errorMessage
stop
endif
successCUDA = cuda_free(umc_dev)
if (.not.(successCUDA))then
print *,"bandred_complex: error in cudaFree"
stop
endif
endif
allocate(umcCPU(max(l_cols,1),2*n_cols), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then if (istat .ne. 0) then
print *,"bandred_complex: error when allocating umcCPU "//errorMessage print *,"bandred_&
&MATH_DATATYPE&
&: error when deallocating umcCUDA "//errorMessage
stop stop
endif endif
if (max(l_cols,1) * 2*n_cols .gt. umc_size) then successCUDA = cuda_malloc(umc_dev, umc_size* &
print *,"bandred_complex: umc_size ",max(l_cols,1) * 2*n_cols,umc_size #if REALCASE == 1
endif size_of_PRECISION_real)
successCUDA = cuda_malloc(umc_dev, umc_size*size_of_PRECISION_complex) #endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)
#endif
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *, "bandred_complex: cuda malloc failed umc_dev ", istat print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMalloc umc_dev"
stop stop
endif endif
endif endif
#endif
else ! GPU not used else ! GPU not used
...@@ -621,6 +582,9 @@ ...@@ -621,6 +582,9 @@
if (useGPU) then if (useGPU) then
#if REALCASE == 1 #if REALCASE == 1
vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0 vmrCUDA(1 : cur_l_rows * n_cols) = CONST_0_0
#endif
#if COMPLEXCASE == 1
vmrCUDA(1:l_rows,1:n_cols) = CONST_COMPLEX_0_0
#endif #endif
else else
#if REALCASE == 1 #if REALCASE == 1
...@@ -652,20 +616,34 @@ ...@@ -652,20 +616,34 @@
! Here we assume that the processor grid and the block grid are aligned ! Here we assume that the processor grid and the block grid are aligned
cur_pcol = pcol(istep*nbw+1, nblk, np_cols) cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if(my_pcol == cur_pcol) then if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), &
#if REALCASE == 1
lda*size_of_PRECISION_real, &
#endif
#if COMPLEXCASE == 1
int(lda*size_of_PRECISION_complex,kind=c_size_t), &
#endif
#if REALCASE == 1 #if REALCASE == 1
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), lda*size_of_PRECISION_real, & (a_dev + ((lc_start-1) * lda*size_of_PRECISION_real)), &
(a_dev + ((lc_start-1) * lda*size_of_PRECISION_real)), &
lda*size_of_PRECISION_real, lr_end*size_of_PRECISION_real, &
(lc_end - lc_start+1), cudaMemcpyDeviceToHost)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
successCUDA = cuda_memcpy2d(loc(a(1, lc_start)), int(lda*size_of_PRECISION_complex,kind=c_size_t), &
(a_dev + int( ( (lc_start-1) * lda*size_of_PRECISION_complex),kind=c_size_t )), & (a_dev + int( ( (lc_start-1) * lda*size_of_PRECISION_complex),kind=c_size_t )), &
int(lda*size_of_PRECISION_complex,kind=c_size_t), &
int(lr_end*size_of_PRECISION_complex,kind=c_size_t), &
int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int))
#endif #endif
#if REALCASE == 1
lda*size_of_PRECISION_real, lr_end*size_of_PRECISION_real, &
#endif
#if COMPLEXCASE == 1
int(lda*size_of_PRECISION_complex,kind=c_size_t), &
int(lr_end*size_of_PRECISION_complex,kind=c_size_t), &
#endif
#if REALCASE == 1
(lc_end - lc_start+1), cudaMemcpyDeviceToHost)
#endif
#if COMPLEXCASE == 1
int((lc_end - lc_start+1),kind=c_size_t),int(cudaMemcpyDeviceToHost,kind=c_int))
#endif
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_& print *,"bandred_&
&MATH_DATATYPE& &MATH_DATATYPE&
...@@ -803,16 +781,16 @@ ...@@ -803,16 +781,16 @@
#endif /* WITH_MPI */ #endif /* WITH_MPI */
#if REALCASE == 1
if (useGPU) then if (useGPU) then
#if REALCASE == 1
vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr) vmrCUDA(cur_l_rows * (lc - 1) + 1 : cur_l_rows * (lc - 1) + lr) = vr(1:lr)
else
vmrCPU(1:lr,lc) = vr(1:lr)
endif
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
vmrCPU(1:lr,lc) = vr(1:lr) vmrCUDA(1:lr,lc) = vr(1:lr)
#endif #endif
else
vmrCPU(1:lr,lc) = vr(1:lr)
endif
tau = vr(lr+1) tau = vr(lr+1)
#if REALCASE == 1 #if REALCASE == 1
...@@ -831,9 +809,9 @@ ...@@ -831,9 +809,9 @@
aux1 = CONST_COMPLEX_0_0 aux1 = CONST_COMPLEX_0_0
#endif #endif
#if REALCASE == 1
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
#if REALCASE == 1
!Open up one omp region to avoid paying openmp overhead. !Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call, !This does not help performance due to the addition of two openmp barriers around the MPI call,
!But in the future this may be beneficial if these barriers are replaced with a faster implementation !But in the future this may be beneficial if these barriers are replaced with a faster implementation
...@@ -888,38 +866,8 @@ ...@@ -888,38 +866,8 @@
endif endif
enddo enddo
!$omp end parallel !$omp end parallel
#else /* WITH_OPENMP */
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, MPI_REAL_PRECISION, MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_OPENMP */
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
nlc = 0 ! number of local columns nlc = 0 ! number of local columns
do j=1,lc-1 do j=1,lc-1
...@@ -975,35 +923,90 @@ ...@@ -975,35 +923,90 @@
! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr) ! a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif ! endif
! enddo ! enddo
#endif /* COMPLEXCASE */
#else /* WITH_OPENMP */
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
if (lr>0) aux1(nlc) = dot_product(vr(1:lr),a(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
call timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, nlc, &
#if REALCASE == 1
MPI_REAL_PRECISION, &
#endif
#if COMPLEXCASE == 1
MPI_COMPLEX_PRECISION,&
#endif #endif
MPI_SUM, mpi_comm_rows, mpierr)
call timer%stop("mpi_communication")
#else /* WITH_MPI */
if (nlc>0) aux2=aux1
#endif /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
#if REALCASE == 1
a(1:lr,lcx) = a(1:lr,lcx) - tau*aux2(nlc)*vr(1:lr)
#endif
#if COMPLEXCASE == 1
a(1:lr,lcx) = a(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
#endif
endif
enddo
#endif /* WITH_OPENMP */
enddo ! lc enddo ! lc
if (useGPU) then if (useGPU) then
! store column tiles back to GPU ! store column tiles back to GPU
cur_pcol = pcol(istep*nbw+1, nblk, np_cols) cur_pcol = pcol(istep*nbw+1, nblk, np_cols)
if (my_pcol == cur_pcol) then if (my_pcol == cur_pcol) then
successCUDA = cuda_memcpy2d((a_dev+ &
#if REALCASE == 1 #if REALCASE == 1
successCUDA = cuda_memcpy2d((a_dev+((lc_start-1)*lda*size_of_PRECISION_real)), & ((lc_start-1)*lda*size_of_PRECISION_real)), &
lda*size_of_PRECISION_real, loc(a(1, lc_start)), & #endif
lda*size_of_PRECISION_real, lr_end*size_of_PRECISION_real, & #if COMPLEXCASE == 1
(lc_end - lc_start+1),cudaMemcpyHostToDevice) int(((lc_start-1)*lda*size_of_PRECISION_complex),kind=c_size_t)), &
if (.not.(successCUDA)) then #endif
print *,"bandred_real: error in cudaMemcpy2d" #if REALCASE == 1
stop lda*size_of_PRECISION_real, loc(a(1, lc_start)), &
endif #endif
#if COMPLEXCASE == 1
int(lda*size_of_PRECISION_complex,kind=c_size_t), loc(a(1,lc_start)), &
#endif
#if REALCASE == 1
lda*size_of_PRECISION_real, lr_end*size_of_PRECISION_real, &
#endif
#if COMPLEXCASE == 1
int(lda*size_of_PRECISION_complex,kind=c_size_t), &
int(lr_end*size_of_PRECISION_complex,kind=c_size_t), &
#endif
#if REALCASE == 1
(lc_end - lc_start+1),cudaMemcpyHostToDevice)
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
successCUDA = cuda_memcpy2d((a_dev+int(((lc_start-1)*lda*size_of_PRECISION_complex),kind=c_size_t)), & int((lc_end - lc_start+1),kind=c_size_t), &
int(lda*size_of_PRECISION_complex,kind=c_size_t), loc(a(1,lc_start)), & int(cudaMemcpyHostToDevice,kind=c_int))
int(lda*size_of_PRECISION_complex,kind=c_size_t), & #endif
int(lr_end*size_of_PRECISION_complex,kind=c_size_t), &
int((lc_end - lc_start+1),kind=c_size_t) &
,int(cudaMemcpyHostToDevice,kind=c_int))
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy a_dev failed ", istat print *, "bandred_&
&MATH_DATATYPE&
&: cuda memcpy a_dev failed ", istat
stop stop
endif endif
#endif
endif endif
endif endif
...@@ -1025,10 +1028,10 @@ ...@@ -1025,10 +1028,10 @@
vmrCUDA, cur_l_rows, & vmrCUDA, cur_l_rows, &
#endif #endif
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
vmrCPU, ubound(vmrCPU,dim=1), & vmrCUDA, ubound(vmrCUDA,dim=1), &
#endif #endif
ZERO, vav, ubound(vav,dim=1)) ZERO, vav, ubound(vav,dim=1))
else else ! useGPU
if (l_rows>0) & if (l_rows>0) &
#if REALCASE == 1 #if REALCASE == 1
call PRECISION_SYRK('U', 'T', & call PRECISION_SYRK('U', 'T', &
...@@ -1073,16 +1076,26 @@ ...@@ -1073,16 +1076,26 @@
endif !useQR endif !useQR
#endif #endif
! Transpose vmr -> vmc (stored in umc, second half) ! Transpose vmr -> vmc (stored in umc, second half)
#if REALCASE == 1
if (useGPU) then if (useGPU) then
call elpa_transpose_vectors_& call elpa_transpose_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
&PRECISION & &PRECISION &
(vmrCUDA, cur_l_rows, mpi_comm_rows, & #if REALCASE == 1
umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, mpi_comm_cols, & (vmrCUDA, cur_l_rows, &
1, istep*nbw, n_cols, nblk) #endif
else #if COMPLEXCASE == 1
(vmrCUDA, ubound(vmrCUDA,dim=1), &
#endif
mpi_comm_rows, &
#if REALCASE == 1
umcCUDA(cur_l_cols * n_cols + 1), cur_l_cols, &
#endif
#if COMPLEXCASE == 1
umcCUDA(1,n_cols+1), ubound(umcCUDA,dim=1), &
#endif
mpi_comm_cols, 1, istep*nbw, n_cols, nblk)
else ! useGPU
call elpa_transpose_vectors_& call elpa_transpose_vectors_&
&MATH_DATATYPE& &MATH_DATATYPE&
&_& &_&
...@@ -1091,16 +1104,6 @@ ...@@ -1091,16 +1104,6 @@
umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, & umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk) 1, istep*nbw, n_cols, nblk)
endif endif
#endif
#if COMPLEXCASE == 1
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(vmrCPU, ubound(vmrCPU,dim=1), mpi_comm_rows, &
umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), mpi_comm_cols, &
1, istep*nbw, n_cols, nblk)
#endif
! Calculate umc = A**T * vmr ! Calculate umc = A**T * vmr
! Note that the distributed A has to be transposed ! Note that the distributed A has to be transposed
...@@ -1256,25 +1259,31 @@ ...@@ -1256,25 +1259,31 @@
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0 if (useGPU) then
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0 umcCUDA(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
vmrCUDA(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
else
umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
endif
if (l_cols>0 .and. l_rows>0) then if (l_cols>0 .and. l_rows>0) then
if (useGPU) then if (useGPU) then
if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then ! if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
print *,"bandred_complex: vmr size 2 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size ! print *,"bandred_complex: vmr size 2 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
stop