Commit 82e195a5 authored by Andreas Marek's avatar Andreas Marek
Browse files

Combine real/complex case

parent 2d7c8e91
...@@ -1119,22 +1119,42 @@ ...@@ -1119,22 +1119,42 @@
! Note that the distributed A has to be transposed ! Note that the distributed A has to be transposed
! Opposed to direct tridiagonalization there is no need to use the cache locality ! Opposed to direct tridiagonalization there is no need to use the cache locality
! of the tiles, so we can use strips of the matrix ! of the tiles, so we can use strips of the matrix
#if REALCASE == 1
! here the GPU version and CPU version diverged substantially, due to the newest ! here the GPU version and CPU version diverged substantially, due to the newest
! optimizations due to Intel. The GPU version has to be re-written ! optimizations due to Intel. The GPU version has to be re-written
if (useGPU) then if (useGPU) then
#if REALCASE == 1
umcCUDA(1 : l_cols * n_cols) = CONST_0_0 umcCUDA(1 : l_cols * n_cols) = CONST_0_0
vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = CONST_0_0 vmrCUDA(cur_l_rows * n_cols + 1 : cur_l_rows * n_cols * 2) = CONST_0_0
#endif
#if COMPLEXCASE == 1
umcCUDA(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
vmrCUDA(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
#endif
if (l_cols>0 .and. l_rows>0) then if (l_cols>0 .and. l_rows>0) then
#if REALCASE == 1
successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_PRECISION_real,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1)), vmr_size*size_of_PRECISION_real,cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1,1)),vmr_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy" print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop stop
endif endif
#if REALCASE == 1
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_PRECISION_real,cudaMemcpyHostToDevice) successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1)), umc_size*size_of_PRECISION_real,cudaMemcpyHostToDevice)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy" print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop stop
endif endif
...@@ -1143,32 +1163,93 @@ ...@@ -1143,32 +1163,93 @@
lcs = i*l_cols_tile+1 lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile) lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle if (lce<lcs) cycle
call timer%start("cublas") call timer%start("cublas")
lre = min(l_rows,(i+1)*l_rows_tile) lre = min(l_rows,(i+1)*l_rows_tile)
call cublas_PRECISION_GEMM('T', 'N', lce-lcs+1, n_cols, lre, & #if REALCASE == 1
CONST_1_0, (a_dev + ((lcs-1)*lda*size_of_PRECISION_real)), lda, vmr_dev,cur_l_rows, & call cublas_PRECISION_GEMM('T', 'N', &
CONST_1_0, (umc_dev+ (lcs-1)*size_of_PRECISION_real), cur_l_cols) #endif
#if COMPLEXCASE == 1
call cublas_PRECISION_GEMM('C', 'N', &
#endif
lce-lcs+1, n_cols, lre, &
ONE, (a_dev + ((lcs-1)*lda* &
#if REALCASE == 1
size_of_PRECISION_real)), &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)),&
#endif
lda, vmr_dev,cur_l_rows, ONE, &
(umc_dev+ (lcs-1)* &
#if REALCASE == 1
size_of_PRECISION_real), &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex), &
#endif
cur_l_cols)
if(i==0) cycle if(i==0) cycle
lre = min(l_rows,i*l_rows_tile) lre = min(l_rows,i*l_rows_tile)
call cublas_PRECISION_GEMM('N', 'N', lre,n_cols, lce-lcs+1,& call cublas_PRECISION_GEMM('N', 'N', lre,n_cols, lce-lcs+1, ONE, &
CONST_1_0, (a_dev+ ((lcs-1)*lda*size_of_PRECISION_real)), lda, & (a_dev+ ((lcs-1)*lda* &
(umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_PRECISION_real), cur_l_cols, & #if REALCASE == 1
CONST_1_0, (vmr_dev+(cur_l_rows * n_cols)*size_of_PRECISION_real), cur_l_rows) size_of_PRECISION_real)), &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex)), &
#endif
lda, (umc_dev+(cur_l_cols * n_cols+lcs-1)* &
#if REALCASE == 1
size_of_PRECISION_real), &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex), &
#endif
cur_l_cols, ONE, (vmr_dev+(cur_l_rows * n_cols)* &
#if REALCASE == 1
size_of_PRECISION_real), &
#endif
#if COMPLEXCASE == 1
size_of_PRECISION_complex), &
#endif
cur_l_rows)
call timer%stop("cublas") call timer%stop("cublas")
enddo enddo
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(vmrCUDA(1)), vmr_dev,vmr_size*size_of_PRECISION_real,cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(loc(vmrCUDA(1)), vmr_dev,vmr_size*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(loc(vmrCUDA(1,1)),vmr_dev,vmr_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy" print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop stop
endif endif
#if REALCASE == 1
successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_PRECISION_real,cudaMemcpyDeviceToHost) successCUDA = cuda_memcpy(loc(umcCUDA(1)), umc_dev, umc_size*size_of_PRECISION_real,cudaMemcpyDeviceToHost)
#endif
#if COMPLEXCASE == 1
successCUDA = cuda_memcpy(loc(umcCUDA(1,1)), umc_dev,umc_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaMemcpy" print *,"bandred_&
&MATH_DATATYPE&
&: error in cudaMemcpy"
stop stop
endif endif
endif ! l_cols>0 .and. l_rows>0 endif ! l_cols>0 .and. l_rows>0
#if REALCASE == 1
else ! do not useGPU version else ! do not useGPU version
!Code for Algorithm 4 !Code for Algorithm 4
...@@ -1265,13 +1346,14 @@ ...@@ -1265,13 +1346,14 @@
#ifdef WITH_OPENMP #ifdef WITH_OPENMP
!$omp end parallel !$omp end parallel
#endif #endif
endif ! do not useGPU version
#endif /* REALCASE == 1 */ #endif /* REALCASE == 1 */
endif ! do not useGPU version
#if COMPLEXCASE == 1 #if COMPLEXCASE == 1
if (useGPU) then if (useGPU) then
umcCUDA(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0 !umcCUDA(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
vmrCUDA(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0 !vmrCUDA(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
else else
umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0 umcCPU(1:l_cols,1:n_cols) = CONST_COMPLEX_0_0
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0 vmrCPU(1:l_rows,n_cols+1:2*n_cols) = CONST_COMPLEX_0_0
...@@ -1279,25 +1361,25 @@ ...@@ -1279,25 +1361,25 @@
if (l_cols>0 .and. l_rows>0) then if (l_cols>0 .and. l_rows>0) then
if (useGPU) then if (useGPU) then
! if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then !! if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
! print *,"bandred_complex: vmr size 2 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size !! print *,"bandred_complex: vmr size 2 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
!! stop
!! endif
! successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1,1)),vmr_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
! if (.not.(successCUDA)) then
! print *, "bandred_complex: cuda memcpy vmr_dev failed ", istat
! stop ! stop
! endif ! endif
successCUDA = cuda_memcpy(vmr_dev, loc(vmrCUDA(1,1)),vmr_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice) ! !if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
! ! print *,"bandred_complex: umc size 2 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
if (.not.(successCUDA)) then ! ! stop
print *, "bandred_complex: cuda memcpy vmr_dev failed ", istat ! !endif
stop ! successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
endif ! if (.not.(successCUDA)) then
!if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then ! print *, "bandred_complex: cuda memcpy umc_dev failed ", istat
! print *,"bandred_complex: umc size 2 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
! stop ! stop
!endif ! endif
successCUDA = cuda_memcpy(umc_dev, loc(umcCUDA(1,1)),umc_size*size_of_PRECISION_complex,cudaMemcpyHostToDevice)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuda memcpy umc_dev failed ", istat
stop
endif
endif endif
do i=0,(istep*nbw-1)/tile_size do i=0,(istep*nbw-1)/tile_size
...@@ -1308,11 +1390,11 @@ ...@@ -1308,11 +1390,11 @@
lre = min(l_rows,(i+1)*l_rows_tile) lre = min(l_rows,(i+1)*l_rows_tile)
if (useGPU) then if (useGPU) then
call timer%start("cublas") !call timer%start("cublas")
call cublas_PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, (a_dev + ((lcs-1)*lda* & !call cublas_PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, (a_dev + ((lcs-1)*lda* &
size_of_PRECISION_complex)), lda, & ! size_of_PRECISION_complex)), lda, &
vmr_dev, cur_l_rows, ONE, (umc_dev +(lcs-1)*size_of_PRECISION_complex), cur_l_cols) ! vmr_dev, cur_l_rows, ONE, (umc_dev +(lcs-1)*size_of_PRECISION_complex), cur_l_cols)
call timer%stop("cublas") !call timer%stop("cublas")
else else
call timer%start("blas") call timer%start("blas")
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), & call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a(1,lcs), ubound(a,dim=1), &
...@@ -1323,12 +1405,12 @@ ...@@ -1323,12 +1405,12 @@
if (i==0) cycle if (i==0) cycle
lre = min(l_rows,i*l_rows_tile) lre = min(l_rows,i*l_rows_tile)
if (useGPU) then if (useGPU) then
call timer%start("cublas") !call timer%start("cublas")
call cublas_PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, (a_dev+((lcs-1)*lda* & !call cublas_PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, (a_dev+((lcs-1)*lda* &
size_of_PRECISION_complex)),lda, & ! size_of_PRECISION_complex)),lda, &
(umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_PRECISION_complex), cur_l_cols,ONE, & ! (umc_dev+(cur_l_cols * n_cols+lcs-1)*size_of_PRECISION_complex), cur_l_cols,ONE, &
(vmr_dev+(cur_l_rows * n_cols)*size_of_PRECISION_complex), cur_l_rows) ! (vmr_dev+(cur_l_rows * n_cols)*size_of_PRECISION_complex), cur_l_rows)
call timer%stop("cublas") !call timer%stop("cublas")
else else
call timer%start("blas") call timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, & call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a(1,lcs), lda, &
...@@ -1338,24 +1420,24 @@ ...@@ -1338,24 +1420,24 @@
enddo enddo
if (useGPU) then if (useGPU) then
! if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then !! if (size(vmrCPU,dim=1)*size(vmrCPU,dim=2) .gt. vmr_size) then
! print *,"bandred_complex: vmr size 3 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size !! print *,"bandred_complex: vmr size 3 :",size(vmrCPU,dim=1)*size(vmrCPU,dim=2),vmr_size
!! stop
!! endif
! successCUDA = cuda_memcpy(loc(vmrCUDA(1,1)),vmr_dev,vmr_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
! if (.not.(successCUDA)) then
! print *, "bandred_complex: cuad memcpy failed vmrCUDA ", istat
! stop
! endif
! ! if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
! ! print *,"bandred_complex: umc size 3 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
! ! stop
! ! endif
! successCUDA = cuda_memcpy(loc(umcCUDA(1,1)), umc_dev,umc_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
! if (.not.(successCUDA)) then
! print *, "bandred_complex: cuad memcpy failed umcCUDA ", istat
! stop ! stop
! endif ! endif
successCUDA = cuda_memcpy(loc(vmrCUDA(1,1)),vmr_dev,vmr_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed vmrCUDA ", istat
stop
endif
! if (size(umcCPU,dim=1)*size(umcCPU,dim=2) .gt. umc_size) then
! print *,"bandred_complex: umc size 3 :",size(umcCPU,dim=1)*size(umcCPU,dim=2),umc_size
! stop
! endif
successCUDA = cuda_memcpy(loc(umcCUDA(1,1)), umc_dev,umc_size*size_of_PRECISION_complex,cudaMemcpyDeviceToHost)
if (.not.(successCUDA)) then
print *, "bandred_complex: cuad memcpy failed umcCUDA ", istat
stop
endif
endif ! useGPU endif ! useGPU
endif ! (l_cols>0 .and. l_rows>0) endif ! (l_cols>0 .and. l_rows>0)
#endif /* COMPLEXCASE == 1 */ #endif /* COMPLEXCASE == 1 */
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment