Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
elpa
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
11
Issues
11
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Environments
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
elpa
elpa
Commits
3ec9e836
Commit
3ec9e836
authored
Aug 31, 2017
by
Andreas Marek
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Try to run QR and GPU code
parent
8596c33e
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
59 additions
and
18 deletions
+59
-18
src/elpa2/elpa2_bandred_template.F90
src/elpa2/elpa2_bandred_template.F90
+59
-18
No files found.
src/elpa2/elpa2_bandred_template.F90
View file @
3ec9e836
...
...
@@ -159,9 +159,9 @@
#endif
integer
(
kind
=
ik
)
::
ierr
integer
(
kind
=
ik
)
::
cur_l_rows
,
cur_l_cols
,
vmr_size
,
umc_size
integer
(
kind
=
c_intptr_t
)
::
lc_start
,
lc_end
integer
(
kind
=
c_intptr_t
)
::
lc_start
,
lc_end
#if COMPLEXCASE == 1
integer
(
kind
=
c_intptr_t
)
::
lce_1
,
lcs_1
,
lre_1
integer
(
kind
=
c_intptr_t
)
::
lce_1
,
lcs_1
,
lre_1
#endif
integer
(
kind
=
ik
)
::
lr_end
integer
(
kind
=
ik
)
::
na_cols
...
...
@@ -179,17 +179,32 @@
logical
,
intent
(
in
)
::
useQR
#endif
integer
(
kind
=
ik
)
::
mystart
,
myend
,
m_way
,
n_way
,
work_per_thread
,
m_id
,
n_id
,
n_threads
,
&
ii
,
pp
ii
,
pp
,
transformChunkSize
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
logical
::
useGPU_reduction_lower_block_to_tridiagonal
call
obj
%
timer
%
start
(
"bandred_&
&MATH_DATATYPE&
&"
//
&
&
PRECISION_SUFFIX
&
)
if
(
useGPU
)
then
useGPU_reduction_lower_block_to_tridiagonal
=
.true.
#if REALCASE == 1
if
(
useQR
)
then
!in this case switch off GPU usage for step "reduce current block to lower triangular form"
! since this is done by QR decomposition
useGPU_reduction_lower_block_to_tridiagonal
=
.false.
endif
#endif
endif
if
(
wantDebug
)
call
obj
%
timer
%
start
(
"mpi_communication"
)
call
mpi_comm_rank
(
mpi_comm_rows
,
my_prow
,
mpierr
)
...
...
@@ -206,18 +221,18 @@
if
(
my_prow
==
0
.and.
my_pcol
==
0
)
then
if
(
wantDebug
)
then
write
(
error_unit
,
*
)
'ELPA2_bandred_&
&MATH_DATATYPE&
&: ERROR: nbw='
,
nbw
,
', nblk='
,
nblk
&MATH_DATATYPE&
&: ERROR: nbw='
,
nbw
,
', nblk='
,
nblk
write
(
error_unit
,
*
)
'ELPA2_bandred_&
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
&MATH_DATATYPE&
&: ELPA2 works only for nbw==n*nblk'
endif
success
=
.false.
return
endif
endif
! na_rows in used nowhere; only na_cols
! na_rows in used nowhere; only na_cols
if
(
useGPU
)
then
#ifdef WITH_MPI
#if COMPLEXCASE == 1
...
...
@@ -268,11 +283,6 @@
#if REALCASE == 1
if
(
useQR
)
then
if
(
useGPU
)
then
print
*
,
"qr decomposition at the moment not supported with GPU"
stop
1
endif
if
(
which_qr_decomposition
==
1
)
then
call
qr_pqrparam_init
(
obj
,
pqrparam
(
1
:
11
),
nblk
,
'M'
,
0
,
nblk
,
'M'
,
0
,
nblk
,
'M'
,
1
,
's'
)
allocate
(
tauvector
(
na
),
stat
=
istat
,
errmsg
=
errorMessage
)
...
...
@@ -533,6 +543,10 @@
! Reduce current block to lower triangular form
#if REALCASE == 1
if
(
useQR
)
then
if
(
useGPU
)
then
! vmrCPU(1:cur_l_rows,1:n_cols) = vmrCUDA(1 : cur_l_rows * n_cols)
endif
if
(
which_qr_decomposition
==
1
)
then
vmrCols
=
2
*
n_cols
#ifdef USE_ASSUMED_SIZE_QR
...
...
@@ -637,7 +651,7 @@
#endif /* WITH_MPI */
if
(
useGPU
)
then
if
(
useGPU
_reduction_lower_block_to_tridiagonal
)
then
vmrCUDA
(
cur_l_rows
*
(
lc
-
1
)
+
1
:
cur_l_rows
*
(
lc
-
1
)
+
lr
)
=
vr
(
1
:
lr
)
else
vmrCPU
(
1
:
lr
,
lc
)
=
vr
(
1
:
lr
)
...
...
@@ -815,7 +829,7 @@
#endif /* WITH_OPENMP */
enddo
! lc
if
(
useGPU
)
then
if
(
useGPU
_reduction_lower_block_to_tridiagonal
)
then
! store column tiles back to GPU
cur_pcol
=
pcol
(
istep
*
nbw
+1
,
nblk
,
np_cols
)
if
(
my_pcol
==
cur_pcol
)
then
...
...
@@ -841,7 +855,7 @@
vav
=
0
call
obj
%
timer
%
start
(
"blas"
)
if
(
useGPU
)
then
if
(
useGPU
_reduction_lower_block_to_tridiagonal
)
then
if
(
l_rows
>
0
)
&
#if REALCASE == 1
call
PRECISION_SYRK
(
'U'
,
'T'
,
&
...
...
@@ -853,7 +867,7 @@
vmrCUDA
,
cur_l_rows
,
&
ZERO
,
vav
,
ubound
(
vav
,
dim
=
1
))
else
! useGPU
else
! useGPU
_reduction_to_tridiagonal
if
(
l_rows
>
0
)
&
#if REALCASE == 1
call
PRECISION_SYRK
(
'U'
,
'T'
,
&
...
...
@@ -892,6 +906,33 @@
#if REALCASE == 1
endif
!useQR
#endif
#if REALCASE == 1
if
(
useGPU
.and.
useQR
)
then
! copy the data for furhter usage
! qr worked on *CPU arrarys
!vmrCUDA(1:cur_l_rows * n_cols) = vmrCPU(1:cur_l_rows,1:n_cols)
cur_pcol
=
pcol
(
istep
*
nbw
+1
,
nblk
,
np_cols
)
if
(
my_pcol
==
cur_pcol
)
then
successCUDA
=
cuda_memcpy2d
((
a_dev
+
&
int
(((
lc_start
-1
)
*
lda
*
size_of_datatype
),
kind
=
c_intptr_t
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
loc
(
a
(
1
,
lc_start
)),
&
int
(
lda
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
int
(
lr_end
*
size_of_datatype
,
kind
=
c_intptr_t
),
&
int
((
lc_end
-
lc_start
+1
),
kind
=
c_intptr_t
),
&
int
(
cudaMemcpyHostToDevice
,
kind
=
c_int
))
if
(
.not.
(
successCUDA
))
then
print
*
,
"bandred_&
&MATH_DATATYPE&
&: cuda memcpy a_dev failed "
,
istat
stop
1
endif
endif
endif
#endif
! Transpose vmr -> vmc (stored in umc, second half)
if
(
useGPU
)
then
call
elpa_transpose_vectors_
&
...
...
@@ -1517,7 +1558,7 @@
endif
endif
!useGPU
enddo
! istep
enddo
! istep
- loop
if
(
useGPU
)
then
successCUDA
=
cuda_free
(
vav_dev
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment