Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
fa6778b7
Commit
fa6778b7
authored
Aug 31, 2017
by
Andreas Marek
Browse files
Remove complex GPU call from OpenMP region
This closes issue
#51
.
parent
8638a970
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90
View file @
fa6778b7
...
...
@@ -105,12 +105,12 @@
integer
(
kind
=
ik
),
intent
(
in
)
::
na
,
nev
,
nblk
,
nbw
,
ldq
,
matrixCols
,
mpi_comm_rows
,
mpi_comm_cols
#ifdef USE_ASSUMED_SIZE
MATH_DATATYPE
(
kind
=
rck
)
::
q
(
ldq
,
*
)
MATH_DATATYPE
(
kind
=
rck
)
::
q
(
ldq
,
*
)
#else
MATH_DATATYPE
(
kind
=
rck
)
::
q
(
ldq
,
matrixCols
)
MATH_DATATYPE
(
kind
=
rck
)
::
q
(
ldq
,
matrixCols
)
#endif
MATH_DATATYPE
(
kind
=
rck
),
intent
(
in
)
::
hh_trans
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
intent
(
in
)
::
hh_trans
(:,:)
integer
(
kind
=
c_intptr_t
)
::
q_dev
integer
(
kind
=
ik
)
::
np_rows
,
my_prow
,
np_cols
,
my_pcol
...
...
@@ -130,86 +130,75 @@
logical
::
flag
#ifdef WITH_OPENMP
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
aIntern
(:,:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
aIntern
(:,:,:,:)
#else
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
aIntern
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
pointer
::
aIntern
(:,:,:)
#endif
MATH_DATATYPE
(
kind
=
rck
)
::
a_var
MATH_DATATYPE
(
kind
=
rck
)
::
a_var
type
(
c_ptr
)
::
aIntern_ptr
MATH_DATATYPE
(
kind
=
rck
)
,
allocatable
::
row
(:)
MATH_DATATYPE
(
kind
=
rck
)
,
allocatable
::
row_group
(:,:)
MATH_DATATYPE
(
kind
=
rck
)
,
allocatable
::
row
(:)
MATH_DATATYPE
(
kind
=
rck
)
,
allocatable
::
row_group
(:,:)
#ifdef WITH_OPENMP
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_send_buffer
(:,:),
top_border_recv_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_send_buffer
(:,:),
bottom_border_recv_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_send_buffer
(:,:),
top_border_recv_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_send_buffer
(:,:),
bottom_border_recv_buffer
(:,:)
#else
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_send_buffer
(:,:,:),
top_border_recv_buffer
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_send_buffer
(:,:,:),
bottom_border_recv_buffer
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
top_border_send_buffer
(:,:,:),
top_border_recv_buffer
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bottom_border_send_buffer
(:,:,:),
bottom_border_recv_buffer
(:,:,:)
#endif
integer
(
kind
=
c_intptr_t
)
::
aIntern_dev
integer
(
kind
=
c_intptr_t
)
::
bcast_buffer_dev
integer
(
kind
=
c_intptr_t
)
::
aIntern_dev
integer
(
kind
=
c_intptr_t
)
::
bcast_buffer_dev
integer
(
kind
=
c_intptr_t
)
::
num
integer
(
kind
=
c_intptr_t
)
::
dev_offset
,
dev_offset_1
,
dev_offset_2
integer
(
kind
=
c_intptr_t
)
::
row_dev
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
integer
(
kind
=
c_intptr_t
)
::
hh_tau_dev
integer
(
kind
=
c_intptr_t
)
::
hh_dot_dev
integer
(
kind
=
ik
)
::
row_group_size
,
unpack_idx
integer
(
kind
=
c_intptr_t
)
::
row_dev
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
integer
(
kind
=
c_intptr_t
)
::
hh_tau_dev
integer
(
kind
=
c_intptr_t
)
::
hh_dot_dev
integer
(
kind
=
ik
)
::
row_group_size
,
unpack_idx
integer
(
kind
=
ik
)
::
n_times
integer
(
kind
=
ik
)
::
top
,
chunk
,
this_chunk
integer
(
kind
=
ik
)
::
n_times
integer
(
kind
=
ik
)
::
top
,
chunk
,
this_chunk
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
result_buffer
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bcast_buffer
(:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
result_buffer
(:,:,:)
MATH_DATATYPE
(
kind
=
rck
),
allocatable
::
bcast_buffer
(:,:)
integer
(
kind
=
ik
)
::
n_off
integer
(
kind
=
ik
)
::
n_off
integer
(
kind
=
ik
),
allocatable
::
result_send_request
(:),
result_recv_request
(:),
limits
(:)
integer
(
kind
=
ik
),
allocatable
::
top_send_request
(:),
bottom_send_request
(:)
integer
(
kind
=
ik
),
allocatable
::
top_recv_request
(:),
bottom_recv_request
(:)
#ifdef WITH_OPENMP
! integer(kind=ik), allocatable :: mpi_statuses(:,:)
#endif
#ifdef WITH_OPENMP
#ifdef WITH_MPI
! integer(kind=ik) :: my_MPI_STATUS_(MPI_STATUS_SIZE)
#endif
#endif
integer
(
kind
=
ik
),
allocatable
::
result_send_request
(:),
result_recv_request
(:),
limits
(:)
integer
(
kind
=
ik
),
allocatable
::
top_send_request
(:),
bottom_send_request
(:)
integer
(
kind
=
ik
),
allocatable
::
top_recv_request
(:),
bottom_recv_request
(:)
! MPI send/recv tags, arbitrary
integer
(
kind
=
ik
),
parameter
::
bottom_recv_tag
=
111
integer
(
kind
=
ik
),
parameter
::
top_recv_tag
=
222
integer
(
kind
=
ik
),
parameter
::
result_recv_tag
=
333
integer
(
kind
=
ik
),
parameter
::
bottom_recv_tag
=
111
integer
(
kind
=
ik
),
parameter
::
top_recv_tag
=
222
integer
(
kind
=
ik
),
parameter
::
result_recv_tag
=
333
#ifdef WITH_OPENMP
integer
(
kind
=
ik
)
::
max_threads
,
my_thread
integer
(
kind
=
ik
)
::
omp_get_max_threads
integer
(
kind
=
ik
)
::
max_threads
,
my_thread
integer
(
kind
=
ik
)
::
omp_get_max_threads
#endif
! Just for measuring the kernel performance
real
(
kind
=
c_double
)
::
kernel_time
,
kernel_time_recv
! MPI_WTIME always needs double
real
(
kind
=
c_double
)
::
kernel_time
,
kernel_time_recv
! MPI_WTIME always needs double
! long integer
integer
(
kind
=
lik
)
::
kernel_flops
,
kernel_flops_recv
integer
(
kind
=
lik
)
::
kernel_flops
,
kernel_flops_recv
logical
,
intent
(
in
)
::
wantDebug
logical
::
success
integer
(
kind
=
ik
)
::
istat
,
print_flops
character
(
200
)
::
errorMessage
logical
::
successCUDA
logical
,
intent
(
in
)
::
wantDebug
logical
::
success
integer
(
kind
=
ik
)
::
istat
,
print_flops
character
(
200
)
::
errorMessage
logical
::
successCUDA
#ifndef WITH_MPI
integer
(
kind
=
ik
)
::
j1
integer
(
kind
=
ik
)
::
j1
#endif
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
integer
(
kind
=
c_intptr_t
),
parameter
::
size_of_datatype
=
size_of_
&
&
PRECISION
&
&
_
&
&
MATH_DATATYPE
call
obj
%
timer
%
start
(
"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
...
...
@@ -1434,7 +1423,7 @@
! host_offset= (0 + (0 * stripe_width) + ( (i-1) * stripe_width * nbw ) ) * 8
successCUDA
=
cuda_memcpy
(
aIntern_dev
+
dev_offset
,
loc
(
top_border_recv_buffer
(
1
,
1
,
i
)),
&
stripe_width
*
top_msg_length
*
size_of_datatype
,
&
cudaMemcpyHostToDevice
)
cudaMemcpyHostToDevice
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
...
...
@@ -1485,7 +1474,7 @@
&
(
obj
,
useGPU
,
wantDebug
,
aIntern
,
aIntern_dev
,
stripe_width
,
a_dim2
,
stripe_count
,
&
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
bcast_buffer_dev
,
&
#if REALCASE == 1
hh_dot_dev
,
&
hh_dot_dev
,
&
#endif
hh_tau_dev
,
kernel_flops
,
kernel_time
,
n_times
,
0
,
current_local_n
,
i
,
&
last_stripe_width
,
kernel
)
...
...
@@ -1784,7 +1773,7 @@
&
(
obj
,
useGPU
,
wantDebug
,
aIntern
,
aIntern_dev
,
stripe_width
,
a_dim2
,
stripe_count
,
&
a_off
,
nbw
,
max_blk_size
,
bcast_buffer
,
bcast_buffer_dev
,
&
#if REALCASE == 1
hh_dot_dev
,
&
hh_dot_dev
,
&
#endif
hh_tau_dev
,
kernel_flops
,
kernel_time
,
n_times
,
0
,
top_msg_length
,
i
,
&
last_stripe_width
,
kernel
)
...
...
@@ -1862,7 +1851,7 @@
dev_offset
=
(
0
+
(
a_off
*
stripe_width
)
+
(
(
i
-1
)
*
stripe_width
*
a_dim2
))
*
size_of_datatype
successCUDA
=
cuda_memcpy
(
loc
(
top_border_send_buffer
(:,
1
,
i
)),
aIntern_dev
+
dev_offset
,
&
stripe_width
*
nbw
*
size_of_datatype
,
&
cudaMemcpyDeviceToHost
)
cudaMemcpyDeviceToHost
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"trans_ev_tridi_to_band_&
&MATH_DATATYPE&
...
...
@@ -1983,13 +1972,13 @@
else
! (dst == 0)
if
(
useGPU
)
then
call
pack_row_group_
&
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
&(
row_group_dev
,
aIntern_dev
,
stripe_count
,
stripe_width
,
&
last_stripe_width
,
a_dim2
,
l_nev
,
&
result_buffer
(:,
:,
nbuf
),
j
*
nblk
+
a_off
,
nblk
)
call
pack_row_group_
&
&
MATH_DATATYPE
&
&
_
gpu_
&
&
PRECISION
&
&(
row_group_dev
,
aIntern_dev
,
stripe_count
,
stripe_width
,
&
last_stripe_width
,
a_dim2
,
l_nev
,
&
result_buffer
(:,
:,
nbuf
),
j
*
nblk
+
a_off
,
nblk
)
else
! useGPU
do
i
=
1
,
nblk
...
...
@@ -2338,6 +2327,7 @@
if
(
useGPU
)
then
#if COMPLEXCASE == 1
! should this not hbe done always?
successCUDA
=
cuda_free
(
aIntern_dev
)
if
(
.not.
(
successCUDA
))
then
print
*
,
"trans_ev_tridi_to_band_complex: error in cudaFree"
...
...
src/elpa2/pack_unpack_gpu.F90
View file @
fa6778b7
...
...
@@ -63,7 +63,7 @@
real
(
kind
=
C_DATATYPE_KIND
)
::
rows
(:,:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
C_DATATYPE_KIND
)
::
rows
(:,:)
complex
(
kind
=
C_DATATYPE_KIND
)::
rows
(:,:)
#endif
integer
(
kind
=
ik
)
::
max_idx
logical
::
successCUDA
...
...
@@ -97,7 +97,6 @@
&: error in cudaMemcpy"
stop
1
endif
!write(*,*) cudaGetErrorString(istat)
end
subroutine
...
...
@@ -133,8 +132,6 @@
! Issue one single transfer call for all rows (host to device)
! row_group_dev(:, 1 : row_count) = rows(:, 1 : row_count)
!istat = cuda_memcpy( row_group_dev , loc(rows(:, 1: row_count)),row_count * l_nev * size_of_double_real_datatype , &
! cudaMemcpyHostToDevice)
successCUDA
=
cuda_memcpy
(
row_group_dev
,
loc
(
rows
(
1
,
1
)),
row_count
*
l_nev
*
&
size_of_
&
...
...
@@ -150,7 +147,6 @@
&: error in cudaMemcpy"
stop
1
endif
!write(*,*) cudaGetErrorString(istat)
! Use one kernel call to pack the entire row group
! call my_unpack_kernel<<<grid_size, stripe_width>>>(n_offset, max_idx, stripe_width, a_dim2, stripe_count, row_group_dev, a_dev)
...
...
@@ -178,10 +174,10 @@
use
precision
implicit
none
#if REALCASE == 1
real
(
kind
=
C_DATATYPE_KIND
)
::
row_group
(:,:)
real
(
kind
=
C_DATATYPE_KIND
)
::
row_group
(:,:)
#endif
#if COMPLEXCASE == 1
complex
(
kind
=
C_DATATYPE_KIND
)
::
row_group
(:,:)
complex
(
kind
=
C_DATATYPE_KIND
)
::
row_group
(:,:)
#endif
integer
(
kind
=
c_intptr_t
)
::
row_group_dev
,
a_dev
integer
(
kind
=
ik
),
intent
(
in
)
::
stripe_count
,
stripe_width
,
last_stripe_width
,
a_dim2
,
l_nev
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment