Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
elpa
elpa
Commits
4c4a78c1
Commit
4c4a78c1
authored
Jun 16, 2016
by
Andreas Marek
Browse files
Merge branch 'fix_gpu' into ELPA_GPU
parents
19390d50
cae76d2c
Changes
4
Pipelines
1
Expand all
Hide whitespace changes
Inline
Side-by-side
.gitlab-ci.yml
View file @
4c4a78c1
This diff is collapsed.
Click to expand it.
src/elpa2.F90
View file @
4c4a78c1
...
...
@@ -777,11 +777,11 @@ contains
#ifdef DOUBLE_PRECISION_REAL
call
trans_ev_band_to_full_real_double
(
na
,
nev
,
nblk
,
nbw
,
a
,
a_dev
,
lda
,
tmat
,
tmat_dev
,
q
,
q_dev
,
ldq
,
&
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
mpi_comm_cols
,
useGPU
,
useQRActual
)
mpi_comm_cols
,
useGPU
,
useQRActual
)
#else
call
trans_ev_band_to_full_real_single
(
na
,
nev
,
nblk
,
nbw
,
a
,
a_dev
,
lda
,
tmat
,
tmat_dev
,
q
,
q_dev
,
ldq
,
&
matrixCols
,
num_blocks
,
mpi_comm_rows
,
&
mpi_comm_cols
,
useGPU
,
useQRActual
)
mpi_comm_cols
,
useGPU
,
useQRActual
)
#endif
ttt1
=
MPI_Wtime
()
...
...
src/elpa2_compute_real_template.X90
View file @
4c4a78c1
...
...
@@ -1446,12 +1446,15 @@
stop
endif
! this is not necessart tmat_dev is passed (unchanged) from one routine to the other
! successCUDA = cuda_free(tmat_dev)
! if (.not.(successCUDA)) then
! print *,"bandred_real: error in cudaFree"
! stop
! endif
!#ifdef WITH_MPI
! it should be possible to keep tmat dev on the device and not copy it arround
! this is not necessary tmat_dev is passed (unchanged) from one routine to the other
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaFree"
stop
endif
!#endif
successCUDA = cuda_free(vav_dev)
if (.not.(successCUDA)) then
...
...
@@ -1775,16 +1778,19 @@
stop
endif
!#ifdef WITH_MPI
! it should be possible to keep tmat dev on the device and not copy it around
! already existent on GPU
!#ifdef DOUBLE_PRECISION_REAL
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
!#else
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
#else
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
! q_dev already living on device
!#ifdef DOUBLE_PRECISION_REAL
...
...
@@ -1989,19 +1995,21 @@
endif
#endif /* WITH_MPI */
!
already existend on GPU
!
#ifdef DOUBLE_PRECISION_REAL
!#ifdef WITH_MPI
!
it should be possible to keep tmat on the device and not copy it aroud
#ifdef DOUBLE_PRECISION_REAL
! ! copy to device, maybe this can be avoided tmat is input from bandred_real
!
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
!#else
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
! stop
! endif
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
!#endif /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
call cublas_dtrmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk8, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_dgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk8, hvm_dev, max_local_rows, &
...
...
@@ -2012,17 +2020,17 @@
tmp_dev, n_cols, 1.0_rk4, q_dev, ldq)
#endif
!
#ifdef DOUBLE_PRECISION_REAL
!
! copy to host maybe this can be avoided
!
! this is not necessary hvm is not used anymore
!
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
!
#else
!
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
!
#endif
!
if (.not.(successCUDA)) then
!
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
!
stop
!
endif
#ifdef DOUBLE_PRECISION_REAL
! copy to host maybe this can be avoided
! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
endif ! l_rows > 0
!#ifdef WITH_GPU_VERSION
...
...
@@ -4121,15 +4129,13 @@
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, aIntern_dev, stripe_count,&
stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(my_prow), .false.)
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, i - limits(my_prow), .false.)
#else
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(my_prow), .false.)
stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, i - limits(my_prow), .false.)
#endif
#ifdef WITH_MPI
...
...
@@ -4765,16 +4771,16 @@
endif
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_openmp_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer
_dev, hh_
dot
_dev, &
hh_tau_dev, kernel_flops,
kernel_time, 0, current_local_n, i,
&
my_thread, thread_width,
THIS_REAL_ELPA_KERNEL)
max_threads, l_nev,
a_off, nbw, max_blk_size, bcast_buffer,
&
bcast_buffer_dev, hh_dot
_dev, hh_
tau
_dev,
kernel_flops,
&
kernel_time, 0, current_local_n, i,
my_thread, thread_width,
&
THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_openmp_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer
_dev, hh_
dot
_dev, &
hh_tau_dev, kernel_flops,
kernel_time, 0, current_local_n, i,
&
my_thread, thread_width,
THIS_REAL_ELPA_KERNEL)
max_threads, l_nev,
a_off, nbw, max_blk_size, bcast_buffer,
&
bcast_buffer_dev, hh_dot
_dev, hh_
tau
_dev,
kernel_flops,
&
kernel_time, 0, current_local_n, i,
my_thread, thread_width,
&
THIS_REAL_ELPA_KERNEL)
#endif
! call compute_hh_trafo_real_cpu_openmp(aIntern,stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
...
...
@@ -4794,12 +4800,12 @@
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,
&
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,
&
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
...
...
@@ -4904,18 +4910,16 @@
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_openmp_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, current_local_n - bottom_msg_length, &
bottom_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_openmp_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, current_local_n - bottom_msg_length, &
bottom_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
#endif
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
...
...
@@ -4961,14 +4965,15 @@
endif
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,
&
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,
&
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
...
...
@@ -5038,18 +5043,18 @@
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_openmp_double(aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, &
max_threads, l_nev,
&
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer
_dev, hh_
dot
_dev, &
hh_tau_dev, kernel_flops, kernel_time,
&
top_msg_length,
current_local_n-top_msg_length-bottom_msg_length, i,
&
my_thread,
thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev,
a_off, nbw, max_blk_size, bcast_buffer,
&
bcast_buffer_dev, hh_dot
_dev, hh_
tau
_dev,
kernel_flops,
&
kernel_time, top_msg_length,
&
current_local_n-top_msg_length-bottom_msg_length, i,
my_thread,
&
thread_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_openmp_single(aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, &
max_threads, l_nev,
&
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer
_dev, hh_
dot
_dev, &
hh_tau_dev, kernel_flops, kernel_time,
&
top_msg_length,
current_local_n-top_msg_length-bottom_msg_length, i,
&
my_thread,
thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev,
a_off, nbw, max_blk_size, bcast_buffer,
&
bcast_buffer_dev, hh_dot
_dev, hh_
tau
_dev,
kernel_flops,
&
kernel_time, top_msg_length,
&
current_local_n-top_msg_length-bottom_msg_length, i,
my_thread,
&
thread_width, THIS_REAL_ELPA_KERNEL)
#endif
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width, a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
...
...
@@ -5068,13 +5073,13 @@
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,
&
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,
&
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
...
...
@@ -5145,16 +5150,16 @@
endif
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_openmp_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev,
&
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer
_dev, hh_
dot
_dev, &
hh_tau_dev, kernel_flops, kernel_time,
&
0, top_msg_length, i, my_thread, thread_width,
THIS_REAL_ELPA_KERNEL)
max_threads, l_nev,
a_off, nbw, max_blk_size, bcast_buffer,
&
bcast_buffer_dev, hh_dot
_dev, hh_
tau
_dev,
kernel_flops,
&
kernel_time, 0, top_msg_length, i, my_thread, thread_width,
&
THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_openmp_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev,
&
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer
_dev, hh_
dot
_dev, &
hh_tau_dev, kernel_flops, kernel_time,
&
0, top_msg_length, i, my_thread, thread_width,
THIS_REAL_ELPA_KERNEL)
max_threads, l_nev,
a_off, nbw, max_blk_size, bcast_buffer,
&
bcast_buffer_dev, hh_dot
_dev, hh_
tau
_dev,
kernel_flops,
&
kernel_time, 0, top_msg_length, i, my_thread, thread_width,
&
THIS_REAL_ELPA_KERNEL)
#endif
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
...
...
@@ -5172,12 +5177,13 @@
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, top_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count,
&
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, top_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
...
...
src/elpa_qr/elpa_pdlarfb.F90
View file @
4c4a78c1
...
...
@@ -1411,6 +1411,9 @@ subroutine qr_tmerge_pdlarfb_1dcomm_single(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,basei
integer
(
kind
=
ik
)
::
mergeoffset
,
mergelda
,
mergesize
integer
(
kind
=
ik
)
::
tgenoffset
,
tgenlda
,
tgensize
! quickfix
mergeoffset
=
0
if
(
updatemode
.eq.
ichar
(
'I'
))
then
updatelda
=
oldk
+
k
else
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment