Commit 4c4a78c1 authored by Andreas Marek's avatar Andreas Marek

Merge branch 'fix_gpu' into ELPA_GPU

parents 19390d50 cae76d2c
This diff is collapsed.
......@@ -777,11 +777,11 @@ contains
#ifdef DOUBLE_PRECISION_REAL
call trans_ev_band_to_full_real_double(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQRActual)
mpi_comm_cols, useGPU, useQRActual)
#else
call trans_ev_band_to_full_real_single(na, nev, nblk, nbw, a, a_dev, lda, tmat, tmat_dev, q, q_dev, ldq, &
matrixCols, num_blocks, mpi_comm_rows, &
mpi_comm_cols, useGPU, useQRActual)
mpi_comm_cols, useGPU, useQRActual)
#endif
ttt1 = MPI_Wtime()
......
......@@ -1446,12 +1446,15 @@
stop
endif
! this is not necessart tmat_dev is passed (unchanged) from one routine to the other
! successCUDA = cuda_free(tmat_dev)
! if (.not.(successCUDA)) then
! print *,"bandred_real: error in cudaFree"
! stop
! endif
!#ifdef WITH_MPI
! it should be possible to keep tmat dev on the device and not copy it arround
! this is not necessary tmat_dev is passed (unchanged) from one routine to the other
successCUDA = cuda_free(tmat_dev)
if (.not.(successCUDA)) then
print *,"bandred_real: error in cudaFree"
stop
endif
!#endif
successCUDA = cuda_free(vav_dev)
if (.not.(successCUDA)) then
......@@ -1775,16 +1778,19 @@
stop
endif
!#ifdef WITH_MPI
! it should be possible to keep tmat dev on the device and not copy it around
! already existent on GPU
!#ifdef DOUBLE_PRECISION_REAL
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
!#else
! successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
#ifdef DOUBLE_PRECISION_REAL
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_double_real_datatype)
#else
successCUDA = cuda_malloc(tmat_dev, nbw*nbw*size_of_single_real_datatype)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMalloc"
stop
endif
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMalloc"
! stop
! endif
! q_dev already living on device
!#ifdef DOUBLE_PRECISION_REAL
......@@ -1989,19 +1995,21 @@
endif
#endif /* WITH_MPI */
! already existend on GPU
!#ifdef DOUBLE_PRECISION_REAL
!#ifdef WITH_MPI
! it should be possible to keep tmat on the device and not copy it aroud
#ifdef DOUBLE_PRECISION_REAL
! ! copy to device, maybe this can be avoided tmat is input from bandred_real
!
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
!#else
! successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
! stop
! endif
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_double_real_datatype,cudaMemcpyHostToDevice)
#else
successCUDA = cuda_memcpy(tmat_dev, loc(tmat(1,1,istep)), nbw*nbw*size_of_single_real_datatype,cudaMemcpyHostToDevice)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
!#endif /* WITH_MPI */
#ifdef DOUBLE_PRECISION_REAL
call cublas_dtrmm('L', 'U', 'T', 'N', n_cols, l_cols, 1.0_rk8, tmat_dev, nbw, tmp_dev, n_cols)
call cublas_dgemm('N', 'N', l_rows, l_cols, n_cols, -1.0_rk8, hvm_dev, max_local_rows, &
......@@ -2012,17 +2020,17 @@
tmp_dev, n_cols, 1.0_rk4, q_dev, ldq)
#endif
!#ifdef DOUBLE_PRECISION_REAL
! ! copy to host maybe this can be avoided
! ! this is not necessary hvm is not used anymore
! successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
!#else
! successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
!#endif
! if (.not.(successCUDA)) then
! print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
! stop
! endif
#ifdef DOUBLE_PRECISION_REAL
! copy to host maybe this can be avoided
! this is not necessary hvm is not used anymore
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_double_real_datatype),cudaMemcpyDeviceToHost)
#else
successCUDA = cuda_memcpy(loc(hvm), hvm_dev, ((max_local_rows)*nbw*size_of_single_real_datatype),cudaMemcpyDeviceToHost)
#endif
if (.not.(successCUDA)) then
print *,"trans_ev_band_to_full_real: error in cudaMemcpy"
stop
endif
endif ! l_rows > 0
!#ifdef WITH_GPU_VERSION
......@@ -4121,15 +4129,13 @@
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, aIntern_dev, stripe_count,&
stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(my_prow), .false.)
call unpack_and_prepare_row_group_real_gpu_double(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, i - limits(my_prow), .false.)
#else
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, &
last_stripe_width, a_dim2, l_nev, row_group_size, nblk, &
unpack_idx, i - limits(my_prow), .false.)
stripe_width, last_stripe_width, a_dim2, l_nev, &
row_group_size, nblk, unpack_idx, i - limits(my_prow), .false.)
#endif
#ifdef WITH_MPI
......@@ -4765,16 +4771,16 @@
endif
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_openmp_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, 0, current_local_n, i, my_thread, thread_width, &
THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_openmp_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, 0, current_local_n, i, my_thread, thread_width, &
THIS_REAL_ELPA_KERNEL)
#endif
! call compute_hh_trafo_real_cpu_openmp(aIntern,stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
......@@ -4794,12 +4800,12 @@
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, current_local_n, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
......@@ -4904,18 +4910,16 @@
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_openmp_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, current_local_n - bottom_msg_length, &
bottom_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_openmp_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, my_thread, &
thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, current_local_n - bottom_msg_length, &
bottom_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
#endif
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
......@@ -4961,14 +4965,15 @@
endif
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
current_local_n - bottom_msg_length, bottom_msg_length, i, &
......@@ -5038,18 +5043,18 @@
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_openmp_double(aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, my_thread, &
thread_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_openmp_single(aIntern, aIntern_dev, stripe_width ,a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
top_msg_length, current_local_n-top_msg_length-bottom_msg_length, i, &
my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, my_thread, &
thread_width, THIS_REAL_ELPA_KERNEL)
#endif
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width, a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
......@@ -5068,13 +5073,13 @@
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, top_msg_length, &
current_local_n-top_msg_length-bottom_msg_length, i, &
......@@ -5145,16 +5150,16 @@
endif
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_openmp_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
0, top_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, 0, top_msg_length, i, my_thread, thread_width, &
THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_openmp_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
max_threads, l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, &
0, top_msg_length, i, my_thread, thread_width, THIS_REAL_ELPA_KERNEL)
max_threads, l_nev, a_off, nbw, max_blk_size, bcast_buffer, &
bcast_buffer_dev, hh_dot_dev, hh_tau_dev, kernel_flops, &
kernel_time, 0, top_msg_length, i, my_thread, thread_width, &
THIS_REAL_ELPA_KERNEL)
#endif
! call compute_hh_trafo_real_cpu_openmp(aIntern, stripe_width,a_dim2,stripe_count, max_threads, l_nev, &
! a_off, nbw, max_blk_size, bcast_buffer, kernel_flops, kernel_time, &
......@@ -5172,12 +5177,13 @@
#else /* WITH_OPENMP */
#ifdef DOUBLE_PRECISION_REAL
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_double(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, top_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
#else
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
call compute_hh_trafo_real_cpu_single(aIntern, aIntern_dev, stripe_width, a_dim2, stripe_count, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, hh_dot_dev, &
hh_tau_dev, kernel_flops, kernel_time, 0, top_msg_length, i, &
last_stripe_width, THIS_REAL_ELPA_KERNEL)
......
......@@ -1411,6 +1411,9 @@ subroutine qr_tmerge_pdlarfb_1dcomm_single(m,mb,n,oldk,k,v,ldv,t,ldt,a,lda,basei
integer(kind=ik) :: mergeoffset,mergelda,mergesize
integer(kind=ik) :: tgenoffset,tgenlda,tgensize
! quickfix
mergeoffset = 0
if (updatemode .eq. ichar('I')) then
updatelda = oldk+k
else
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment