Unverified Commit 08086f30 authored by Andreas Marek's avatar Andreas Marek
Browse files

Better comments

- add markers for if / endif clauses
- fix intendation for variable definitions
parent 5c1c9230
......@@ -1263,7 +1263,8 @@
implicit none
logical, intent(in) :: useGPU
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#else
......@@ -3001,10 +3002,6 @@
end subroutine tridiag_band_complex_single ! has to be checked for GPU
#endif
#define ATODEV istat = cuda_memcpy(loc(a), a_dev, stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost)
#define ATOHOST istat = cuda_memcpy(a_dev, loc(a), stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost)
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine trans_ev_tridi_to_band_complex_double(na, nev, nblk, nbw, q, ldq, matrixCols, &
hh_trans_complex, mpi_comm_rows, mpi_comm_cols, &
......@@ -3298,7 +3295,9 @@
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: a
#endif
#ifdef WITH_OPENMP
if (useGPU) then
......@@ -3442,7 +3441,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -3459,7 +3458,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#endif /* WITH_OPENMP */
......@@ -3543,7 +3542,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -3562,7 +3561,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -3597,7 +3596,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -3616,7 +3615,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -3747,7 +3746,8 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -3765,7 +3765,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4269,7 +4269,8 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
n_off = current_local_n+a_off
......@@ -4285,7 +4286,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4427,7 +4428,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4456,7 +4457,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
if (useGPU) then
......@@ -4601,7 +4602,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4625,7 +4626,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4775,7 +4776,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4800,7 +4801,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4891,7 +4892,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4920,7 +4921,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......
......@@ -192,7 +192,7 @@
! na_rows = na
na_cols = na
#endif
endif
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
......@@ -423,7 +423,9 @@
endif
endif
else ! GPU not used
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
......@@ -443,7 +445,9 @@
print *,"bandred_real: error when allocating vr "//errorMessage
stop
endif
endif ! use GPU
#ifdef DOUBLE_PRECISION_REAL
if (useGPU) then
vmrCUDA(1 : cur_l_rows * n_cols) = 0._rk8
......@@ -547,6 +551,7 @@
#endif /* DOUBLE_PRECISION_REAL */
endif
else !useQR
do lc = n_cols, 1, -1
......@@ -706,7 +711,9 @@
endif
enddo
!$omp end parallel
#else /* WITH_OPENMP */
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
......@@ -919,6 +926,7 @@
endif ! l_cols>0 .and. l_rows>0
else ! do not useGPU version
!Code for Algorithm 4
n_way = 1
......@@ -1338,7 +1346,9 @@
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &
1, istep*nbw, n_cols, nblk)
#endif
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
n_threads = omp_get_num_threads()
......@@ -1378,7 +1388,9 @@
#endif
enddo
!$omp end parallel
#else /* WITH_OPENMP */
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
......@@ -1727,6 +1739,7 @@
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
max_blocks_row = ((na -1)/nblk)/np_rows + 1 ! Rows of A
max_blocks_col = ((nqc-1)/nblk)/np_cols + 1 ! Columns of q!
......@@ -1734,6 +1747,7 @@
max_local_cols = max_blocks_col*nblk
if (useGPU) then
! here the GPU and CPU version diverged: the CPU version now always uses the useQR path which
! is not implemented in the GPU version
allocate(tmp1(max_local_cols*nbw), stat=istat, errmsg=errorMessage)
......@@ -2466,6 +2480,7 @@
global_id(:,:) = 0
global_id(my_prow, my_pcol) = my_pe
#ifdef WITH_OPENMP
allocate(global_id_tmp(0:np_rows-1,0:np_cols-1), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
......@@ -2489,6 +2504,7 @@
#endif /* WITH_OPENMP */
#endif /* WITH_MPI */
! Total number of blocks in the band:
nblocks_total = (na-1)/nb + 1
......@@ -2686,6 +2702,7 @@
hv_t = 0
tau_t = 0
#endif /* WITH_OPENMP */
! ---------------------------------------------------------------------------
! Start of calculations
......@@ -3541,11 +3558,13 @@
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
#ifdef WITH_OPENMP
#ifdef WITH_MPI
integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE)
#endif
#endif
logical :: flag
#ifdef WITH_OPENMP
......@@ -3772,7 +3791,7 @@
aIntern(:,:,:) = 0._rk4
#endif
#endif
#endif /* WITH_OPENMP */
endif !useGPU
allocate(row(l_nev), stat=istat, errmsg=errorMessage)
......@@ -3809,6 +3828,7 @@
print *,"trans_ev_tridi_to_band_real: error when allocating row_group"//errorMessage
stop
endif
#ifdef DOUBLE_PRECISION_REAL
row_group(:, :) = 0._rk8
......@@ -3901,7 +3921,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -3921,7 +3941,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -3972,6 +3992,7 @@
endif
#endif /* WITH_OPENMP */
elseif (src==my_prow) then
src_offset = src_offset+1
if (.not.(useGPU)) row(:) = q(src_offset, 1:l_nev)
......@@ -3984,8 +4005,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
......@@ -4009,9 +4029,10 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
......@@ -4032,9 +4053,12 @@
call unpack_row_real_cpu_single(aIntern, row,i-limits(ip), stripe_count, stripe_width, last_stripe_width)
#endif
endif
#endif /* WITH_OPENMP */
endif
enddo
! Send all rows which have not yet been send
src_offset = 0
do dst = 0, ip-1
......@@ -4104,7 +4128,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4126,6 +4150,7 @@
#endif
#else /* WITH_OPENMP */
if (useGPU) then
! An unpacking of the current row group may occur before queuing the next row
#ifdef DOUBLE_PRECISION_REAL
......@@ -4188,7 +4213,6 @@
call unpack_and_prepare_row_group_real_gpu_single(row_group, row_group_dev, aIntern_dev, stripe_count, &
stripe_width, last_stripe_width, &
a_dim2, l_nev, row_group_size, nblk, unpack_idx, -1, .true.)
#endif
successCUDA = cuda_devicesynchronize()
......@@ -4322,7 +4346,9 @@
#endif
! Initialize broadcast buffer
#else /* WITH_OPENMP */
allocate(top_border_send_buffer(stripe_width, nbw, stripe_count), stat=istat, errmsg=errorMessage)
if (istat .ne. 0) then
print *,"trans_ev_tridi_to_band_real: error when allocating top_border_send_bufer"//errorMessage
......@@ -4463,7 +4489,9 @@
if (sweep==0 .and. current_n_end < current_n .and. l_nev > 0) then
do i = 1, stripe_count
#ifdef WITH_OPENMP
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
......@@ -4504,6 +4532,7 @@
#endif /* WITH_MPI */
#endif /* WITH_OPENMP */
enddo
endif
......@@ -4544,7 +4573,8 @@
#endif
endif
else
else ! (current_local_n > 1) then
! for current_local_n == 1 the one and only HH vector is 0 and not stored in hh_trans_real
#ifdef DOUBLE_PRECISION_REAL
bcast_buffer(:,1) = 0._rk8
......@@ -4569,7 +4599,7 @@
call extract_hh_tau_real_gpu_single(bcast_buffer_dev, hh_tau_dev, nbw, 1, .true.)
#endif
endif
endif
endif ! (current_local_n > 1) then
if (l_nev == 0) cycle
......@@ -4608,7 +4638,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4625,7 +4655,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4749,13 +4779,15 @@
!compute
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_REAL
call timer%start("OpenMP parallel_double")
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
if (useGPU) then
print *,"trans_ev_tridi_to_band_real: not yet implemented"
stop
......@@ -4795,7 +4827,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4904,7 +4936,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4933,7 +4965,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!send_b
#ifdef WITH_MPI
......@@ -5037,7 +5069,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -5068,7 +5100,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -5138,7 +5170,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -5172,7 +5204,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -5444,7 +5476,7 @@
call pack_row_real_cpu_double(aIntern, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
#endif
#else
#else /* DOUBLE_PRECISION_REAL */
#ifdef WITH_OPENMP
call pack_row_real_cpu_openmp_single(aIntern, row, j*nblk+i+a_off, stripe_width, &
......@@ -5453,7 +5485,7 @@
call pack_row_real_cpu_single(aIntern, row, j*nblk+i+a_off, stripe_width, last_stripe_width, stripe_count)
#endif
#endif
#endif /* DOUBLE_PRECISION_REAL */
q((num_blk/np_rows)*nblk+i,1:l_nev) = row(:)
enddo
endif ! useGPU
......@@ -5484,7 +5516,7 @@
last_stripe_width, stripe_count)
#endif
#else
#else /* DOUBLE_PRECISION_REAL */
#ifdef WITH_OPENMP
call pack_row_real_cpu_openmp_single(aIntern, result_buffer(:,i,nbuf), j*nblk+i+a_off, &
......@@ -5494,7 +5526,8 @@
last_stripe_width, stripe_count)
#endif
#endif
#endif /* DOUBLE_PRECISION_REAL */
enddo
endif ! useGPU
#ifdef WITH_MPI
......@@ -5630,7 +5663,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, i, j), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -5682,14 +5715,16 @@
#endif /* WITH_OPENMP */
#ifdef WITH_OPENMP
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_REAL
call timer%stop("OpenMP parallel_double")
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#endif /* WITH_OPENMP */
a_off = 0
endif
......@@ -5903,7 +5938,8 @@
#else
call timer%stop("trans_ev_tridi_to_band_real_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
return
#ifdef DOUBLE_PRECISION_REAL
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment