Unverified Commit 08086f30 authored by Andreas Marek's avatar Andreas Marek
Browse files

Better comments

- add markers for if / endif clauses
- fix intendation for variable definitions
parent 5c1c9230
......@@ -105,9 +105,9 @@
implicit none
logical, intent(in) :: useGPU
logical, intent(in) :: useGPU
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), tmat(nbw,nbw,*)
#else
......@@ -120,29 +120,29 @@
complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk4, 0.0_rk4), CONE = (1.0_rk4, 0.0_rk4)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows
integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=REAL_DATATYPE) :: vnorm2
real(kind=REAL_DATATYPE) :: vnorm2
complex(kind=COMPLEX_DATATYPE) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp(:,:), vr(:), vmr(:,:), umc(:,:)
integer(kind=c_intptr_t) :: umc_dev, tmat_dev,vav_dev,vmr_dev,a_dev
integer(kind=ik) :: cur_l_rows, cur_l_cols,vmr_size ,umc_size
integer(kind=c_size_t) :: lc_start, lc_end, lr_end, lce_1, lcs_1,lre_1
integer(kind=ik) :: na_rows, na_cols
integer(kind=c_intptr_t) :: umc_dev, tmat_dev,vav_dev,vmr_dev,a_dev
integer(kind=ik) :: cur_l_rows, cur_l_cols,vmr_size ,umc_size
integer(kind=c_size_t) :: lc_start, lc_end, lr_end, lce_1, lcs_1,lre_1
integer(kind=ik) :: na_rows, na_cols
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
integer(kind=ik), external :: numroc
#endif
logical, intent(in) :: wantDebug
logical, intent(out) :: success
character(200) :: errorMessage
integer(kind=ik) :: istat
logical :: successCUDA
logical, intent(in) :: wantDebug
logical, intent(out) :: success
character(200) :: errorMessage
integer(kind=ik) :: istat
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
call timer%start("bandred_complex_double")
......@@ -1143,10 +1143,10 @@
use precision
implicit none
integer(kind=ik) :: n, lda, ldb, comm
integer(kind=ik) :: n, lda, ldb, comm
complex(kind=COMPLEX_DATATYPE) :: a(lda,ldb)
integer(kind=ik) :: i, nc, mpierr
integer(kind=ik) :: i, nc, mpierr
complex(kind=COMPLEX_DATATYPE) :: h1(n*n), h2(n*n)
#ifdef HAVE_DETAILED_TIMINGS
......@@ -1262,8 +1262,9 @@
implicit none
logical, intent(in) :: useGPU
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU
integer(kind=ik) :: na, nqc, lda, ldq, nblk, nbw, matrixCols, numBlocks, &
mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: a(lda,*), q(ldq,*), tmat(nbw,nbw,*)
#else
......@@ -1276,19 +1277,19 @@
complex(kind=COMPLEX_DATATYPE), parameter :: CZERO = (0.0_rk4,0.0_rk4), CONE = (1.0_rk4,0.0_rk4)
#endif
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: max_blocks_row, max_blocks_col, max_local_rows, max_local_cols
integer(kind=ik) :: l_cols, l_rows, l_colh, n_cols
integer(kind=ik) :: istep, lc, ncol, nrow, nb, ns
complex(kind=COMPLEX_DATATYPE), allocatable :: tmp1(:), tmp2(:), hvb(:), hvm(:,:)
integer(kind=ik) :: i
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmat_dev, tmp_dev
integer(kind=ik) :: i
integer(kind=C_intptr_T) :: hvm_dev, q_dev, tmat_dev, tmp_dev
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -1738,47 +1739,47 @@
implicit none
!#ifdef WITH_GPU_VERSION
! integer(C_SIZE_T) :: h_dev, hv_new_dev ,ab_dev,x_dev,hs_dev,tau_new_dev,hv_dev,hd_dev
! complex*16, allocatable :: ab_temp(:,:)
! integer(C_SIZE_T) :: h_dev, hv_new_dev ,ab_dev,x_dev,hs_dev,tau_new_dev,hv_dev,hd_dev
! complex*16, allocatable :: ab_temp(:,:)
!#endif
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
integer(kind=ik), intent(in) :: na, nb, nblk, lda, matrixCols, mpi_comm_rows, mpi_comm_cols, mpi_comm
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE),intent(in) :: a(lda,*)
#else
complex(kind=COMPLEX_DATATYPE), intent(in) :: a(lda,matrixCols)
#endif
real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0
real(kind=REAL_DATATYPE), intent(out) :: d(na), e(na) ! set only on PE 0
complex(kind=COMPLEX_DATATYPE), intent(inout), &
allocatable :: hh_trans_complex(:,:)
allocatable :: hh_trans_complex(:,:)
real(kind=REAL_DATATYPE) :: vnorm2
real(kind=REAL_DATATYPE) :: vnorm2
complex(kind=COMPLEX_DATATYPE) :: hv(nb), tau, x, h(nb), ab_s(1+nb), hv_s(nb), hv_new(nb), tau_new, hf
complex(kind=COMPLEX_DATATYPE) :: hd(nb), hs(nb)
integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
integer(kind=ik) :: my_pe, n_pes, mpierr
integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols
integer(kind=ik) :: ireq_ab, ireq_hv
integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
integer(kind=ik) :: i, j, n, nc, nr, ns, ne, istep, iblk, nblocks_total, nblocks, nt
integer(kind=ik) :: my_pe, n_pes, mpierr
integer(kind=ik) :: my_prow, np_rows, my_pcol, np_cols
integer(kind=ik) :: ireq_ab, ireq_hv
integer(kind=ik) :: na_s, nx, num_hh_vecs, num_chunks, local_size, max_blk_size, n_off
#ifdef WITH_OPENMP
integer(kind=ik), allocatable :: mpi_statuses(:,:)
integer(kind=ik), allocatable :: omp_block_limits(:)
integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter
integer(kind=ik) :: omp_get_max_threads
integer(kind=ik), allocatable :: mpi_statuses(:,:)
integer(kind=ik), allocatable :: omp_block_limits(:)
integer(kind=ik) :: max_threads, my_thread, my_block_s, my_block_e, iter
integer(kind=ik) :: omp_get_max_threads
#ifdef WITH_MPI
integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE)
integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE)
#endif
complex(kind=COMPLEX_DATATYPE), allocatable :: hv_t(:,:), tau_t(:)
#endif
integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
integer(kind=ik), allocatable :: block_limits(:)
integer(kind=ik), allocatable :: ireq_hhr(:), ireq_hhs(:), global_id(:,:), hh_cnt(:), hh_dst(:)
integer(kind=ik), allocatable :: limits(:), snd_limits(:,:)
integer(kind=ik), allocatable :: block_limits(:)
complex(kind=COMPLEX_DATATYPE), allocatable :: ab(:,:), hh_gath(:,:,:), hh_send(:,:,:)
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: istat
character(200) :: errorMessage
#ifndef WITH_MPI
integer(kind=ik) :: startAddr
integer(kind=ik) :: startAddr
#endif
! ! dummies for calling redist_band
......@@ -3001,10 +3002,6 @@
end subroutine tridiag_band_complex_single ! has to be checked for GPU
#endif
#define ATODEV istat = cuda_memcpy(loc(a), a_dev, stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost)
#define ATOHOST istat = cuda_memcpy(a_dev, loc(a), stripe_width*a_dim2*stripe_count*size_of_complex_datatype, cudaMemcpyDeviceToHost)
#ifdef DOUBLE_PRECISION_COMPLEX
subroutine trans_ev_tridi_to_band_complex_double(na, nev, nblk, nbw, q, ldq, matrixCols, &
hh_trans_complex, mpi_comm_rows, mpi_comm_cols, &
......@@ -3051,30 +3048,30 @@
use iso_c_binding
implicit none
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU
integer(kind=ik), intent(in) :: THIS_COMPLEX_ELPA_KERNEL
integer(kind=ik), intent(in) :: na, nev, nblk, nbw, ldq, matrixCols, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
complex(kind=COMPLEX_DATATYPE) :: q(ldq,*)
#else
complex(kind=COMPLEX_DATATYPE) :: q(ldq,matrixCols)
#endif
complex(kind=COMPLEX_DATATYPE) :: hh_trans_complex(:,:)
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: tmp
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
integer(kind=ik) :: np_rows, my_prow, np_cols, my_pcol
integer(kind=ik) :: tmp
integer(kind=ik) :: i, j, ip, sweep, nbuf, l_nev, a_dim2
integer(kind=ik) :: current_n, current_local_n, current_n_start, current_n_end
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
integer(kind=ik) :: thread_width, csw, b_off, b_len
integer(kind=ik) :: thread_width, csw, b_off, b_len
#endif
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
logical :: flag
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
integer(kind=ik) :: a_off, current_tv_off, max_blk_size
integer(kind=ik) :: mpierr, src, src_offset, dst, offset, nfact, num_blk
logical :: flag
#ifdef WITH_OPENMP
complex(kind=COMPLEX_DATATYPE), pointer :: a(:,:,:,:)
......@@ -3093,72 +3090,72 @@
complex(kind=COMPLEX_DATATYPE), allocatable :: top_border_send_buffer(:,:,:), top_border_recv_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bottom_border_send_buffer(:,:,:), bottom_border_recv_buffer(:,:,:)
#endif
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t) :: a_dev
integer(kind=c_intptr_t) :: bcast_buffer_dev
integer(kind=c_size_t) :: num
integer(kind=c_size_t) :: dev_offset, dev_offset_1, dev_offset_2
integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx
integer(kind=ik) :: n_times
integer(kind=c_intptr_t) :: row_dev
integer(kind=c_intptr_t) :: row_group_dev
integer(kind=c_intptr_t) :: hh_tau_dev
integer(kind=c_intptr_t) :: hh_dot_dev
integer(kind=ik) :: row_group_size, unpack_idx
integer(kind=ik) :: n_times
integer(kind=ik) :: top, chunk, this_chunk
integer(kind=ik) :: top, chunk, this_chunk
complex(kind=COMPLEX_DATATYPE), allocatable :: result_buffer(:,:,:)
complex(kind=COMPLEX_DATATYPE), allocatable :: bcast_buffer(:,:)
integer(kind=ik) :: n_off
integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
integer(kind=ik) :: n_off
integer(kind=ik), allocatable :: result_send_request(:), result_recv_request(:), limits(:)
integer(kind=ik), allocatable :: top_send_request(:), bottom_send_request(:)
integer(kind=ik), allocatable :: top_recv_request(:), bottom_recv_request(:)
#ifdef WITH_OPENMP
integer(kind=ik), allocatable :: mpi_statuses(:,:)
integer(kind=ik), allocatable :: mpi_statuses(:,:)
#ifdef WITH_MPI
integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE)
integer(kind=ik) :: mpi_status(MPI_STATUS_SIZE)
#endif
#endif
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: na_rows, na_cols
! real*8 :: ttt0, ttt1, ttt2, t2_compute_kernel, t0_compute_kernel,t1_compute_kernel, &
! t0_mpi_time, t1_mpi_time,t2_mpi_time
! real*8 :: t0_cpu_code,t1_cpu_code,t2_cpu_code,t0_block_time,t1_block_time,t2_block_time,t0_cuda_memcpy
! real*8 :: t0_inner_do_time, t1_inner_do_time , t2_inner_do_time,t0_outer_do_time ,t1_outer_do_time , &
! t2_outer_do_time ,t0_result_time ,t1_result_time, t2_result_time,t0_mpi_recv_time, &
! t1_mpi_recv_time,t2_mpi_recv_time
! real*8 :: t1_mpi_wait_time,t0_mpi_wait_time,t2_mpi_wait_time,t1_memcpy_time,t0_memcpy_time,t2_memcpy_time, &
! t1_mpi_irecv_time,t0_mpi_irecv_time,t2_mpi_irecv_time,t0_mpi_outer_wait_time,t1_mpi_outer_wait_time,&
! t2_mpi_outer_wait_time, time0
! real*4 :: time1
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: na_rows, na_cols
! real*8 :: ttt0, ttt1, ttt2, t2_compute_kernel, t0_compute_kernel,t1_compute_kernel, &
! t0_mpi_time, t1_mpi_time,t2_mpi_time
! real*8 :: t0_cpu_code,t1_cpu_code,t2_cpu_code,t0_block_time,t1_block_time,t2_block_time,t0_cuda_memcpy
! real*8 :: t0_inner_do_time, t1_inner_do_time , t2_inner_do_time,t0_outer_do_time ,t1_outer_do_time , &
! t2_outer_do_time ,t0_result_time ,t1_result_time, t2_result_time,t0_mpi_recv_time, &
! t1_mpi_recv_time,t2_mpi_recv_time
! real*8 :: t1_mpi_wait_time,t0_mpi_wait_time,t2_mpi_wait_time,t1_memcpy_time,t0_memcpy_time,t2_memcpy_time, &
! t1_mpi_irecv_time,t0_mpi_irecv_time,t2_mpi_irecv_time,t0_mpi_outer_wait_time,t1_mpi_outer_wait_time,&
! t2_mpi_outer_wait_time, time0
! real*4 :: time1
! MPI send/recv tags, arbitrary
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
integer(kind=ik), parameter :: bottom_recv_tag = 111
integer(kind=ik), parameter :: top_recv_tag = 222
integer(kind=ik), parameter :: result_recv_tag = 333
#ifdef WITH_OPENMP
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads
integer(kind=ik) :: max_threads, my_thread
integer(kind=ik) :: omp_get_max_threads
#endif
! Just for measuring the kernel performance
real(kind=c_double) :: kernel_time ! MPI_WTIME always needs double
real(kind=c_double) :: kernel_time ! MPI_WTIME always needs double
! long integer
integer(kind=lik) :: kernel_flops
integer(kind=lik) :: kernel_flops
logical, intent(in) :: wantDebug
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
logical :: success
logical, intent(in) :: wantDebug
integer(kind=ik) :: istat
character(200) :: errorMessage
logical :: successCUDA
logical :: success
#ifndef WITH_MPI
integer(kind=ik) :: j1
integer(kind=ik) :: j1
#endif
#ifdef HAVE_DETAILED_TIMINGS
......@@ -3298,7 +3295,9 @@
max_blk_size = maxval(limits(1:np_rows) - limits(0:np_rows-1))
a_dim2 = max_blk_size + nbw
#if 0
!DEC$ ATTRIBUTES ALIGN: 64:: a
#endif
#ifdef WITH_OPENMP
if (useGPU) then
......@@ -3442,7 +3441,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -3459,7 +3458,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#endif /* WITH_OPENMP */
......@@ -3543,7 +3542,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -3562,7 +3561,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -3597,7 +3596,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -3616,7 +3615,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -3747,7 +3746,8 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
#ifdef DOUBLE_PRECISION_COMPLEX
......@@ -3765,7 +3765,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4269,7 +4269,8 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
n_off = current_local_n+a_off
......@@ -4285,7 +4286,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4427,7 +4428,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, n_off, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4456,7 +4457,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
if (useGPU) then
......@@ -4601,7 +4602,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4625,7 +4626,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4775,7 +4776,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4800,7 +4801,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......@@ -4891,7 +4892,7 @@
#else
call timer%start("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
!$omp parallel do private(my_thread, b_len, b_off), schedule(static, 1)
do my_thread = 1, max_threads
......@@ -4920,7 +4921,7 @@
#else
call timer%stop("OpenMP parallel_single")
#endif
#endif
#endif /* HAVE_DETAILED_TIMINGS */
#else /* WITH_OPENMP */
......
......@@ -110,20 +110,20 @@
use precision
implicit none
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
integer(kind=ik) :: na, lda, nblk, nbw, matrixCols, numBlocks, mpi_comm_rows, mpi_comm_cols
#ifdef DESPERATELY_WANT_ASSUMED_SIZE
real(kind=REAL_DATATYPE) :: a(lda,*), tmat(nbw,nbw,*)
#else
real(kind=REAL_DATATYPE) :: a(lda,matrixCols), tmat(nbw,nbw,numBlocks)
#endif
real(kind=REAL_DATATYPE) :: eps
logical, intent(in) :: useGPU
logical, intent(in) :: useGPU
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, vmrCols
integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc, mynlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
integer(kind=ik) :: my_prow, my_pcol, np_rows, np_cols, mpierr
integer(kind=ik) :: l_cols, l_rows, vmrCols
integer(kind=ik) :: i, j, lcs, lce, lrs, lre, lc, lr, cur_pcol, n_cols, nrow
integer(kind=ik) :: istep, ncol, lch, lcx, nlc, mynlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=REAL_DATATYPE) :: vnorm2, xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
......@@ -131,31 +131,31 @@
real(kind=REAL_DATATYPE), allocatable :: tmpCPU(:,:), vmrCPU(:,:), umcCPU(:,:)
real(kind=REAL_DATATYPE), allocatable :: vr(:)
! needed for blocked QR decomposition
integer(kind=ik) :: PQRPARAM(11), work_size
integer(kind=ik) :: PQRPARAM(11), work_size
real(kind=REAL_DATATYPE) :: dwork_size(1)
real(kind=REAL_DATATYPE), allocatable :: work_blocked(:), tauvector(:), blockheuristic(:)
! a_dev is passed from bandred_real to trans_ev_band
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
integer(kind=C_intptr_T) :: a_dev, vmr_dev, umc_dev, tmat_dev, vav_dev
#ifdef WITH_MPI
integer(kind=ik), external :: numroc
integer(kind=ik), external :: numroc
#endif
integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=c_size_t) :: lc_start, lc_end
integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols !, na_rows
integer(kind=ik) :: ierr
integer(kind=ik) :: cur_l_rows, cur_l_cols, vmr_size, umc_size
integer(kind=c_size_t) :: lc_start, lc_end
integer(kind=ik) :: lr_end
integer(kind=ik) :: na_cols !, na_rows
logical, intent(in) :: wantDebug
logical, intent(out) :: success
logical :: successCUDA
integer(kind=ik) :: istat
character(200) :: errorMessage
logical, intent(in) :: wantDebug
logical, intent(out) :: success
logical :: successCUDA
integer(kind=ik) :: istat
character(200) :: errorMessage
logical, intent(in) :: useQR
logical, intent(in) :: useQR
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp, transformChunkSize
integer(kind=ik) :: mystart, myend, m_way, n_way, work_per_thread, m_id, n_id, n_threads, &
ii, pp, transformChunkSize
#ifdef HAVE_DETAILED_TIMINGS
#ifdef DOUBLE_PRECISION_REAL
......@@ -192,7 +192,7 @@
! na_rows = na
na_cols = na
#endif
endif
endif ! useGPU
! Matrix is split into tiles; work is done only for tiles on the diagonal or above
......@@ -423,7 +423,9 @@
endif
endif
else ! GPU not used
! Allocate vmr and umc to their exact sizes so that they can be used in bcasts and reduces
allocate(vmrCPU(max(l_rows,1),2*n_cols), stat=istat, errmsg=errorMessage)
......@@ -443,7 +445,9 @@
print *,"bandred_real: error when allocating vr "//errorMessage
stop
endif
endif ! use GPU
#ifdef DOUBLE_PRECISION_REAL
if (useGPU) then
vmrCUDA(1 : cur_l_rows * n_cols) = 0._rk8
......@@ -547,6 +551,7 @@
#endif /* DOUBLE_PRECISION_REAL */
endif
else !useQR
do lc = n_cols, 1, -1
......@@ -706,7 +711,9 @@
endif
enddo
!$omp end parallel
#else /* WITH_OPENMP */
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
......@@ -919,6 +926,7 @@
endif ! l_cols>0 .and. l_rows>0
else ! do not useGPU version
!Code for Algorithm 4
n_way = 1
......@@ -1338,7 +1346,9 @@
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1), mpi_comm_rows, &