Commit 5b55ad7a authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'GPU_in_openmp_code_path' into 'master_pre_stage'

Gpu in openmp code path

See merge request !56
parents 4ff2f9e6 ea6daa06
......@@ -57,7 +57,7 @@
#include "cuUtils_template.cu"
#undef DOUBLE_PRECISION_REAL
#if WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
#undef DOUBLE_PRECISION_REAL
#include "cuUtils_template.cu"
......@@ -71,7 +71,7 @@
#include "cuUtils_template.cu"
#undef DOUBLE_PRECISION_COMPLEX
#if WANT_SINGLE_PRECISION_COMPLEX
#ifdef WANT_SINGLE_PRECISION_COMPLEX
#undef DOUBLE_PRECISION_COMPLEX
#include "cuUtils_template.cu"
......
......@@ -137,7 +137,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
......@@ -287,7 +287,7 @@ module elpa1_auxiliary_impl
#undef DOUBLE_PRECISION
#undef REALCASE
#if WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
#define REALCASE 1
#define SINGLE_PRECISION
#include "../general/precision_macros.h"
......
......@@ -279,8 +279,6 @@ function elpa_solve_evp_&
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
#endif
call obj%timer%stop("elpa_solve_evp_&
......@@ -356,6 +354,18 @@ function elpa_solve_evp_&
success = .false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu")
endif
......@@ -589,8 +599,6 @@ function elpa_solve_evp_&
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
#endif
......
......@@ -612,26 +612,28 @@ subroutine tridiag_&
if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP_TRADITIONAL
if (mod(n_iter,n_threads) == my_thread) then
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
if (i/=j) then
if (isSkewsymmetric) then
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
else
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
if (.not. useGPU) then
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_row(l_row_beg:max_local_rows+1), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
if (i/=j) then
if (isSkewsymmetric) then
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
else
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), &
int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(matrixRows,kind=BLAS_KIND), &
v_col(l_col_beg:max_local_cols), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
endif
endif
endif
if (wantDebug) call obj%timer%stop("blas")
......@@ -750,11 +752,12 @@ subroutine tridiag_&
#ifdef WITH_OPENMP_TRADITIONAL
!$OMP END PARALLEL
call obj%timer%stop("OpenMP parallel")
do i=0,max_threads-1
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
enddo
if (.not.(useGPU)) then
do i=0,max_threads-1
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
enddo
endif
#endif /* WITH_OPENMP_TRADITIONAL */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
......
......@@ -134,8 +134,11 @@ subroutine elpa_reduce_add_vectors_&
aux2(:) = 0
#ifdef WITH_OPENMP_TRADITIONAL
!call omp_set_num_threads(nrThreads)
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
!$omp parallel &
!$omp default(none) &
!$omp private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads) &
!$omp shared(nps, npt, lcm_s_t, nblk, vmat_t, vmat_s, myps, mypt, mpierr, obj, &
!$omp& comm_t, nblks_tot, aux2, aux1, nvr, nvc)
#endif
do n = 0, lcm_s_t-1
......
......@@ -148,7 +148,11 @@ subroutine ROUTINE_NAME&
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ), stat=istat, errmsg=errorMessage)
check_allocate("elpa_transpose_vectors: aux", istat, errorMessage)
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
do n = 0, lcm_s_t-1
......
......@@ -131,7 +131,11 @@ subroutine elpa_transpose_vectors_ss_&
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
check_allocate("elpa_transpose_vectors_ss: aux", istat, errorMessage)
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
!$omp parallel &
!$omp default(none) &
!$omp private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n) &
!$omp shared(nps, npt, lcm_s_t, mypt, nblk, myps, vmat_t, mpierr, comm_s, &
!$omp& obj, vmat_s, aux, nblks_skip, nblks_tot, nvc, nvr)
#endif
do n = 0, lcm_s_t-1
......
......@@ -58,11 +58,10 @@ l_nev, &
a_off, nbw, max_blk_size, bcast_buffer, bcast_buffer_dev, &
hh_tau_dev, kernel_flops, kernel_time, n_times, off, ncols, istripe, &
#ifdef WITH_OPENMP_TRADITIONAL
my_thread, thread_width, &
my_thread, thread_width, kernel, last_stripe_width)
#else
last_stripe_width, &
last_stripe_width, kernel)
#endif
kernel)
use precision
use elpa_abstract_impl
......@@ -141,6 +140,7 @@ kernel)
#else /* WITH_OPENMP_TRADITIONAL */
integer(kind=ik), intent(in) :: l_nev, thread_width
integer(kind=ik), intent(in), optional :: last_stripe_width
#if REALCASE == 1
! real(kind=C_DATATYPE_KIND) :: a(stripe_width,a_dim2,stripe_count,max_threads)
real(kind=C_DATATYPE_KIND), pointer :: a(:,:,:,:)
......@@ -221,54 +221,39 @@ kernel)
#ifdef WITH_OPENMP_TRADITIONAL
if (my_thread==1) then
if (my_thread==1) then ! in the calling routine threads go form 1 .. max_threads
#endif
ttt = mpi_wtime()
#ifdef WITH_OPENMP_TRADITIONAL
endif
#endif
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
if (kernel .eq. ELPA_2STAGE_REAL_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#if COMPLEXCASE == 1
if (kernel .eq. ELPA_2STAGE_COMPLEX_GPU) then
print *,"compute_hh_trafo_&
&MATH_DATATYPE&
&_GPU OPENMP: not yet implemented"
stop 1
endif
#endif
#endif /* WITH_OPENMP_TRADITIONAL */
#ifndef WITH_OPENMP_TRADITIONAL
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
#else /* WITH_OPENMP_TRADITIONAL */
if (istripe<stripe_count) then
nl = stripe_width
if (present(last_stripe_width)) then
nl = merge(stripe_width, last_stripe_width, istripe<stripe_count)
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
if (istripe<stripe_count) then
nl = stripe_width
else
noff = (my_thread-1)*thread_width + (istripe-1)*stripe_width
nl = min(my_thread*thread_width-noff, l_nev-noff)
if (nl<=0) then
if (wantDebug) call obj%timer%stop("compute_hh_trafo_&
&MATH_DATATYPE&
#ifdef WITH_OPENMP_TRADITIONAL
&_openmp" // &
&_openmp" // &
#else
&" // &
&" // &
#endif
&PRECISION_SUFFIX &
)
&PRECISION_SUFFIX &
)
return
return
endif
endif
endif
#endif /* not WITH_OPENMP_TRADITIONAL */
......
......@@ -629,65 +629,65 @@ max_threads)
aux1 = 0.0_rck
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc = 0 ! number of local columns
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
endif
enddo
! Get global dot products
#ifdef WITH_MPI
if (wantDebug) call obj%timer%start("mpi_communication")
if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
int(mpi_comm_rows,kind=MPI_KIND), mpierr)
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
endif
enddo
if (wantDebug) call obj%timer%stop("mpi_communication")
#else /* WITH_MPI */
! Transform
nlc = 0
do j=1,lc-1
lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
if (lcx>0) then
nlc = nlc+1
a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
endif
enddo
#endif /* WITH_MPI */
!#if 0
! ! original complex implementation without openmp. check performance
! nlc = 0 ! number of local columns
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! aux1(nlc) = dot_product(vr(1:lr),a_mat(1:lr,lcx))
! endif
! enddo
!
! ! Transform
! ! Get global dot products
!#ifdef WITH_MPI
! if (wantDebug) call obj%timer%start("mpi_communication")
! if (nlc>0) call mpi_allreduce(aux1, aux2, int(nlc,kind=MPI_KIND), MPI_COMPLEX_PRECISION, MPI_SUM, &
! int(mpi_comm_rows,kind=MPI_KIND), mpierr)
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
! endif
! enddo
#endif /* if 0 */
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
! endif
! enddo
!
!
! if (wantDebug) call obj%timer%stop("mpi_communication")
!
!#else /* WITH_MPI */
!
! ! Transform
!
! nlc = 0
! do j=1,lc-1
! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
! if (lcx>0) then
! nlc = nlc+1
! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux1(nlc)*vr(1:lr)
! endif
! enddo
!
!#endif /* WITH_MPI */
!!
!! ! Transform
!!
!! nlc = 0
!! do j=1,lc-1
!! lcx = local_index(istep*nbw+j, my_pcol, np_cols, nblk, 0)
!! if (lcx>0) then
!! nlc = nlc+1
!! a_mat(1:lr,lcx) = a_mat(1:lr,lcx) - conjg(tau)*aux2(nlc)*vr(1:lr)
!
!! endif
!! enddo
!#endif /* if 0 */
!Open up one omp region to avoid paying openmp overhead.
!This does not help performance due to the addition of two openmp barriers around the MPI call,
......@@ -909,35 +909,35 @@ max_threads)
! of the tiles, so we can use strips of the matrix
#if 0
! original complex implemetation check for performance
umcCPU(1:l_cols,1:n_cols) = 0.0_rck
vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
if (l_cols>0 .and. l_rows>0) then
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
lce = min(l_cols,(i+1)*l_cols_tile)
if (lce<lcs) cycle
lre = min(l_rows,(i+1)*l_rows_tile)
call obj%timer%start("blas")
call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
call obj%timer%stop("blas")
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call obj%timer%start("blas")
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
call obj%timer%stop("blas")
enddo
endif ! (l_cols>0 .and. l_rows>0)
#endif /* if 0 */
!#if 0
! ! original complex implemetation check for performance
! umcCPU(1:l_cols,1:n_cols) = 0.0_rck
! vmrCPU(1:l_rows,n_cols+1:2*n_cols) = 0.0_rck
!
! if (l_cols>0 .and. l_rows>0) then
! do i=0,(istep*nbw-1)/tile_size
!
! lcs = i*l_cols_tile+1
! lce = min(l_cols,(i+1)*l_cols_tile)
! if (lce<lcs) cycle
!
! lre = min(l_rows,(i+1)*l_rows_tile)
!
! call obj%timer%start("blas")
! call PRECISION_GEMM('C', 'N', lce-lcs+1, n_cols, lre, ONE, a_mat(1,lcs), ubound(a_mat,dim=1), &
! vmrCPU, ubound(vmrCPU,dim=1), ONE, umcCPU(lcs,1), ubound(umcCPU,dim=1))
! call obj%timer%stop("blas")
!
! if (i==0) cycle
! lre = min(l_rows,i*l_rows_tile)
! call obj%timer%start("blas")
! call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
! umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), ONE, vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
! call obj%timer%stop("blas")
! enddo
!
! endif ! (l_cols>0 .and. l_rows>0)
!#endif /* if 0 */
!Code for Algorithm 4
......@@ -1396,7 +1396,11 @@ max_threads)
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
!$omp parallel &
!$omp default(none) &
!$omp private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend ) &
!$omp shared(n_threads, istep, tile_size, nbw, n_cols, obj, vmrcpu, l_cols_tile, l_rows, l_rows_tile, &
!$omp& umccpu, l_cols, a_dev, vmr_dev, useGPU, cur_l_rows, umc_dev, cur_l_cols, lda )
n_threads = omp_get_num_threads()
if (mod(n_threads, 2) == 0) then
......@@ -1424,13 +1428,30 @@ max_threads)
myend = mystart + work_per_thread - 1
if ( myend > lre ) myend = lre
if ( myend-mystart+1 < 1) cycle
call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(myend-mystart+1,kind=BLAS_KIND), &
int(lce-lcs+1,kind=BLAS_KIND), int(2*n_cols,kind=BLAS_KIND), -ONE, &
vmrCPU(mystart, 1), int(ubound(vmrCPU,1),kind=BLAS_KIND), &
umcCPU(lcs,1), int(ubound(umcCPU,1),kind=BLAS_KIND), &
ONE, a_mat(mystart,lcs), int(ubound(a_mat,1),kind=BLAS_KIND) )
call obj%timer%stop("blas")
if (useGPU) then
if (n_way .gt. 1) then
print *,"error more than 1 openmp thread used in GPU part of elpa2_bandred"
print *,"this should never happen"
stop
endif
call obj%timer%start("cublas")
call cublas_PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, myend-mystart+1, &
lce-lcs+1, 2*n_cols, -ONE, &
vmr_dev, cur_l_rows, (umc_dev +(lcs-1)* &
size_of_datatype), &
cur_l_cols, ONE, (a_dev+(lcs-1)*lda* &
size_of_datatype), lda)
call obj%timer%stop("cublas")
else
call obj%timer%start("blas")
call PRECISION_GEMM('N', BLAS_TRANS_OR_CONJ, int(myend-mystart+1,kind=BLAS_KIND), &
int(lce-lcs+1,kind=BLAS_KIND), int(2*n_cols,kind=BLAS_KIND), -ONE, &
vmrCPU(mystart, 1), int(ubound(vmrCPU,1),kind=BLAS_KIND), &
umcCPU(lcs,1), int(ubound(umcCPU,1),kind=BLAS_KIND), &
ONE, a_mat(mystart,lcs), int(ubound(a_mat,1),kind=BLAS_KIND) )
call obj%timer%stop("blas")
endif
enddo
!$omp end parallel
......
......@@ -391,6 +391,18 @@
success = .false.
return
endif
#ifdef WITH_OPENMP_TRADITIONAL
! check the number of threads that ELPA should use internally
! in the GPU case at the moment only _1_ thread internally is allowed
call obj%get("omp_threads", nrThreads, error)
if (nrThreads .ne. 1) then
print *,"Experimental feature: Using OpenMP with GPU code paths needs internal to ELPA _1_ OpenMP thread"
print *,"setting 1 openmp thread now"
call obj%set("omp_threads",1, error)
nrThreads=1
call omp_set_num_threads(nrThreads)
endif
#endif
call obj%timer%stop("check_for_gpu")
endif
......@@ -518,7 +530,7 @@
endif
#endif
#endif
#endif /* REALCASE == 1 */
! consistency check: is user set kernel still identical with "kernel" or did
! we change it above? This is a mess and should be cleaned up
......@@ -614,9 +626,7 @@
else
useQR = .false.
endif
#endif
#endif /* REALCASE == 1 */
#if REALCASE == 1
useQRActual = .false.
......@@ -1009,8 +1019,6 @@
! restore original OpenMP settings
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
#endif
......
......@@ -528,8 +528,13 @@ subroutine tridiag_band_&
! with MPI calls
call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!$omp parallel do private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i), schedule(static,1), num_threads(max_threads)
!$omp parallel do &
!$omp default(none) &
!$omp private(my_thread, my_block_s, my_block_e, iblk, ns, ne, hv, tau, &
!$omp& nc, nr, hs, hd, vnorm2, hf, x, h, i) &
!$omp shared(max_threads, obj, ab, isSkewsymmetric, wantDebug, hh_gath, &
!$omp hh_cnt, tau_t, hv_t, na, istep, n_off, na_s, nb, omp_block_limits, iter) &
!$omp schedule(static,1), num_threads(max_threads)
do my_thread = 1, max_threads
if (iter == 1) then
......@@ -1087,7 +1092,7 @@ subroutine tridiag_band_&
endif
#endif
#if WITH_OPENMP_TRADITIONAL
#ifdef WITH_OPENMP_TRADITIONAL
do iblk = 1, nblocks
if (hh_dst(iblk) >= np_rows) exit
......
......@@ -57,7 +57,7 @@ module qr_utils_mod
public :: reverse_matrix_1dcomm_double
public :: reverse_matrix_2dcomm_ref_double
#if WANT_SINGLE_PRECISION_REAL
#ifdef WANT_SINGLE_PRECISION_REAL
public :: reverse_vector_local_single