Commit 0d08507c authored by Andreas Marek's avatar Andreas Marek
Browse files

Rename OPENMP preprocessor macro

parent 1aa89171
......@@ -93,9 +93,9 @@ AC_ARG_ENABLE([openmp],
],
[enable_openmp=no])
AC_MSG_RESULT([${enable_openmp}])
AM_CONDITIONAL([WITH_OPENMP],[test x"$enable_openmp" = x"yes"])
AM_CONDITIONAL([WITH_OPENMP_TRADITIONAL],[test x"$enable_openmp" = x"yes"])
if test x"${enable_openmp}" = x"yes"; then
AC_DEFINE([WITH_OPENMP], [1], [use OpenMP threading])
AC_DEFINE([WITH_OPENMP_TRADITIONAL], [1], [use OpenMP threading])
fi
......
......@@ -64,7 +64,7 @@ subroutine merge_systems_&
use elpa_abstract_impl
use elpa_blas_interfaces
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
use omp_lib
#endif
implicit none
......@@ -93,7 +93,7 @@ subroutine merge_systems_&
dbase(na), ddiff(na), ev_scale(na), tmp(na)
real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
#endif
......@@ -122,7 +122,7 @@ subroutine merge_systems_&
&PRECISION&
&_real
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
integer(kind=ik) :: my_thread
allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
......@@ -442,7 +442,7 @@ subroutine merge_systems_&
! Solve secular equation
z(1:na1) = 1
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
z_p(1:na1,:) = 1
#endif
dbase(1:na1) = 0
......@@ -450,7 +450,7 @@ subroutine merge_systems_&
info = 0
infoBLAS = int(info,kind=BLAS_KIND)
!#ifdef WITH_OPENMP
!#ifdef WITH_OPENMP_TRADITIONAL
!
! call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
!!$OMP PARALLEL PRIVATE(i,my_thread,delta,s,info,infoBLAS,j)
......@@ -474,7 +474,7 @@ subroutine merge_systems_&
! Compute updated z
!#ifdef WITH_OPENMP
!#ifdef WITH_OPENMP_TRADITIONAL
! do j=1,na1
! if (i/=j) z_p(j,my_thread) = z_p(j,my_thread)*( delta(j) / (d1(j)-d1(i)) )
! enddo
......@@ -500,7 +500,7 @@ subroutine merge_systems_&
ddiff(i) = delta(i)
endif
enddo
!#ifdef WITH_OPENMP
!#ifdef WITH_OPENMP_TRADITIONAL
!!$OMP END PARALLEL
!
! call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
......@@ -526,7 +526,7 @@ subroutine merge_systems_&
! Calculate scale factors for eigenvectors
ev_scale(:) = 0.0_rk
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
call obj%timer%start("OpenMP parallel" // PRECISION_SUFFIX)
......@@ -548,7 +548,7 @@ subroutine merge_systems_&
&(obj, d1, dbase, ddiff, z, ev_scale(i), na1,i)
! ev_scale(i) = ev_scale_val
enddo
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$OMP END PARALLEL DO
call obj%timer%stop("OpenMP parallel" // PRECISION_SUFFIX)
......@@ -888,7 +888,7 @@ subroutine merge_systems_&
deallocate(ev, qtmp1, qtmp2, stat=istat, errmsg=errorMessage)
check_deallocate("merge_systems: ev, qtmp1, qtmp2",istat, errorMessage)
endif !very outer test (na1==1 .or. na1==2)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
deallocate(z_p, stat=istat, errmsg=errorMessage)
check_deallocate("merge_systems: z_p",istat, errorMessage)
#endif
......
......@@ -201,7 +201,7 @@ function elpa_solve_evp_&
call mpi_comm_rank(int(mpi_comm_all,kind=MPI_KIND), my_peMPI, mpierr)
my_pe = int(my_peMPI,kind=c_int)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
......@@ -263,7 +263,7 @@ function elpa_solve_evp_&
endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
......@@ -562,7 +562,7 @@ function elpa_solve_evp_&
call nvtxRangePop()
#endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
......
......@@ -145,7 +145,7 @@ subroutine tridiag_&
integer(kind=c_intptr_t) :: a_offset
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
integer(kind=ik) :: my_thread, n_threads, n_iter
#endif
......@@ -170,7 +170,7 @@ subroutine tridiag_&
! pattern: u1,v1,u2,v2,u3,v3,....
MATH_DATATYPE(kind=rck), allocatable :: uv_stored_cols(:,:)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
MATH_DATATYPE(kind=rck), allocatable :: ur_p(:,:), uc_p(:,:)
#endif
......@@ -355,7 +355,7 @@ subroutine tridiag_&
endif
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
allocate(ur_p(max_local_rows,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "ur_p", istat, errorMessage)
......@@ -363,7 +363,7 @@ subroutine tridiag_&
allocate(uc_p(max_local_cols,0:max_threads-1), stat=istat, errmsg=errorMessage)
call check_alloc("tridiag_&
&MATH_DATATYPE ", "uc_p", istat, errorMessage)
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP_TRADITIONAL */
tmp = 0
v_row = 0
......@@ -579,7 +579,7 @@ subroutine tridiag_&
check_memcpy_cuda("tridiag: v_row_dev", successCUDA)
endif ! useGU
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
call obj%timer%start("OpenMP parallel")
!$OMP PARALLEL PRIVATE(my_thread,n_threads,n_iter,i,l_col_beg,l_col_end,j,l_row_beg,l_row_end)
......@@ -592,7 +592,7 @@ subroutine tridiag_&
! first calculate A*v part of (A + VU**T + UV**T)*v
uc_p(1:l_cols,my_thread) = 0.
ur_p(1:l_rows,my_thread) = 0.
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP_TRADITIONAL */
do i= 0, (istep-2)/tile_size
l_col_beg = i*l_cols_per_tile+1
l_col_end = min(l_cols,(i+1)*l_cols_per_tile)
......@@ -601,7 +601,7 @@ subroutine tridiag_&
l_row_beg = j*l_rows_per_tile+1
l_row_end = min(l_rows,(j+1)*l_rows_per_tile)
if (l_row_end < l_row_beg) cycle
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
if (mod(n_iter,n_threads) == my_thread) then
if (wantDebug) call obj%timer%start("blas")
call PRECISION_GEMV(BLAS_TRANS_OR_CONJ, &
......@@ -628,7 +628,7 @@ subroutine tridiag_&
if (wantDebug) call obj%timer%stop("blas")
endif
n_iter = n_iter+1
#else /* WITH_OPENMP */
#else /* WITH_OPENMP_TRADITIONAL */
! multiplication by blocks is efficient only for CPU
! for GPU we introduced 2 other ways, either by stripes (more simmilar to the original
......@@ -658,7 +658,7 @@ subroutine tridiag_&
if (wantDebug) call obj%timer%stop("blas")
endif ! not useGPU
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP_TRADITIONAL */
enddo ! j=0,i
enddo ! i=0,(istep-2)/tile_size
......@@ -738,7 +738,7 @@ subroutine tridiag_&
check_memcpy_cuda("tridiag: u_row_dev 1", successCUDA)
endif ! useGPU
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$OMP END PARALLEL
call obj%timer%stop("OpenMP parallel")
......@@ -746,7 +746,7 @@ subroutine tridiag_&
u_col(1:l_cols) = u_col(1:l_cols) + uc_p(1:l_cols,i)
u_row(1:l_rows) = u_row(1:l_rows) + ur_p(1:l_rows,i)
enddo
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP_TRADITIONAL */
! second calculate (VU**T + UV**T)*v part of (A + VU**T + UV**T)*v
if (n_stored_vecs > 0) then
......
......@@ -82,7 +82,7 @@
&PRECISION&
&")
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
......@@ -330,7 +330,7 @@
enddo
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
......
......@@ -75,7 +75,7 @@ subroutine elpa_reduce_add_vectors_&
!-------------------------------------------------------------------------------
use precision
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
use omp_lib
#endif
use elpa_mpi
......@@ -132,7 +132,7 @@ subroutine elpa_reduce_add_vectors_&
check_allocate("elpa_reduce_add: aux2", istat, errorMessage)
aux1(:) = 0
aux2(:) = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!call omp_set_num_threads(nrThreads)
!$omp parallel private(ips, ipt, auxstride, lc, i, k, ns, nl) num_threads(nrThreads)
......@@ -147,7 +147,7 @@ subroutine elpa_reduce_add_vectors_&
if (myps == ips) then
! k = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp do
#endif
do lc=1,nvc
......@@ -161,7 +161,7 @@ subroutine elpa_reduce_add_vectors_&
enddo
k = nvc * auxstride
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp barrier
!$omp master
#endif
......@@ -184,13 +184,13 @@ subroutine elpa_reduce_add_vectors_&
if (k>0) aux2 = aux1
#endif /* WITH_MPI */
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp end master
!$omp barrier
#endif
if (mypt == ipt) then
! k = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp do
#endif
do lc=1,nvc
......@@ -207,7 +207,7 @@ subroutine elpa_reduce_add_vectors_&
endif
enddo
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp end parallel
#endif
......
......@@ -92,7 +92,7 @@
matrixRows = obj%local_nrows
matrixCols = obj%local_ncols
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
......@@ -135,7 +135,7 @@
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
......
......@@ -87,7 +87,7 @@ subroutine ROUTINE_NAME&
!-------------------------------------------------------------------------------
use precision
use elpa_abstract_impl
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
use omp_lib
#endif
use elpa_mpi
......@@ -147,7 +147,7 @@ subroutine ROUTINE_NAME&
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ), stat=istat, errmsg=errorMessage)
check_allocate("elpa_transpose_vectors: aux", istat, errorMessage)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
#endif
do n = 0, lcm_s_t-1
......@@ -163,7 +163,7 @@ subroutine ROUTINE_NAME&
if (nblks_comm .ne. 0) then
if (myps == ips) then
! k = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp do
#endif
do lc=1,nvc
......@@ -177,7 +177,7 @@ subroutine ROUTINE_NAME&
enddo
endif
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp barrier
!$omp master
#endif
......@@ -198,7 +198,7 @@ subroutine ROUTINE_NAME&
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp end master
!$omp barrier
......@@ -222,7 +222,7 @@ subroutine ROUTINE_NAME&
endif
enddo
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp end parallel
#endif
deallocate(aux, stat=istat, errmsg=errorMessage)
......
......@@ -78,7 +78,7 @@ subroutine elpa_transpose_vectors_ss_&
!-------------------------------------------------------------------------------
use precision
use elpa_abstract_impl
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
use omp_lib
#endif
use elpa_mpi
......@@ -130,7 +130,7 @@ subroutine elpa_transpose_vectors_ss_&
allocate(aux( ((nblks_tot-nblks_skip+lcm_s_t-1)/lcm_s_t) * nblk * nvc ))
check_allocate("elpa_transpose_vectors_ss: aux", istat, errorMessage)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private(lc, i, k, ns, nl, nblks_comm, auxstride, ips, ipt, n)
#endif
do n = 0, lcm_s_t-1
......@@ -146,7 +146,7 @@ subroutine elpa_transpose_vectors_ss_&
if (nblks_comm .ne. 0) then
if (myps == ips) then
! k = 0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp do
#endif
do lc=1,nvc
......@@ -160,7 +160,7 @@ subroutine elpa_transpose_vectors_ss_&
enddo
endif
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp barrier
!$omp master
#endif
......@@ -181,7 +181,7 @@ subroutine elpa_transpose_vectors_ss_&
call obj%timer%stop("mpi_communication")
#endif /* WITH_MPI */
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp end master
!$omp barrier
......@@ -201,7 +201,7 @@ subroutine elpa_transpose_vectors_ss_&
endif
enddo
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp end parallel
#endif
deallocate(aux, stat=istat, errmsg=errorMessage)
......
This diff is collapsed.
......@@ -103,7 +103,7 @@
use cuda_functions
use iso_c_binding
use elpa1_compute
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
use omp_lib
#endif
use precision
......@@ -140,7 +140,7 @@
#if REALCASE == 1
integer(kind=ik) :: vmrCols
#endif
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
integer(kind=ik) :: mynlc, lrs, transformChunkSize
#endif
integer(kind=ik) :: i, j, lcs, lce, lre, lc, lr, cur_pcol, n_cols, nrow
......@@ -628,7 +628,7 @@
aux1 = 0.0_rck
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
#if 0
! original complex implementation without openmp. check performance
nlc = 0 ! number of local columns
......@@ -750,7 +750,7 @@
enddo
!$omp end parallel
#else /* WITH_OPENMP */
#else /* WITH_OPENMP_TRADITIONAL */
nlc = 0 ! number of local columns
do j=1,lc-1
......@@ -785,7 +785,7 @@
#endif
endif
enddo
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP_TRADITIONAL */
enddo ! lc
if (useGPU_reduction_lower_block_to_tridiagonal) then
......@@ -939,7 +939,7 @@
! n_way is actually a branch for the number of OpenMP threads
n_way = 1
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
#if REALCASE == 1
n_way = max_threads
......@@ -1022,7 +1022,7 @@
endif ! l_cols>0 .and. l_rows>0
else ! n_way > 1
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP_TRADITIONAL */
if (.not. useGPU) then
umcCPU(1:l_cols,1:n_cols) = 0.0_rck
......@@ -1137,7 +1137,7 @@
endif ! useGPU
endif ! l_cols>0 .and. l_rows>0
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
endif ! n_way > 1
#if REALCASE == 1
!$omp end parallel
......@@ -1394,7 +1394,7 @@
! A = A - V*U**T - U*V**T
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
!$omp parallel private( ii, i, lcs, lce, lre, n_way, m_way, m_id, n_id, work_per_thread, mystart, myend )
n_threads = omp_get_num_threads()
......@@ -1433,7 +1433,7 @@
enddo
!$omp end parallel
#else /* WITH_OPENMP */
#else /* WITH_OPENMP_TRADITIONAL */
do i=0,(istep*nbw-1)/tile_size
lcs = i*l_cols_tile+1
......@@ -1464,7 +1464,7 @@
call obj%timer%stop("blas")
endif ! useGPU
enddo ! i=0,(istep*nbw-1)/tile_size
#endif /* WITH_OPENMP */
#endif /* WITH_OPENMP_TRADITIONAL */
if (.not.(useGPU)) then
if (allocated(vr)) then
......
......@@ -88,7 +88,7 @@ program print_available_elpa2_kernels
print *, "information if (and how) the kernels can be choosen at "
print *, "runtime"
print *
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
print *, " ELPA supports threads: yes"
#else
print *, " ELPA supports threads: no"
......
......@@ -223,7 +223,7 @@
reDistributeMatrix = .false.
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
omp_threads_caller = omp_get_max_threads()
......@@ -305,7 +305,7 @@
endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
......@@ -972,7 +972,7 @@
endif
! restore original OpenMP settings
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! store the number of OpenMP threads used in the calling function
! restore this at the end of ELPA 2
call omp_set_num_threads(omp_threads_caller)
......
......@@ -94,7 +94,7 @@
use cuda_functions
use precision
use iso_c_binding
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
! use omp_lib
#endif
implicit none
......@@ -120,7 +120,7 @@
integer(kind=ik) :: next_n, next_local_n, next_n_start, next_n_end
integer(kind=ik) :: bottom_msg_length, top_msg_length, next_top_msg_length
integer(kind=ik) :: stripe_width, last_stripe_width, stripe_count
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
integer(kind=ik) :: thread_width, csw, b_off, b_len
#endif
integer(kind=ik) :: num_result_blocks, num_result_buffers, num_bufs_recvd
......@@ -129,7 +129,7 @@
integer(kind=MPI_KIND) :: mpierr
logical :: flag
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:,:)
#else
MATH_DATATYPE(kind=rck), pointer :: aIntern(:,:,:)
......@@ -141,7 +141,7 @@
MATH_DATATYPE(kind=rck), allocatable :: row(:)
MATH_DATATYPE(kind=rck), pointer :: row_group(:,:)
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
MATH_DATATYPE(kind=rck), allocatable :: top_border_send_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: top_border_recv_buffer(:,:)
MATH_DATATYPE(kind=rck), allocatable :: bottom_border_send_buffer(:,:)
......@@ -184,7 +184,7 @@
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
integer(kind=ik) :: my_thread
#endif
......@@ -266,7 +266,7 @@
l_nev = local_index(nev, my_pcol, np_cols, nblk, -1)
if (l_nev==0) then
#ifdef WITH_OPENMP
#ifdef WITH_OPENMP_TRADITIONAL
thread_width = 0
#endif
stripe_width = 0
......@@ -275,7 +275,7 @@
else ! l_nev
#if WITH_OPENMP
#if WITH_OPENMP_TRADITIONAL
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
! Suggested stripe width is 48 - should this be reduced for the complex case ???
......@@ -382,7 +382,7 @@
endif ! useGPU
#else /* WITH_OPENMP */
#else /* WITH_OPENMP_TRADITIONAL */
! Suggested stripe width is 48 since 48*64 real*8 numbers should fit into
! every primary cache
......@@ -481,7 +481,7 @@
last_stripe_width = l_nev - (stripe_count-1)*stripe_width