Commit 69972fbf authored by Andreas Marek's avatar Andreas Marek

Some cleanup

parent 03d44007
......@@ -46,7 +46,7 @@ AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, w
AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20200417], [Current ELPA autotune version])
AC_DEFINE_SUBST(CURRENT_AUTOTUNE_VERSION, 20200417, "Current ELPA autotune version")
AC_DEFINE_UNQUOTED([ELPA_BUILDTIME], [$ELPA_BUILDTIME], ["Time of build"])
AX_COMPARE_VERSION([$ELPA_BUILDTIME], [gt], [1604905771],[old_elpa_version=yes],[old_elpa_version=no])
AX_CHECK_GNU_MAKE()
if test x$_cv_gnu_make_command = x ; then
......@@ -1776,6 +1776,9 @@ else
echo "build config should be compiled into the library: no"
fi
if test x"$have_loop_blocking" = x"yes"; then
AC_DEFINE([LOOP_BLOCKING],[1],[use blocking in loops])
fi
AC_SUBST([SUFFIX])
AC_SUBST([PKG_CONFIG_FILE],[elpa${SUFFIX}-${PACKAGE_VERSION}.pc])
......@@ -1986,4 +1989,10 @@ else
make -f $srcdir/generated_headers.am generated-headers top_srcdir="$srcdir" CPP="$CPP"
fi
if test x"$old_elpa_version" = x"yes"; then
echo " "
echo " It is possible that your current version of ELPA is not the latest one."
echo " You might want to have a look at https://elpa.mpcdf.mpg.de, whether a more recent"
echo " version has been released already"
echo " "
fi
AC_DEFUN([AX_GCC_VERSION], [
GCC_VERSION=""
echo "calling gcc"
echo $CC
$CC | grep gcc
echo $?
AX_CHECK_COMPILE_FLAG([-dumpversion],
[ax_gcc_version_option=yes],
[ax_gcc_version_option=no])
AS_IF([test "x$GCC" = "xyes"],[
AS_IF([test "x$ax_gcc_version_option" != "xno"],[
AC_CACHE_CHECK([gcc version],[ax_cv_gcc_version],[
ax_cv_gcc_version="`$CC -dumpversion`"
AS_IF([test "x$ax_cv_gcc_version" = "x"],[
ax_cv_gcc_version=""
])
])
GCC_VERSION=$ax_cv_gcc_version
])
])
AC_SUBST([GCC_VERSION])
])
# ===========================================================================
# https://www.gnu.org/software/autoconf-archive/ax_compare_version.html
# ===========================================================================
#
# SYNOPSIS
#
# AX_COMPARE_VERSION(VERSION_A, OP, VERSION_B, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
#
# DESCRIPTION
#
# This macro compares two version strings. Due to the various number of
# minor-version numbers that can exist, and the fact that string
# comparisons are not compatible with numeric comparisons, this is not
# necessarily trivial to do in a autoconf script. This macro makes doing
# these comparisons easy.
#
# The six basic comparisons are available, as well as checking equality
# limited to a certain number of minor-version levels.
#
# The operator OP determines what type of comparison to do, and can be one
# of:
#
# eq - equal (test A == B)
# ne - not equal (test A != B)
# le - less than or equal (test A <= B)
# ge - greater than or equal (test A >= B)
# lt - less than (test A < B)
# gt - greater than (test A > B)
#
# Additionally, the eq and ne operator can have a number after it to limit
# the test to that number of minor versions.
#
# eq0 - equal up to the length of the shorter version
# ne0 - not equal up to the length of the shorter version
# eqN - equal up to N sub-version levels
# neN - not equal up to N sub-version levels
#
# When the condition is true, shell commands ACTION-IF-TRUE are run,
# otherwise shell commands ACTION-IF-FALSE are run. The environment
# variable 'ax_compare_version' is always set to either 'true' or 'false'
# as well.
#
# Examples:
#
# AX_COMPARE_VERSION([3.15.7],[lt],[3.15.8])
# AX_COMPARE_VERSION([3.15],[lt],[3.15.8])
#
# would both be true.
#
# AX_COMPARE_VERSION([3.15.7],[eq],[3.15.8])
# AX_COMPARE_VERSION([3.15],[gt],[3.15.8])
#
# would both be false.
#
# AX_COMPARE_VERSION([3.15.7],[eq2],[3.15.8])
#
# would be true because it is only comparing two minor versions.
#
# AX_COMPARE_VERSION([3.15.7],[eq0],[3.15])
#
# would be true because it is only comparing the lesser number of minor
# versions of the two values.
#
# Note: The characters that separate the version numbers do not matter. An
# empty string is the same as version 0. OP is evaluated by autoconf, not
# configure, so must be a string, not a variable.
#
# The author would like to acknowledge Guido Draheim whose advice about
# the m4_case and m4_ifvaln functions make this macro only include the
# portions necessary to perform the specific comparison specified by the
# OP argument in the final configure script.
#
# LICENSE
#
# Copyright (c) 2008 Tim Toolan <toolan@ele.uri.edu>
#
# Copying and distribution of this file, with or without modification, are
# permitted in any medium without royalty provided the copyright notice
# and this notice are preserved. This file is offered as-is, without any
# warranty.
#serial 13
dnl #########################################################################
AC_DEFUN([AX_COMPARE_VERSION], [
AC_REQUIRE([AC_PROG_AWK])
# Used to indicate true or false condition
ax_compare_version=false
# Convert the two version strings to be compared into a format that
# allows a simple string comparison. The end result is that a version
# string of the form 1.12.5-r617 will be converted to the form
# 0001001200050617. In other words, each number is zero padded to four
# digits, and non digits are removed.
AS_VAR_PUSHDEF([A],[ax_compare_version_A])
A=`echo "$1" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
AS_VAR_PUSHDEF([B],[ax_compare_version_B])
B=`echo "$3" | sed -e 's/\([[0-9]]*\)/Z\1Z/g' \
-e 's/Z\([[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/Z\([[0-9]][[0-9]][[0-9]]\)Z/Z0\1Z/g' \
-e 's/[[^0-9]]//g'`
dnl # In the case of le, ge, lt, and gt, the strings are sorted as necessary
dnl # then the first line is used to determine if the condition is true.
dnl # The sed right after the echo is to remove any indented white space.
m4_case(m4_tolower($2),
[lt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[gt],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/false/;s/x${B}/true/;1q"`
],
[le],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],
[ge],[
ax_compare_version=`echo "x$A
x$B" | sed 's/^ *//' | sort -r | sed "s/x${A}/true/;s/x${B}/false/;1q"`
],[
dnl Split the operator from the subversion count if present.
m4_bmatch(m4_substr($2,2),
[0],[
# A count of zero means use the length of the shorter version.
# Determine the number of characters in A and B.
ax_compare_version_len_A=`echo "$A" | $AWK '{print(length)}'`
ax_compare_version_len_B=`echo "$B" | $AWK '{print(length)}'`
# Set A to no more than B's length and B to no more than A's length.
A=`echo "$A" | sed "s/\(.\{$ax_compare_version_len_B\}\).*/\1/"`
B=`echo "$B" | sed "s/\(.\{$ax_compare_version_len_A\}\).*/\1/"`
],
[[0-9]+],[
# A count greater than zero means use only that many subversions
A=`echo "$A" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
B=`echo "$B" | sed "s/\(\([[0-9]]\{4\}\)\{m4_substr($2,2)\}\).*/\1/"`
],
[.+],[
AC_WARNING(
[invalid OP numeric parameter: $2])
],[])
# Pad zeros at end of numbers to make same length.
ax_compare_version_tmp_A="$A`echo $B | sed 's/./0/g'`"
B="$B`echo $A | sed 's/./0/g'`"
A="$ax_compare_version_tmp_A"
# Check for equality or inequality as necessary.
m4_case(m4_tolower(m4_substr($2,0,2)),
[eq],[
test "x$A" = "x$B" && ax_compare_version=true
],
[ne],[
test "x$A" != "x$B" && ax_compare_version=true
],[
AC_WARNING([invalid OP parameter: $2])
])
])
AS_VAR_POPDEF([A])dnl
AS_VAR_POPDEF([B])dnl
dnl # Execute ACTION-IF-TRUE / ACTION-IF-FALSE.
if test "$ax_compare_version" = "true" ; then
have_loop_blocking=yes
m4_ifvaln([$4],[$4],[:])dnl
m4_ifvaln([$5],[else $5])dnl
fi
]) dnl AX_COMPARE_VERSION
......@@ -54,1180 +54,1180 @@
#include "../general/sanity.F90"
subroutine merge_systems_&
&PRECISION &
(obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success, max_threads)
use cuda_functions
use iso_c_binding
use precision
use elpa_abstract_impl
use elpa_blas_interfaces
subroutine merge_systems_&
&PRECISION &
(obj, na, nm, d, e, q, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, &
l_col, p_col, l_col_out, p_col_out, npc_0, npc_n, useGPU, wantDebug, success, max_threads)
use cuda_functions
use iso_c_binding
use precision
use elpa_abstract_impl
use elpa_blas_interfaces
#ifdef WITH_OPENMP
use omp_lib
use omp_lib
#endif
implicit none
implicit none
#include "../general/precision_kinds.F90"
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, npc_0, npc_n
integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
real(kind=REAL_DATATYPE), intent(inout) :: d(na), e
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, nm, ldq, nqoff, nblk, matrixCols, mpi_comm_rows, &
mpi_comm_cols, npc_0, npc_n
integer(kind=ik), intent(in) :: l_col(na), p_col(na), l_col_out(na), p_col_out(na)
real(kind=REAL_DATATYPE), intent(inout) :: d(na), e
#ifdef USE_ASSUMED_SIZE
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*)
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,*)
#else
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols)
real(kind=REAL_DATATYPE), intent(inout) :: q(ldq,matrixCols)
#endif
logical, intent(in) :: useGPU, wantDebug
logical, intent(out) :: success
! TODO: play with max_strip. If it was larger, matrices being multiplied
! might be larger as well!
integer(kind=ik), parameter :: max_strip=128
real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, &
qtrans(2,2), dmax, zmax, d1new, d2new
real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
dbase(na), ddiff(na), ev_scale(na), tmp(na)
real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
logical, intent(in) :: useGPU, wantDebug
logical, intent(out) :: success
! TODO: play with max_strip. If it was larger, matrices being multiplied
! might be larger as well!
integer(kind=ik), parameter :: max_strip=128
real(kind=REAL_DATATYPE) :: beta, sig, s, c, t, tau, rho, eps, tol, &
qtrans(2,2), dmax, zmax, d1new, d2new
real(kind=REAL_DATATYPE) :: z(na), d1(na), d2(na), z1(na), delta(na), &
dbase(na), ddiff(na), ev_scale(na), tmp(na)
real(kind=REAL_DATATYPE) :: d1u(na), zu(na), d1l(na), zl(na)
real(kind=REAL_DATATYPE), allocatable :: qtmp1(:,:), qtmp2(:,:), ev(:,:)
#ifdef WITH_OPENMP
real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
real(kind=REAL_DATATYPE), allocatable :: z_p(:,:)
#endif
integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
l_rqm, ns, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
l_cols_qreorg, np, l_idx, nqcols1, nqcols2
integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
np_cols
integer(kind=MPI_KIND) :: mpierr
integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
integer(kind=ik) :: np_next, np_prev, np_rem
integer(kind=ik) :: idx(na), idx1(na), idx2(na)
integer(kind=BLAS_KIND) :: idxBLAS(NA)
integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
integer(kind=c_intptr_t) :: num
integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_real
integer(kind=ik), intent(in) :: max_threads
integer(kind=ik) :: i, j, na1, na2, l_rows, l_cols, l_rqs, l_rqe, &
l_rqm, ns, info
integer(kind=BLAS_KIND) :: infoBLAS
integer(kind=ik) :: l_rnm, nnzu, nnzl, ndef, ncnt, max_local_cols, &
l_cols_qreorg, np, l_idx, nqcols1, nqcols2
integer(kind=ik) :: my_proc, n_procs, my_prow, my_pcol, np_rows, &
np_cols
integer(kind=MPI_KIND) :: mpierr
integer(kind=MPI_KIND) :: my_prowMPI, np_rowsMPI, my_pcolMPI, np_colsMPI
integer(kind=ik) :: np_next, np_prev, np_rem
integer(kind=ik) :: idx(na), idx1(na), idx2(na)
integer(kind=BLAS_KIND) :: idxBLAS(NA)
integer(kind=ik) :: coltyp(na), idxq1(na), idxq2(na)
integer(kind=ik) :: istat
character(200) :: errorMessage
integer(kind=ik) :: gemm_dim_k, gemm_dim_l, gemm_dim_m
integer(kind=c_intptr_t) :: num
integer(kind=C_intptr_T) :: qtmp1_dev, qtmp2_dev, ev_dev
logical :: successCUDA
integer(kind=c_intptr_t), parameter :: size_of_datatype = size_of_&
&PRECISION&
&_real
integer(kind=ik), intent(in) :: max_threads
#ifdef WITH_OPENMP
integer(kind=ik) :: my_thread
integer(kind=ik) :: my_thread
allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
check_allocate("merge_systems: z_p",istat, errorMessage)
allocate(z_p(na,0:max_threads-1), stat=istat, errmsg=errorMessage)
check_allocate("merge_systems: z_p",istat, errorMessage)
#endif
call obj%timer%start("merge_systems" // PRECISION_SUFFIX)
success = .true.
call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
! If my processor column isn't in the requested set, do nothing
if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
return
endif
! Determine number of "next" and "prev" column for ring sends
if (my_pcol == npc_0+npc_n-1) then
np_next = npc_0
else
np_next = my_pcol + 1
endif
call obj%timer%start("merge_systems" // PRECISION_SUFFIX)
success = .true.
call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND) ,my_prowMPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND) ,np_rowsMPI, mpierr)
call mpi_comm_rank(int(mpi_comm_cols,kind=MPI_KIND) ,my_pcolMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND) ,np_colsMPI, mpierr)
my_prow = int(my_prowMPI,kind=c_int)
np_rows = int(np_rowsMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
call obj%timer%stop("mpi_communication")
! If my processor column isn't in the requested set, do nothing
if (my_pcol<npc_0 .or. my_pcol>=npc_0+npc_n) then
call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
return
endif
! Determine number of "next" and "prev" column for ring sends
if (my_pcol == npc_0+npc_n-1) then
np_next = npc_0
else
np_next = my_pcol + 1
endif
if (my_pcol == npc_0) then
np_prev = npc_0+npc_n-1
else
np_prev = my_pcol - 1
endif
call check_monotony_&
&PRECISION&
&(obj, nm,d,'Input1',wantDebug, success)
if (.not.(success)) then
call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
return
endif
call check_monotony_&
&PRECISION&
&(obj,na-nm,d(nm+1),'Input2',wantDebug, success)
if (.not.(success)) then
call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
return
endif
! Get global number of processors and my processor number.
! Please note that my_proc does not need to match any real processor number,
! it is just used for load balancing some loops.
n_procs = np_rows*npc_n
my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
! Local limits of the rows of Q
l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm
l_rows = l_rqe-l_rqs+1 ! Total number of local rows
! My number of local columns
l_cols = COUNT(p_col(1:na)==my_pcol)
! Get max number of local columns
max_local_cols = 0
do np = npc_0, npc_0+npc_n-1
max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np))
enddo
! Calculations start here
beta = abs(e)
sig = sign(1.0_rk,e)
! Calculate rank-1 modifier z
z(:) = 0
if (MOD((nqoff+nm-1)/nblk,np_rows)==my_prow) then
! nm is local on my row
do i = 1, na
if (p_col(i)==my_pcol) z(i) = q(l_rqm,l_col(i))
enddo
endif
if (MOD((nqoff+nm)/nblk,np_rows)==my_prow) then
! nm+1 is local on my row
do i = 1, na
if (p_col(i)==my_pcol) z(i) = z(i) + sig*q(l_rqm+1,l_col(i))
enddo
endif
call global_gather_&
&PRECISION&
&(obj, z, na)
! Normalize z so that norm(z) = 1. Since z is the concatenation of
! two normalized vectors, norm2(z) = sqrt(2).
z = z/sqrt(2.0_rk)
rho = 2.0_rk*beta
! Calculate index for merging both systems by ascending eigenvalues
call obj%timer%start("blas")
call PRECISION_LAMRG( int(nm,kind=BLAS_KIND), int(na-nm,kind=BLAS_KIND), d, &
1_BLAS_KIND, 1_BLAS_KIND, idxBLAS )
idx(:) = int(idxBLAS(:),kind=ik)
call obj%timer%stop("blas")
! Calculate the allowable deflation tolerance
zmax = maxval(abs(z))
dmax = maxval(abs(d))
EPS = PRECISION_LAMCH( 'E' ) ! return epsilon
TOL = 8.0_rk*EPS*MAX(dmax,zmax)
! If the rank-1 modifier is small enough, no more needs to be done
! except to reorganize D and Q
IF ( RHO*zmax <= TOL ) THEN
! Rearrange eigenvalues
tmp = d
do i=1,na
d(i) = tmp(idx(i))
enddo
! Rearrange eigenvectors
call resort_ev_&
&PRECISION &
(obj, idx, na)
if (my_pcol == npc_0) then
np_prev = npc_0+npc_n-1
else
np_prev = my_pcol - 1
endif
call check_monotony_&
&PRECISION&
&(obj, nm,d,'Input1',wantDebug, success)
if (.not.(success)) then
call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
return
endif
call check_monotony_&
&PRECISION&
&(obj,na-nm,d(nm+1),'Input2',wantDebug, success)
if (.not.(success)) then
call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
return
endif
! Get global number of processors and my processor number.
! Please note that my_proc does not need to match any real processor number,
! it is just used for load balancing some loops.
call obj%timer%stop("merge_systems" // PRECISION_SUFFIX)
n_procs = np_rows*npc_n
my_proc = my_prow*npc_n + (my_pcol-npc_0) ! Row major
return
ENDIF
! Merge and deflate system
! Local limits of the rows of Q
na1 = 0
na2 = 0
l_rqs = local_index(nqoff+1 , my_prow, np_rows, nblk, +1) ! First row of Q
l_rqm = local_index(nqoff+nm, my_prow, np_rows, nblk, -1) ! Last row <= nm
l_rqe = local_index(nqoff+na, my_prow, np_rows, nblk, -1) ! Last row of Q
! COLTYP:
! 1 : non-zero in the upper half only;
! 2 : dense;
! 3 : non-zero in the lower half only;
! 4 : deflated.
l_rnm = l_rqm-l_rqs+1 ! Number of local rows <= nm
l_rows = l_rqe-l_rqs+1 ! Total number of local rows
coltyp(1:nm) = 1
coltyp(nm+1:na) = 3
do i=1,na
! My number of local columns
if (rho*abs(z(idx(i))) <= tol) then
l_cols = COUNT(p_col(1:na)==my_pcol)
! Deflate due to small z component.
! Get max number of local columns
na2 = na2+1
d2(na2) = d(idx(i))
idx2(na2) = idx(i)
coltyp(idx(i)) = 4
max_local_cols = 0
do np = npc_0, npc_0+npc_n-1
max_local_cols = MAX(max_local_cols,COUNT(p_col(1:na)==np))
enddo
else if (na1>0) then
! Calculations start here
! Check if eigenvalues are close enough to allow deflation.
beta = abs(e)
sig = sign(1.0_rk,e)
S = Z(idx(i))