Commit befee56f authored by Andreas Marek's avatar Andreas Marek
Browse files

Merge branch 'skew' into 'master_pre_stage'

Skew

See merge request !26
parents 576b9f6d bb29f9da
Changelog for upcoming release
- not yet decided
- support of ELPA for real valued skew-symmetric matrices, please see:
CITATION
Changelog for ELPA 2019.05.002
......
......@@ -885,6 +885,7 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa_c_interface.c \
@top_srcdir@/src/elpa2/mod_pack_unpack_cpu.F90 \
@top_srcdir@/src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 \
@top_srcdir@/src/elpa2/elpa2_ssymm_matrix_allreduce_real_template.F90 \
@top_srcdir@/src/elpa2/elpa2_tridiag_band_template.F90 \
@top_srcdir@/src/elpa2/mod_redist_band.F90 \
@top_srcdir@/src/elpa2/pack_unpack_cpu.F90 \
......@@ -1001,6 +1002,7 @@ EXCLUDE = @top_srcdir@/src/GPU/check_for_gpu.F90 \
@top_srcdir@/src/elpa1/elpa_solve_tridi_impl_public.F90 \
@top_srcdir@/src/elpa1/elpa1_trans_ev_template.F90 \
@top_srcdir@/src/elpa1/elpa_transpose_vectors.F90 \
@top_srcdir@/src/elpa1/elpa_transpose_vectors_ss.F90 \
@top_srcdir@/src/elpa1/elpa1_auxiliary.F90 \
@top_srcdir@/src/elpa1/elpa1_tridiag_template.F90 \
@top_srcdir@/src/elpa1/elpa1_tools_template.F90 \
......
......@@ -84,7 +84,7 @@ An excerpt of the most important (*ELPA* specific) options reads as follows:
| --disable-Fortran2008-features | disable Fortran 2008 if compiler does not support it |
| --enable-pyhton | build and install python wrapper, default no |
| --enable-python-tests | enable python tests, default no. |
| --enable-skew-symmetric-support | enable support for real valued skew-symmetric matrices |
We recommend that you do not build ELPA in its main directory but that you use it
in a sub-directory:
......
......@@ -64,8 +64,8 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa1/elpa1.F90 \
src/elpa2/elpa2.F90 \
src/elpa_generalized/cannon.c \
#src/elpa_generalized/test_c_bindings.c \
src/helpers/matrix_plot.F90 \
src/general/mod_elpa_skewsymmetric_blas.F90 \
src/elpa_index.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa_c_interface.c
......@@ -122,6 +122,8 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa1/elpa_invert_trm.F90 \
src/elpa1/elpa_multiply_a_b.F90 \
src/elpa1/elpa_solve_tridi_impl_public.F90 \
src/general/elpa_ssr2_template.F90 \
src/general/elpa_ssmv_template.F90 \
src/general/precision_macros.h \
src/general/precision_typedefs.h \
src/general/precision_kinds.F90
......@@ -518,6 +520,10 @@ dist_man_MANS = \
man/elpa_autotune_save_state.3 \
man/elpa_autotune_load_state.3 \
man/elpa_autotune_print_state.3 \
man/elpa_autotune_setup.3 \
man/elpa_autotune_step.3 \
man/elpa_autotune_set_best.3 \
man/elpa_autotune_deallocate.3 \
man/elpa_uninit.3
if ENABLE_LEGACY
......@@ -542,11 +548,7 @@ dist_man_MANS += \
man/elpa_solve_evp_real_double.3 \
man/elpa_solve_evp_real_single.3 \
man/elpa_solve_evp_complex_double.3 \
man/elpa_solve_evp_complex_single.3 \
man/elpa_autotune_setup.3 \
man/elpa_autotune_step.3 \
man/elpa_autotune_set_best.3 \
man/elpa_autotune_deallocate.3
man/elpa_solve_evp_complex_single.3
endif
......@@ -824,6 +826,8 @@ EXTRA_DIST = \
test/shared/test_precision_kinds.F90 \
src/general/prow_pcol.F90 \
src/general/sanity.F90 \
src/general/elpa_ssr2_template.F90 \
src/general/elpa_ssmv_template.F90 \
test/Fortran/assert.h \
test/Fortran/elpa_print_headers.F90 \
test/shared/test_check_correctness_template.F90 \
......
......@@ -39,7 +39,7 @@ AC_SUBST([$1], ['$2'])
# API Version
AC_DEFINE([EARLIEST_API_VERSION], [20170403], [Earliest supported ELPA API version])
AC_DEFINE_SUBST(CURRENT_API_VERSION, 20190524, "Current ELPA API version")
AC_DEFINE_SUBST(CURRENT_API_VERSION, 20191110, "Current ELPA API version")
# Autotune Version
AC_DEFINE([EARLIEST_AUTOTUNE_VERSION], [20171201], [Earliest ELPA API version, which supports autotuning])
AC_DEFINE([CURRENT_AUTOTUNE_VERSION], [20190524], [Current ELPA autotune version])
......@@ -1506,6 +1506,27 @@ fi
AM_CONDITIONAL([WANT_SINGLE_PRECISION_REAL],[test x"$want_single_precision" = x"yes"])
AM_CONDITIONAL([WANT_SINGLE_PRECISION_COMPLEX],[test x"$want_single_precision" = x"yes"])
#always define SKEWSYMMETRIC for the moment
AC_MSG_CHECKING(whether we should enable skew-symmetric support)
AC_ARG_ENABLE([skew-symmetric-support],
AS_HELP_STRING([--enable-skew-symmetric-support],
[enable support for real valued skew-symmetric matrices]),
[if test x"$enableval" = x"yes"; then
enable_skewsymmetric=yes
else
enable_skewsymmetric=no
fi],
[enable_skewsymmetric=no])
AC_MSG_RESULT([${enable_skewsymmetric}])
AM_CONDITIONAL([HAVE_SKEWSYMMETRIC],[test x"$enable_skewsymmetric" = x"yes"])
if test x"${enable_skewsymmetric}" = x"yes"; then
if test x"${USE_ASSUMED_SIZE}" = x"no"; then
AC_MSG_ERROR(you have to enable assumed size arrays, if you want to build with skew-symmetric support)
fi
AC_DEFINE([HAVE_SKEWSYMMETRIC],[1],[build for skewsyemmtric case])
fi
AC_SUBST([MPI_BINARY])
AC_SUBST([WITH_MKL])
AC_SUBST([WITH_BLACS])
......
......@@ -4,6 +4,7 @@ import sys
simple_tokens = [
"PRECISION",
"elpa_transpose_vectors_NUMBER_PRECISION",
"elpa_transpose_vectors_ssNUMBER_PRECISION",
"elpa_reduce_add_vectors_NUMBER_PRECISION",
"bandred_NUMBER_PRECISION",
"trans_ev_band_to_full_NUMBER_PRECISION",
......@@ -19,6 +20,7 @@ simple_tokens = [
"qr_pdgeqrf_2dcomm_PRECISION",
"hh_transform_NUMBER_PRECISION",
"symm_matrix_allreduce_PRECISION",
"ssymm_matrix_allreduce_PRECISION",
"herm_matrix_allreduce_PRECISION",
"redist_band_NUMBER_PRECISION",
"unpack_row_NUMBER_cpu_PRECISION",
......
......@@ -285,6 +285,30 @@ print(" " + " \\\n ".join([
prec_flag['double']]))
print("endif")
name = "test_skewsymmetric_real_double"
print("check_SCRIPTS += " + name + "_extended.sh")
print("noinst_PROGRAMS += " + name)
print(name + "_SOURCES = test/Fortran/test_skewsymmetric.F90")
print(name + "_LDADD = $(test_program_ldadd)")
print(name + "_FCFLAGS = $(test_program_fcflags) \\")
print(" " + " \\\n ".join([
domain_flag['real'],
prec_flag['double']]))
name = "test_skewsymmetric_real_single"
print("if WANT_SINGLE_PRECISION_REAL")
print("check_SCRIPTS += " + name + "_extended.sh")
print("noinst_PROGRAMS += " + name)
print(name + "_SOURCES = test/Fortran/test_skewsymmetric.F90")
print(name + "_LDADD = $(test_program_ldadd)")
print(name + "_FCFLAGS = $(test_program_fcflags) \\")
print(" " + " \\\n ".join([
domain_flag['real'],
prec_flag['single']]))
print("endif")
name = "validate_multiple_objs_real_double_c_version"
print("if ENABLE_C_TESTS")
print("if ENABLE_AUTOTUNING")
......
......@@ -111,7 +111,7 @@
!>
!> ! We urge the user to always check the error code of all ELPA functions
!>
!> if (elpa_init(20181112) /= ELPA_OK) then
!> if (elpa_init(20191110) /= ELPA_OK) then
!> print *, "ELPA API version not supported"
!> stop
!> endif
......@@ -178,7 +178,7 @@
!>
!> /* We urge the user to always check the error code of all ELPA functions */
!>
!> if (elpa_init(20181113) != ELPA_OK) {
!> if (elpa_init(20191110) != ELPA_OK) {
!> fprintf(stderr, "Error: ELPA API version not supported");
!> exit(1);
!> }
......@@ -233,7 +233,7 @@
!> class(elpa_autotune_t), pointer :: tune_state
!> integer :: success
!>
!> if (elpa_init(20181112) /= ELPA_OK) then
!> if (elpa_init(20191110) /= ELPA_OK) then
!> print *, "ELPA API version not supported"
!> stop
!> endif
......
......@@ -105,7 +105,9 @@ module elpa1_compute
public :: elpa_reduce_add_vectors_real_double
public :: elpa_reduce_add_vectors_real
public :: elpa_transpose_vectors_real_double
public :: elpa_transpose_vectors_ss_real_double
public :: elpa_transpose_vectors_real
public :: elpa_transpose_vectors_ss_real
interface hh_transform_real
module procedure hh_transform_real_double
......@@ -118,11 +120,16 @@ module elpa1_compute
interface elpa_transpose_vectors_real
module procedure elpa_transpose_vectors_real_double
end interface
interface elpa_transpose_vectors_ss_real
module procedure elpa_transpose_vectors_ss_real_double
end interface
#ifdef WANT_SINGLE_PRECISION_REAL
public :: hh_transform_real_single
public :: elpa_reduce_add_vectors_real_single
public :: elpa_transpose_vectors_real_single
public :: elpa_transpose_vectors_ss_real_single
#endif
public :: hh_transform_complex_double
......@@ -130,7 +137,9 @@ module elpa1_compute
public :: elpa_reduce_add_vectors_complex_double
public :: elpa_reduce_add_vectors_complex
public :: elpa_transpose_vectors_complex_double
public :: elpa_transpose_vectors_ss_complex_double
public :: elpa_transpose_vectors_complex
public :: elpa_transpose_vectors_ss_complex
interface hh_transform_complex
module procedure hh_transform_complex_double
......@@ -143,11 +152,16 @@ module elpa1_compute
interface elpa_transpose_vectors_complex
module procedure elpa_transpose_vectors_complex_double
end interface
interface elpa_transpose_vectors_ss_complex
module procedure elpa_transpose_vectors_ss_complex_double
end interface
#ifdef WANT_SINGLE_PRECISION_COMPLEX
public :: hh_transform_complex_single
public :: elpa_reduce_add_vectors_complex_single
public :: elpa_transpose_vectors_complex_single
public :: elpa_transpose_vectors_ss_complex_single
#endif
contains
......@@ -158,7 +172,11 @@ module elpa1_compute
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
#include "elpa_transpose_vectors.F90"
#define SKEW_SYMMETRIC_BUILD
#include "elpa_transpose_vectors.F90"
#undef SKEW_SYMMETRIC_BUILD
#include "elpa_reduce_add_vectors.F90"
#undef DOUBLE_PRECISION
#undef REALCASE
......@@ -170,6 +188,9 @@ module elpa1_compute
#include "../general/precision_macros.h"
#include "elpa_transpose_vectors.F90"
#define SKEW_SYMMETRIC_BUILD
#include "elpa_transpose_vectors.F90"
#undef SKEW_SYMMETRIC_BUILD
#include "elpa_reduce_add_vectors.F90"
#undef SINGLE_PRECISION
#undef REALCASE
......@@ -181,6 +202,9 @@ module elpa1_compute
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
#include "elpa_transpose_vectors.F90"
#define SKEW_SYMMETRIC_BUILD
#include "elpa_transpose_vectors.F90"
#undef SKEW_SYMMETRIC_BUILD
#include "elpa_reduce_add_vectors.F90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
......@@ -191,6 +215,9 @@ module elpa1_compute
#define SINGLE_PRECISION 1
#include "../general/precision_macros.h"
#include "elpa_transpose_vectors.F90"
#define SKEW_SYMMETRIC_BUILD
#include "elpa_transpose_vectors.F90"
#undef SKEW_SYMMETRIC_BUILD
#include "elpa_reduce_add_vectors.F90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
......
......@@ -78,7 +78,11 @@ function elpa_solve_evp_&
MATH_DATATYPE(kind=rck), optional,target,intent(out) :: q(obj%local_nrows,*)
#else
MATH_DATATYPE(kind=rck), intent(inout) :: a(obj%local_nrows,obj%local_ncols)
MATH_DATATYPE(kind=rck), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
#ifdef HAVE_SKEWSYMMETRIC
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,2*obj%local_ncols)
#else
MATH_DATATYPE(kind=C_DATATYPE_KIND), optional, target, intent(out) :: q(obj%local_nrows,obj%local_ncols)
#endif
#endif
#if REALCASE == 1
......@@ -92,11 +96,15 @@ function elpa_solve_evp_&
complex(kind=C_DATATYPE_KIND), allocatable :: tau(:)
complex(kind=C_DATATYPE_KIND), allocatable,target :: q_dummy(:,:)
complex(kind=C_DATATYPE_KIND), pointer :: q_actual(:,:)
#endif /* COMPLEXCASE */
integer(kind=c_int) :: l_cols, l_rows, l_cols_nev, np_rows, np_cols
integer(kind=MPI_KIND) :: np_rowsMPI, np_colsMPI
#endif /* COMPLEXCASE */
logical :: useGPU
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
logical :: success
logical :: do_useGPU, do_useGPU_tridiag, &
......@@ -115,7 +123,8 @@ function elpa_solve_evp_&
logical :: do_tridiag, do_solve, do_trans_ev
integer(kind=ik) :: nrThreads
integer(kind=ik) :: global_index
call obj%timer%start("elpa_solve_evp_&
&MATH_DATATYPE&
&_1stage_&
......@@ -206,7 +215,15 @@ function elpa_solve_evp_&
else
useGPU = .false.
endif
call obj%get("is_skewsymmetric",skewsymmetric,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
call obj%timer%start("mpi_communication")
call mpi_comm_rank(int(mpi_comm_rows,kind=MPI_KIND), my_prowMPI, mpierr)
......@@ -215,13 +232,11 @@ function elpa_solve_evp_&
my_prow = int(my_prowMPI,kind=c_int)
my_pcol = int(my_pcolMPI,kind=c_int)
#if COMPLEXCASE == 1
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsmPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsmPI, mpierr)
call mpi_comm_size(int(mpi_comm_rows,kind=MPI_KIND), np_rowsMPI, mpierr)
call mpi_comm_size(int(mpi_comm_cols,kind=MPI_KIND), np_colsMPI, mpierr)
np_rows = int(np_rowsMPI,kind=c_int)
np_cols = int(np_colsMPI,kind=c_int)
#endif
call obj%timer%stop("mpi_communication")
......@@ -418,6 +433,30 @@ function elpa_solve_evp_&
#if COMPLEXCASE == 1
q(1:l_rows,1:l_cols_nev) = q_real(1:l_rows,1:l_cols_nev)
#endif
if (isSkewsymmetric) then
! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
! This makes the eigenvectors complex.
! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols) = 0.0
do i = 1, obj%local_nrows
! global_index = indxl2g(i, nblk, my_prow, 0, np_rows)
global_index = np_rows*nblk*((i-1)/nblk) + MOD(i-1,nblk) + MOD(np_rows+my_prow-0, np_rows)*nblk + 1
if (mod(global_index-1,4) .eq. 0) then
! do nothing
end if
if (mod(global_index-1,4) .eq. 1) then
q(i,obj%local_ncols+1:2*obj%local_ncols) = q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
end if
if (mod(global_index-1,4) .eq. 2) then
q(i,1:obj%local_ncols) = -q(i,1:obj%local_ncols)
end if
if (mod(global_index-1,4) .eq. 3) then
q(i,obj%local_ncols+1:2*obj%local_ncols) = -q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
end if
end do
endif
call obj%timer%start("back")
#ifdef HAVE_LIKWID
......@@ -427,11 +466,22 @@ function elpa_solve_evp_&
call nvtxRangePush("trans_ev")
#endif
! In the skew-symmetric case this transforms the real part
call trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj, na, nev, a, lda, tau, q, ldq, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
if (isSkewsymmetric) then
! Transform imaginary part
! Transformation of real and imaginary part could also be one call of trans_ev_tridi acting on the n x 2n matrix.
call trans_ev_&
&MATH_DATATYPE&
&_&
&PRECISION&
& (obj, na, nev, a, lda, tau, q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols), ldq, nblk, matrixCols, &
mpi_comm_rows, mpi_comm_cols, do_useGPU_trans_ev)
endif
#ifdef WITH_NVTX
call nvtxRangePop()
......
......@@ -110,6 +110,8 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
class(elpa_abstract_impl_t), intent(inout) :: obj
integer(kind=ik), intent(in) :: na, lda, nblk, matrixCols, mpi_comm_rows, mpi_comm_cols
logical, intent(in) :: useGPU, wantDebug
integer(kind=c_int) :: skewsymmetric
logical :: isSkewsymmetric
MATH_DATATYPE(kind=rck), intent(out) :: tau(na)
#ifdef USE_ASSUMED_SIZE
......@@ -181,6 +183,13 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
&PRECISION&
&_&
&MATH_DATATYPE
call obj%get("is_skewsymmetric",skewsymmetric,istat)
if (istat .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
......@@ -538,10 +547,16 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
v_row(l_row_beg), 1_BLAS_KIND, ONE, uc_p(l_col_beg,my_thread), 1_BLAS_KIND)
if (i/=j) then
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), v_col(l_col_beg), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
if (isSkewsymmetric) then
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), v_col(l_col_beg), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
else
call PRECISION_GEMV('N', int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), v_col(l_col_beg), 1_BLAS_KIND, &
ONE, ur_p(l_row_beg,my_thread), 1_BLAS_KIND)
endif
endif
if (wantDebug) call obj%timer%stop("blas")
endif
......@@ -560,11 +575,16 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
ONE, u_col(l_col_beg), 1_BLAS_KIND)
if (i/=j) then
call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_col(l_col_beg), 1_BLAS_KIND, &
ONE, u_row(l_row_beg), 1_BLAS_KIND)
if (isSkewsymmetric) then
call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
-ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_col(l_col_beg), 1_BLAS_KIND, ONE, u_row(l_row_beg), 1_BLAS_KIND)
else
call PRECISION_GEMV('N',int(l_row_end-l_row_beg+1,kind=BLAS_KIND), int(l_col_end-l_col_beg+1,kind=BLAS_KIND), &
ONE, a_mat(l_row_beg,l_col_beg), int(lda,kind=BLAS_KIND), &
v_col(l_col_beg), 1_BLAS_KIND, ONE, u_row(l_row_beg), 1_BLAS_KIND)
endif
endif
if (wantDebug) call obj%timer%stop("blas")
endif ! not useGPU
......@@ -626,11 +646,17 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
a_offset = ((l_row_beg-1) + (l_col_beg - 1) * lda) * &
size_of_datatype
call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, lda, &
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
if (isSkewsymmetric) then
call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
-ONE, a_dev + a_offset, lda, &
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
else
call cublas_PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_dev + a_offset, lda, &
v_col_dev + (l_col_beg - 1) * size_of_datatype,1, &
ONE, u_row_dev + (l_row_beg - 1) * size_of_datatype, 1)
endif
enddo
end if !multiplication as one block / per stripes
......@@ -710,13 +736,21 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
u_col = tmp
#endif /* WITH_MPI */
endif
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
if (isSkewsymmetric) then
call elpa_transpose_vectors_ss_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
else
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
(obj, u_col, ubound(u_col,dim=1), mpi_comm_cols, u_row, ubound(u_row,dim=1), &
mpi_comm_rows, 1, istep-1, 1, nblk, max_threads)
endif
! calculate u**T * v (same as v**T * (A + VU**T + UV**T) * v )
x = 0
......@@ -843,7 +877,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
+ dot_product(vu_stored_rows(l_rows,1:2*n_stored_vecs),uv_stored_cols(l_cols,1:2*n_stored_vecs))
end if
#if REALCASE == 1
d_vec(istep-1) = a_mat(l_rows,l_cols)
if (isSkewsymmetric) then
d_vec(istep-1) = 0.0_rk
else
d_vec(istep-1) = a_mat(l_rows,l_cols)
endif
#endif
#if COMPLEXCASE == 1
d_vec(istep-1) = real(a_mat(l_rows,l_cols),kind=rk)
......@@ -937,7 +975,11 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
successCUDA = cuda_memcpy(int(loc(d_vec(1)),kind=c_intptr_t), a_dev, 1 * size_of_datatype, cudaMemcpyDeviceToHost)
check_memcpy_cuda("tridiag: a_dev 8", successCUDA)
else !useGPU
d_vec(1) = a_mat(1,1)
if (isSkewsymmetric) then
d_vec(1) = 0.0_rk
else
d_vec(1) = a_mat(1,1)
endif
endif !useGPU
endif
#endif
......
......@@ -50,7 +50,15 @@
#include "config-f90.h"
#include "../general/sanity.F90"
subroutine elpa_transpose_vectors_&
#undef ROUTINE_NAME
#ifdef SKEW_SYMMETRIC_BUILD
#define ROUTINE_NAME elpa_transpose_vectors_ss_
#else
#define ROUTINE_NAME elpa_transpose_vectors_
#endif
subroutine ROUTINE_NAME&
&MATH_DATATYPE&
&_&
&PRECISION &
......@@ -98,7 +106,7 @@ subroutine elpa_transpose_vectors_&
integer(kind=ik) :: auxstride
integer(kind=ik), intent(in) :: nrThreads
call obj%timer%start("elpa_transpose_vectors_&
call obj%timer%start("ROUTINE_NAME&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
......@@ -197,7 +205,11 @@ subroutine elpa_transpose_vectors_&
k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/npt)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
#ifdef SKEW_SYMMETRIC_BUILD
vmat_t(ns+1:ns+nl,lc) = - aux(k+1:k+nl)
#else
vmat_t(ns+1:ns+nl,lc) = aux(k+1:k+nl)
#endif
! k = k+nblk
enddo
enddo
......@@ -210,7 +222,7 @@ subroutine elpa_transpose_vectors_&
#endif
deallocate(aux)
call obj%timer%stop("elpa_transpose_vectors_&
call obj%timer%stop("ROUTINE_NAME&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
......