Commit 4d6f8a4f authored by Andreas Marek's avatar Andreas Marek
Browse files

WIP: start to templatize skew symm code; Not yet compilable

parent b634f55e
......@@ -63,8 +63,8 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa_generalized/cannon.c \
#src/elpa_generalized/test_c_bindings.c \
src/helpers/matrix_plot.F90 \
src/general/dssmv.F90 \
src/general/dssr2.F90 \
src/general/ssmv_template.F90 \
src/general/ssr2_template.F90 \
src/elpa_index.c
libelpa@SUFFIX@_private_la_SOURCES += src/elpa_c_interface.c
......@@ -103,7 +103,6 @@ EXTRA_libelpa@SUFFIX@_private_la_DEPENDENCIES = \
src/elpa2/qr/elpa_pdgeqrf_template.F90 \
src/elpa2/elpa2_bandred_template.F90 \
src/elpa2/elpa2_symm_matrix_allreduce_real_template.F90 \
src/elpa2/elpa2_ssymm_matrix_allreduce_real_template.F90 \
src/elpa2/elpa2_trans_ev_band_to_full_template.F90 \
src/elpa2/elpa2_tridiag_band_template.F90 \
src/elpa2/elpa2_trans_ev_tridi_to_band_template.F90 \
......@@ -706,7 +705,6 @@ EXTRA_DIST = \
src/elpa1/elpa_reduce_add_vectors.F90 \
src/elpa1/elpa_solve_tridi_impl_public.F90 \
src/elpa1/elpa_transpose_vectors.F90 \
src/elpa1/elpa_transpose_vectors_ss.F90 \
src/elpa2/GPU/ev_tridi_band_gpu_c_v2_complex_template.cu \
src/elpa2/GPU/ev_tridi_band_gpu_c_v2_real_template.cu \
src/elpa2/compute_hh_trafo.F90 \
......
......@@ -172,8 +172,11 @@ module elpa1_compute
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
#include "elpa_transpose_vectors.F90"
#define SKEW_SYMMETRIC
#include "elpa_transpose_vectors.F90"
#include "elpa_transpose_vectors_ss.F90"
#undef SKEW_SYMMETRIC
#include "elpa_reduce_add_vectors.F90"
#undef DOUBLE_PRECISION
#undef REALCASE
......@@ -185,7 +188,9 @@ module elpa1_compute
#include "../general/precision_macros.h"
#include "elpa_transpose_vectors.F90"
#include "elpa_transpose_vectors_ss.F90"
#define SKEW_SYMMETRIC
#include "elpa_transpose_vectors.F90"
#undef SKEW_SYMMETRIC
#include "elpa_reduce_add_vectors.F90"
#undef SINGLE_PRECISION
#undef REALCASE
......@@ -197,7 +202,9 @@ module elpa1_compute
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
#include "elpa_transpose_vectors.F90"
#include "elpa_transpose_vectors_ss.F90"
#define SKEW_SYMMETRIC
#include "elpa_transpose_vectors.F90"
#undef SKEW_SYMMETRIC
#include "elpa_reduce_add_vectors.F90"
#undef COMPLEXCASE
#undef DOUBLE_PRECISION
......@@ -208,7 +215,9 @@ module elpa1_compute
#define SINGLE_PRECISION 1
#include "../general/precision_macros.h"
#include "elpa_transpose_vectors.F90"
#include "elpa_transpose_vectors_ss.F90"
#define SKEW_SYMMETRIC
#include "elpa_transpose_vectors.F90"
#undef SKEW_SYMMETRIC
#include "elpa_reduce_add_vectors.F90"
#undef COMPLEXCASE
#undef SINGLE_PRECISION
......
......@@ -185,9 +185,9 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (istat .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
......@@ -538,12 +538,10 @@ call prmat(na,useGpu,a_mat,a_dev,lda,matrixCols,nblk,my_prow,my_pcol,np_rows,np_
if (isSkewsymmetric) then
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
-ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1, &
ONE, ur_p(l_row_beg,my_thread), 1)
else
call PRECISION_GEMV('N', l_row_end-l_row_beg+1, l_col_end-l_col_beg+1, &
ONE, a_mat(l_row_beg,l_col_beg), lda, v_col(l_col_beg), 1, &
ONE, ur_p(l_row_beg,my_thread), 1)
endif
endif
......
......@@ -50,7 +50,14 @@
#include "config-f90.h"
#include "../general/sanity.F90"
subroutine elpa_transpose_vectors_&
#ifdef SKEW_SYMMETRIC
#define ROUTINE_NAME elpa_transpose_vectors_ss_
#else
#define ROUTINE_NAME elpa_transpose_vectors_
#endif
subroutine ROUTINE_NAME&
&MATH_DATATYPE&
&_&
&PRECISION &
......@@ -96,7 +103,7 @@ subroutine elpa_transpose_vectors_&
integer(kind=ik) :: auxstride
integer(kind=ik), intent(in) :: nrThreads
call obj%timer%start("elpa_transpose_vectors_&
call obj%timer%start("ROUTINE_NAME&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
......@@ -190,7 +197,11 @@ subroutine elpa_transpose_vectors_&
k = (i - nblks_skip - n)/lcm_s_t * nblk + (lc - 1) * auxstride
ns = (i/npt)*nblk ! local start of block i
nl = min(nvr-i*nblk,nblk) ! length
#ifdef SKEW_SYMMETRIC
vmat_t(ns+1:ns+nl,lc) = - aux(k+1:k+nl)
#else
vmat_t(ns+1:ns+nl,lc) = aux(k+1:k+nl)
#endif
! k = k+nblk
enddo
enddo
......@@ -203,7 +214,7 @@ subroutine elpa_transpose_vectors_&
#endif
deallocate(aux)
call obj%timer%stop("elpa_transpose_vectors_&
call obj%timer%stop("ROUTINE_NAME&
&MATH_DATATYPE&
&" // &
&PRECISION_SUFFIX &
......
......@@ -139,7 +139,7 @@
integer(kind=ik) :: istep, ncol, lch, lcx, nlc
integer(kind=ik) :: tile_size, l_rows_tile, l_cols_tile
real(kind=rk) :: vnorm2
real(kind=rk) :: vnorm2
MATH_DATATYPE(kind=rck) :: xf, aux1(nbw), aux2(nbw), vrl, tau, vav(nbw,nbw)
! complex(kind=COMPLEX_DATATYPE), allocatable :: tmpCUDA(:,:), vmrCUDA(:,:), umcCUDA(:,:) ! note the different dimension in real case
......@@ -194,9 +194,9 @@
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
gpuString = "_gpu"
else
......@@ -1183,11 +1183,11 @@
if (i==0) cycle
lre = min(l_rows,i*l_rows_tile)
call obj%timer%start("blas")
if (isSkewsymmetric) then
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, -ONE, a_mat(1,lcs), lda, &
umcCPU(lcs,n_cols+1), ubound(umcCPU,dim=1), &
ONE, &
ONE, &
vmrCPU(1,n_cols+1), ubound(vmrCPU,dim=1))
else
call PRECISION_GEMM('N', 'N', lre, n_cols, lce-lcs+1, ONE, a_mat(1,lcs), lda, &
......@@ -1465,7 +1465,7 @@
! Transpose umc -> umr (stored in vmr, second half)
if (isSkewsymmetric) then
call elpa_transpose_vectors_ss_&
call elpa_transpose_vectors_ss_&
&MATH_DATATYPE&
&_&
&PRECISION &
......@@ -1473,7 +1473,7 @@
vmrCUDA(cur_l_rows * n_cols + 1), cur_l_rows, mpi_comm_rows, &
1, istep*nbw, n_cols, nblk, max_threads)
else
call elpa_transpose_vectors_&
call elpa_transpose_vectors_&
&MATH_DATATYPE&
&_&
&PRECISION &
......@@ -1516,7 +1516,7 @@
umcCPU(1,n_cols+1), ubound(umcCPU,dim=1), vav, &
ubound(vav,dim=1), ONE, umcCPU, ubound(umcCPU,dim=1))
endif
#endif
#endif
#if COMPLEXCASE == 1
call PRECISION_GEMM('N', 'N', l_cols, n_cols, n_cols, &
(-0.5_rk, 0.0_rk), &
......
......@@ -59,9 +59,12 @@
#undef COMPLEXCASE
#include "elpa2_bandred_template.F90"
#define REALCASE 1
#undef SKEW_SYMMETRIC
#include "elpa2_symm_matrix_allreduce_real_template.F90"
#if SKEWSYMMETRIC == 1
#include "elpa2_ssymm_matrix_allreduce_real_template.F90"
#define SKEW_SYMMETRIC
#include "elpa2_symm_matrix_allreduce_real_template.F90"
#undef SKEW_SYMMETRIC
#endif
#undef REALCASE
#define REALCASE 1
......
......@@ -52,7 +52,18 @@
#include "../general/sanity.F90"
#ifdef SKEW_SYMMETRIC
#define ROUTINE_NAME ssymm_matrix_allreduce
#else
#define ROUTINE_NAME symm_matrix_allreduce
#endif
#ifdef SKEW_SYMMETRIC
subroutine ssymm_matrix_allreduce_&
#else
subroutine symm_matrix_allreduce_&
#endif
&PRECISION &
(obj, n, a, lda, ldb, comm)
!-------------------------------------------------------------------------------
......@@ -73,7 +84,7 @@
integer(kind=ik) :: i, nc, mpierr
real(kind=REAL_DATATYPE) :: h1(n*n), h2(n*n)
call obj%timer%start("symm_matrix_allreduce" // PRECISION_SUFFIX)
call obj%timer%start("ROUTINE_NAME" // PRECISION_SUFFIX)
nc = 0
do i=1,n
......@@ -88,7 +99,11 @@
nc = 0
do i=1,n
a(1:i,i) = h2(nc+1:nc+i)
#ifdef SKEW_SYMMETRIC
a(i,1:i-1) = - a(1:i-1,i)
#else
a(i,1:i-1) = a(1:i-1,i)
#endif
nc = nc+i
enddo
......@@ -98,7 +113,11 @@
nc = 0
do i=1,n
a(1:i,i) = h1(nc+1:nc+i)
#ifdef SKEW_SYMMETRIC
a(i,1:i-1) = - a(1:i-1,i)
#else
a(i,1:i-1) = a(1:i-1,i)
#endif
nc = nc+i
enddo
......@@ -110,9 +129,13 @@
! nc = nc+i
! enddo
call obj%timer%stop("symm_matrix_allreduce" // PRECISION_SUFFIX)
call obj%timer%stop("ROUTINE_NAME" // PRECISION_SUFFIX)
#ifdef SKEW_SYMMETRIC
end subroutine ssymm_matrix_allreduce_&
#else
end subroutine symm_matrix_allreduce_&
#endif
&PRECISION
......
......@@ -132,7 +132,7 @@
integer(kind=ik) :: nrThreads
! #if SKEWSYMMETRIC ==1
integer(kind=ik) :: global_index
integer(kind=ik) :: global_index
! #endif
#if REALCASE == 1
#undef GPU_KERNEL
......@@ -186,7 +186,7 @@
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("mpi_comm_cols",mpi_comm_cols,error)
call obj%get("mpi_comm_cols",mpi_comm_cols,error
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
......@@ -236,13 +236,13 @@
print *,"Problem getting option. Aborting..."
stop
endif
call obj%get("is_skewsymmetric",skewsymmetric,error)
if (error .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
isSkewsymmetric = (skewsymmetric == 1)
! GPU settings
......@@ -690,10 +690,10 @@
endif
#endif
endif
if (isSkewsymmetric) then
! Extra transformation step for skew-symmetric matrix. Multiplication with diagonal complex matrix D.
! This makes the eigenvectors complex.
! This makes the eigenvectors complex.
! For now real part of eigenvectors is generated in first half of q, imaginary part in second part.
q(1:obj%local_nrows, obj%local_ncols+1:2*obj%local_ncols) = 0.0
do i = 1, obj%local_nrows
......@@ -713,7 +713,7 @@
q(i,obj%local_ncols+1:2*obj%local_ncols) = -q(i,1:obj%local_ncols)
q(i,1:obj%local_ncols) = 0
end if
end do
end do
endif
! print * , "q="
! do i=1,na
......@@ -754,7 +754,7 @@
call obj%timer%stop("trans_ev_to_band")
if (.not.(success)) return
! ! We can now deallocate the stored householder vectors
! deallocate(hh_trans, stat=istat, errmsg=errorMessage)
! if (istat .ne. 0) then
......@@ -798,7 +798,7 @@
call obj%timer%start("trans_ev_to_full")
if ( (do_useGPU_trans_ev_band_to_full) .and. .not.(do_useGPU_trans_ev_tridi_to_band) ) then
! copy to device if we want to continue on GPU
successCUDA = cuda_malloc(q_dev, ldq*matrixCols*size_of_datatype)
! if (.not.(successCUDA)) then
! print *,"elpa2_template, error in cuda_malloc"
......
......@@ -142,7 +142,7 @@
if (istat .ne. ELPA_OK) then
print *,"Problem getting option. Aborting..."
stop
endif
endif
isSkewsymmetric = (skewsymmetric == 1)
if(useGPU) then
......@@ -653,7 +653,7 @@
endif
if (wantDebug) call obj%timer%start("blas")
#if REALCASE == 1
if (isSkewsymmetric) then
if (isSkewsymmetric) then
! call PRECISION_SSR2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
call dssr2('L', nc, -ONE, hd, 1, hv, 1, ab(1,ns), 2*nb-1)
else
......
SUBROUTINE DSSMV( UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY )
subroutine BLAS_SUFFIX&
ssmv(UPLO, N, ALPHA, A, LDA, X, INCX, BETA, Y, INCY )
!
use precision
use iso_c_binding
IMPLICIT NONE
!
! .. Scalar Arguments ..
CHARACTER UPLO
INTEGER N, LDA, INCX, INCY
DOUBLE PRECISION ALPHA, BETA
CHARACTER :: UPLO
INTEGER(kind=c_int) :: N, LDA, INCX, INCY
MATH_DATATYPE(kind=rck) :: ALPHA, BETA
! ..
! .. Array Arguments ..
DOUBLE PRECISION A( LDA, * ), X( * ), Y( * )
MATH_DATATYPE(kind=rck) :: A( LDA, * ), X( * ), Y( * )
! ..
!
! Purpose
......@@ -103,15 +106,14 @@
! .. Parameters ..
DOUBLE PRECISION ZERO, ONE
PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 )
INTEGER NB
PARAMETER ( NB = 64 )
INTEGER(kind=c_int), parameter :: NB = 64
! ..
! .. Local Scalars ..
INTEGER II, JJ, IC, IY, JC, JX, KX, KY, INFO
DOUBLE PRECISION TEMP
LOGICAL UPPER
INTEGER(kind=c_int) :: II, JJ, IC, IY, JC, JX, KX, KY, INFO
MATH_DATATYPE(kind=rck) :: TEMP
LOGICAL :: UPPER
! .. Local Arrays ..
DOUBLE PRECISION WORK( NB )
MATH_DATATYPE(kind=rck) :: WORK( NB )
! ..
! .. External Functions ..
LOGICAL LSAME
......@@ -183,10 +185,10 @@
! and DGEMV for off-diagonal blocks.
!
IF ( II .GT. JJ ) THEN
CALL DGEMV( 'T', NB, IC, -ALPHA, A( JJ, II ), LDA, &
CALL PRECISIOM_GEMV( 'T', NB, IC, -ALPHA, A( JJ, II ), LDA, &
& X( JX ), INCX, TEMP, Y( IY ), INCY )
ELSE IF ( II .LT. JJ ) THEN
CALL DGEMV( 'N', NB, JC, ALPHA, A( II, JJ ), LDA, &
CALL PRECISION_GEMV( 'N', NB, JC, ALPHA, A( II, JJ ), LDA, &
& X( JX ), INCX, TEMP, Y( IY ), INCY )
ELSE
CALL DSSMV_SM( UPPER, JC, ALPHA, A( JJ, JJ ), LDA, &
......@@ -213,10 +215,10 @@
! and DGEMV for off-diagonal blocks.
!
IF ( II .LT. JJ ) THEN
CALL DGEMV( 'T', JC, NB, -ALPHA, A( JJ, II ), LDA, &
CALL PRECISION_GEMV( 'T', JC, NB, -ALPHA, A( JJ, II ), LDA, &
& X( JX ), INCX, TEMP, Y( IY ), INCY )
ELSE IF ( II .GT. JJ ) THEN
CALL DGEMV( 'N', IC, NB, ALPHA, A( II, JJ ), LDA, &
CALL PRECISION_GEMV( 'N', IC, NB, ALPHA, A( II, JJ ), LDA, &
& X( JX ), INCX, TEMP, Y( IY ), INCY )
ELSE
CALL DSSMV_SM( UPPER, JC, ALPHA, A( JJ, JJ ), LDA, &
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment