Commit 3eb37776 authored by Pavel Kus's avatar Pavel Kus

integration of C implementation of Cannon algorithm to ELPA

this commit does not work, the generalized EVP problems fail from 2
- Cannons algorithm does not work for the processor grid shape we use at
  the moment in tests (It only works if num. of process columns is a
  multiple of num. of process rows, which is not true in ELPA for 2 mpi
- Cannons algorithm does not work as integrated to ELPA for larger
  matrices (not clear why at the moment)

There are 2 new tests, which should work
- test_cannon.c: it tests the new algorithm without going through
  Fortran, as it has been delivered
- test_c_bindings: it tests transfering a 2D fortran matrix to C and
  back, as it is done with the cannons algorithm in the normal ELPA
parent c768a38c
......@@ -60,6 +60,9 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa2/qr/elpa_pdgeqrf.F90 \
src/elpa1/elpa1.F90 \
src/elpa2/elpa2.F90 \
src/elpa_generalized/cannon_forw_real_double.c \
src/elpa_generalized/cannon_back_real_double.c \
src/elpa_generalized/test_c_bindings.c \
src/helpers/matrix_plot.F90 \
......@@ -562,6 +565,8 @@ if ENABLE_LEGACY
noinst_PROGRAMS += double_instance@SUFFIX@
check_SCRIPTS +=
double_instance@SUFFIX@_SOURCES = test/Fortran/elpa2/double_instance.F90
......@@ -596,6 +601,19 @@ single_complex_2stage_banded@SUFFIX@_LDADD = $(test_program_ldadd)
single_complex_2stage_banded@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules
noinst_PROGRAMS += test_c_bindings@SUFFIX@
check_SCRIPTS +=
test_c_bindings@SUFFIX@_SOURCES = test/Fortran/elpa_generalized/test_bindings.F90
test_c_bindings@SUFFIX@_LDADD = $(test_program_ldadd) $(FCLIBS)
test_c_bindings@SUFFIX@_FCFLAGS = $(AM_FCFLAGS) $(FC_MODINC)test_modules $(FC_MODINC)modules
noinst_PROGRAMS += test_c_cannon@SUFFIX@
check_SCRIPTS +=
test_c_cannon@SUFFIX@_SOURCES = test/C/elpa_generalized/test_cannon.c
test_c_cannon@SUFFIX@_LDADD = $(test_program_ldadd) $(FCLIBS)
test_c_cannon@SUFFIX@_CFLAGS = $(test_program_cflags)
# test scripts
TASKS ?= 2
......@@ -37,7 +37,7 @@ test/shared/generated.h: $(wildcard $(top_srcdir)/test/shared/*.*90) | test/shar
$(call extract_interface,!c>)
generated_headers += src/elpa_generated_fortran_interfaces.h
src/elpa_generated_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/elpa2/kernels/*.*c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch])) | src
src/elpa_generated_fortran_interfaces.h: $(filter-out $(wildcard $(top_srcdir)/src/*generated*), $(wildcard $(top_srcdir)/src/elpa2/kernels/*.c $(top_srcdir)/src/elpa2/kernels/*.s $(top_srcdir)/src/*.[ch] $(top_srcdir)/src/elpa_generalized/*.[ch])) | src
@rm -f $@
$(call extract_interface,!f>)
$(call extract_interface,#!f>)
This diff is collapsed.
This diff is collapsed.
#include "config-f90.h"
#include <stdio.h>
#include <stdlib.h>
#ifdef WITH_MPI
#include <mpi.h>
#include <math.h>
//#include <elpa/elpa.h>
//#include <elpa/elpa_generated.h>
//#include <elpa/elpa_constants.h>
//#include <elpa/elpa_generated_legacy.h>
//#include <elpa/elpa_generic.h>
//#include <elpa/elpa_legacy.h>
void pdlacpy_(char*, int*, int*, double*, int*, int*, int*, double*, int*, int*, int*);
void dlacpy_(char*, int*, int*, double*, int*, double*, int*);
void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
void pdtran_(int*, int*, double*, double*, int*, int*, int*, double*, double*, int*, int*, int*);
//void pdelset_(double*, int*, int*, int*, double*);
//void pdsymm_(char*, char*, int*, int*, double*, double*, int*, int*, int*, double*, int*, int*, int*, double*, double*, int*, int*, int*);
//void pdpotrf_(char*, int*, double*, int*, int*, int*, int*);
//void pdsyngst_(int*, char*, int*, double*, int*, int*, int*, double*, int*, int*, int*, double*, double*, int*, int*);
//void descinit_(int*, int*, int*, int*, int*, int*, int*, int*, int*, int*);
int numroc_(int*, int*, int*, int*, int*);
//void set_up_blacsgrid_f1(int, int*, int*, int*, int*, int*, int*, int*);
//void pdtrtrs_(char*, char*, char*, int*, int*, double*, int*, int*, int*, double*, int*, int*, int*, int*);
//void pdsyevr_(char*, char*, char*, int*, double*, int*, int*, int*, int*, int*, int*, int*, int*, int*, double*, double*, int*, int*, int*, double*, int*, int*, int*, int*);
//////////////////////////////////////////////////////////////////////////////////////////// My function for reduction //////////////////////////////////////////////////////////
void d_test_c_bindings(double* A, int np_rows, int np_cols, int my_prow, int my_pcol, int* a_desc,
double *Res, MPI_Comm row_comm, MPI_Comm col_comm)
int na, nblk, i, j, Size_send_A, Size_receive_A, Size_send_U, Size_receive_U, Buf_rows, Buf_cols, where_to_send_A, from_where_to_receive_A, where_to_send_U, from_where_to_receive_U, last_proc_row, last_proc_col, cols_in_buffer_A, rows_in_buffer_A, intNumber;
double *Buf_to_send_A, *Buf_to_receive_A, *Buf_to_send_U, *Buf_to_receive_U, *double_ptr, *Buf_A, *Buf_pos, *U_local_start, *Res_ptr, *M, *M_T, *A_local_start, *U_local_start_curr, *U_stored, *CopyTo, *CopyFrom, *U_to_calc;
int ratio, num_of_iters, cols_in_buffer, rows_in_block, rows_in_buffer, curr_col_loc, cols_in_block, curr_col_glob, curr_row_loc, Size_receive_A_now, Nb, owner, cols_in_buffer_A_now;
int row_of_origin_U, rows_in_block_U, num_of_blocks_in_U_buffer, k, startPos, cols_in_buffer_U, rows_in_buffer_U, col_of_origin_A, curr_row_loc_res, curr_row_loc_A, curr_col_glob_res;
int curr_col_loc_res, curr_col_loc_buf, proc_row_curr, curr_col_loc_U, A_local_index, LDA_A, LDA_A_new, index_row_A_for_LDA, ii, rows_in_block_U_curr, width, row_origin_U, rows_in_block_A, cols_in_buffer_A_my_initial, rows_in_buffer_A_my_initial, proc_col_min;
int *SizesU;
int Size_U_skewed, Size_U_stored, Curr_pos_in_U_stored, rows_in_buffer_A_now;
double done = 1.0;
double dzero = 0.0;
int one = 1;
int zero = 0;
int na_rows, na_cols;
MPI_Status status;
MPI_Request request_A_Recv;
MPI_Request request_A_Send;
MPI_Request request_U_Recv;
MPI_Request request_U_Send;
na = a_desc[2];
nblk = a_desc[4];
na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
for (int i = 0; i < na_rows * na_cols; i++)
Res[i] = A[i] + 2;
!f> interface
!f> subroutine test_c_bindings(A, local_rows, local_cols, np_rows, np_cols, my_prow, my_pcol, a_desc, &
!f> Res, row_comm, col_comm) &
!f> bind(C, name="d_test_c_bindings_c")
!f> use, intrinsic :: iso_c_binding
!f> real(c_double) :: A(local_rows, local_cols), Res(local_rows, local_cols)
!f> !type(c_ptr), value :: A, Res
!f> integer(kind=c_int) :: a_desc(9)
!f> integer(kind=c_int),value :: local_rows, local_cols
!f> integer(kind=c_int),value :: np_rows, np_cols, my_prow, my_pcol, row_comm, col_comm
!f> end subroutine
!f> end interface
void d_test_c_bindings_c(double* A, int local_rows, int local_cols, int np_rows, int np_cols, int my_prow, int my_pcol, int* a_desc,
double *Res, int row_comm, int col_comm)
//printf("%d, %d, %d, %d, %lf, %lf, %lf, %lf, com: %d, %d\n", np_rows, np_cols, my_prow, my_pcol, A[0], A[1], U[0], U[1], row_comm, col_comm);
MPI_Comm c_row_comm = MPI_Comm_f2c(row_comm);
MPI_Comm c_col_comm = MPI_Comm_f2c(col_comm);
//int c_my_prow, c_my_pcol;
//MPI_Comm_rank(c_row_comm, &c_my_prow);
//MPI_Comm_rank(c_col_comm, &c_my_pcol);
//printf("FORT<->C row: %d<->%d, col: %d<->%d\n", my_prow, c_my_prow, my_pcol, c_my_pcol);
// in the cannons algorithm, column and row communicators are exchanged
// What we usually call row_comm in elpa, is thus passed to col_comm parameter of the function and vice versa
// (order is swapped in the following call)
// It is a bit unfortunate, maybe it should be changed in the Cannon algorithm to comply with ELPA standard notation?
d_test_c_bindings(A, np_rows, np_cols, my_prow, my_pcol, a_desc, Res, c_col_comm, c_row_comm);
! using elpa internal Hermitian multiply is faster then scalapack multiply, but we need an extra
! temporary matrix.
! using cannon algorithm should be the fastest. After this is verified, the other options should be removed
! however, we need the extra temporary matrix as well.
#if defined(REALCASE) && defined(DOUBLE_PRECISION)
!TODO first just for real double...
subroutine elpa_transform_generalized_&
&(self, a, b, is_already_decomposed, error)
......@@ -12,12 +31,10 @@
integer :: error
logical :: is_already_decomposed
integer :: sc_desc(SC_DESC_LEN)
integer(kind=ik) :: my_p, my_prow, my_pcol, np_rows, np_cols, mpierr, mpi_comm_rows, mpi_comm_cols, mpi_comm_all
integer(kind=ik) :: BuffLevelInt
! using elpa internal Hermitian multiply is faster then scalapack multiply, but we need an extra
! temporary variable. Therefore both options are provided and at the moment controled by this switch
MATH_DATATYPE(kind=rck) :: tmp(self%local_nrows, self%local_ncols)
......@@ -39,7 +56,7 @@
if(error .NE. ELPA_OK) return
end if
! tmp <- inv(U^T) * A (we have to use temporary variable)
call self%elpa_hermitian_multiply_&
......@@ -49,7 +66,8 @@
! A <- inv(U)^T * A
a(1:self%local_nrows, 1:self%local_ncols) = tmp(1:self%local_nrows, 1:self%local_ncols)
! A <- inv(U)^T * A (using scalapack, we can directly update A)
call self%timer_start("scalapack multiply inv(U)^T * A")
#ifdef WITH_MPI
......@@ -64,8 +82,9 @@
call self%timer_stop("scalapack multiply inv(U)^T * A")
! A <- inv(U)^T * A * inv(U)
! For this multiplication we do not have internal function in ELPA,
! so we have to call scalapack anyway
......@@ -81,6 +100,29 @@
ONE, b, self%na, a, self%na)
call self%timer_stop("scalapack multiply A * inv(U)")
!TODO set the value properly
!TODO tunable parameter?
BuffLevelInt = 1
call self%get("mpi_comm_rows",mpi_comm_rows,error)
call self%get("mpi_comm_cols",mpi_comm_cols,error)
call self%get("mpi_comm_parent", mpi_comm_all,error)
call mpi_comm_rank(mpi_comm_rows,my_prow,mpierr)
call mpi_comm_size(mpi_comm_rows,np_rows,mpierr)
call mpi_comm_rank(mpi_comm_cols,my_pcol,mpierr)
call mpi_comm_size(mpi_comm_cols,np_cols,mpierr)
call mpi_comm_rank(mpi_comm_all,my_p,mpierr)
call cannons_reduction(a, b, self%local_nrows, self%local_ncols, np_rows, np_cols, my_prow, my_pcol, &
sc_desc, tmp, BuffLevelInt, mpi_comm_rows, mpi_comm_cols)
a(1:self%local_nrows, 1:self%local_ncols) = tmp(1:self%local_nrows, 1:self%local_ncols)
write(*, *) my_prow, my_pcol, "A(2,3)", a(2,3)
call self%timer_stop("transform_generalized()")
end subroutine
This diff is collapsed.
#include "config-f90.h"
#include "../assert.h"
program test_bindings
use elpa
use test_util
use test_setup_mpi
! use test_prepare_matrix
use test_read_input_parameters
use test_blacs_infrastructure
! use test_check_correctness
! use test_analytic
! use test_scalapack
implicit none
#include "src/elpa_generated_fortran_interfaces.h"
! matrix dimensions
integer :: na, nev, nblk
! mpi
integer :: myid, nprocs
integer :: na_cols, na_rows ! local matrix size
integer :: np_cols, np_rows ! number of MPI processes per column/row
integer :: my_prow, my_pcol ! local MPI task position (my_prow, my_pcol) in the grid (0..np_cols -1, 0..np_rows -1)
integer :: mpierr, mpi_comm_rows, mpi_comm_cols
type(output_t) :: write_to_file
! blacs
integer :: my_blacs_ctxt, sc_desc(9), info, nprow, npcol, i, j
character(len=1) :: layout
! The Matrix
real(kind=C_DOUBLE) , allocatable :: a(:,:), res(:,:)
logical :: skip_check_correctness
class(elpa_t), pointer :: e
integer :: error, status
call read_input_parameters_traditional(na, nev, nblk, write_to_file, skip_check_correctness)
call setup_mpi(myid, nprocs)
#ifdef WITH_MPI
!call redirect_stdout(myid)
if (elpa_init(CURRENT_API_VERSION) /= ELPA_OK) then
print *, "ELPA API version not supported"
stop 1
layout = 'C'
do np_cols = NINT(SQRT(REAL(nprocs))),2,-1
if(mod(nprocs,np_cols) == 0 ) exit
np_rows = nprocs/np_cols
assert(nprocs == np_rows * np_cols)
if (myid == 0) then
print '((a,i0))', 'Matrix size: ', na
print '((a,i0))', 'Num eigenvectors: ', nev
print '((a,i0))', 'Blocksize: ', nblk
#ifdef WITH_MPI
print '((a,i0))', 'Num MPI proc: ', nprocs
print '(3(a,i0))','Number of processor rows=',np_rows,', cols=',np_cols,', total=',nprocs
print '(a)', 'Process layout: ' // layout
print *,''
call set_up_blacsgrid(mpi_comm_world, np_rows, np_cols, layout, &
my_blacs_ctxt, my_prow, my_pcol)
call set_up_blacs_descriptor(na, nblk, my_prow, my_pcol, np_rows, np_cols, &
na_rows, na_cols, sc_desc, my_blacs_ctxt, info)
allocate(a (na_rows,na_cols))
e => elpa_allocate()
if(myid .ne. 0) then
call e%set("suppress_warnings", 1, error)
call e%set("na", na, error)
call e%set("nev", nev, error)
call e%set("local_nrows", na_rows, error)
call e%set("local_ncols", na_cols, error)
call e%set("nblk", nblk, error)
#ifdef WITH_MPI
call e%set("mpi_comm_parent", MPI_COMM_WORLD, error)
call e%set("process_row", my_prow, error)
call e%set("process_col", my_pcol, error)
call e%get("mpi_comm_rows",mpi_comm_rows,error)
call e%get("mpi_comm_cols",mpi_comm_cols,error)
a(:,:) = 1.0
res(:,:) = 0.0
call test_c_bindings(a, na_rows, na_cols, np_rows, np_cols, my_prow, my_pcol, sc_desc, res, mpi_comm_rows, mpi_comm_cols)
status = 0
do i = 1, na_rows
do j = 1, na_cols
if(a(i,j) .ne. 1.0) then
write(*,*) i, j, ": wrong value of A: ", a(i,j), ", should be 1"
status = 1
if(res(i,j) .ne. 3.0) then
write(*,*) i, j, ": wrong value of res: ", res(i,j), ", should be 3"
status = 1
call check_status(status, myid)
call elpa_deallocate(e)
call elpa_uninit()
#ifdef WITH_MPI
call blacs_gridexit(my_blacs_ctxt)
call mpi_finalize(mpierr)
call exit(status)
subroutine check_status(status, myid)
implicit none
integer, intent(in) :: status, myid
integer :: mpierr
if (status /= 0) then
if (myid == 0) print *, "Result incorrect!"
#ifdef WITH_MPI
call mpi_finalize(mpierr)
call exit(status)
end subroutine
end program
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment