Commit 844a4c7c authored by Pavel Kus's avatar Pavel Kus
Browse files

back transformation handled by cannons algorihtm for all cases

parent 4e4a7074
......@@ -60,8 +60,7 @@ libelpa@SUFFIX@_private_la_SOURCES = \
src/elpa2/qr/elpa_pdgeqrf.F90 \
src/elpa1/elpa1.F90 \
src/elpa2/elpa2.F90 \
src/elpa_generalized/cannon_forw.c \
src/elpa_generalized/cannon_back_real_double.c \
src/elpa_generalized/cannon.c \
#src/elpa_generalized/test_c_bindings.c \
src/helpers/matrix_plot.F90 \
src/elpa_index.c
......@@ -745,6 +744,7 @@ EXTRA_DIST = \
src/elpa2/qr/qr_utils_template.F90 \
src/elpa2/redist_band.F90 \
src/elpa_generalized/cannon_forw_template.c \
src/elpa_generalized/cannon_back_template.c \
src/elpa_index.h \
src/fortran_constants.h \
src/general/map_global_to_local.F90 \
......
......@@ -8,17 +8,18 @@
#ifdef WITH_MPI
#include <mpi.h>
int numroc_(int*, int*, int*, int*, int*);
//***********************************************************************************************************
#define REALCASE 1
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
#include "cannon_forw_template.c"
#include "cannon_back_template.c"
#undef DOUBLE_PRECISION
#undef REALCASE
//***********************************************************************************************************
/*
!f> interface
!f> subroutine cannons_reduction_d(A, U, local_rows, local_cols, a_desc, Res, toStore, row_comm, col_comm) &
......@@ -35,14 +36,31 @@ int numroc_(int*, int*, int*, int*, int*);
void cannons_reduction_c_d(double* A, double* U, int local_rows, int local_cols, int* a_desc,
double *Res, int ToStore, int row_comm, int col_comm);
/*
!f> interface
!f> subroutine cannons_triang_rectangular_d(U, B, local_rows, local_cols, u_desc, b_desc, Res, row_comm, col_comm) &
!f> bind(C, name="cannons_triang_rectangular_c_d")
!f> use, intrinsic :: iso_c_binding
!f> real(c_double) :: U(local_rows, local_cols), B(local_rows, local_cols), Res(local_rows, local_cols)
!f> integer(kind=c_int) :: u_desc(9), b_desc(9)
!f> integer(kind=c_int),value :: local_rows, local_cols
!f> integer(kind=c_int),value :: row_comm, col_comm
!f> end subroutine
!f> end interface
*/
void cannons_triang_rectangular_c_d(double* U, double* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, double *Res, int row_comm, int col_comm);
//***********************************************************************************************************
#define REALCASE 1
#define SINGLE_PRECISION 1
#include "../general/precision_macros.h"
#include "cannon_forw_template.c"
#include "cannon_back_template.c"
#undef SINGLE_PRECISION
#undef REALCASE
//***********************************************************************************************************
/*
!f> interface
!f> subroutine cannons_reduction_f(A, U, local_rows, local_cols, a_desc, Res, toStore, row_comm, col_comm) &
......@@ -59,14 +77,31 @@ void cannons_reduction_c_d(double* A, double* U, int local_rows, int local_cols,
void cannons_reduction_c_f(float* A, float* U, int local_rows, int local_cols, int* a_desc,
float *Res, int ToStore, int row_comm, int col_comm);
/*
!f> interface
!f> subroutine cannons_triang_rectangular_f(U, B, local_rows, local_cols, u_desc, b_desc, Res, row_comm, col_comm) &
!f> bind(C, name="cannons_triang_rectangular_c_f")
!f> use, intrinsic :: iso_c_binding
!f> real(c_float) :: U(local_rows, local_cols), B(local_rows, local_cols), Res(local_rows, local_cols)
!f> integer(kind=c_int) :: u_desc(9), b_desc(9)
!f> integer(kind=c_int),value :: local_rows, local_cols
!f> integer(kind=c_int),value :: row_comm, col_comm
!f> end subroutine
!f> end interface
*/
void cannons_triang_rectangular_c_f(float* U, float* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, float *Res, int row_comm, int col_comm);
//***********************************************************************************************************
#define COMPLEXCASE 1
#define DOUBLE_PRECISION 1
#include "../general/precision_macros.h"
#include "cannon_forw_template.c"
#include "cannon_back_template.c"
#undef DOUBLE_PRECISION
#undef COMPLEXCASE
//***********************************************************************************************************
/*
!f> interface
!f> subroutine cannons_reduction_dc(A, U, local_rows, local_cols, a_desc, Res, toStore, row_comm, col_comm) &
......@@ -83,14 +118,30 @@ void cannons_reduction_c_f(float* A, float* U, int local_rows, int local_cols, i
void cannons_reduction_c_dc(double complex* A, double complex* U, int local_rows, int local_cols, int* a_desc,
double complex *Res, int ToStore, int row_comm, int col_comm);
/*
!f> interface
!f> subroutine cannons_triang_rectangular_dc(U, B, local_rows, local_cols, u_desc, b_desc, Res, row_comm, col_comm) &
!f> bind(C, name="cannons_triang_rectangular_c_dc")
!f> use, intrinsic :: iso_c_binding
!f> complex(c_double) :: U(local_rows, local_cols), B(local_rows, local_cols), Res(local_rows, local_cols)
!f> integer(kind=c_int) :: u_desc(9), b_desc(9)
!f> integer(kind=c_int),value :: local_rows, local_cols
!f> integer(kind=c_int),value :: row_comm, col_comm
!f> end subroutine
!f> end interface
*/
void cannons_triang_rectangular_c_dc(double complex* U, double complex* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, double complex *Res, int row_comm, int col_comm);
//***********************************************************************************************************
#define COMPLEXCASE 1
#define SINGLE_PRECISION 1
#include "../general/precision_macros.h"
#include "cannon_forw_template.c"
#include "cannon_back_template.c"
#undef SINGLE_PRECISION
#undef COMPLEXCASE
//***********************************************************************************************************
/*
!f> interface
!f> subroutine cannons_reduction_fc(A, U, local_rows, local_cols, a_desc, Res, toStore, row_comm, col_comm) &
......@@ -107,26 +158,62 @@ void cannons_reduction_c_dc(double complex* A, double complex* U, int local_rows
void cannons_reduction_c_fc(float complex* A, float complex* U, int local_rows, int local_cols, int* a_desc,
float complex *Res, int ToStore, int row_comm, int col_comm);
/*
!f> interface
!f> subroutine cannons_triang_rectangular_fc(U, B, local_rows, local_cols, u_desc, b_desc, Res, row_comm, col_comm) &
!f> bind(C, name="cannons_triang_rectangular_c_fc")
!f> use, intrinsic :: iso_c_binding
!f> complex(c_float) :: U(local_rows, local_cols), B(local_rows, local_cols), Res(local_rows, local_cols)
!f> integer(kind=c_int) :: u_desc(9), b_desc(9)
!f> integer(kind=c_int),value :: local_rows, local_cols
!f> integer(kind=c_int),value :: row_comm, col_comm
!f> end subroutine
!f> end interface
*/
void cannons_triang_rectangular_c_fc(float complex* U, float complex* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, float complex *Res, int row_comm, int col_comm);
#else
// Just because of the Intel preprocessor
// TODO do something with it
// ideally the build system, which is generating fortran interfaces, should respect ifdefs and not
// generate interface for non-MPI case
void cannons_reduction_c_d(double* A, double* U, int local_rows, int local_cols, int* a_desc,
double *Res, int ToStore, int row_comm, int col_comm)
{
}
void cannons_triang_rectangular_c_d(double* U, double* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, double *Res, int row_comm, int col_comm)
{
}
void cannons_reduction_c_f(float* A, float* U, int local_rows, int local_cols, int* a_desc,
float *Res, int ToStore, int row_comm, int col_comm)
{
}
void cannons_triang_rectangular_c_f(float* U, float* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, float *Res, int row_comm, int col_comm)
{
}
void cannons_reduction_c_dc(double complex* A, double complex* U, int local_rows, int local_cols, int* a_desc,
double complex *Res, int ToStore, int row_comm, int col_comm)
{
}
void cannons_triang_rectangular_c_dc(double complex* U, double complex* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, double complex *Res, int row_comm, int col_comm)
{
}
void cannons_reduction_c_fc(float complex* A, float complex* U, int local_rows, int local_cols, int* a_desc,
float complex *Res, int ToStore, int row_comm, int col_comm)
{
}
void cannons_triang_rectangular_c_fc(float complex* U, float complex* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, float complex *Res, int row_comm, int col_comm)
{
}
#endif
#include "config-f90.h"
// it seems, that we need those two levels of indirection to correctly expand macros
#define cannons_triang_rectangular_impl_expand2(SUFFIX) cannons_triang_rectangular_##SUFFIX
#define cannons_triang_rectangular_impl_expand1(SUFFIX) cannons_triang_rectangular_impl_expand2(SUFFIX)
#define cannons_triang_rectangular_impl cannons_triang_rectangular_impl_expand1(ELPA_IMPL_SUFFIX)
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define cannons_triang_rectangular_c_impl_expand2(SUFFIX) cannons_triang_rectangular_c_##SUFFIX
#define cannons_triang_rectangular_c_impl_expand1(SUFFIX) cannons_triang_rectangular_c_impl_expand2(SUFFIX)
#define cannons_triang_rectangular_c_impl cannons_triang_rectangular_c_impl_expand1(ELPA_IMPL_SUFFIX)
// most of the file is not compiled if not using MPI
#ifdef WITH_MPI
#include <mpi.h>
//#include <elpa/elpa.h>
//#include <elpa/elpa_generated.h>
//#include <elpa/elpa_constants.h>
//#include <elpa/elpa_generated_legacy.h>
//#include <elpa/elpa_generic.h>
//#include <elpa/elpa_legacy.h>
//
void pdlacpy_(char*, int*, int*, double*, int*, int*, int*, double*, int*, int*, int*);
void dlacpy_(char*, int*, int*, double*, int*, double*, int*);
void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
void pdtran_(int*, int*, double*, double*, int*, int*, int*, double*, double*, int*, int*, int*);
//void pdelset_(double*, int*, int*, int*, double*);
//void pdsymm_(char*, char*, int*, int*, double*, double*, int*, int*, int*, double*, int*, int*, int*, double*, double*, int*, int*, int*);
//void pdpotrf_(char*, int*, double*, int*, int*, int*, int*);
//void pdsyngst_(int*, char*, int*, double*, int*, int*, int*, double*, int*, int*, int*, double*, double*, int*, int*);
//void descinit_(int*, int*, int*, int*, int*, int*, int*, int*, int*, int*);
int numroc_(int*, int*, int*, int*, int*);
//void set_up_blacsgrid_f1(int, int*, int*, int*, int*, int*, int*, int*);
//void pdtrtrs_(char*, char*, char*, int*, int*, double*, int*, int*, int*, double*, int*, int*, int*, int*);
//void pdsyevr_(char*, char*, char*, int*, double*, int*, int*, int*, int*, int*, int*, int*, int*, int*, double*, double*, int*, int*, int*, double*, int*, int*, int*, int*);
void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols, int my_prow, int my_pcol, int* U_desc, int*b_desc, double *Res, MPI_Comm row_comm, MPI_Comm col_comm)
void cannons_triang_rectangular_impl(math_type* U, math_type* B, int np_rows, int np_cols, int my_prow, int my_pcol, int* U_desc, int*b_desc, math_type *Res, MPI_Comm row_comm, MPI_Comm col_comm)
{
// Cannons algorithm, Non-blocking version
// Input:
......@@ -45,11 +23,11 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
int i, j, Size_send_U, Size_receive_U, Size_send_B, Size_receive_B, intNumber, Buf_rows, Buf_cols_U, Buf_cols_B, curr_rows, num_of_iters, cols_in_buffer, rows_in_block, curr_col_loc, cols_in_block, num_of_blocks_in_U_buffer, col_of_origin_U, b_rows_mult, b_cols_mult;
double *Buf_to_send_U, *Buf_to_receive_U, *Buf_to_send_B, *Buf_to_receive_B, *Buf_U, *PosBuff;
math_type *Buf_to_send_U, *Buf_to_receive_U, *Buf_to_send_B, *Buf_to_receive_B, *Buf_U, *PosBuff;
int where_to_send_U, from_where_to_receive_U, where_to_send_B, from_where_to_receive_B, last_proc_col_B, last_proc_row_B, n, Size_U_stored, proc_col_min;
double *U_local_start, *Buf_pos, *B_local_start, *double_ptr, *CopyTo, *CopyFrom;
math_type *U_local_start, *Buf_pos, *B_local_start, *double_ptr, *CopyTo, *CopyFrom;
int ratio;
......@@ -57,8 +35,8 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
int one = 1;
int zero = 0;
double done = 1.0;
double dzero = 0.0;
math_type done = 1.0;
math_type dzero = 0.0;
na = U_desc[2];
nblk = U_desc[4];
......@@ -107,15 +85,15 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
ratio = np_cols/np_rows;
intNumber = ceil((double)na/(double)(np_cols*nblk)); // max. possible number of the local block columns of U
intNumber = ceil((math_type)na/(math_type)(np_cols*nblk)); // max. possible number of the local block columns of U
Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2; // number of local elements from the upper triangular part that every proc. has (max. possible value among all the procs.)
Buf_to_send_U = malloc(ratio*Size_U_stored*sizeof(double));
Buf_to_receive_U = malloc(ratio*Size_U_stored*sizeof(double));
Buf_to_send_B = malloc(Buf_cols_B*Buf_rows*sizeof(double));
Buf_to_receive_B = malloc(Buf_cols_B*Buf_rows*sizeof(double));
Buf_to_send_U = malloc(ratio*Size_U_stored*sizeof(math_type));
Buf_to_receive_U = malloc(ratio*Size_U_stored*sizeof(math_type));
Buf_to_send_B = malloc(Buf_cols_B*Buf_rows*sizeof(math_type));
Buf_to_receive_B = malloc(Buf_cols_B*Buf_rows*sizeof(math_type));
if(ratio != 1)
Buf_U = malloc(Size_U_stored*sizeof(double)); // in this case we will receive data into initial buffer and after place block-rows to the needed positions of buffer for calculation
Buf_U = malloc(Size_U_stored*sizeof(math_type)); // in this case we will receive data into initial buffer and after place block-rows to the needed positions of buffer for calculation
for(i = 0; i < na_rows*nb_cols; i++)
Res[i] = 0;
......@@ -133,14 +111,14 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
if(my_pcol >= my_prow) // if I am in the upper part of proc. grid
curr_col_loc = 0; // my first local block-column has block from the upper part of matrix
else
curr_col_loc = 1; //ceil((double)(((double)my_prow - (double)my_pcol)/(double)np_cols)) always will give 1 since np_cols > np_rows
curr_col_loc = 1; //ceil((math_type)(((math_type)my_prow - (math_type)my_pcol)/(math_type)np_cols)) always will give 1 since np_cols > np_rows
num_of_iters = ceil((double)na_cols/(double)nblk); // number my of block-columns
num_of_iters = ceil((math_type)na_cols/(math_type)nblk); // number my of block-columns
num_of_iters = num_of_iters - curr_col_loc; // I will exclude the first <curr_col_loc> block-columns since they do not have blocks from the upper part of matrix U
curr_col_loc = curr_col_loc*nblk; // local index of the found block-column
if(my_pcol >= my_prow )
rows_in_block = ceil(((double)(my_pcol + 1) - (double)my_prow)/(double)np_rows)*nblk;
rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk;
else
rows_in_block = ratio*nblk;
cols_in_buffer_U_my_initial = 0;
......@@ -158,7 +136,7 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
if((rows_in_block > 0)&&(cols_in_block > 0))
{
double_ptr = &U[curr_col_loc*na_rows]; // pointer to start of the current block-column to be copied to buffer
dlacpy_("A", &rows_in_block, &cols_in_block, double_ptr, &na_rows, Buf_pos, &rows_in_block); // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column
C_LACPY("A", &rows_in_block, &cols_in_block, double_ptr, &na_rows, Buf_pos, &rows_in_block); // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column
Buf_pos = Buf_pos + rows_in_block*cols_in_block; // go to the position where the next block-column will be copied
Size_send_U = Size_send_U + rows_in_block*cols_in_block;
cols_in_buffer_U_my_initial = cols_in_buffer_U_my_initial + cols_in_block;
......@@ -167,9 +145,9 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
rows_in_block = rows_in_block + ratio*nblk;
}
rows_in_buffer_U_my_initial = rows_in_block - ratio*nblk; // remove redundant addition from the previous loop
*Buf_pos = (double)cols_in_buffer_U_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors
*Buf_pos = (math_type)cols_in_buffer_U_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors
Buf_pos = Buf_pos + 1;
*Buf_pos = (double)rows_in_buffer_U_my_initial; // write number of the rows at the end of the buffer; we will need this for furhter multiplications on the other processors
*Buf_pos = (math_type)rows_in_buffer_U_my_initial; // write number of the rows at the end of the buffer; we will need this for furhter multiplications on the other processors
Size_send_U = Size_send_U + 2;
// now we have the local buffer to send
......@@ -196,8 +174,8 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
{
if(where_to_send_U != my_pcol) // if I need to send and receive on this step
{
MPI_Sendrecv(Buf_to_send_U, Size_send_U, MPI_DOUBLE, where_to_send_U, 0, Buf_U, Size_U_stored, MPI_DOUBLE, from_where_to_receive_U, 0, row_comm, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_U_now);
MPI_Sendrecv(Buf_to_send_U, Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_U, 0, Buf_U, Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_U, 0, row_comm, &status);
MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U_now);
Size_receive_U = Size_receive_U + Size_receive_U_now - 2; // we need only number of elements, so exclude information about cols_in_buffer_U and rows_in_buffer_U
cols_in_buffer_U_now = Buf_U[Size_receive_U_now - 2];
......@@ -239,9 +217,9 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
}
// copy by block-columns
intNumber = ceil((double)cols_in_buffer_U_now/(double)nblk); // how many block-columns I have received on this iteration
intNumber = ceil((math_type)cols_in_buffer_U_now/(math_type)nblk); // how many block-columns I have received on this iteration
if(from_where_to_receive_U >= my_prow)
rows_in_block = ceil(((double)(from_where_to_receive_U + 1) - (double)my_prow)/(double)np_rows)*nblk; // number of rows in the first block-column of U buffer
rows_in_block = ceil(((math_type)(from_where_to_receive_U + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk; // number of rows in the first block-column of U buffer
else
rows_in_block = ratio*nblk;
for(j = 0; j < intNumber; j++)
......@@ -251,7 +229,7 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
else
cols_in_block = cols_in_buffer_U_now - j*nblk;
dlacpy_("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
C_LACPY("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
CopyFrom = CopyFrom + rows_in_block*cols_in_block;
CopyTo = CopyTo + ratio*rows_in_block*nblk + nblk*nblk*ratio*(ratio-1)/2; // I need to leave place for ratio block-columns of the other procs. of the lengths rows_in_block, (rows_in_block+nblk), (rows_in_block+2*nblk) and so on
......@@ -264,8 +242,8 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
{
if(my_prow > 0)
{
MPI_Sendrecv(Buf_to_send_U, Size_send_U, MPI_DOUBLE, where_to_send_U, 0, Buf_to_receive_U, Size_U_stored, MPI_DOUBLE, from_where_to_receive_U, 0, row_comm, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_U);
MPI_Sendrecv(Buf_to_send_U, Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_U, 0, Buf_to_receive_U, Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_U, 0, row_comm, &status);
MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U);
cols_in_buffer_U = (int)Buf_to_receive_U[Size_receive_U-2];
rows_in_buffer_U = (int)Buf_to_receive_U[Size_receive_U-1];
}
......@@ -295,20 +273,20 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
if(where_to_send_B != my_prow) // for the rectangular proc grids it may be possible that I need to "send to myself"; if it is not the case, then I send
{
// form array to send
dlacpy_("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_send_B, &na_rows);
MPI_Sendrecv(Buf_to_send_B, nb_cols*na_rows, MPI_DOUBLE, where_to_send_B, 0, Buf_to_receive_B, nb_cols*Buf_rows, MPI_DOUBLE, from_where_to_receive_B, 0, col_comm, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_B); // find out how many elements I have received
C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_send_B, &na_rows);
MPI_Sendrecv(Buf_to_send_B, nb_cols*na_rows, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_B, 0, Buf_to_receive_B, nb_cols*Buf_rows, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_B, 0, col_comm, &status);
MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_B); // find out how many elements I have received
Size_receive_B = Size_receive_B/nb_cols; // how many rows I have received
}
else
{
dlacpy_("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // else I copy data like I have "received" it
C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // else I copy data like I have "received" it
Size_receive_B = na_rows;
}
}
else
{
dlacpy_("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // if I am in the 0 proc row, I need not to send; so copy data like I have "received" it
C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // if I am in the 0 proc row, I need not to send; so copy data like I have "received" it
Size_receive_B = na_rows;
}
......@@ -333,12 +311,12 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
Size_send_B = Size_receive_B;
///// shift for U ////////////////////////////////////////////////////////////
MPI_Isend(Buf_to_send_U, Size_send_U, MPI_DOUBLE, where_to_send_U, 0, row_comm, &request_U_Send);
MPI_Irecv(Buf_to_receive_U, ratio*Size_U_stored, MPI_DOUBLE, from_where_to_receive_U, 0, row_comm, &request_U_Recv);
MPI_Isend(Buf_to_send_U, Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_U, 0, row_comm, &request_U_Send);
MPI_Irecv(Buf_to_receive_U, ratio*Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_U, 0, row_comm, &request_U_Recv);
///// shift for B /////////////////////////////////////////////
MPI_Isend(Buf_to_send_B, Size_send_B*nb_cols, MPI_DOUBLE, where_to_send_B, 0, col_comm, &request_B_Send);
MPI_Irecv(Buf_to_receive_B, Buf_rows*nb_cols, MPI_DOUBLE, from_where_to_receive_B, 0, col_comm, &request_B_Recv);
MPI_Isend(Buf_to_send_B, Size_send_B*nb_cols, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_B, 0, col_comm, &request_B_Send);
MPI_Irecv(Buf_to_receive_B, Buf_rows*nb_cols, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_B, 0, col_comm, &request_B_Recv);
///// multiplication ////////////////////////////////////////////////////////////////////////////////////////////
cols_in_buffer_U = (int)Buf_to_send_U[Size_receive_U-2];
......@@ -353,7 +331,7 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
}
col_of_origin_U = proc_col_min;
num_of_blocks_in_U_buffer = ceil((double)cols_in_buffer_U/(double)nblk);
num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
if (col_of_origin_U >= my_prow)
B_local_start = Buf_to_send_B;
......@@ -373,7 +351,7 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
else
b_rows_mult = cols_in_buffer_U - j*nblk;
dgemm_("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows);
C_GEMM("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows);
U_local_start = U_local_start + nblk*curr_rows;
B_local_start = B_local_start + nblk;
......@@ -381,11 +359,11 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
MPI_Wait(&request_U_Send, &status);
MPI_Wait(&request_U_Recv, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_U); // find out how many elements I have received
MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U); // find out how many elements I have received
MPI_Wait(&request_B_Send, &status);
MPI_Wait(&request_B_Recv, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_B); // find out how many elements I have received
MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_B); // find out how many elements I have received
Size_receive_B = Size_receive_B/nb_cols; // how many rows I have received
}
......@@ -402,7 +380,7 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
}
col_of_origin_U = proc_col_min;
num_of_blocks_in_U_buffer = ceil((double)cols_in_buffer_U/(double)nblk);
num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk);
if (col_of_origin_U >= my_prow)
B_local_start = Buf_to_receive_B;
......@@ -422,7 +400,7 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
else
b_rows_mult = cols_in_buffer_U - j*nblk;
dgemm_("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows);
C_GEMM("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows);
U_local_start = U_local_start + nblk*curr_rows;
B_local_start = B_local_start + nblk;
......@@ -436,23 +414,9 @@ void d_cannons_triang_rectangular(double* U, double* B, int np_rows, int np_cols
free(Buf_U);
}
#endif
//***********************************************************************************************************
/*
!f> interface
!f> subroutine cannons_triang_rectangular(U, B, local_rows, local_cols, u_desc, b_desc, Res, row_comm, col_comm) &
!f> bind(C, name="d_cannons_triang_rectangular_c")
!f> use, intrinsic :: iso_c_binding
!f> real(c_double) :: U(local_rows, local_cols), B(local_rows, local_cols), Res(local_rows, local_cols)
!f> integer(kind=c_int) :: u_desc(9), b_desc(9)
!f> integer(kind=c_int),value :: local_rows, local_cols
!f> integer(kind=c_int),value :: row_comm, col_comm
!f> end subroutine
!f> end interface
*/
void d_cannons_triang_rectangular_c(double* U, double* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, double *Res, int row_comm, int col_comm)
void cannons_triang_rectangular_c_impl(math_type* U, math_type* B, int local_rows, int local_cols,
int* u_desc, int* b_desc, math_type *Res, int row_comm, int col_comm)
{
#ifdef WITH_MPI
MPI_Comm c_row_comm = MPI_Comm_f2c(row_comm);
......@@ -470,7 +434,7 @@ void d_cannons_triang_rectangular_c(double* U, double* B, int local_rows, int lo
// What we usually call row_comm in elpa, is thus passed to col_comm parameter of the function and vice versa
// (order is swapped in the following call)
// It is a bit unfortunate, maybe it should be changed in the Cannon algorithm to comply with ELPA standard notation?
d_cannons_triang_rectangular(U, B, np_rows, np_cols, my_prow, my_pcol, u_desc, b_desc, Res, c_col_comm, c_row_comm);
cannons_triang_rectangular_impl(U, B, np_rows, np_cols, my_prow, my_pcol, u_desc, b_desc, Res, c_col_comm, c_row_comm);
#else
printf("Internal error: Cannons algorithm should not be called without MPI, stopping...\n");
exit(1);
......
......@@ -72,14 +72,12 @@
!TODO tunable parameter?
BuffLevelInt = 1
call self%timer_start("cannons_reduction")
! BEWARE! even though tmp is output from the routine, it has to be zero on input!
tmp = 0.0_rck
call cannons_reduction_&
&ELPA_IMPL_SUFFIX&
&(a, b, self%local_nrows, self%local_ncols, &
sc_desc, tmp, BuffLevelInt, mpi_comm_rows, mpi_comm_cols)
&(a, b, self%local_nrows, self%local_ncols, sc_desc, tmp, BuffLevelInt, mpi_comm_rows, mpi_comm_cols)
call self%timer_stop("cannons_reduction")
a(1:self%local_nrows, 1:self%local_ncols) = tmp(1:self%local_nrows, 1:self%local_ncols)
......@@ -149,9 +147,6 @@
call self%timer_start("transform_back_generalized()")
call self%get("cannon_for_generalized",use_cannon,error)
#if !defined(REALCASE) || !defined(DOUBLE_PRECISION)
use_cannon = 0
#endif
#if !defined(WITH_MPI)
use_cannon = 0
......@@ -166,12 +161,13 @@
if(error .NE. ELPA_OK) return
if(use_cannon == 1) then
#if defined(REALCASE) && defined(DOUBLE_PRECISION)
call cannons_triang_rectangular(b, q, self%local_nrows, self%local_ncols, &
sc_desc, sc_desc_ev, tmp, mpi_comm_rows, mpi_comm_cols);
call self%timer_start("cannons_triang_rectangular")
call cannons_triang_rectangular_&
&ELPA_IMPL_SUFFIX&
&(b, q, self%local_nrows, self%local_ncols, sc_desc, sc_desc_ev, tmp, mpi_comm_rows, mpi_comm_cols);
call self%timer_stop("cannons_triang_rectangular")
q(1:self%local_nrows, 1:self%local_ncols) = tmp(1:self%local_nrows, 1:self%local_ncols)
#endif
else
call self%timer_start("scalapack multiply inv(U) * Q")
#ifdef WITH_MPI
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment