// This file is part of ELPA.
//
// The ELPA library was originally created by the ELPA consortium,
// consisting of the following organizations:
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// - Max-Plack-Institut für Mathematik in den Naturwissenschaften,
// Leipzig, Abt. Komplexe Strukutren in Biologie und Kognition,
// and
// - IBM Deutschland GmbH
//
// This particular source code file has been developed within the ELPA-AEO //
// project, which has been a joint effort of
//
// - Max Planck Computing and Data Facility (MPCDF), formerly known as
// Rechenzentrum Garching der Max-Planck-Gesellschaft (RZG),
// - Bergische Universität Wuppertal, Lehrstuhl für angewandte
// Informatik,
// - Technische Universität München, Lehrstuhl für Informatik mit
// Schwerpunkt Wissenschaftliches Rechnen ,
// - Technische Universität München, Lehrstuhl für Theoretische Chemie,
// - Fritz-Haber-Institut, Berlin, Abt. Theorie,
// More information can be found here:
// http://elpa.mpcdf.mpg.de/ and
// http://elpa-aeo.mpcdf.mpg.de
//
// ELPA is free software: you can redistribute it and/or modify
// it under the terms of the version 3 of the license of the
// GNU Lesser General Public License as published by the Free
// Software Foundation.
//
// ELPA is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with ELPA. If not, see
//
// ELPA reflects a substantial effort on the part of the original
// ELPA consortium, and we ask you to respect the spirit of the
// license that we chose: i.e., please contribute any changes you
// may have back to the original ELPA library distribution, and keep
// any derivatives of ELPA under the same license that we chose for
// the original distribution, the GNU Lesser General Public License.
//
// Author: Valeriy Manin (Bergische Universität Wuppertal)
// integreated into the ELPA library Pavel Kus, Andeas Marek (MPCDF)
#include
#include
#ifdef WITH_MPI
#include
#endif
#include
#include
#include
#include
#include
#include
#include
void dlacpy_(char*, int*, int*, double*, int*, double*, int*);
void dgemm_(char*, char*, int*, int*, int*, double*, double*, int*, double*, int*, double*, double*, int*);
void pdtran_(int*, int*, double*, double*, int*, int*, int*, double*, double*, int*, int*, int*);
void pdtrmm_(char*, char*, char*, char*, int*, int*, double*, double*, int*, int*, int*, double*, int*, int*, int*);
void descinit_(int*, int*, int*, int*, int*, int*, int*, int*, int*, int*);
int numroc_(int*, int*, int*, int*, int*);
void set_up_blacsgrid_f1(int, int*, int*, int*, int*, int*, int*, int*);
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////// My function for multiplication 2 //////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void d_Cannons_Mult2(double* L, double* U, int np_rows, int np_cols, int my_prow, int my_pcol, int* a_desc, double *Res, MPI_Comm row_comm, MPI_Comm col_comm)
{
// Input matrices:
// - L: lower triangular matrix
// - U: upper triangular matrix
// Output matrix:
// - Lower triangular part of L*U
// row_comm: communicator along rows
// col_comm: communicator along columns
int na, nblk;
int i, j, ii, Size_send_L, Size_receive_L, Size_send_U, Size_receive_U, num_of_blocks_in_U_buffer, row_of_origin_U, col_of_origin_L, Nb, owner;
double *Buf_to_send_L, *Buf_to_receive_L, *Buf_to_send_U, *Buf_to_receive_U, *U_local_start_curr, *CopyFrom, *CopyTo;
int curr_col_loc, where_to_send_L, from_where_to_receive_L, where_to_send_U, from_where_to_receive_U, rows_in_block, cols_in_block, cols_in_buffer_L, cols_in_buffer_L_my_initial, rows_in_buffer_L, rows_in_buffer_L_my_initial, cols_in_buffer_U, rows_in_buffer_U;
double *L_local_start, *Buf_pos, *U_local_start, *double_ptr, *Res_ptr, *Buf_L;
int LDA_L, rows_in_block_U_curr, ratio, rows_in_buffer, proc_col_min, num_of_iters, rows_in_block_U, curr_row_loc;
int curr_col_loc_res, curr_row_loc_res, curr_row_loc_L, curr_col_loc_U, curr_col_glob_res, L_local_index, LDA_L_new, index_row_L_for_LDA, Size_receive_L_now, cols_in_buffer_L_now, rows_in_buffer_L_now, intNumber, Size_U_stored;
MPI_Status status;
int one = 1;
int zero = 0;
double done = 1.0;
double dzero = 0.0;
int na_rows, na_cols;
na = a_desc[2];
nblk = a_desc[4];
na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
MPI_Request request_L_Recv;
MPI_Request request_L_Send;
MPI_Request request_U_Recv;
MPI_Request request_U_Send;
if (np_cols%np_rows != 0)
{
if((my_prow == 0)&& (my_pcol ==0))
printf("!!!!! np_cols must be a multiple of np_rows!!!!! I do nothing! \n");
return;
}
if (np_cols < np_rows != 0)
{
if((my_prow == 0)&& (my_pcol ==0))
printf("np_cols < np_rows \n");
return;
}
ratio = np_cols/np_rows;
///////////////////////////////////////////////////////////// Start of algorithm ///////////////////////////////////////////////////////////////////////////////////////////////
/////////////////////////memory allocation area//////////////////////////////////////////////////////////////
intNumber = ceil((double)na/(double)(np_cols*nblk)); // max. possible number of the local block columns of U
Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2; // number of local elements from the upper triangular part that every proc. has (max. possible value among all the procs.)
Buf_to_send_L = malloc(ratio*Size_U_stored*sizeof(double));
Buf_to_receive_L = malloc(ratio*Size_U_stored*sizeof(double));
Buf_to_send_U = malloc(Size_U_stored*sizeof(double));
Buf_to_receive_U = malloc(Size_U_stored*sizeof(double));
if(ratio != 1)
Buf_L = malloc(Size_U_stored*sizeof(double)); // in this case we will receive data into initial buffer and after place block-columns to the needed positions of buffer for calculation
/////////////////////////////////////////////////////////////// initial reordering of L /////////////////////////////////////////////////////////////////////////////////////////
// here we assume, that np_rows < np_cols; then I will send to the number of processors equal to with the "leap" equal to np_rows; the same holds for receive
if((ratio != 1)||(my_prow != 0)) // if grid is rectangular or my_prow is not 0
Buf_pos = Buf_to_send_L; // I will copy to the send buffer
else
Buf_pos = Buf_to_receive_L; // if grid is square and my_prow is 0, then I will copy to the received buffer
// form array to send by block-columns; we need only lower triangular part
num_of_iters = ceil((double)na_cols/(double)nblk); // number my of block-columns
cols_in_buffer_L_my_initial = 0;
Size_send_L = 0;
if(my_pcol <= my_prow) // if I am from the lower part of grid
{
curr_row_loc = 0; // I will copy all my block-rows
rows_in_buffer_L_my_initial = na_rows;
}
else
{
curr_row_loc = ceil((double)(((double)my_pcol - (double)my_prow)/(double)np_rows))*nblk; // I will skip some of my block-rows
rows_in_buffer_L_my_initial = na_rows - curr_row_loc;
}
for(i = 0; i < num_of_iters; i++) // loop over my block-columns
{
curr_col_loc = i*nblk; // local index of start of the current block-column
rows_in_block = na_rows - curr_row_loc; // how many rows do I have in the lower part of the current block-column
if ((na_cols - curr_col_loc) < nblk)
cols_in_block = na_cols - curr_col_loc; // how many columns do I have in the block-column
else
cols_in_block = nblk;
if((rows_in_block > 0)&&(cols_in_block > 0))
{
L_local_start = &L[curr_col_loc*na_rows + curr_row_loc];
dlacpy_("A", &rows_in_block, &cols_in_block, L_local_start, &na_rows, Buf_pos, &rows_in_block); // copy lower part of block-column in the buffer with LDA = length of the lower part of block-column
Buf_pos = Buf_pos + rows_in_block*cols_in_block;
Size_send_L = Size_send_L + rows_in_block*cols_in_block;
cols_in_buffer_L_my_initial = cols_in_buffer_L_my_initial + cols_in_block;
}
curr_row_loc = curr_row_loc + ratio*nblk;
}
*Buf_pos = (double)cols_in_buffer_L_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors
Size_send_L = Size_send_L + 1;
// now we have the local buffer to send
// find the lowest processor column among those who will send me
proc_col_min = np_cols;
for(i = 0; i < ratio; i++)
{
from_where_to_receive_L = (my_pcol + my_prow + i*np_rows)%np_cols;
if(from_where_to_receive_L < proc_col_min)
proc_col_min = from_where_to_receive_L;
}
// do communications and form local buffers for calculations
Size_receive_L = 0; // size of the accumulated buffer
cols_in_buffer_L = 0; // number of columns in the accumulated buffer
rows_in_buffer_L = 0; // number of rows in the accumulated buffer
for(i = 0; i < ratio; i++)
{
where_to_send_L = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols;
from_where_to_receive_L = (my_pcol + my_prow + i*np_rows)%np_cols;
// send and receive in the row_comm
if(ratio != 1) // if grid is not square
{
if(where_to_send_L != my_pcol) // if I need to send and receive on this step
{
MPI_Sendrecv(Buf_to_send_L, Size_send_L, MPI_DOUBLE, where_to_send_L, 0, Buf_L, Size_U_stored, MPI_DOUBLE, from_where_to_receive_L, 0, row_comm, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_L_now);
Size_receive_L = Size_receive_L + Size_receive_L_now - 1; // we need only number of elements, so exclude information about cols_in_buffer_L
cols_in_buffer_L_now = Buf_L[Size_receive_L_now-1];
cols_in_buffer_L = cols_in_buffer_L + cols_in_buffer_L_now;
// determine number of rows in the received buffer
if(from_where_to_receive_L <= my_prow) // if source is from the lower part of grid
{
rows_in_buffer_L_now = na_rows;
}
else
{
rows_in_buffer_L_now = na_rows - ceil((double)(((double)from_where_to_receive_L - (double)my_prow)/(double)np_rows))*nblk; // some of the block-rows have been skipped
}
if(rows_in_buffer_L < rows_in_buffer_L_now)
rows_in_buffer_L = rows_in_buffer_L_now;
intNumber = from_where_to_receive_L/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks
if(proc_col_min <= my_prow) // if among procs who will send me there is one with the full sets of block-rows in the lower part
CopyTo = &Buf_to_receive_L[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)]; // here I will copy to; formula based on arithm. progression
else
CopyTo = &Buf_to_receive_L[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)]; // otherwise, the first block-column will be shorter by one block
CopyFrom = Buf_L;
}
else // if I need to send to myself on this step, then I will copy from Buf_to_send_L to Buf_to_receive_L
{
cols_in_buffer_L_now = cols_in_buffer_L_my_initial;
cols_in_buffer_L = cols_in_buffer_L + cols_in_buffer_L_now;
rows_in_buffer_L_now = rows_in_buffer_L_my_initial;
if(rows_in_buffer_L < rows_in_buffer_L_now)
rows_in_buffer_L = rows_in_buffer_L_now;
intNumber = my_pcol/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks
if(proc_col_min <= my_prow) // if among procs who will send me there is one with the full sets of block-rows in the lower part
CopyTo = &Buf_to_receive_L[nblk*(na_rows*intNumber - nblk*(intNumber-1)*intNumber/2)]; // here I will copy to; formula based on arithm. progression
else
CopyTo = &Buf_to_receive_L[nblk*(na_rows*intNumber - nblk*intNumber*(intNumber+1)/2)]; // otherwise, the first block-column will be shorter by one block
CopyFrom = Buf_to_send_L;
Size_receive_L = Size_receive_L + Size_send_L - 1;
}
// copy by block-columns
intNumber = ceil((double)cols_in_buffer_L_now/(double)nblk); // how many block-columns I have received on this iteration
rows_in_block = rows_in_buffer_L_now;
for(j = 0; j < intNumber; j++)
{
if((j+1)*nblk < cols_in_buffer_L_now)
cols_in_block = nblk;
else
cols_in_block = cols_in_buffer_L_now - j*nblk;
dlacpy_("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block);
CopyFrom = CopyFrom + rows_in_block*cols_in_block;
CopyTo = CopyTo + nblk*(ratio*rows_in_block - nblk*(ratio-1)*ratio/2); // I need to leave place for ratio block-columns of the other procs. of the lengths rows_in_block, (rows_in_block-nblk), (rows_in_block-2*nblk) and so on
rows_in_block = rows_in_block - ratio*nblk; // number of rows in the next block-columns
}
}
else // if grid is square
{
if(my_prow > 0)
{
MPI_Sendrecv(Buf_to_send_L, Size_send_L, MPI_DOUBLE, where_to_send_L, 0, Buf_to_receive_L, Size_U_stored, MPI_DOUBLE, from_where_to_receive_L, 0, row_comm, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_L);
cols_in_buffer_L = (int)Buf_to_receive_L[Size_receive_L-1];
if(from_where_to_receive_L <= my_prow) // if source is from the lower part of grid
{
rows_in_buffer_L = na_rows;
}
else
{
rows_in_buffer_L = na_rows - ceil((double)(((double)from_where_to_receive_L - (double)my_prow)/(double)np_rows))*nblk; // some of the block-rows have been skipped
}
}
else // if my_prow == 0, then I have already everything in my Buf_to_receive_L buffer
{
Size_receive_L = Size_send_L;
rows_in_buffer_L = rows_in_buffer_L_my_initial;
cols_in_buffer_L = cols_in_buffer_L_my_initial;
}
}
}
if(ratio != 1)
{
Buf_to_receive_L[Size_receive_L] = cols_in_buffer_L;
Buf_to_receive_L[Size_receive_L + 1] = rows_in_buffer_L;
Size_receive_L = Size_receive_L + 2;
}
else
{
Buf_to_receive_L[Size_receive_L] = rows_in_buffer_L;
Size_receive_L = Size_receive_L + 1;
}
////////////////////////////////////////////////////////////// initial reordering of U //////////////////////////////////////////////////////////////////////////////////////////
// form array to send by block-columns
num_of_iters = ceil((double)na_cols/(double)nblk); // number my of block-columns
where_to_send_U = (my_prow - my_pcol + np_cols)%np_rows; // shift = my_pcol; we assume that np_cols%np_rows = 0
from_where_to_receive_U = (my_pcol + my_prow)%np_rows;
if(where_to_send_U == my_prow) // if I will not need to send my local part of U, then copy my local data to the "received" buffer
Buf_pos = Buf_to_receive_U;
else
Buf_pos = Buf_to_send_U; // else form the array to send
Size_send_U = 0; // we already have 1 element in the buffer
// find the first local block belonging to the upper part of matrix U
if(my_pcol >= my_prow) // if I am in the upper part of proc. grid
curr_col_loc = 0; // my first local block-column has block from the upper part of matrix
else
curr_col_loc = 1; //ceil((double)(((double)my_prow - (double)my_pcol)/(double)np_cols)) always will give 1 since np_cols > np_rows
num_of_iters = num_of_iters - curr_col_loc; // I will exclude the first block-columns since they do not have blocks from the upper part of matrix U
curr_col_loc = curr_col_loc*nblk; // local index of the found block-column
if(my_pcol >= my_prow )
rows_in_block = ceil(((double)(my_pcol + 1) - (double)my_prow)/(double)np_rows)*nblk;
else
rows_in_block = ratio*nblk;
for(i = 0; i < num_of_iters; i++) // loop over my block-columns, which have blocks in the upepr part of U
{
if(rows_in_block > na_rows)
rows_in_block = na_rows;
if ((na_cols - curr_col_loc) < nblk)
cols_in_block = na_cols - curr_col_loc; // how many columns do I have in the current block-column
else
cols_in_block = nblk;
if((rows_in_block > 0)&&(cols_in_block > 0))
{
double_ptr = &U[curr_col_loc*na_rows]; // pointer to start of the current block-column to be copied to buffer
dlacpy_("A", &rows_in_block, &cols_in_block, double_ptr, &na_rows, Buf_pos, &rows_in_block); // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column
Buf_pos = Buf_pos + rows_in_block*cols_in_block; // go to the position where the next block-column will be copied
Size_send_U = Size_send_U + rows_in_block*cols_in_block;
}
curr_col_loc = curr_col_loc + nblk; // go to the next local block-column of my local array U
rows_in_block = rows_in_block + ratio*nblk;
}
rows_in_buffer = rows_in_block - ratio*nblk; // remove redundant addition from the previous loop
*Buf_pos = (double)rows_in_buffer; // write number of the rows at the end of the buffer; we will need this for furhter multiplications on the other processors
Size_send_U = Size_send_U + 1;
//send and receive
if(where_to_send_U != my_prow)
{
// send and receive in the col_comm
MPI_Sendrecv(Buf_to_send_U, Size_send_U, MPI_DOUBLE, where_to_send_U, 0, Buf_to_receive_U, Size_U_stored, MPI_DOUBLE, from_where_to_receive_U, 0, col_comm, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_U); // find out how many elements I have received
}
else // if I do not need to send
Size_receive_U = Size_send_U; // how many rows I "have received"; the needed data I have already copied to the "receive" buffer
//////////////////////////////////////////////////////////////////////// main loop ////////////////////////////////////////////////////////////////////////////////
where_to_send_L = (my_pcol - 1 + np_cols)%np_cols;
from_where_to_receive_L = (my_pcol + 1)%np_cols;
where_to_send_U = (my_prow - 1 + np_rows)%np_rows;
from_where_to_receive_U = (my_prow + 1)%np_rows;
for(j = 1; j < np_rows; j++)
{
// at this moment I need to send to neighbour what I have in the "received" arrays; that is why exchange pointers of the "received" and "send" arrays
double_ptr = Buf_to_send_L;
Buf_to_send_L = Buf_to_receive_L;
Buf_to_receive_L = double_ptr;
double_ptr = Buf_to_send_U;
Buf_to_send_U = Buf_to_receive_U;
Buf_to_receive_U = double_ptr;
///// shift for L ////////////////////////////////////////////////////////////
Size_send_L = Size_receive_L;
MPI_Isend(Buf_to_send_L, Size_send_L, MPI_DOUBLE, where_to_send_L, 0, row_comm, &request_L_Send);
MPI_Irecv(Buf_to_receive_L, ratio*Size_U_stored, MPI_DOUBLE, from_where_to_receive_L, 0, row_comm, &request_L_Recv);
///// shift for U /////////////////////////////////////////////
Size_send_U = Size_receive_U;
MPI_Isend(Buf_to_send_U, Size_send_U, MPI_DOUBLE, where_to_send_U, 0, col_comm, &request_U_Send);
MPI_Irecv(Buf_to_receive_U, Size_U_stored, MPI_DOUBLE, from_where_to_receive_U, 0, col_comm, &request_U_Recv);
///// multiplication ////////////////////////////////////////////////////////////////////////////////////////////
rows_in_buffer_U = (int)Buf_to_send_U[Size_receive_U-1];
row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
if(my_pcol >= row_of_origin_U)
cols_in_buffer_U = na_cols;
else
cols_in_buffer_U = na_cols - nblk;
cols_in_buffer_L = (int)Buf_to_send_L[Size_receive_L-2];
rows_in_buffer_L = (int)Buf_to_send_L[Size_receive_L-1];
// find the minimal pcol among those who have sent L for this iteration
col_of_origin_L = np_cols;
for(i = 0; i < ratio; i++)
{
intNumber = (my_pcol + my_prow + i*np_rows + np_cols + j - 1)%np_cols;
if(intNumber < col_of_origin_L)
col_of_origin_L = intNumber;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// find block-column of the result to start update with
if (my_pcol >= row_of_origin_U) // if origin of U is from the upper part
curr_col_loc_res = 0; // then I update all columns of Result
else
curr_col_loc_res = nblk; // the first block column of U corresponds to my second one and I do not need to update the first block-column
num_of_blocks_in_U_buffer = ceil((double)((double)cols_in_buffer_U/(double)nblk));
if(my_pcol >= row_of_origin_U) // if origin of U is from the upper part
rows_in_block_U = ceil(((double)(my_pcol + 1) - (double)row_of_origin_U)/(double)np_rows)*nblk; // blocks in the first block-column of U buffer
else
rows_in_block_U = ratio*nblk;
U_local_start = &Buf_to_send_U[0];
for (i = 0; i < num_of_blocks_in_U_buffer; i++)
{
// find block-row of the result to start update with; we need to update only lower triangular part of result
curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk; // global index of the first column to be updated
// now we need to find the smallest my local row index, such that the corresponding global index is larger of equal to
Nb = curr_col_glob_res/nblk; // how many global block-rows are before the needed one
owner = Nb%np_rows; // proc. row index of the owner of row with the global index equal to (it is not necessarily me)
curr_row_loc_res = (Nb/np_rows)*nblk;
if(my_prow < owner)
curr_row_loc_res = curr_row_loc_res + nblk;
curr_row_loc_L = curr_row_loc_res; // it is impossible, that both col_of_origin_L and row_of_origin_U are from upper part
if(col_of_origin_L > my_prow)
curr_row_loc_L = curr_row_loc_L - nblk;
rows_in_block = rows_in_buffer_L - curr_row_loc_L; // rows in current block of L
curr_col_loc_U = i*nblk; // local index in the buffer U of the current column
if((curr_col_loc_U + nblk) <= cols_in_buffer_U)
cols_in_block = nblk; // number columns in block of U which will take part in this calculation
else
cols_in_block = cols_in_buffer_U - curr_col_loc_U;
if(rows_in_block_U > rows_in_buffer_U)
rows_in_block_U = rows_in_buffer_U; // rows in current column of U; also a leading dimension for U
L_local_index = curr_row_loc_L;
L_local_start = &Buf_to_send_L[L_local_index];
Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
LDA_L = rows_in_buffer_L;
LDA_L_new = LDA_L;
if ((rows_in_block > 0)&&(cols_in_block > 0))
{
U_local_start_curr = U_local_start;
if (my_prow >= col_of_origin_L)
index_row_L_for_LDA = 0;
else
index_row_L_for_LDA = 1;
// loop over block-columns of the "active" part of L buffer
for (ii = 0; ii < ceil((double)rows_in_block_U/(double)nblk); ii++)
{
if((ii+1)*nblk <= cols_in_buffer_L)
rows_in_block_U_curr = nblk;
else
rows_in_block_U_curr = cols_in_buffer_L - ii*nblk;
if((j == 1)&&(ii == 0))
dgemm_("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &done, L_local_start, &LDA_L, U_local_start_curr, &rows_in_block_U, &dzero, Res_ptr, &na_rows);
else
dgemm_("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &done, L_local_start, &LDA_L, U_local_start_curr, &rows_in_block_U, &done, Res_ptr, &na_rows);
if(np_rows*nblk*index_row_L_for_LDA + ((np_rows+my_prow)%np_rows)*nblk < np_cols*nblk*(ii + 1) + ((np_cols+col_of_origin_L)%np_cols)*nblk)
{
LDA_L_new = LDA_L_new - nblk;
index_row_L_for_LDA = index_row_L_for_LDA + 1;
}
U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
L_local_index = L_local_index - LDA_L + LDA_L*nblk + LDA_L_new;
L_local_start = &Buf_to_send_L[L_local_index];
LDA_L = LDA_L_new;
}
}
U_local_start = U_local_start + rows_in_block_U*cols_in_block;
curr_col_loc_res = curr_col_loc_res + nblk;
rows_in_block_U = rows_in_block_U + ratio*nblk;
}
MPI_Wait(&request_L_Send, &status);
MPI_Wait(&request_L_Recv, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_L); // find out how many elements I have received
MPI_Wait(&request_U_Send, &status);
MPI_Wait(&request_U_Recv, &status);
MPI_Get_count(&status, MPI_DOUBLE, &Size_receive_U); // find out how many elements I have received
}
/////// do the last multiplication //////////////
rows_in_buffer_U = (int)Buf_to_receive_U[Size_receive_U-1];
row_of_origin_U = (my_pcol + my_prow + np_cols + j - 1)%np_rows;
if(my_pcol >= row_of_origin_U)
cols_in_buffer_U = na_cols;
else
cols_in_buffer_U = na_cols - nblk;
cols_in_buffer_L = (int)Buf_to_receive_L[Size_receive_L-2];
rows_in_buffer_L = (int)Buf_to_receive_L[Size_receive_L-1];
// find the minimal pcol among those who have sent L for this iteration
col_of_origin_L = np_cols;
for(i = 0; i < ratio; i++)
{
intNumber = (my_pcol + my_prow + i*np_rows + np_cols + np_rows - 1)%np_cols;
if(intNumber < col_of_origin_L)
col_of_origin_L = intNumber;
}
// find block-column of the result to start update with
if (my_pcol >= row_of_origin_U) // if origin of U is from the upper part
curr_col_loc_res = 0; // then I update all columns of Result
else
curr_col_loc_res = nblk; // the first block column of U corresponds to my second one and I do not need to update the first block-column
num_of_blocks_in_U_buffer = ceil((double)((double)cols_in_buffer_U/(double)nblk));
if(my_pcol >= row_of_origin_U) // if origin of U is from the upper part
rows_in_block_U = ceil(((double)(my_pcol + 1) - (double)row_of_origin_U)/(double)np_rows)*nblk; // blocks in the first block-column of U buffer
else
rows_in_block_U = ratio*nblk;
U_local_start = &Buf_to_receive_U[0];
for (i = 0; i < num_of_blocks_in_U_buffer; i++)
{
// find block-row of the result to start update with; we need to update only lower triangular part of result
curr_col_glob_res = np_cols*nblk*(curr_col_loc_res/nblk) + curr_col_loc_res%nblk + ((np_cols+my_pcol)%np_cols)*nblk; // global index of the first column to be updated
// now we need to find the smallest my local row index, such that the corresponding global index is larger of equal to
Nb = curr_col_glob_res/nblk; // how many global block-rows are before the needed one
owner = Nb%np_rows; // proc. row index of the owner of row with the global index equal to (it is not necessarily me)
curr_row_loc_res = (Nb/np_rows)*nblk;
if(my_prow < owner)
curr_row_loc_res = curr_row_loc_res + nblk;
curr_row_loc_L = curr_row_loc_res; // it is impossible, that both col_of_origin_L and row_of_origin_U are from upper part
if(col_of_origin_L > my_prow)
curr_row_loc_L = curr_row_loc_L - nblk;
rows_in_block = rows_in_buffer_L - curr_row_loc_L; //rows in current block of
curr_col_loc_U = i*nblk; // local index in the buffer U of the current column
if((curr_col_loc_U + nblk) <= cols_in_buffer_U)
cols_in_block = nblk; // number columns in block of U which will take part in this calculation
else
cols_in_block = cols_in_buffer_U - curr_col_loc_U;
if(rows_in_block_U > rows_in_buffer_U)
rows_in_block_U = rows_in_buffer_U;
L_local_index = curr_row_loc_L;
L_local_start = &Buf_to_receive_L[L_local_index];
Res_ptr = &Res[curr_col_loc_res*na_rows + curr_row_loc_res];
LDA_L = rows_in_buffer_L;
LDA_L_new = LDA_L;
if ((rows_in_block > 0) &&(cols_in_block > 0))
{
U_local_start_curr = U_local_start;
if (my_prow >= col_of_origin_L)
index_row_L_for_LDA = 0;
else
index_row_L_for_LDA = 1;
// loop over block-columns of the "active" part of L buffer
for (ii = 0; ii < ceil((double)rows_in_block_U/(double)nblk); ii++)
{
if((ii+1)*nblk <= cols_in_buffer_L)
rows_in_block_U_curr = nblk;
else
rows_in_block_U_curr = cols_in_buffer_L - ii*nblk;
if((j == 1)&&(ii == 0))
dgemm_("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &done, L_local_start, &LDA_L, U_local_start_curr, &rows_in_block_U, &dzero, Res_ptr, &na_rows);
else
dgemm_("N", "N", &rows_in_block, &cols_in_block, &rows_in_block_U_curr, &done, L_local_start, &LDA_L, U_local_start_curr, &rows_in_block_U, &done, Res_ptr, &na_rows);
if(np_rows*nblk*index_row_L_for_LDA + ((np_rows+my_prow)%np_rows)*nblk < np_cols*nblk*(ii + 1) + ((np_cols+col_of_origin_L)%np_cols)*nblk)
{
LDA_L_new = LDA_L_new - nblk;
index_row_L_for_LDA = index_row_L_for_LDA + 1;
}
U_local_start_curr = U_local_start_curr + rows_in_block_U_curr;
L_local_index = L_local_index - (LDA_L - rows_in_block) + LDA_L*nblk + LDA_L_new - rows_in_block;
L_local_start = &Buf_to_receive_L[L_local_index];
LDA_L = LDA_L_new;
}
}
U_local_start = U_local_start + rows_in_block_U*cols_in_block;
curr_col_loc_res = curr_col_loc_res + nblk;
rows_in_block_U = rows_in_block_U + ratio*nblk;
}
free(Buf_to_send_L);
free(Buf_to_receive_L);
free(Buf_to_send_U);
free(Buf_to_receive_U);
if(ratio != 1)
free(Buf_L);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////// Start of main program //////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main(int argc, char** argv) {
int myid;
int nprocs;
#ifndef WITH_MPI
int MPI_COMM_WORLD;
#endif
int my_mpi_comm_world, mpi_comm_rows, mpi_comm_cols;
int na, nblk, np_cols, np_rows, np_colsStart, my_blacs_ctxt, nprow, npcol, my_prow, my_pcol;
int mpierr;
int info, i, j, na_rows, na_cols;
double startVal;
double *a, *b, *c, *a_copy, *b_copy, *c1, *c2, *a_t, *work;
int *a_desc, *b_desc, *c_desc;
double value, diff, diffSum;
double done = 1.0;
double dMinusOne = -1.0;
int one = 1;
double dzero = 0.0;
int zero = 0;
double startTime, endTime, localTime, avTime, maxTime;
int reallevel;
#ifdef WITH_MPI
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &reallevel);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
if(myid == 0)
printf("Threading level = %d \n", reallevel);
#else
nprocs = 1;
myid=0;
MPI_COMM_WORLD=1;
#endif
na = atoi(argv[1]);
nblk = atoi(argv[2]);
///////////// procs grids and communicators ///////////////////////////////////////////////
if (myid == 0)
printf("Matrix size: %d, blocksize: %d\n\n", na, nblk);
startVal = sqrt((double) nprocs);
np_colsStart = (int) round(startVal);
for (np_rows=np_colsStart;np_rows>1;np_rows--){
if (nprocs %np_rows ==0)
break;
}
if (nprocs == 3200)
np_rows = 40;
if (nprocs == 800)
np_rows = 20;
np_cols = nprocs/np_rows;
if (myid == 0)
printf("Number of processor rows %d, cols %d, total %d \n\n",np_rows,np_cols,nprocs);
/* set up blacs */
/* convert communicators before */
#ifdef WITH_MPI
my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
#else
my_mpi_comm_world = 1;
#endif
set_up_blacsgrid_f1(my_mpi_comm_world, &my_blacs_ctxt, &np_rows, &np_cols, &nprow, &npcol, &my_prow, &my_pcol);
/* get the ELPA row and col communicators. */
/* These are NOT usable in C without calling the MPI_Comm_f2c function on them !! */
#ifdef WITH_MPI
my_mpi_comm_world = MPI_Comm_c2f(MPI_COMM_WORLD);
#endif
mpierr = elpa_get_communicators(my_mpi_comm_world, my_prow, my_pcol, &mpi_comm_rows, &mpi_comm_cols);
////////////////////// descriptors area ///////////////////////////////////////////////
a_desc = malloc(9*sizeof(int));
b_desc = malloc(9*sizeof(int));
c_desc = malloc(9*sizeof(int));
na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows);
na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols);
descinit_(a_desc, &na, &na, &nblk, &nblk, &zero, &zero, &my_blacs_ctxt, &na_rows, &info);
descinit_(b_desc, &na, &na, &nblk, &nblk, &zero, &zero, &my_blacs_ctxt, &na_rows, &info);
descinit_(c_desc, &na, &na, &nblk, &nblk, &zero, &zero, &my_blacs_ctxt, &na_rows, &info);
/////////////////////////memory allocation area//////////////////////////////////////////////////////////////
a = malloc(na_rows*na_cols*sizeof(double));
b = malloc(na_rows*na_cols*sizeof(double));
c = malloc(na_rows*na_cols*sizeof(double));
a_copy = malloc(na_rows*na_cols*sizeof(double));
b_copy = malloc(na_rows*na_cols*sizeof(double));
c1 = malloc(na_rows*na_cols*sizeof(double));
c2 = malloc(na_rows*na_cols*sizeof(double));
a_t = malloc(na_rows*na_cols*sizeof(double));
work = malloc(na_cols*na_rows*sizeof(double));
//////////////////////////generate matrices//////////////////////////////////////////////////////////////////////////////
int i_global, j_global;
for(i = 0; i < na_rows; i++)
for(j = 0; j < na_cols; j++)
{
i_global = np_rows*nblk*(i/nblk) + i%nblk + ((np_rows+my_prow)%np_rows)*nblk + 1;
j_global = np_cols*nblk*(j/nblk) + j%nblk + ((np_cols+my_pcol)%np_cols)*nblk + 1;
a[i + j*na_rows] = (double)cos(i_global)*cos(j_global) + (double)sin(i_global)*sin(j_global);
if(i_global == j_global)
a[i + j*na_rows] = a[i + j*na_rows] + (double)(i_global + j_global)/na;
b[i + j*na_rows] = (double)sin(i_global)*(double)sin(j_global);
if(i_global == j_global)
b[i + j*na_rows] = b[i + j*na_rows] + 1;
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
for(i = 0; i < na_rows*na_cols; i++)
c[i] = 0;
for(i = 0; i < na_rows*na_cols; i++)
c2[i] = 0;
elpa_cholesky_real_double(na, b, na_rows, nblk, na_cols, mpi_comm_rows, mpi_comm_cols, 1); // now b = U
for(i = 0; i < na_rows; i++)
for(j = 0; j < na_cols; j++)
{
i_global = np_rows*nblk*(i/nblk) + i%nblk + ((np_rows+my_prow)%np_rows)*nblk + 1;
j_global = np_cols*nblk*(j/nblk) + j%nblk + ((np_cols+my_pcol)%np_cols)*nblk + 1;
if(i_global > j_global)
b[i + j*na_rows] = 0;
if(i_global < j_global)
a[i + j*na_rows] = 0;
}
//make copies of a and b
for(i = 0; i < na_rows*na_cols; i++)
{
a_copy[i] = a[i];
b_copy[i] = b[i];
}
for(i = 0; i < na_rows*na_cols; i++)
c1[i] = a_copy[i];
if(myid == 0)
printf("\n\nTest1 ___________________________________________________________________ \n");
///// test Cannon's ///////////////////////////////////////////////////////////////////////////////
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
d_Cannons_Mult2(a, b, np_rows, np_cols, my_prow, my_pcol, a_desc, c, mpi_comm_cols, mpi_comm_rows);
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n Cannon's algorithm. 0 proc in: %lf, average over procs = %lf, max = %lf\n\n", localTime, avTime, maxTime);
///// test PDTRMM /////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
pdtrmm_("R", "U", "N", "N", &na, &na, &done, b_copy, &one, &one, b_desc, c1, &one, &one, c_desc);
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n PDTRMM from ScaLAPACK. 0 proc in: %lf, average over procs = %lf, max = %lf\n\n", localTime, avTime, maxTime);
///// test ELPA ///////////////////////////////////////////////////////////////////////////////////////////
pdtran_(&na, &na, &done, a_copy, &one, &one, a_desc, &dzero, a_t, &one, &one, a_desc);
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
elpa_mult_at_b_real_double('U', 'U', na, na, b, na_rows, na_cols, a_t, na_rows, na_cols, nblk, mpi_comm_rows, mpi_comm_cols, work, na_rows, na_cols); // work has upper part of b(H)*A(H)
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n elpa_mult_at_b_real_double(U,U). 0 proc in: %lf, average over procs = %lf, max = %lf\n\n", localTime, avTime, maxTime);
pdtran_(&na, &na, &done, work, &one, &one, a_desc, &dzero, c2, &one, &one, a_desc); // c2 has lower part of A*b
for(i = 0; i < na_rows; i++)
for(j = 0; j < na_cols; j++)
{
i_global = np_rows*nblk*(i/nblk) + i%nblk + ((np_rows+my_prow)%np_rows)*nblk + 1;
j_global = np_cols*nblk*(j/nblk) + j%nblk + ((np_cols+my_pcol)%np_cols)*nblk + 1;
if(i_global < j_global)
{
c[i + j*na_rows] = 0;
c1[i + j*na_rows] = 0;
c2[i + j*na_rows] = 0;
}
}
/////check /////////////////////////////////////////////////////////////////////////////////////////////////
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c1[i]-c[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between Cannon's and PDTRMM = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c2[i]-c[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between Cannon's and ELPA = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c2[i]-c1[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between ScaLAPACK and ELPA = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
if(myid == 0)
printf("\n\nTest2 ___________________________________________________________________ \n");
for(i = 0; i < na_rows*na_cols; i++)
c[i] = 0;
for(i = 0; i < na_rows*na_cols; i++)
c2[i] = 0;
for(i = 0; i < na_rows*na_cols; i++)
c1[i] = a_copy[i];
///// test PDTRMM /////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
pdtrmm_("R", "U", "N", "N", &na, &na, &done, b_copy, &one, &one, b_desc, c1, &one, &one, c_desc);
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n PDTRMM from ScaLAPACK. 0 proc in: %lf, average over procs = %lf, max = %lf\n\n", localTime, avTime, maxTime);
///// test Cannon's ///////////////////////////////////////////////////////////////////////////////
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
d_Cannons_Mult2(a, b, np_rows, np_cols, my_prow, my_pcol, a_desc, c, mpi_comm_cols, mpi_comm_rows);
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n Cannon's algorithm. 0 proc in: %lf, average over procs = %lf, max = %lf\n\n ", localTime, avTime, maxTime);
///// test ELPA ///////////////////////////////////////////////////////////////////////////////////////////
pdtran_(&na, &na, &done, a_copy, &one, &one, a_desc, &dzero, a_t, &one, &one, a_desc);
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
elpa_mult_at_b_real_double('U', 'U', na, na, b, na_rows, na_cols, a_t, na_rows, na_cols, nblk, mpi_comm_rows, mpi_comm_cols, work, na_rows, na_cols);
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n elpa_mult_at_b_real_double(U,U). 0 proc in: %lf, average over procs = %lf, max = %lf\n\n ", localTime, avTime, maxTime);
pdtran_(&na, &na, &done, work, &one, &one, a_desc, &dzero, c2, &one, &one, a_desc);
for(i = 0; i < na_rows; i++)
for(j = 0; j < na_cols; j++)
{
i_global = np_rows*nblk*(i/nblk) + i%nblk + ((np_rows+my_prow)%np_rows)*nblk + 1;
j_global = np_cols*nblk*(j/nblk) + j%nblk + ((np_cols+my_pcol)%np_cols)*nblk + 1;
if(i_global < j_global)
{
c[i + j*na_rows] = 0;
c1[i + j*na_rows] = 0;
c2[i + j*na_rows] = 0;
}
}
/////check /////////////////////////////////////////////////////////////////////////////////////////////////
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c1[i]-c[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between Cannon's and PDTRMM = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c2[i]-c[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between Cannon's and ELPA = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c2[i]-c1[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between ScaLAPACK and ELPA = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
if(myid == 0)
printf("\n\nTest3 ___________________________________________________________________ \n");
for(i = 0; i < na_rows*na_cols; i++)
c[i] = 0;
for(i = 0; i < na_rows*na_cols; i++)
c2[i] = 0;
for(i = 0; i < na_rows*na_cols; i++)
c1[i] = a_copy[i];
///// test PDTRMM /////////////////////////////////////////////////////////////////////////////////////
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
pdtrmm_("R", "U", "N", "N", &na, &na, &done, b_copy, &one, &one, b_desc, c1, &one, &one, c_desc);
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n PDTRMM from ScaLAPACK. 0 proc in: %lf, average over procs = %lf, max = %lf\n\n", localTime, avTime, maxTime);
///// test ELPA ///////////////////////////////////////////////////////////////////////////////////////////
pdtran_(&na, &na, &done, a_copy, &one, &one, a_desc, &dzero, a_t, &one, &one, a_desc);
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
elpa_mult_at_b_real_double('U', 'U', na, na, b, na_rows, na_cols, a_t, na_rows, na_cols, nblk, mpi_comm_rows, mpi_comm_cols, work, na_rows, na_cols);
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n elpa_mult_at_b_real_double(U,U). 0 proc in: %lf, average over procs = %lf, max = %lf\n\n", localTime, avTime, maxTime);
///// test Cannon's ///////////////////////////////////////////////////////////////////////////////
for(i = 0; i < na_rows*na_cols; i++)
c[i] = 0;
MPI_Barrier(MPI_COMM_WORLD);
startTime = MPI_Wtime();
d_Cannons_Mult2(a, b, np_rows, np_cols, my_prow, my_pcol, a_desc, c, mpi_comm_cols, mpi_comm_rows);
MPI_Barrier(MPI_COMM_WORLD);
endTime = MPI_Wtime();
localTime = endTime - startTime;
MPI_Reduce(&localTime, &avTime, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
avTime = avTime/nprocs;
MPI_Reduce(&localTime, &maxTime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("\n Cannon's algorithm. 0 proc in: %lf, average over procs = %lf, max = %lf\n\n", localTime, avTime, maxTime);
pdtran_(&na, &na, &done, work, &one, &one, a_desc, &dzero, c2, &one, &one, a_desc);
for(i = 0; i < na_rows; i++)
for(j = 0; j < na_cols; j++)
{
i_global = np_rows*nblk*(i/nblk) + i%nblk + ((np_rows+my_prow)%np_rows)*nblk + 1;
j_global = np_cols*nblk*(j/nblk) + j%nblk + ((np_cols+my_pcol)%np_cols)*nblk + 1;
if(i_global < j_global)
{
c[i + j*na_rows] = 0;
c1[i + j*na_rows] = 0;
c2[i + j*na_rows] = 0;
}
}
/////check /////////////////////////////////////////////////////////////////////////////////////////////////
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c1[i]-c[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between Cannon's and PDTRMM = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c2[i]-c[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between Cannon's and ELPA = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
diff = 0;
diffSum = 0;
for(i = 0; i < na_rows*na_cols; i++)
diff = diff + fabs(c2[i]-c1[i]);
MPI_Reduce(&diff, &diffSum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if(myid == 0)
printf("Summed difference between ScaLAPACK and ELPA = %.15e, average = %.15e\n", diffSum, diffSum/(na*na));
////////////////////////////////////////////////////////////////////////////////////// free memory ///////////////////////////////////////////////////
free(a);
free(a_desc);
free(b);
free(b_desc);
free(c);
free(c_desc);
free(work);
free(a_copy);
free(b_copy);
free(c1);
free(c2);
free(a_t);
#ifdef WITH_MPI
MPI_Finalize();
#endif
return 0;
}