cannon_back_template.c 22.5 KB
 Pavel Kus committed Aug 01, 2018 1 2 3 4 ``````// it seems, that we need those two levels of indirection to correctly expand macros #define cannons_triang_rectangular_impl_expand2(SUFFIX) cannons_triang_rectangular_##SUFFIX #define cannons_triang_rectangular_impl_expand1(SUFFIX) cannons_triang_rectangular_impl_expand2(SUFFIX) #define cannons_triang_rectangular_impl cannons_triang_rectangular_impl_expand1(ELPA_IMPL_SUFFIX) `````` Pavel Kus committed Aug 01, 2018 5 `````` `````` Pavel Kus committed Aug 01, 2018 6 7 8 ``````#define cannons_triang_rectangular_c_impl_expand2(SUFFIX) cannons_triang_rectangular_c_##SUFFIX #define cannons_triang_rectangular_c_impl_expand1(SUFFIX) cannons_triang_rectangular_c_impl_expand2(SUFFIX) #define cannons_triang_rectangular_c_impl cannons_triang_rectangular_c_impl_expand1(ELPA_IMPL_SUFFIX) `````` Pavel Kus committed Aug 01, 2018 9 `````` `````` Pavel Kus committed Aug 01, 2018 10 ``````void cannons_triang_rectangular_impl(math_type* U, math_type* B, int np_rows, int np_cols, int my_prow, int my_pcol, int* U_desc, int*b_desc, math_type *Res, MPI_Comm row_comm, MPI_Comm col_comm) `````` Pavel Kus committed Aug 01, 2018 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 ``````{ // Cannons algorithm, Non-blocking version // Input: // - U is upper triangular matrix // - B is rectangular matrix // Output: // - Res is a full rectangular matrix Res = U*B // row_comm: communicator along rows // col_comm: communicator along columns // This function will be used for a backtransformation int na, nb, nblk, width, na_rows, na_cols, nb_cols, cols_in_buffer_U_my_initial, cols_in_buffer_U, rows_in_buffer_U, Size_receive_U_now, rows_in_buffer_U_now, cols_in_buffer_U_now, rows_in_buffer_U_my_initial; int i, j, Size_send_U, Size_receive_U, Size_send_B, Size_receive_B, intNumber, Buf_rows, Buf_cols_U, Buf_cols_B, curr_rows, num_of_iters, cols_in_buffer, rows_in_block, curr_col_loc, cols_in_block, num_of_blocks_in_U_buffer, col_of_origin_U, b_rows_mult, b_cols_mult; `````` Pavel Kus committed Aug 01, 2018 26 `````` math_type *Buf_to_send_U, *Buf_to_receive_U, *Buf_to_send_B, *Buf_to_receive_B, *Buf_U, *PosBuff; `````` Pavel Kus committed Aug 01, 2018 27 28 29 `````` int where_to_send_U, from_where_to_receive_U, where_to_send_B, from_where_to_receive_B, last_proc_col_B, last_proc_row_B, n, Size_U_stored, proc_col_min; `````` Pavel Kus committed Aug 01, 2018 30 `````` math_type *U_local_start, *Buf_pos, *B_local_start, *double_ptr, *CopyTo, *CopyFrom; `````` Pavel Kus committed Aug 01, 2018 31 32 33 34 35 36 37 `````` int ratio; MPI_Status status; int one = 1; int zero = 0; `````` Pavel Kus committed Aug 01, 2018 38 39 `````` math_type done = 1.0; math_type dzero = 0.0; `````` Pavel Kus committed Aug 01, 2018 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 `````` na = U_desc[2]; nblk = U_desc[4]; nb = b_desc[3]; na_rows = numroc_(&na, &nblk, &my_prow, &zero, &np_rows); na_cols = numroc_(&na, &nblk, &my_pcol, &zero, &np_cols); nb_cols = numroc_(&nb, &nblk, &my_pcol, &zero, &np_cols); MPI_Request request_U_Recv; MPI_Request request_U_Send; MPI_Request request_B_Recv; MPI_Request request_B_Send; ///////////////////////////////////////////////////////////////// Start of algorithm /////////////////////////////////////////////////////////////////////////////////////////////// last_proc_col_B = ((nb-1)/nblk) % np_cols; last_proc_row_B = ((na-1)/nblk) % np_rows; /////////////////////////memory allocation area////////////////////////////////////////////////////////////// if(nb%nblk == 0) if(my_pcol <= last_proc_col_B) Buf_cols_B = nb_cols; else Buf_cols_B = nb_cols + nblk; else if(my_pcol < last_proc_col_B) Buf_cols_B = nb_cols; else if(my_pcol > last_proc_col_B) Buf_cols_B = nb_cols + nblk; else // if my_pcol == last_proc_col_B Buf_cols_B = nb_cols + nblk - nb_cols%nblk; if(na%nblk == 0) if(my_prow <= last_proc_row_B) Buf_rows = na_rows; else Buf_rows = na_rows + nblk; else if(my_prow < last_proc_row_B) Buf_rows = na_rows; else if(my_prow > last_proc_row_B) Buf_rows = na_rows + nblk; else // if my_prow == last_proc_row_B Buf_rows = na_rows + nblk - na_rows%nblk; ratio = np_cols/np_rows; `````` Pavel Kus committed Aug 01, 2018 88 `````` intNumber = ceil((math_type)na/(math_type)(np_cols*nblk)); // max. possible number of the local block columns of U `````` Pavel Kus committed Aug 01, 2018 89 90 `````` Size_U_stored = ratio*nblk*nblk*intNumber*(intNumber+1)/2 + 2; // number of local elements from the upper triangular part that every proc. has (max. possible value among all the procs.) `````` Pavel Kus committed Aug 01, 2018 91 92 93 94 `````` Buf_to_send_U = malloc(ratio*Size_U_stored*sizeof(math_type)); Buf_to_receive_U = malloc(ratio*Size_U_stored*sizeof(math_type)); Buf_to_send_B = malloc(Buf_cols_B*Buf_rows*sizeof(math_type)); Buf_to_receive_B = malloc(Buf_cols_B*Buf_rows*sizeof(math_type)); `````` Pavel Kus committed Aug 01, 2018 95 `````` if(ratio != 1) `````` Pavel Kus committed Aug 01, 2018 96 `````` Buf_U = malloc(Size_U_stored*sizeof(math_type)); // in this case we will receive data into initial buffer and after place block-rows to the needed positions of buffer for calculation `````` Pavel Kus committed Aug 01, 2018 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 `````` for(i = 0; i < na_rows*nb_cols; i++) Res[i] = 0; /////////////////////////////////////////////////////////////// initial reordering of U ///////////////////////////////////////////////////////////////////////////////////////// // here we assume, that np_rows < np_cols; then I will send to the number of processors equal to with the "leap" equal to np_rows; the same holds for receive if((ratio != 1)||(my_prow != 0)) // if grid is rectangular or my_prow is not 0 Buf_pos = Buf_to_send_U; // I will copy to the send buffer else Buf_pos = Buf_to_receive_U; // if grid is square and my_prow is 0, then I will copy to the received buffer // form array to send by block-columns; we need only upper triangular part // find the first local block belonging to the upper part of matrix U if(my_pcol >= my_prow) // if I am in the upper part of proc. grid curr_col_loc = 0; // my first local block-column has block from the upper part of matrix else `````` Pavel Kus committed Aug 01, 2018 114 `````` curr_col_loc = 1; //ceil((math_type)(((math_type)my_prow - (math_type)my_pcol)/(math_type)np_cols)) always will give 1 since np_cols > np_rows `````` Pavel Kus committed Aug 01, 2018 115 `````` `````` Pavel Kus committed Aug 01, 2018 116 `````` num_of_iters = ceil((math_type)na_cols/(math_type)nblk); // number my of block-columns `````` Pavel Kus committed Aug 01, 2018 117 118 119 120 `````` num_of_iters = num_of_iters - curr_col_loc; // I will exclude the first block-columns since they do not have blocks from the upper part of matrix U curr_col_loc = curr_col_loc*nblk; // local index of the found block-column if(my_pcol >= my_prow ) `````` Pavel Kus committed Aug 01, 2018 121 `````` rows_in_block = ceil(((math_type)(my_pcol + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk; `````` Pavel Kus committed Aug 01, 2018 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 `````` else rows_in_block = ratio*nblk; cols_in_buffer_U_my_initial = 0; Size_send_U = 0; for(i = 0; i < num_of_iters; i++) // loop over my block-columns, which have blocks in the upepr part of U { if(rows_in_block > na_rows) rows_in_block = na_rows; if ((na_cols - curr_col_loc) < nblk) cols_in_block = na_cols - curr_col_loc; // how many columns do I have in the current block-column else cols_in_block = nblk; if((rows_in_block > 0)&&(cols_in_block > 0)) { double_ptr = &U[curr_col_loc*na_rows]; // pointer to start of the current block-column to be copied to buffer `````` Pavel Kus committed Aug 01, 2018 139 `````` C_LACPY("A", &rows_in_block, &cols_in_block, double_ptr, &na_rows, Buf_pos, &rows_in_block); // copy upper part of block-column in the buffer with LDA = length of the upper part of block-column `````` Pavel Kus committed Aug 01, 2018 140 141 142 143 144 145 146 147 `````` Buf_pos = Buf_pos + rows_in_block*cols_in_block; // go to the position where the next block-column will be copied Size_send_U = Size_send_U + rows_in_block*cols_in_block; cols_in_buffer_U_my_initial = cols_in_buffer_U_my_initial + cols_in_block; } curr_col_loc = curr_col_loc + nblk; // go to the next local block-column of my local array U rows_in_block = rows_in_block + ratio*nblk; } rows_in_buffer_U_my_initial = rows_in_block - ratio*nblk; // remove redundant addition from the previous loop `````` Pavel Kus committed Aug 01, 2018 148 `````` *Buf_pos = (math_type)cols_in_buffer_U_my_initial; // write number of the columns at the end of the buffer; we will need this for furhter multiplications on the other processors `````` Pavel Kus committed Aug 01, 2018 149 `````` Buf_pos = Buf_pos + 1; `````` Pavel Kus committed Aug 01, 2018 150 `````` *Buf_pos = (math_type)rows_in_buffer_U_my_initial; // write number of the rows at the end of the buffer; we will need this for furhter multiplications on the other processors `````` Pavel Kus committed Aug 01, 2018 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 `````` Size_send_U = Size_send_U + 2; // now we have the local buffer to send // find the lowest processor column among those who will send me proc_col_min = np_cols; for(i = 0; i < ratio; i++) { from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols; if(from_where_to_receive_U < proc_col_min) proc_col_min = from_where_to_receive_U; } // do communications and form local buffers for calculations Size_receive_U = 0; // size of the accumulated buffer cols_in_buffer_U = 0; // number of columns in the accumulated buffer rows_in_buffer_U = 0; // number of rows in the accumulated buffer for(i = 0; i < ratio; i++) { where_to_send_U = (my_pcol - my_prow - i*np_rows + np_cols)%np_cols; from_where_to_receive_U = (my_pcol + my_prow + i*np_rows)%np_cols; // send and receive in the row_comm if(ratio != 1) // if grid is not square { if(where_to_send_U != my_pcol) // if I need to send and receive on this step { `````` Pavel Kus committed Aug 01, 2018 177 178 `````` MPI_Sendrecv(Buf_to_send_U, Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_U, 0, Buf_U, Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_U, 0, row_comm, &status); MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U_now); `````` Pavel Kus committed Aug 01, 2018 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 `````` Size_receive_U = Size_receive_U + Size_receive_U_now - 2; // we need only number of elements, so exclude information about cols_in_buffer_U and rows_in_buffer_U cols_in_buffer_U_now = Buf_U[Size_receive_U_now - 2]; cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now; rows_in_buffer_U_now = Buf_U[Size_receive_U_now - 1]; if(rows_in_buffer_U < rows_in_buffer_U_now) rows_in_buffer_U = rows_in_buffer_U_now; intNumber = from_where_to_receive_U/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks if(proc_col_min >= my_prow) // if among procs who will send me there is one with the full sets of block-rows in the upper part CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2]; // here I will copy to; formula based on arithm. progression else // if among procs who will send me there is one from the lower part of grid if(from_where_to_receive_U < my_prow) // if I have just received from this processor from the lower part CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2]; // copy the first block of this processor after the first blocks from the others procs. that will send me later (the first block-column of this proc. is in the lower part of matrix) else CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2]; CopyFrom = Buf_U; } else // if I need to send to myself on this step, then I will copy from Buf_to_send_U to Buf_to_receive_U { cols_in_buffer_U_now = cols_in_buffer_U_my_initial; cols_in_buffer_U = cols_in_buffer_U + cols_in_buffer_U_now; rows_in_buffer_U_now = rows_in_buffer_U_my_initial; if(rows_in_buffer_U < rows_in_buffer_U_now) rows_in_buffer_U = rows_in_buffer_U_now; intNumber = my_pcol/np_rows; // how many processors will send me blocks, such that they will be placed before the current blocks if(proc_col_min >= my_prow) // if among procs who will send me there is one with the full sets of block-rows in the upper part CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber + 1)/2]; // here I will copy to; formula based on arithm. progression else // if among procs who will send me there is one from the lower part of grid if(my_pcol < my_prow) // if I have just received from this processor from the lower part (in this case it is me) CopyTo = &Buf_to_receive_U[nblk*nblk*ratio*(ratio - 1)/2]; // copy the first block of this processor after the first blocks from the others procs. that will send me later (the first block-column of this proc. is in the lower part of matrix) else CopyTo = &Buf_to_receive_U[nblk*nblk*intNumber*(intNumber - 1)/2]; CopyFrom = Buf_to_send_U; Size_receive_U = Size_receive_U + Size_send_U - 2; } // copy by block-columns `````` Pavel Kus committed Aug 01, 2018 220 `````` intNumber = ceil((math_type)cols_in_buffer_U_now/(math_type)nblk); // how many block-columns I have received on this iteration `````` Pavel Kus committed Aug 01, 2018 221 `````` if(from_where_to_receive_U >= my_prow) `````` Pavel Kus committed Aug 01, 2018 222 `````` rows_in_block = ceil(((math_type)(from_where_to_receive_U + 1) - (math_type)my_prow)/(math_type)np_rows)*nblk; // number of rows in the first block-column of U buffer `````` Pavel Kus committed Aug 01, 2018 223 224 225 226 227 228 229 230 231 `````` else rows_in_block = ratio*nblk; for(j = 0; j < intNumber; j++) { if((j+1)*nblk < cols_in_buffer_U_now) cols_in_block = nblk; else cols_in_block = cols_in_buffer_U_now - j*nblk; `````` Pavel Kus committed Aug 01, 2018 232 `````` C_LACPY("A", &rows_in_block, &cols_in_block, CopyFrom, &rows_in_block, CopyTo, &rows_in_block); `````` Pavel Kus committed Aug 01, 2018 233 234 235 236 237 238 239 240 241 242 243 244 `````` CopyFrom = CopyFrom + rows_in_block*cols_in_block; CopyTo = CopyTo + ratio*rows_in_block*nblk + nblk*nblk*ratio*(ratio-1)/2; // I need to leave place for ratio block-columns of the other procs. of the lengths rows_in_block, (rows_in_block+nblk), (rows_in_block+2*nblk) and so on rows_in_block = rows_in_block + ratio*nblk; // number of rows in the next block-columns if(rows_in_block > rows_in_buffer_U_now) rows_in_block = rows_in_buffer_U_now; } } else // if grid is square { if(my_prow > 0) { `````` Pavel Kus committed Aug 01, 2018 245 246 `````` MPI_Sendrecv(Buf_to_send_U, Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_U, 0, Buf_to_receive_U, Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_U, 0, row_comm, &status); MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U); `````` Pavel Kus committed Aug 01, 2018 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 `````` cols_in_buffer_U = (int)Buf_to_receive_U[Size_receive_U-2]; rows_in_buffer_U = (int)Buf_to_receive_U[Size_receive_U-1]; } else // if my_prow == 0, then I have already everything in my Buf_to_receive_U buffer { Size_receive_U = Size_send_U; rows_in_buffer_U = rows_in_buffer_U_my_initial; cols_in_buffer_U = cols_in_buffer_U_my_initial; } } } if(ratio != 1) { Buf_to_receive_U[Size_receive_U] = cols_in_buffer_U; Buf_to_receive_U[Size_receive_U + 1] = rows_in_buffer_U; Size_receive_U = Size_receive_U + 2; } ////////////////////////////////////////////////////////////// initial reordering of B ///////////////////////////////////////////////////////////////////////////////////////// if(my_pcol > 0) { where_to_send_B = (my_prow - my_pcol + np_cols)%np_rows; // shift = my_pcol from_where_to_receive_B = (my_pcol + my_prow)%np_rows; // send and receive in the row_comm if(where_to_send_B != my_prow) // for the rectangular proc grids it may be possible that I need to "send to myself"; if it is not the case, then I send { // form array to send `````` Pavel Kus committed Aug 01, 2018 276 277 278 `````` C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_send_B, &na_rows); MPI_Sendrecv(Buf_to_send_B, nb_cols*na_rows, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_B, 0, Buf_to_receive_B, nb_cols*Buf_rows, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_B, 0, col_comm, &status); MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_B); // find out how many elements I have received `````` Pavel Kus committed Aug 01, 2018 279 280 281 282 `````` Size_receive_B = Size_receive_B/nb_cols; // how many rows I have received } else { `````` Pavel Kus committed Aug 01, 2018 283 `````` C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // else I copy data like I have "received" it `````` Pavel Kus committed Aug 01, 2018 284 285 286 287 288 `````` Size_receive_B = na_rows; } } else { `````` Pavel Kus committed Aug 01, 2018 289 `````` C_LACPY("A", &na_rows, &nb_cols, B, &na_rows, Buf_to_receive_B, &na_rows); // if I am in the 0 proc row, I need not to send; so copy data like I have "received" it `````` Pavel Kus committed Aug 01, 2018 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 `````` Size_receive_B = na_rows; } //////////////////////////////////////////////////////////////////////// main loop //////////////////////////////////////////////////////////////////////////////// where_to_send_U = (my_pcol - 1 + np_cols)%np_cols; from_where_to_receive_U = (my_pcol + 1)%np_cols; where_to_send_B = (my_prow - 1 + np_rows)%np_rows; from_where_to_receive_B = (my_prow + 1)%np_rows; for(i = 1; i < np_rows; i++) { // at this moment I need to send to neighbour what I have in the "received" arrays; that is why change pointers of the "received" and "send" arrays double_ptr = Buf_to_send_U; Buf_to_send_U = Buf_to_receive_U; Buf_to_receive_U = double_ptr; double_ptr = Buf_to_send_B; Buf_to_send_B = Buf_to_receive_B; Buf_to_receive_B = double_ptr; Size_send_U = Size_receive_U; Size_send_B = Size_receive_B; ///// shift for U //////////////////////////////////////////////////////////// `````` Pavel Kus committed Aug 01, 2018 314 315 `````` MPI_Isend(Buf_to_send_U, Size_send_U, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_U, 0, row_comm, &request_U_Send); MPI_Irecv(Buf_to_receive_U, ratio*Size_U_stored, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_U, 0, row_comm, &request_U_Recv); `````` Pavel Kus committed Aug 01, 2018 316 317 `````` ///// shift for B ///////////////////////////////////////////// `````` Pavel Kus committed Aug 01, 2018 318 319 `````` MPI_Isend(Buf_to_send_B, Size_send_B*nb_cols, MPI_MATH_DATATYPE_PRECISION_C, where_to_send_B, 0, col_comm, &request_B_Send); MPI_Irecv(Buf_to_receive_B, Buf_rows*nb_cols, MPI_MATH_DATATYPE_PRECISION_C, from_where_to_receive_B, 0, col_comm, &request_B_Recv); `````` Pavel Kus committed Aug 01, 2018 320 321 322 323 324 325 326 327 328 329 330 331 332 333 `````` ///// multiplication //////////////////////////////////////////////////////////////////////////////////////////// cols_in_buffer_U = (int)Buf_to_send_U[Size_receive_U-2]; rows_in_buffer_U = (int)Buf_to_send_U[Size_receive_U-1]; //find minimal proc. column among those procs. who contributed in the current U buffer proc_col_min = np_cols; for(j = 0; j < ratio; j++) { col_of_origin_U = (my_pcol + my_prow + i - 1 + j*np_rows)%np_cols; if(col_of_origin_U < proc_col_min) proc_col_min = col_of_origin_U; } col_of_origin_U = proc_col_min; `````` Pavel Kus committed Aug 01, 2018 334 `````` num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk); `````` Pavel Kus committed Aug 01, 2018 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 `````` if (col_of_origin_U >= my_prow) B_local_start = Buf_to_send_B; else B_local_start = Buf_to_send_B + nblk; U_local_start = Buf_to_send_U; for(j = 0; j < num_of_blocks_in_U_buffer; j++) { curr_rows = (j+1)*nblk; if (curr_rows > rows_in_buffer_U) curr_rows = rows_in_buffer_U; if((j+1)*nblk <= cols_in_buffer_U) b_rows_mult = nblk; else b_rows_mult = cols_in_buffer_U - j*nblk; `````` Pavel Kus committed Aug 01, 2018 354 `````` C_GEMM("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows); `````` Pavel Kus committed Aug 01, 2018 355 356 357 358 359 360 361 `````` U_local_start = U_local_start + nblk*curr_rows; B_local_start = B_local_start + nblk; } MPI_Wait(&request_U_Send, &status); MPI_Wait(&request_U_Recv, &status); `````` Pavel Kus committed Aug 01, 2018 362 `````` MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_U); // find out how many elements I have received `````` Pavel Kus committed Aug 01, 2018 363 364 365 `````` MPI_Wait(&request_B_Send, &status); MPI_Wait(&request_B_Recv, &status); `````` Pavel Kus committed Aug 01, 2018 366 `````` MPI_Get_count(&status, MPI_MATH_DATATYPE_PRECISION_C, &Size_receive_B); // find out how many elements I have received `````` Pavel Kus committed Aug 01, 2018 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 `````` Size_receive_B = Size_receive_B/nb_cols; // how many rows I have received } // last iteration cols_in_buffer_U = (int)Buf_to_receive_U[Size_receive_U-2]; rows_in_buffer_U = (int)Buf_to_receive_U[Size_receive_U-1]; //find minimal proc. column among those procs. who contributed in the current U buffer proc_col_min = np_cols; for(j = 0; j < ratio; j++) { col_of_origin_U = (my_pcol + my_prow + np_rows - 1 + j*np_rows)%np_cols; if(col_of_origin_U < proc_col_min) proc_col_min = col_of_origin_U; } col_of_origin_U = proc_col_min; `````` Pavel Kus committed Aug 01, 2018 383 `````` num_of_blocks_in_U_buffer = ceil((math_type)cols_in_buffer_U/(math_type)nblk); `````` Pavel Kus committed Aug 01, 2018 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 `````` if (col_of_origin_U >= my_prow) B_local_start = Buf_to_receive_B; else B_local_start = Buf_to_receive_B + nblk; U_local_start = Buf_to_receive_U; for(j = 0; j < num_of_blocks_in_U_buffer; j++) { curr_rows = (j+1)*nblk; if (curr_rows > rows_in_buffer_U) curr_rows = rows_in_buffer_U; if((j+1)*nblk <= cols_in_buffer_U) b_rows_mult = nblk; else b_rows_mult = cols_in_buffer_U - j*nblk; `````` Pavel Kus committed Aug 01, 2018 403 `````` C_GEMM("N", "N", &curr_rows, &nb_cols, &b_rows_mult, &done, U_local_start, &curr_rows, B_local_start, &Size_receive_B, &done, Res, &na_rows); `````` Pavel Kus committed Aug 01, 2018 404 405 406 407 408 409 410 411 412 413 414 415 416 `````` U_local_start = U_local_start + nblk*curr_rows; B_local_start = B_local_start + nblk; } free(Buf_to_send_U); free(Buf_to_receive_U); free(Buf_to_send_B); free(Buf_to_receive_B); if(ratio != 1) free(Buf_U); } `````` Pavel Kus committed Aug 01, 2018 417 `````` `````` Pavel Kus committed Aug 01, 2018 418 419 ``````void cannons_triang_rectangular_c_impl(math_type* U, math_type* B, int local_rows, int local_cols, int* u_desc, int* b_desc, math_type *Res, int row_comm, int col_comm) `````` Pavel Kus committed Aug 01, 2018 420 421 422 423 ``````{ #ifdef WITH_MPI MPI_Comm c_row_comm = MPI_Comm_f2c(row_comm); MPI_Comm c_col_comm = MPI_Comm_f2c(col_comm); `````` Pavel Kus committed Aug 01, 2018 424 `````` `````` Pavel Kus committed Aug 01, 2018 425 426 427 428 429 430 `````` int my_prow, my_pcol, np_rows, np_cols; MPI_Comm_rank(c_row_comm, &my_prow); MPI_Comm_size(c_row_comm, &np_rows); MPI_Comm_rank(c_col_comm, &my_pcol); MPI_Comm_size(c_col_comm, &np_cols); `````` Pavel Kus committed Aug 01, 2018 431 432 433 434 435 436 `````` // BEWARE // in the cannons algorithm, column and row communicators are exchanged // What we usually call row_comm in elpa, is thus passed to col_comm parameter of the function and vice versa // (order is swapped in the following call) // It is a bit unfortunate, maybe it should be changed in the Cannon algorithm to comply with ELPA standard notation? `````` Pavel Kus committed Aug 01, 2018 437 `````` cannons_triang_rectangular_impl(U, B, np_rows, np_cols, my_prow, my_pcol, u_desc, b_desc, Res, c_col_comm, c_row_comm); `````` Pavel Kus committed Aug 01, 2018 438 439 440 441 442 ``````#else printf("Internal error: Cannons algorithm should not be called without MPI, stopping...\n"); exit(1); #endif } `````` Pavel Kus committed Aug 01, 2018 443