#include <mpi.h> //#include <complex> #include "mkl.h" #include <stdio.h> #include <stdlib.h> #include <math.h> #include <string.h> #include <time.h> // float //#define BASE_PREC_FLOAT #define BASE_PREC_DOUBLE #if defined (BASE_PREC_FLOAT) #define T_base_precision float #elif defined (BASE_PREC_DOUBLE) #define T_base_precision double #endif #define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b)) #if defined(USE_CUDA) #include <cuda_runtime.h> #define gpuErrCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); exit(code); } } #endif //______________________________________________ int debug_mode=0; int N; //problem size int nev; // warmup problem size enum matrix_types {Clement, Random_0_1, Random_m1_1, Sequence}; // symmetric Clement matrix enum matrix_types matrix_type = Random_0_1; /************ MPI ***************************/ int world_rank, world_size; int world_size_external; // for partial usage of the node /************ BLACS ***************************/ int NB=256; // Global column block size for partitioning the global matrix int ictxt; // ordinal number i of the "context", int nprow; // Number of total process rows in the process grid (equivalent to Pr) int npcol; // Number of total process columns in the process grid (equivalent to Pc) int myrow; // The calling process's row coordinate in the process grid int mycol; // The calling process's column coordinate in the process grid. int info=0; // Output integer argument of driver and computational routines indicating the success(0) or failure(1) of the routine int m_loc; // size of local matrices is m_loc x n_loc for NxN global matrix; size of local vectors is m_loc x 1 int n_loc; int m_loc_reduced; // size of local matrices is m_loc_reduced x n_loc_reduced for nev x nev global matrix int n_loc_reduced; int desc_N_N[9], desc_N_nev[9], desc_nev_nev[9]; #ifdef __cplusplus extern "C" { #endif // Cblacs declarations void Cblacs_pinfo(int*, int*); void Cblacs_get(int, int, int*); void Cblacs_gridinit(int*, const char*, int, int); void Cblacs_pcoord(int, int, int*, int*); void Cblacs_gridexit(int); void Cblacs_barrier(int, const char*); int numroc_(int*, int*, int*, int*, int*); void Cblacs_gridinfo(int, int*, int*, int*, int*); void descinit_ (MKL_INT *desc, const MKL_INT *m, const MKL_INT *n, const MKL_INT *mb, const MKL_INT *nb, const MKL_INT *irsrc, const MKL_INT *icsrc, const MKL_INT *ictxt, const MKL_INT *lld, MKL_INT *info); // P BLAS declarations void pdgemm_ ( const char *transa , const char *transb , const MKL_INT *m , const MKL_INT *n , const MKL_INT *k , const double *alpha , const double *a , const MKL_INT *ia , const MKL_INT *ja , const MKL_INT *desca , const double *b , const MKL_INT *ib , const MKL_INT *jb , const MKL_INT *descb , const double *beta , double *c , const MKL_INT *ic , const MKL_INT *jc , const MKL_INT *descc ); void pdscal_ ( const MKL_INT *n , const double *a , double *x , const MKL_INT *ix , const MKL_INT *jx , const MKL_INT *descx , const MKL_INT *incx ); void pdaxpy_ ( const MKL_INT *n , const double *a , const double *x , const MKL_INT *ix , const MKL_INT *jx , const MKL_INT *descx , const MKL_INT *incx , double *y , const MKL_INT *iy , const MKL_INT *jy , const MKL_INT *descy , const MKL_INT *incy ); void pdnrm2_ ( const MKL_INT *n , double *norm2 , const double *x , const MKL_INT *ix , const MKL_INT *jx , const MKL_INT *descx , const MKL_INT *incx ); void psgeadd_ ( const char *trans , const MKL_INT *m , const MKL_INT *n , const float *alpha , const float *a , const MKL_INT *ia , const MKL_INT *ja , const MKL_INT *desca , const float *beta , float *c , const MKL_INT *ic , const MKL_INT *jc , const MKL_INT *descc ); void pdgeadd_ ( const char *trans , const MKL_INT *m , const MKL_INT *n , const double *alpha , const double *a , const MKL_INT *ia , const MKL_INT *ja , const MKL_INT *desca , const double *beta , double *c , const MKL_INT *ic , const MKL_INT *jc , const MKL_INT *descc ); //void cblas_dgemm ( const CBLAS_LAYOUT Layout , const CBLAS_TRANSPOSE transa , const CBLAS_TRANSPOSE transb , const MKL_INT m , const MKL_INT n , const MKL_INT k , const double alpha , const double *a , const MKL_INT lda , const double *b , const MKL_INT ldb , const double beta , double *c , const MKL_INT ldc ); //void RefSyEv(T_high_precision *A, T_high_precision *X, T_high_precision *lambda, int N, int nev, int m_loc, int n_loc, int m_loc_reduced, int n_loc_reduced, int *desc_N_N, int *desc_N_nev, int *desc_nev_nev, int myrow, int mycol, int nprow, int npcol, int world_rank, int NB, int debug_mode); #ifdef __cplusplus } #endif void createCublasHandle(); //____________________________________________________________________________________________ int LoadCblacs(int N) // return 0 (for successfull) and 1 (for unsuccessful) initialization { /* if (world_size== 2) {nprow = 2; npcol = 1;} if (world_size==12) {nprow = 3; npcol = 4;} if (world_size==24) {nprow = 4; npcol = 6;} if (world_size==36) {nprow = 6; npcol = 6;} */ if (world_size_external== 2) {nprow = 1; npcol = 2;} if (world_size_external== 8) {nprow = 2; npcol = 4;} else if (world_size_external==72) {nprow = 8; npcol = 9;} else { nprow=sqrt(world_size_external); // Number of process rows in the process grid (equivalent to Pr) if (nprow>1 && NB*nprow>N) nprow = N/NB; npcol=nprow; // Number of process columns in the process grid (equivalent to Pc) } if (nprow==1 && npcol==1) NB=N; if (world_rank==0) printf("N= %d, nprow=%d, npcol=%d, NB=%d \n", N, nprow, npcol, NB); Cblacs_pinfo( &world_rank, &world_size ) ; // Routine is used when some initial system information is required before the BLACS are set up Cblacs_get( -1, 0, &ictxt ); Cblacs_gridinit( &ictxt, "Row", nprow, npcol ); Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol ); if (!((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol))) return 1; // NUMROC computes the NUMber of Rows Or Columns of a distributed (small local) matrix owned by the process indicated by myrow,mycol int iZERO=0; m_loc = numroc_( &N, &NB, &myrow, &iZERO, &nprow ); // size of local matrix is m_loc x n_loc n_loc = numroc_( &N, &NB, &mycol, &iZERO, &npcol ); //Cblacs_barrier(ictxt, "All"); // DESCINIT initializes the descriptor vector with the 8 input arguments: M, N, MB, NB, IRSRC, ICSRC, ICTXT, LLD. descinit_(desc_N_N, &N, &N , &NB, &NB, &iZERO, &iZERO, &ictxt, &m_loc, &info); // m_loc -- is local leading dimension //descinit_(desc_N_nev, &N, &nev, &NB, &NB, &iZERO, &iZERO, &ictxt, &m_loc, &info); // m_loc -- is local leading dimension nev = MIN(N, 10240); descinit_(desc_nev_nev, &nev, &nev, &NB, &NB, &iZERO, &iZERO, &ictxt, &m_loc, &info); // m_loc_reduced -- is local leading dimension //descinit_(descVector, &N, &iONE, &NB, &iONE, &iZERO, &iZERO, &ictxt, &m_loc, &info); // print parameters of the processes if (debug_mode==1) { int iter_rank; for(iter_rank=0; iter_rank<world_size; iter_rank++) { if (iter_rank==world_rank) printf("load_cblacs.h: Proc-%d, m_loc= %d, n_loc=%d \n", world_rank, m_loc, n_loc); //Cblacs_barrier(ictxt, "All"); } } else if (world_rank==0) printf("load_cblacs.h: Proc-%d, m_loc= %d, n_loc=%d \n", world_rank, m_loc, n_loc); return 0; } //___________________________________________________________________________ void PrintMatrix(T_base_precision *Mat, int n_rows, int n_cols) { int i,j; for(i=0; i<n_rows; i++) { for(j=0; j<n_cols; j++) { printf("%.16g\t", Mat[i+j*n_rows]); } printf("\n"); } } //___________________________________________________________________________ T_base_precision function_H (int I_gl, int J_gl) { T_base_precision func_H=0; if (matrix_type==Clement) { if (I_gl==J_gl+1 || J_gl==I_gl+1) { int K = MIN(I_gl, J_gl); func_H = (T_base_precision) sqrtl((K+1) * (N - K-1)); } } else if (matrix_type==Sequence) { if (J_gl>I_gl) return function_H (J_gl, I_gl); return I_gl + N*J_gl + 1; } else if (matrix_type==Random_0_1) { func_H = (double)(rand())/RAND_MAX; //if (abs(I_gl-J_gl)!=1) func_H = double(rand())/RAND_MAX; //if (!(I_gl==0&&J_gl==1 || I_gl==1&&J_gl==0)) func_H = double(rand())/RAND_MAX; } else if (matrix_type==Random_m1_1) { func_H = (2.0*rand())/RAND_MAX - 1.0; } else { printf("matrix_type=%d is not supported", matrix_type); exit(1); } return func_H; } //___________________________________________________________________________ int main(int argc, char** argv) { /************ MPI ***************************/ MPI_Init( &argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &world_rank); MPI_Comm_size(MPI_COMM_WORLD, &world_size); if (world_rank==0) printf("world_size=%d \n", world_size); world_size_external = world_size; /************ --- ***************************/ if (argc==1) // one argument was provided: filename (default) { N = 10; } if (argc==2) // two arguments were provided: filename (default), N { N = atoi(argv[1]); } if (argc==3) // two arguments were provided: filename (default), N, NB { N = atoi(argv[1]); NB = atoi(argv[2]); } if (argc==4) // two arguments were provided: filename (default), N, NB, world_size_external { N = atoi(argv[1]); NB = atoi(argv[2]); world_size_external = atoi(argv[3]); } if (world_rank==0) printf("world_size_external=%d \n", world_size_external); if (N>=100) debug_mode=0; //____________________________________________ // write output into file for debugging //std::string OutputName = "output-"+std::to_string(world_rank)+".txt"; //if (debug_mode==1 && world_size>1) freopen(OutputName.c_str(), "w", stdout); //____________________________________________ clock_t t0, t1; //double t_gemm; //std::mt19937 gen(1337.0); //std::mt19937 gen(1.0); //std::normal_distribution<> d; //std::uniform_real_distribution<> uniform_distr(-1.0, 1.0); //std::uniform_real_distribution<> uniform_distr(0, 1.0); //srand(time(NULL)); srand(1+world_rank); // seeding //double r = double(rand())/RAND_MAX; // Returns a pseudo-random integer between 0 and 1 /*parameters of block-cyclic data layout*/ int irsrc = 0; int icsrc = 0; int dims[2]; dims[0] = dims[1] = 0; //MPI proc grid = dims[0] x dims[1] MPI_Dims_create(world_size, 2, dims); //---------------------------------------------------- // 0. Setup blacs LoadCblacs(N); if (!((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol))) // checks whether the process is in the grid. { printf("bye-bye from process-%d \n", world_rank); MPI_Finalize(); return 0; } #if defined (USE_CUDA) createCublasHandle(); #endif // 1. Fill the local matrix t0 = clock(); //vector<T_base_precision> H_loc = vector<T_base_precision>(m_loc*n_loc, 0); T_base_precision *A_loc, *B_loc, *C_loc; A_loc = (T_base_precision *) calloc(m_loc*n_loc, sizeof(T_base_precision)); B_loc = (T_base_precision *) calloc(m_loc*n_loc, sizeof(T_base_precision)); C_loc = (T_base_precision *) calloc(m_loc*n_loc, sizeof(T_base_precision)); int i_loc, j_loc; for(i_loc=0; i_loc<m_loc; i_loc++) // iteration over the first state { int l_1 = i_loc/NB; // local coord of the (NBxNB) block among other blocks int x_1 = i_loc%NB; // local coord within the block int I_gl= (l_1*nprow + myrow)*NB + x_1; // gl-global; nprow = "P_r"; myrow="p_r" (quoted-ScaLAPACK userguide notation, p.88-90) for(j_loc=0; j_loc<n_loc; j_loc++) // iteration over the second state { int l_2 = j_loc/NB; // local coord of the (NBxNB) block among other blocks int x_2 = j_loc%NB; // local coord within the block int J_gl= (l_2*npcol + mycol)*NB + x_2; //H_loc[i_loc+j_loc*m_loc] = H[I_gl+J_gl*N]; A_loc[i_loc+j_loc*m_loc] = function_H(I_gl, J_gl); B_loc[i_loc+j_loc*m_loc] = function_H(I_gl, J_gl); } } t1 = clock(); //t_gemm=(double)(t1-t0)/CLOCKS_PER_SEC; if (world_rank==0 || debug_mode==1) printf("Matrix creation time: %f sec.\n", (double)(t1-t0)/CLOCKS_PER_SEC); #ifdef PIN_MATRICES gpuErrCheck( cudaHostRegister(A_loc, m_loc*n_loc*sizeof(T_base_precision), cudaHostRegisterDefault) ); gpuErrCheck( cudaHostRegister(B_loc, m_loc*n_loc*sizeof(T_base_precision), cudaHostRegisterDefault) ); gpuErrCheck( cudaHostRegister(C_loc, m_loc*n_loc*sizeof(T_base_precision), cudaHostRegisterDefault) ); #endif if (debug_mode==1) { printf("A_loc: \n"); PrintMatrix(A_loc, m_loc, n_loc); printf("B_loc: \n"); PrintMatrix(B_loc, m_loc, n_loc); } // 2. Call ScaLAPACK routines (perform the actual matrix diagonalization) T_base_precision real_minus_ONE = (T_base_precision)(-1.0); T_base_precision real_ONE = (T_base_precision)( 1.0); T_base_precision real_ZERO = (T_base_precision)(0.0); int iONE = 1; t0 = clock(); #if defined (BASE_PREC_DOUBLE) pdgemm_ ("N", "N", &nev , &nev, &nev, &real_ONE, A_loc, &iONE, &iONE, desc_nev_nev, B_loc , &iONE, &iONE, desc_nev_nev, &real_ZERO, C_loc, &iONE, &iONE, desc_nev_nev); // C = A*B #endif t1 = clock(); double t_gemm_warmup=(double)(t1-t0)/CLOCKS_PER_SEC; if (world_rank==0 || debug_mode==1) printf("Warmup GEMM TN time: %f sec.\n", t_gemm_warmup); t0 = clock(); #if defined (BASE_PREC_DOUBLE) pdgemm_ ("T", "N", &N , &N, &N, &real_ONE, A_loc, &iONE, &iONE, desc_N_N, B_loc , &iONE, &iONE, desc_N_N, &real_ZERO, C_loc, &iONE, &iONE, desc_N_N); // C = A*B #endif t1 = clock(); double t_gemm_tn=(double)(t1-t0)/CLOCKS_PER_SEC; if (world_rank==0 || debug_mode==1) printf("GEMM TN time: %f sec.\n", t_gemm_tn); t0 = clock(); #if defined (BASE_PREC_DOUBLE) pdgemm_ ("N", "N", &N , &N, &N, &real_ONE, A_loc, &iONE, &iONE, desc_N_N, B_loc , &iONE, &iONE, desc_N_N, &real_ZERO, C_loc, &iONE, &iONE, desc_N_N); // C = A*B #endif t1 = clock(); double t_gemm_nn=(double)(t1-t0)/CLOCKS_PER_SEC; if (world_rank==0 || debug_mode==1) printf("GEMM NN time: %f sec.\n", t_gemm_nn); // 3. Finalize if (debug_mode==1) { printf("C:\n"); PrintMatrix(C_loc, N, N); } /* if (world_rank==0 || debug_mode==1) { printf("\n N, t_elpa_float, t_elpa_double, t_refinement(s)\n"); printf("%d,\t%f,\t%f", N, t_elpa_float, t_elpa_double); for(int iter=0; iter<n_iter_ref; iter++) printf(",\t%f", t_refinement[iter]); printf("\n\n Done! \n"); // write to file std::ofstream myfile; myfile.open ("s_data.txt", std::ios_base::app | std::ios_base::out); // add to the current file (not rewrite) myfile << std::fixed; myfile << N << ",\t\t" << t_elpa_float << ",\t" << t_elpa_double; for(int iter=0; iter<n_iter_ref; iter++) myfile << ",\t" << t_refinement[iter]; myfile << std::endl; myfile.close(); } */ #ifdef PIN_MATRICES gpuErrCheck( cudaHostUnregister(A_loc) ); gpuErrCheck( cudaHostUnregister(B_loc) ); gpuErrCheck( cudaHostUnregister(C_loc) ); if (world_rank==0) printf("ScaLAPACK matrices A_loc, B_loc, C_loc on CPUs are pinned\n"); #endif free(A_loc); free(B_loc); free(C_loc); if (world_rank==0 || debug_mode==1) { printf("\n N, world_size_external, t_gemm_nn, t_gemm_tn\n"); printf("%d,\t\t%d,\t\t%f,\t%f\n", N, world_size_external, t_gemm_nn, t_gemm_tn); FILE *f = fopen("s_data.txt", "a"); // a=append if (f == NULL) { printf("Error opening file!\n"); exit(1); } fprintf(f, "%d,\t\t%d,\t\t%f,\t%f\n", N, world_size_external, t_gemm_nn, t_gemm_tn); // write to file /* std::ofstream myfile; myfile.open ("s_data.txt", std::ios_base::app | std::ios_base::out); // add to the current file (not rewrite) myfile << std::fixed; myfile << N << ",\t\t" << t_gemm_nn << ",\t" << t_gemm_tn; myfile << std::endl; myfile.close(); */ } if ((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol)) Cblacs_gridexit(0); // leave grid if we were in one MPI_Finalize(); }