Skip to content
Snippets Groups Projects
Select Git revision
  • 4a053716580ed55017388072200e9437546a887a
  • main default protected
2 results

main.c

Blame
  • Petr Karpov's avatar
    Petr Karpov authored
    6e037f30
    History
    main.c 15.95 KiB
    #include <mpi.h>
    //#include <complex>
    #include "mkl.h"
    
    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <string.h>
    #include <time.h>
    
    
    // float 
    //#define BASE_PREC_FLOAT
    
    #define BASE_PREC_DOUBLE
    
    #if defined (BASE_PREC_FLOAT)
    #define T_base_precision float
    #elif defined (BASE_PREC_DOUBLE)
    #define T_base_precision double
    #endif
    
    #define MAX(a,b) ((a) > (b) ? (a) : (b))
    #define MIN(a,b) ((a) < (b) ? (a) : (b))
    
    #if defined(USE_CUDA)
    #include <cuda_runtime.h>
    
    #define gpuErrCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    inline void gpuAssert(cudaError_t code, const char *file, int line)
    {
       if (code != cudaSuccess) 
       {
          fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
          exit(code);
       }
    }
    #endif
    //______________________________________________
    
    int debug_mode=0;
    
    int N; //problem size
    int nev; // warmup problem size
    
    enum matrix_types {Clement, Random_0_1, Random_m1_1, Sequence}; // symmetric Clement matrix
    enum matrix_types matrix_type = Random_0_1;
    
    
    /************   MPI  ***************************/
    
    int world_rank, world_size;
    int world_size_external; // for partial usage of the node
    
    /************  BLACS ***************************/
    
    int NB=256; // Global column block size for partitioning the global matrix
    int ictxt; // ordinal number i of the "context",
    int nprow; // Number of total process rows    in the process grid (equivalent to Pr)
    int npcol; // Number of total process columns in the process grid (equivalent to Pc)
    
    int myrow; // The calling process's row     coordinate in the process grid
    int mycol; // The calling process's column coordinate in the process grid.
    int info=0; // Output integer argument of driver and computational routines indicating the success(0) or failure(1) of the routine
    
    int m_loc; // size of local matrices is m_loc x n_loc for NxN global matrix; size of local vectors is m_loc x 1
    int n_loc;
    
    int m_loc_reduced; // size of local matrices is m_loc_reduced x n_loc_reduced for nev x nev global matrix
    int n_loc_reduced;
    
    int desc_N_N[9], desc_N_nev[9], desc_nev_nev[9];
    #ifdef __cplusplus
    extern "C" 
    	{
    #endif
    	// Cblacs declarations
    	void Cblacs_pinfo(int*, int*);
    	void Cblacs_get(int, int, int*);
    	void Cblacs_gridinit(int*, const char*, int, int);
    	void Cblacs_pcoord(int, int, int*, int*);
    	void Cblacs_gridexit(int);
    	void Cblacs_barrier(int, const char*);
    	int numroc_(int*, int*, int*, int*, int*);
    	void Cblacs_gridinfo(int, int*, int*, int*, int*);
    	void descinit_ (MKL_INT *desc, const MKL_INT *m, const MKL_INT *n, const MKL_INT *mb, const MKL_INT *nb, const MKL_INT *irsrc, const MKL_INT *icsrc, const MKL_INT *ictxt, const MKL_INT *lld, MKL_INT *info);
    	
       // P BLAS declarations
       void pdgemm_ ( const char *transa , const char *transb , const MKL_INT *m , const MKL_INT *n , const MKL_INT *k , const double *alpha , const double *a , const MKL_INT *ia , const MKL_INT *ja , const MKL_INT *desca , const double *b , const MKL_INT *ib , const MKL_INT *jb , const MKL_INT *descb , const double *beta , double *c , const MKL_INT *ic , const MKL_INT *jc , const MKL_INT *descc );
       void pdscal_ ( const MKL_INT *n , const double *a , double *x , const MKL_INT *ix , const MKL_INT *jx , const MKL_INT *descx , const MKL_INT *incx );
       void pdaxpy_ ( const MKL_INT *n , const double *a , const double *x , const MKL_INT *ix , const MKL_INT *jx , const MKL_INT *descx , const MKL_INT *incx , double *y , const MKL_INT *iy , const MKL_INT *jy , const MKL_INT *descy , const MKL_INT *incy );
       void pdnrm2_ ( const MKL_INT *n , double *norm2 , const double *x , const MKL_INT *ix , const MKL_INT *jx , const MKL_INT *descx , const MKL_INT *incx );
    	void psgeadd_ ( const char *trans , const MKL_INT *m , const MKL_INT *n , const float *alpha , const float *a , const MKL_INT *ia , const MKL_INT *ja , const MKL_INT *desca , const float *beta , float *c , const MKL_INT *ic , const MKL_INT *jc , const MKL_INT *descc );
       void pdgeadd_ ( const char *trans , const MKL_INT *m , const MKL_INT *n , const double *alpha , const double *a , const MKL_INT *ia , const MKL_INT *ja , const MKL_INT *desca , const double *beta , double *c , const MKL_INT *ic , const MKL_INT *jc , const MKL_INT *descc );
       
       //void cblas_dgemm ( const CBLAS_LAYOUT Layout , const CBLAS_TRANSPOSE transa , const CBLAS_TRANSPOSE transb , const MKL_INT m , const MKL_INT n , const MKL_INT k , const double alpha , const double *a , const MKL_INT lda , const double *b , const MKL_INT ldb , const double beta , double *c , const MKL_INT ldc );
    
       //void RefSyEv(T_high_precision *A, T_high_precision *X, T_high_precision *lambda, int N, int nev, int m_loc, int n_loc, int m_loc_reduced, int n_loc_reduced, int *desc_N_N, int *desc_N_nev, int *desc_nev_nev, int myrow, int mycol, int nprow, int npcol, int world_rank, int NB, int debug_mode);
    #ifdef __cplusplus
    	}
    #endif
    
    void createCublasHandle();
    
    //____________________________________________________________________________________________
    
    int LoadCblacs(int N) // return 0 (for successfull) and 1 (for unsuccessful) initialization
    	{
    	/*
       if (world_size== 2) {nprow = 2; npcol = 1;}
       if (world_size==12) {nprow = 3; npcol = 4;}
       if (world_size==24) {nprow = 4; npcol = 6;}
       if (world_size==36) {nprow = 6; npcol = 6;}
       */
       if (world_size_external== 2) {nprow = 1; npcol = 2;}
       if (world_size_external== 8) {nprow = 2; npcol = 4;}
       else if (world_size_external==72) {nprow = 8; npcol = 9;}
       
       else
          {      
          nprow=sqrt(world_size_external); // Number of process rows     in the process grid (equivalent to Pr)
          if (nprow>1 && NB*nprow>N) nprow = N/NB;
          npcol=nprow; // Number of process columns in the process grid (equivalent to Pc)
          }
          
    	if (nprow==1 && npcol==1) NB=N;
    	if (world_rank==0) printf("N= %d, nprow=%d, npcol=%d, NB=%d \n", N, nprow, npcol, NB);
    
    	Cblacs_pinfo( &world_rank, &world_size ) ; // Routine is used when some initial system information is required before the BLACS are set up
    	Cblacs_get( -1, 0, &ictxt );
    	Cblacs_gridinit( &ictxt, "Row", nprow, npcol );
    	Cblacs_gridinfo( ictxt, &nprow, &npcol, &myrow, &mycol );
    
    	if (!((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol))) return 1;
    
    	// NUMROC computes the NUMber of Rows Or Columns of a distributed (small local) matrix owned by the process indicated by myrow,mycol
    	int iZERO=0;
    	m_loc = numroc_( &N, &NB, &myrow, &iZERO, &nprow ); // size of local matrix is m_loc x n_loc
    	n_loc = numroc_( &N, &NB, &mycol, &iZERO, &npcol );
       
       //Cblacs_barrier(ictxt, "All");
    
    	// DESCINIT initializes the descriptor vector with the 8 input arguments: M, N, MB, NB, IRSRC, ICSRC, ICTXT, LLD.
    	descinit_(desc_N_N,   &N, &N  ,    &NB, &NB,   &iZERO, &iZERO, &ictxt, &m_loc, &info); // m_loc -- is local leading dimension
    	//descinit_(desc_N_nev, &N, &nev,    &NB, &NB,   &iZERO, &iZERO, &ictxt, &m_loc, &info); // m_loc -- is local leading dimension
    	nev = MIN(N, 10240);
       descinit_(desc_nev_nev, &nev, &nev,    &NB, &NB,   &iZERO, &iZERO, &ictxt, &m_loc, &info); // m_loc_reduced -- is local leading dimension
    	//descinit_(descVector,  &N, &iONE, &NB, &iONE, &iZERO, &iZERO, &ictxt, &m_loc, &info);
    
    	// print parameters of the processes
    	if (debug_mode==1)
          {
          int iter_rank;
          for(iter_rank=0; iter_rank<world_size; iter_rank++)
             {
             if (iter_rank==world_rank) printf("load_cblacs.h: Proc-%d, m_loc= %d, n_loc=%d \n", world_rank, m_loc, n_loc);
             //Cblacs_barrier(ictxt, "All");
             }
          }
       else if (world_rank==0) printf("load_cblacs.h: Proc-%d, m_loc= %d, n_loc=%d \n", world_rank, m_loc, n_loc);
    
       return 0;
    	}
    
    
    //___________________________________________________________________________
    
    void PrintMatrix(T_base_precision *Mat, int n_rows, int n_cols)
    	{
    	int i,j;
    	for(i=0; i<n_rows; i++) 
    		{
    		for(j=0; j<n_cols; j++)
    			{
    			printf("%.16g\t", Mat[i+j*n_rows]);
    			}
    		printf("\n");
    		}
    	}
       
    //___________________________________________________________________________
    
    T_base_precision function_H (int I_gl, int J_gl)
       {
       T_base_precision func_H=0;
       
       if (matrix_type==Clement)
          {
          if (I_gl==J_gl+1 || J_gl==I_gl+1) 
             {
             int K = MIN(I_gl, J_gl);
             func_H = (T_base_precision) sqrtl((K+1) * (N - K-1));
             }
          }
       else if (matrix_type==Sequence)
          {
          if (J_gl>I_gl) return function_H (J_gl, I_gl);
          return I_gl + N*J_gl + 1;
          }
       else if (matrix_type==Random_0_1)
          {
          func_H = (double)(rand())/RAND_MAX;
          //if (abs(I_gl-J_gl)!=1) func_H = double(rand())/RAND_MAX;
          //if (!(I_gl==0&&J_gl==1 || I_gl==1&&J_gl==0)) func_H = double(rand())/RAND_MAX;
          }
       else if (matrix_type==Random_m1_1)
          {
          func_H = (2.0*rand())/RAND_MAX - 1.0;
          }
       else
          {
          printf("matrix_type=%d is not supported", matrix_type);
          exit(1);
          }
       
       return func_H;
       }
    
    //___________________________________________________________________________
    
    
    int main(int argc, char** argv)
    {
    
    /************  MPI ***************************/
    MPI_Init( &argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    if (world_rank==0) printf("world_size=%d \n", world_size);
    world_size_external = world_size;
    
    /************  --- ***************************/
    
    if (argc==1) // one argument was provided: filename (default)
       {
       N = 10;
       }
       
    if (argc==2) // two arguments were provided: filename (default), N
       {
       N = atoi(argv[1]);
       }
    
    if (argc==3) // two arguments were provided: filename (default), N, NB
       {
       N = atoi(argv[1]);
       NB = atoi(argv[2]);
       }
    
    if (argc==4) // two arguments were provided: filename (default), N, NB, world_size_external
       {
       N = atoi(argv[1]);
       NB = atoi(argv[2]);
       world_size_external = atoi(argv[3]);
       }
       
    if (world_rank==0) printf("world_size_external=%d \n", world_size_external);   
    if (N>=100) debug_mode=0;
    //____________________________________________ 
    // write output into file for debugging
    //std::string OutputName = "output-"+std::to_string(world_rank)+".txt"; 
    //if (debug_mode==1 && world_size>1) freopen(OutputName.c_str(), "w", stdout);
    //____________________________________________    
    
    clock_t t0, t1;
    //double t_gemm;
       
    //std::mt19937 gen(1337.0);
    //std::mt19937 gen(1.0);
    //std::normal_distribution<> d;
    //std::uniform_real_distribution<> uniform_distr(-1.0, 1.0);
    //std::uniform_real_distribution<> uniform_distr(0, 1.0);
    	
    //srand(time(NULL));	
    srand(1+world_rank); // seeding
    //double r = double(rand())/RAND_MAX; // Returns a pseudo-random integer between 0 and 1
    	
    /*parameters of block-cyclic data layout*/
    int irsrc = 0; 
    int icsrc = 0;
    
      
    int dims[2];
    dims[0] = dims[1] = 0;
    //MPI proc grid = dims[0] x dims[1]
    MPI_Dims_create(world_size, 2, dims);
    
    	
    //----------------------------------------------------
    
    // 0. Setup blacs
    LoadCblacs(N);
    if (!((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol))) // checks whether the process is in the grid.
    	{
    	printf("bye-bye from process-%d \n", world_rank); 
    	MPI_Finalize();
    	return 0;
    	}
    
     #if defined (USE_CUDA)
       createCublasHandle();
     #endif
    
    // 1. Fill the local matrix
    t0 = clock();
    //vector<T_base_precision> H_loc = vector<T_base_precision>(m_loc*n_loc, 0);
    T_base_precision *A_loc, *B_loc, *C_loc;
    A_loc  = (T_base_precision *) calloc(m_loc*n_loc, sizeof(T_base_precision));
    B_loc  = (T_base_precision *) calloc(m_loc*n_loc, sizeof(T_base_precision));
    C_loc  = (T_base_precision *) calloc(m_loc*n_loc, sizeof(T_base_precision));
    
    int i_loc, j_loc;	
    for(i_loc=0; i_loc<m_loc; i_loc++) // iteration over the first state
    	{
    	int l_1 = i_loc/NB; // local coord of the (NBxNB) block among other blocks
    	int x_1 = i_loc%NB; // local coord within the block
    	int I_gl= (l_1*nprow + myrow)*NB + x_1; // gl-global; nprow = "P_r"; myrow="p_r"  (quoted-ScaLAPACK userguide notation, p.88-90)
    		
    	for(j_loc=0; j_loc<n_loc; j_loc++) // iteration over the second state
    		{
          int l_2 = j_loc/NB; // local coord of the (NBxNB) block among other blocks
          int x_2 = j_loc%NB; // local coord within the block
          int J_gl= (l_2*npcol + mycol)*NB + x_2;
        
    		//H_loc[i_loc+j_loc*m_loc] = H[I_gl+J_gl*N];
    		A_loc[i_loc+j_loc*m_loc] = function_H(I_gl, J_gl);
    		B_loc[i_loc+j_loc*m_loc] = function_H(I_gl, J_gl);
    		}
    	}
    
    t1 = clock();
    //t_gemm=(double)(t1-t0)/CLOCKS_PER_SEC;
    if (world_rank==0 || debug_mode==1) printf("Matrix creation time: %f sec.\n", (double)(t1-t0)/CLOCKS_PER_SEC);
       
    #ifdef PIN_MATRICES
       gpuErrCheck( cudaHostRegister(A_loc, m_loc*n_loc*sizeof(T_base_precision), cudaHostRegisterDefault) );
       gpuErrCheck( cudaHostRegister(B_loc, m_loc*n_loc*sizeof(T_base_precision), cudaHostRegisterDefault) );
       gpuErrCheck( cudaHostRegister(C_loc, m_loc*n_loc*sizeof(T_base_precision), cudaHostRegisterDefault) );
    #endif
       
    if (debug_mode==1) 
       {
       printf("A_loc: \n");
       PrintMatrix(A_loc, m_loc, n_loc);
       
       printf("B_loc: \n");
       PrintMatrix(B_loc, m_loc, n_loc);
       }
    
       
    // 2. Call ScaLAPACK routines (perform the actual matrix diagonalization)
    
    T_base_precision real_minus_ONE = (T_base_precision)(-1.0);
    T_base_precision real_ONE = (T_base_precision)( 1.0);
    T_base_precision real_ZERO = (T_base_precision)(0.0);
    int iONE = 1; 
    
    t0 = clock();
    #if defined (BASE_PREC_DOUBLE)
       pdgemm_ ("N", "N", &nev  , &nev, &nev, &real_ONE, A_loc, &iONE, &iONE, desc_nev_nev, B_loc , &iONE, &iONE, desc_nev_nev, &real_ZERO, C_loc, &iONE, &iONE, desc_nev_nev); // C = A*B
    #endif
    t1 = clock();
    double t_gemm_warmup=(double)(t1-t0)/CLOCKS_PER_SEC;
    if (world_rank==0 || debug_mode==1) printf("Warmup GEMM TN time: %f sec.\n", t_gemm_warmup);
    
    t0 = clock();
    #if defined (BASE_PREC_DOUBLE)
       pdgemm_ ("T", "N", &N  , &N, &N, &real_ONE, A_loc, &iONE, &iONE, desc_N_N, B_loc , &iONE, &iONE, desc_N_N, &real_ZERO, C_loc, &iONE, &iONE, desc_N_N); // C = A*B
    #endif
    t1 = clock();
    double t_gemm_tn=(double)(t1-t0)/CLOCKS_PER_SEC;
    if (world_rank==0 || debug_mode==1) printf("GEMM TN time: %f sec.\n", t_gemm_tn);
    
       
    t0 = clock();
    #if defined (BASE_PREC_DOUBLE)
       pdgemm_ ("N", "N", &N  , &N, &N, &real_ONE, A_loc, &iONE, &iONE, desc_N_N, B_loc , &iONE, &iONE, desc_N_N, &real_ZERO, C_loc, &iONE, &iONE, desc_N_N); // C = A*B
    #endif
    t1 = clock();
    double t_gemm_nn=(double)(t1-t0)/CLOCKS_PER_SEC;
    if (world_rank==0 || debug_mode==1) printf("GEMM NN time: %f sec.\n", t_gemm_nn);
       
    
     
    
    // 3. Finalize
    
    if (debug_mode==1) 
       {
       printf("C:\n");
       PrintMatrix(C_loc, N, N);
       }
    
    /*   
    if (world_rank==0 || debug_mode==1) 
       {
       printf("\n N, t_elpa_float, t_elpa_double, t_refinement(s)\n");
       printf("%d,\t%f,\t%f", N, t_elpa_float, t_elpa_double);
       for(int iter=0; iter<n_iter_ref; iter++) printf(",\t%f", t_refinement[iter]);
       printf("\n\n Done! \n");
       
       // write to file
       std::ofstream myfile;
       myfile.open ("s_data.txt", std::ios_base::app | std::ios_base::out); // add to the current file (not rewrite)
       myfile << std::fixed;
       myfile << N << ",\t\t" << t_elpa_float << ",\t" << t_elpa_double;
       for(int iter=0; iter<n_iter_ref; iter++) myfile << ",\t" << t_refinement[iter];
       myfile << std::endl;
       myfile.close();
       }
    */
    
    #ifdef PIN_MATRICES
       gpuErrCheck( cudaHostUnregister(A_loc) );
       gpuErrCheck( cudaHostUnregister(B_loc) );
       gpuErrCheck( cudaHostUnregister(C_loc) );
       if (world_rank==0) printf("ScaLAPACK matrices A_loc, B_loc, C_loc on CPUs are pinned\n");
    #endif
    
    free(A_loc);
    free(B_loc);
    free(C_loc);
    
    if (world_rank==0 || debug_mode==1) 
       {
       printf("\n N, world_size_external, t_gemm_nn, t_gemm_tn\n");
       printf("%d,\t\t%d,\t\t%f,\t%f\n", N, world_size_external, t_gemm_nn, t_gemm_tn);
       
       FILE *f = fopen("s_data.txt", "a"); // a=append
       if (f == NULL)
          {
          printf("Error opening file!\n");
          exit(1);
          }
       fprintf(f, "%d,\t\t%d,\t\t%f,\t%f\n", N, world_size_external, t_gemm_nn, t_gemm_tn);
       
       // write to file
       /*
       std::ofstream myfile;
       myfile.open ("s_data.txt", std::ios_base::app | std::ios_base::out); // add to the current file (not rewrite)
       myfile << std::fixed;
       myfile << N << ",\t\t" << t_gemm_nn << ",\t" << t_gemm_tn;
       myfile << std::endl;
       myfile.close();
       */
       }
       
    if ((myrow>-1)&(mycol>-1)&(myrow<nprow)&(mycol<npcol)) Cblacs_gridexit(0); // leave grid if we were in one
      
    MPI_Finalize();
    }