Commit f8e718f7 authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Update documentation for the C++ implimenation

All documenation is up to date
parent 2f565ca5
/** @file descriptor_identifier/Model/Model.hpp
* @brief Object to store the models generated form SISSO
*
* Creates a Model generated from SISSO and the corresponding output file.
* It also has functionality to read in an output file to regenerate the model.
*
* @author Thomas A. R. Purcell (tpurcell)
* @bug No known bugs.
*/
#ifndef MODEL
#define MODEL
......@@ -31,84 +40,136 @@ class Model
std::vector<model_node_ptr> _feats; //!< List of features in the model
std::vector<std::vector<double>> _coefs; //!< Coefficients for the features
std::vector<double> _prop_train; //!< The property to be modeled
std::vector<double> _prop_test; //!< The property to be modeled
std::vector<double> _train_error; //!< The error of the model
std::vector<double> _test_error; //!< The error of the model
std::vector<double> _D_train; //!< The Descriptor matrix
std::vector<double> _D_test; //!< The Descriptor matrix
std::vector<double> _prop_train_est; //!< The estimated Property
std::vector<double> _prop_test_est; //!< The estimated Property
std::vector<int> _task_sizes_train; //!< Number of samples in each task
std::vector<int> _task_sizes_test; //!< Number of samples in each task
std::vector<double> _prop_train; //!< The property to be modeled (training data)
std::vector<double> _prop_test; //!< The property to be modeled (testing data)
std::vector<double> _train_error; //!< The error of the model (training)
std::vector<double> _test_error; //!< The error of the model (testing)
std::vector<double> _D_train; //!< The Descriptor matrix (training data)
std::vector<double> _D_test; //!< The Descriptor matrix (testing data)
std::vector<double> _prop_train_est; //!< The estimated Property (training data)
std::vector<double> _prop_test_est; //!< The estimated Property (testing data)
std::vector<int> _task_sizes_train; //!< Number of training samples in each task
std::vector<int> _task_sizes_test; //!< Number of testing samples in each task
public:
/**
* @brief Constructor for the model
*
* @param prop The property
* @param prop_train The property vector for the training samples
* @param prop_test The property vector for the test samples
* @param feats The features for the model
* @param task_sizes_train Number of samples per task in the training data
* @param task_sizes_test Number of samples per task in the test data
*/
Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test);
/**
* @brief Construct a model from a training output file
* @details Reads in all of the data from the output file and recreates the model object
*
* @param train_file Previously generated model file
*/
Model(std::string train_file);
/**
* @brief Construct a model from a training and testing output file
* @details Reads in all of the data from the output files and recreates the model object
*
* @param train_file Previously generated training model output file
* @param train_file Previously generated testing model output file
*/
Model(std::string train_file, std::string test_file);
/**
* @brief Read an output file and extract all relevant information
* @details Takes in an output file and extracts all data needed to recreate the model
*
* @param filename Name of the output file
* @param train If true then the output represents training data
*
* @return [description]
*/
std::vector<std::string> populate_model(std::string filename, bool train);
/**
* @brief Convert the model to a string
* @return The string of the model
* @return The string representation of the model
*/
std::string toString() const;
/**
* @brief Accessor function to _prop_est
* @brief Accessor function to _prop_test_est
*/
inline std::vector<double> predict(){return _prop_test_est;}
/**
* @brief Accessor function to _prop_est
* @brief Accessor function to _prop_train_est
*/
inline std::vector<double> predict_train(){return _prop_train_est;}
/**
* @brief Copy the error into a new array
*
* @param res pointer to the beginning of the array
* @param res pointer to the beginning of the vector to store the residual
*/
inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);}
/**
* @brief The rmes of the model
* @brief The training rmse of the model
*/
inline double rmse(){return util_funcs::norm(_train_error.data(), _n_samp_train) / std::sqrt(static_cast<double>(_n_samp_train));}
/**
* @brief The testing rmse of the model
*/
inline double test_rmse(){return util_funcs::norm(_test_error.data(), _n_samp_test) / std::sqrt(static_cast<double>(_n_samp_test));}
/**
* @brief Total number of samples being trained on
*/
inline int n_samp_train(){return _n_samp_train;}
/**
* @brief Total number of samples being tested
*/
inline int n_samp_test(){return _n_samp_test;}
/**
* @brief The dimensionality of the data
*/
inline int n_dim(){return _n_dim;}
/**
* @brief The max Absolute error of the array
* @brief The max Absolute error of the training data
*/
inline double max_ae()
{
return std::abs(*std::max_element(_train_error.data(), _train_error.data() + _n_samp_train, [](double d1, double d2){return std::abs(d1) < std::abs(d2);}));
}
/**
* @brief The max Absolute error of the testing data
*/
inline double test_max_ae()
{
return std::abs(*std::max_element(_test_error.data(), _test_error.data() + _n_samp_test, [](double d1, double d2){return std::abs(d1) < std::abs(d2);}));
}
/**
* @brief Print model to a file
* @brief Convert the Model into an output file
*
* @param filename The name of the file to output to
* @param train If true output the training data
* @param test_inds The indexes of the test set
*/
void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {});
#ifdef PY_BINDINGS
/**
* @brief Python Accessor functions to the coefficient array
* @return The coefficients as a python list
*/
inline py::list coefs()
{
py::list coef_lst;
......@@ -117,6 +178,10 @@ public:
return coef_lst;
}
/**
* @brief Python Accessor functions to the features
* @return A python list containing all of the features
*/
inline py::list feats()
{
py::list feat_lst;
......@@ -125,11 +190,40 @@ public:
return feat_lst;
}
/**
* @brief Python Accessor function to _prop_train_est
* @return _prop_train_est as a numpy ndarray
*/
inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);}
/**
* @brief Python Accessor function to _prop_test_est
* @return _prop_test_est as a numpy ndarray
*/
inline np::ndarray prop_test_est(){return python_conv_utils::to_ndarray<double>(_prop_test_est);}
/**
* @brief Python Accessor function to _prop_train
* @return _prop_train as a numpy ndarray
*/
inline np::ndarray prop_train(){return python_conv_utils::to_ndarray<double>(_prop_train);}
/**
* @brief Python Accessor function to _prop_test
* @return _prop_test as a numpy ndarray
*/
inline np::ndarray prop_test(){return python_conv_utils::to_ndarray<double>(_prop_test);}
/**
* @brief Python Accessor function to _train_error
* @return _train_error as a numpy ndarray
*/
inline np::ndarray train_error(){return python_conv_utils::to_ndarray<double>(_train_error);}
/**
* @brief Python Accessor function to _test_error
* @return _test_error as a numpy ndarray
*/
inline np::ndarray test_error(){return python_conv_utils::to_ndarray<double>(_test_error);}
#endif
};
......
/** @file descriptor_identifier/SISSORegressor.hpp
* @brief Perform SISSO on a previously generated Feature Space
*
* Takes in a feature space and performs SISSO on it while storing the selected features in _models
*
* @author Thomas A. R. Purcell (tpurcell)
* @bug No known bugs.
*/
#ifndef SISSO_REGRESSOR
#define SISSO_REGRESSOR
......@@ -11,7 +19,7 @@
#endif
/**
* @brief SISSO Regressor class, to find the best models, and store them
* @brief SISSO Regressor class, performs the SISSO algorithm and stores all selected models
*
*/
class SISSORegressor
......@@ -19,45 +27,41 @@ class SISSORegressor
protected:
std::vector<std::vector<Model>> _models; //!< List of models
std::vector<double> _prop; //!< Property array
std::vector<double> _prop_test; //!< Property array
std::vector<double> _error; //!< Array to calculate the residuals for the models
std::vector<double> _prop; //!< Property array (training data)
std::vector<double> _prop_test; //!< Property array (testing data)
std::vector<double> _error; //!< Array to calculate the residuals for the models (training data)
std::vector<double> _a; //!< A matrix for least squares
std::vector<double> _b; //!< Solution array for least squares
std::vector<double> _work; //!< The work array for least squares problems
std::vector<double> _s; //!< The S array for least squares problems
std::vector<int> _task_sizes_train;
std::vector<int> _task_sizes_test;
std::vector<int> _leave_out_inds;
std::vector<int> _task_sizes_train; //!< Number of training samples per task
std::vector<int> _task_sizes_test; //!< Number of testing samples per task
std::vector<int> _leave_out_inds; //!< List of indexes from the initial data file in the test set
std::shared_ptr<FeatureSpace> _feat_space; //!< Feature Space for the problem
std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI Communicator
int _n_samp; //!< the number of samples per feature
int _n_dim; //!< Number of dimensions to calculate
int _n_residual; //!< Number of residuals to pass to the next sis model
int _lwork; //!< size of the work array
int _lwork; //!< size of the work array (for dgels_)
int _rank; //!< Ranks for the least squares problem
public:
/**
* @brief Constructor for the Regressor
*
* @param prop Property to model
* @param n_dim Maximum dimension of the model
* @param feat_space The feature space to run SISSO on
* @param prop Vector storing all data to train the SISSO models with
* @param prpo_test Vector storing all data to test the SISSO models with
* @param task_sizes_train Number of training samples per task
* @param task_sizes_test Number of testing samples per task
* @param leave_out_inds List of indexes from the initial data file in the test set
* @param n_dim Maximum dimensionality of the generated models
* @param n_residual Number of residuals to pass to the next SIS operation
*/
SISSORegressor(
std::shared_ptr<FeatureSpace> feat_space,
std::vector<double> prop,
std::vector<double> prop_test,
std::vector<int> task_sizes_train,
std::vector<int> task_sizes_test,
std::vector<int> leave_out_inds,
int n_dim,
int n_residual);
SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual);
/**
* @brief Get the optimal size of the working array
......@@ -72,6 +76,8 @@ public:
*
* @param inds Feature indexes to get the model of
* @param coeffs Coefficients for the model
* @param start The index in the property and feature vectors start copying into _b and _a
* @param n_samp number of samples to perform least squares optimization on
*/
void least_squares(std::vector<int>& inds, double* coeffs, int start, int n_samp);
......@@ -80,6 +86,8 @@ public:
*
* @param inds indexes of the selected features
* @param coeffs Coefficients of the model
* @param start The index in the property and feature vectors start copying into _b and _a
* @param n_samp number of samples to perform least squares optimization on
*/
void set_error(std::vector<int>& inds, double* coeffs, int start, int n_samp);
......@@ -87,18 +95,22 @@ public:
* @brief Set the A matrix for the least squares problem
*
* @param inds indexes of the selected features
* @param start The index in the property and feature vectors start copying into _b and _a
* @param n_samp number of samples to perform least squares optimization on
*/
void set_a(std::vector<int>& inds, int start, int n_samp);
/**
* @brief Fit the models
* @brief Perform SISSO to generate the models
* @details Iteratively pefrom SISSO on the Feature space and property until the model dimensionality is equal to _n_dim
*/
void fit();
/**
* @brief Preform the l0 normalization for a property or the residual
*
* @param prop Property to fit
* @param prop The property to fit
* @param n_dim the dimensionality of the model
*/
void l0_norm(std::vector<double>& prop, int n_dim);
......@@ -119,7 +131,7 @@ public:
inline std::vector<double> prop_test(){return _prop_test;}
/**
* @brief Acessor function for {
* @brief Acessor function for the error vector
*/
inline std::vector<double> error(){return _error;}
......@@ -145,6 +157,18 @@ public:
// Python interface functions
#ifdef PY_BINDINGS
/**
* @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSORegressor.cpp)
*
* @param feat_space The feature space to run SISSO on
* @param prop Vector storing all data to train the SISSO models with
* @param prpo_test Vector storing all data to test the SISSO models with
* @param task_sizes_train Number of training samples per task
* @param task_sizes_test Number of testing samples per task
* @param leave_out_inds List of indexes from the initial data file in the test set
* @param n_dim Maximum dimensionality of the generated models
* @param n_residual Number of residuals to pass to the next SIS operation
*/
SISSORegressor(
std::shared_ptr<FeatureSpace> feat_space,
np::ndarray prop,
......@@ -156,6 +180,18 @@ public:
int n_residual
);
/**
* @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSORegressor.cpp)
*
* @param feat_space The feature space to run SISSO on
* @param prop Vector storing all data to train the SISSO models with
* @param prpo_test Vector storing all data to test the SISSO models with
* @param task_sizes_train Number of training samples per task
* @param task_sizes_test Number of testing samples per task
* @param leave_out_inds List of indexes from the initial data file in the test set
* @param n_dim Maximum dimensionality of the generated models
* @param n_residual Number of residuals to pass to the next SIS operation
*/
SISSORegressor(
std::shared_ptr<FeatureSpace> feat_space,
py::list prop,
......@@ -167,11 +203,40 @@ public:
int n_residual
);
/**
* @brief Python Accessor function to models (cpp definition in <python/descriptor_identifier/SISSORegressor.cpp)
* @return models as a python list
*/
py::list models_py();
/**
* @brief Python Accessor function to prop
* @return prop as a numpy array
*/
inline np::ndarray prop_py(){return python_conv_utils::to_ndarray<double>(_prop);}
/**
* @brief Python Accessor function to prop_test
* @return prop_test as a numpy array
*/
inline np::ndarray prop_test_py(){return python_conv_utils::to_ndarray<double>(_prop_test);}
/**
* @brief Python Accessor function to task_sizes_train
* @return task_sizes_train as a python list
*/
inline py::list task_sizes_train(){python_conv_utils::to_list<int>(_task_sizes_train);}
/**
* @brief Python Accessor function to task_sizes_test
* @return task_sizes_test as a python list
*/
inline py::list task_sizes_test(){python_conv_utils::to_list<int>(_task_sizes_test);}
/**
* @brief Python Accessor function to error
* @return error as a numpy array
*/
inline np::ndarray error_py(){return python_conv_utils::to_ndarray<double>(_error);}
#endif
};
......
/** @file feature_creation/feature_space/FeatureSpace.hpp
* @brief Create a feature space from an initial set of features and algebraic operators
*
* Use an initial set of features and combine them to generate more complicated algebraical features. SIS is also performed here
*
* @author Thomas A. R. Purcell (tpurcell)
* @bug No known bugs.
*/
#ifndef FEATURE_SPACE
#define FEATURE_SPACE
......@@ -37,33 +46,37 @@ class FeatureSpace
std::vector<double> _scores; //!< projection scores for each feature
std::vector<int> _task_sizes; //!< The number of elements in each task
std::vector<int> _start_gen; //!< list of starting index for each generation
std::string _feature_space_file; //!< File to store infromation on the feature space
std::vector<int> _task_sizes; //!< The number of elements in each task (training data)
std::vector<int> _start_gen; //!< list of the indexes where each generation starts in _phi
std::string _feature_space_file; //!< File to store information about the selected features
std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used for projection onto SIS
std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPi communicator
std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS
std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator
double _l_bound; //!< lower bound for absolute value of the features
double _u_bound; //!< upper bound for absolute value of the features
int _max_phi; //!< Maximum rung for the feature creation
int _n_sis_select; //!< Number of features to select for each dimensions
int _n_samp; //!< Number of samples
int _n_samp; //!< Number of samples (training data)
int _n_feat; //!< Total number of features
int _n_rung_store; //!< Total rungs stored
int _n_rung_generate; //!< Total number of rungs to generate on the fly
int _max_temp_store;
public:
/**
* @brief Constructor for the feature space
* @details constructs the feature space from an initial set of features and a list of allowed operatiors
* @details constructs the feature space from an initial set of features and a list of allowed operators
*
* @param mpi_comm MPI communicator for the calculations
* @param phi_0 The initial set of features to combine
* @param allowed_ops list of allowed operators
* @param prop The property to be learned (training data)
* @param max_phi highest rung value for the calculation
* @param n_sis_select number of features to select during each SIS step
* @param max_store_rung number of rungs to calculate and store the value of the features for all samples
* @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
* @param min_abs_feat_val minimum absolute feature value
* @param max_abs_feat_val maximum absolute feature value
*/
FeatureSpace(
......@@ -113,51 +126,137 @@ public:
*/
inline std::shared_ptr<MPI_Interface> mpi_comm(){return _mpi_comm;}
/**
* @brief Accessor function for _mpi_comm
*/
inline std::vector<int> task_sizes(){return _task_sizes;}
/**
* @brief Accessor function for _feature_space_file
*/
inline std::string feature_space_file(){return _feature_space_file;}
/**
* @brief Accessor function for _l_bound
*/
inline double l_bound(){return _l_bound;}
/**
* @brief Accessor function for _u_bound
*/
inline double u_bound(){return _u_bound;}
/**
* @brief Accessor function for _max_phi
*/
inline int max_phi(){return _max_phi;}
/**
* @brief Accessor function for _n_sis_select
*/
inline int n_sis_select(){return _n_sis_select;}
/**
* @brief Accessor function for _n_samp
*/
inline int n_samp(){return _n_samp;}
/**
* @brief Accessor function for _n_feat
*/
inline int n_feat(){return _n_feat;}
/**
* @brief Accessor function for _n_rung_store
*/
inline int n_rung_store(){return _n_rung_store;}
/**
* @brief Accessor function for _n_rung_generate
*/
inline int n_rung_generate(){return _n_rung_generate;}
/**
* @brief Generate a new set of features from a single feature
* @details Take in the feature and perform all valid algebraic operations on it.
*
* @param feat The feature to spawn new features from
* @param feat_set The feature set to pull features from for combinations
* @param feat_ind starting index for the next feature generated
* @param l_bound lower bound for the absolute value of the feature
* @param u_bound upper bound for the abosulte value of the feature
*/
void generate_new_feats(std::vector<node_ptr>::iterator& feat, std::vector<node_ptr>& feat_set, int& feat_ind, double l_bound=1e-50, double u_bound=1e50);
/**
* @brief Calculate the SIS Scores for feature generated on the fly
* @details Create the next rung of features and calculate their projection scores. Only keep those that can be selected by SIS.
*
* @param prop Pointer to the start of the vector storing the data to project the features onto
* @param size The size of the data to project over
* @param phi_selected The features that would be selected from the previous rungs
* @param scores_selected The projection scores of the features that would be selected from the previous rungs
* @param scores_comp vector to store temporary score comparisons
*/
void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
/**
* @brief Check if a feature overlaps with a feature previously selected in earlier SIS iterations
* @details Compares the projection score of the current candidate feature with all those of previously selected features (using the current prop) and
* if they are within 1e-10, then check the correlation between the features themselves
*
* @param val_ptr pointer to the candidate feature's data
* @param cur_score the projection score of the candidate feature
* @param scores_past The projection scores of the previous features
* @param scores_comp vector to temporarily store the comparison of projection scores
* @return True if the feature does not overlap with any previously selected
*/
bool valid_score_against_past(double* val_ptr, double cur_score, std::vector<double> scores_past, std::vector<double>& scores_comp);
/**
* @brief Check if a feature overlaps with a feature previously selected in this SIS iterations
* @details CCompares the projection score of the current candidate feature with all those of previously selected features in this iteration and
* if they are within 1e-10, then check the correlation between the features themselves
*
* @param end_check the end point to stop the comparison (the same as the current number of selected features)
* @param val_ptr pointer to the candidate feature's data
* @param cur_score the projection score of the candidate feature
* @param scores_selected The projection scores of the previous features
* @param scores_comp vector to temporarily store the comparison of projection scores
* @return True if the feature does not overlap with any previously selected
*/
bool valid_score_against_current(int end_check, double* val_ptr, double cur_score, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
/**
* @brief Perform SIS on a feature set with a specified property
* @details Perform sure-independence screening with either the correct property
* @details Perform sure-independence screening with either the correct property or the error
*
* @param prop The property to calculate SIS from
* @param prop The property to perform SIS over
*/
void sis(std::vector<double>& prop);
/**
* @brief Is a feature in this process' _phi?
*
* @param ind index
* @return True if feature is in this _phi