Commit 1d4b68ef authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Update documentation for classifiers and regressors

parent eb663b87
......@@ -28,12 +28,13 @@ void ConvexHull1D::initialize_prop(const std::vector<int>& sizes, double* prop)
{
_n_task = sizes.size();
// Get number of samples and resize the value and index vectors
int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0);
_sorted_value.resize(n_samp, 0.0);
_sorted_prop_inds.resize(n_samp, 0);
std::iota(_sorted_prop_inds.begin(), _sorted_prop_inds.end(), 0);
// Initialize and fill the vectors used for storing information of the number of samples in each class in each task
std::vector<std::vector<int>> cls_sz(sizes.size());
std::vector<std::vector<int>> cls_start(sizes.size());
......@@ -62,8 +63,11 @@ void ConvexHull1D::initialize_prop(const std::vector<int>& sizes, double* prop)
_task_scores.resize(_n_task, 0.0);
for(int tt = 0; tt < _n_task; ++tt)
{
_n_class = std::max(_n_class, static_cast<int>((cls_sz[tt].size())));
}
// Setup vectors used in calculating 1D convex hull
_cls_max.resize(_n_class * _n_task);
_cls_min.resize(_n_class * _n_task);
......@@ -82,10 +86,14 @@ void ConvexHull1D::initialize_prop(const std::vector<int>& sizes, double* prop)
double ConvexHull1D::overlap_1d(double* value, double width)
{
// Initialize scores and value arrays
std::fill_n(_task_scores.data(), _task_scores.size(), 0.0);
for(int ii = 0; ii < _sorted_prop_inds.size(); ++ii)
{
_sorted_value[ii] = value[_sorted_prop_inds[ii]];
}
// Get min/max for each class
for(int tt = 0; tt < _n_task; ++tt)
{
for(int cc = 0; cc < _n_class; ++cc)
......@@ -101,6 +109,7 @@ double ConvexHull1D::overlap_1d(double* value, double width)
}
}
// Calculate the score for the feature
for(int tt = 0; tt < _n_task; ++tt)
{
double cls_norm = 1.0 / (static_cast<double>(_n_class) * static_cast<double>(_n_class - 1));
......
......@@ -61,10 +61,9 @@ public:
/**
* @brief Calculate the projection scores of a set of features to a vector via Pearson correlation
*
* @param prop The pointer to the property vector to calculate the Pearson correlation against
* @param value The pointer to the value of the data
* @param inds Vector storing the indexes of the sorted porperty value
* @param class_szs number of elements in each class
* @param width The buffer used for calculating the overlap
*
* @returns The projection score for the particular feature
*/
double overlap_1d(double* value, double width = 0.0);
......
......@@ -50,10 +50,12 @@ public:
/**
* @brief The constructor for the LPWrapper
*
* @param samp_per_class number of samples per class
* @param task_num The task ID number to perform the calculation
* @param n_class Number of classes in the dataset
* @param n_dim Number of dimensions of the problem
* @param n_samp Number of samples in the dataset
* @param prop pointer to the start of the property vector
* @param tol The tolerance used to have a fuzzy border around the convex hull
*/
LPWrapper(std::vector<int> samp_per_class, int task_num, int n_class, int n_dim, int n_samp, double tol);
......@@ -78,13 +80,6 @@ public:
*/
void copy_data(int cls, std::vector<double*> val_ptrs);
/**
* @brief Train the SVM model
*
* @param remap_coefs If true remap the final coefficients back to the unscaled feature space
*/
void train();
/**
* @brief Copy the data from a set of feature indexes (sorted_dmatrix) into the x_space and train the SVM model
*
......@@ -106,14 +101,14 @@ public:
inline int n_class(){return _n_class;}
/**
* @brief The number of classes in the training set
* @return The number of classes in the training set
* @brief The task id number
* @return The task id number
*/
inline int task_num(){return _task_num;}
/**
* @brief The number of dimensions of the SVM model
* @return The number of dimensions of the SVM model
* @brief The number of dimensions of the Convex Hulls
* @return The number of dimensions of the Convex Hulls
*/
inline int n_dim(){return _n_dim;}
......@@ -136,4 +131,4 @@ public:
inline int n_overlap(){return _n_overlap;}
};
#endif
\ No newline at end of file
#endif
......@@ -55,7 +55,6 @@ public:
*
* @param n_class Number of classes in the dataset
* @param n_dim Number of dimensions of the problem
* @param n_samp Number of samples in the dataset
* @param prop The property vector
*/
SVMWrapper(int n_class, int n_dim, std::vector<double> prop);
......@@ -77,7 +76,6 @@ public:
* @param C The C value for the SVM calculation
* @param n_class Number of classes in the dataset
* @param n_dim Number of dimensions of the problem
* @param n_samp Number of samples in the dataset
* @param prop The property vector
*/
SVMWrapper(double C, int n_class, int n_dim, std::vector<double> prop);
......@@ -214,4 +212,4 @@ public:
};
#endif
\ No newline at end of file
#endif
......@@ -49,7 +49,7 @@ namespace prop_sorted_d_mat
* @param task The task number
* @param cls the class number
*
* @return [description]
* @return the number of samples in the class
*/
inline int get_class_size(int task, int cls){return N_SAMPLES_PER_CLASS[task * N_CLASS + cls];}
......@@ -104,6 +104,7 @@ namespace prop_sorted_d_mat
/**
* @brief Access the sorted descriptor matrix by the sample index, class number, and task number
*
* @param sample_ind The index of the sample to point to
* @param task The task number
* @param cls The class number
* @return pointer to the element of the given sample within a class and task in the first feature
......
......@@ -74,6 +74,7 @@ public:
* @param feats The features for the model
* @param task_sizes_train Number of samples per task in the training data
* @param task_sizes_test Number of samples per task in the test data
* @param fix_intercept If true the intercept of the model is 0
*/
Model(
std::string prop_label,
......@@ -114,12 +115,52 @@ public:
*/
Model& operator= (Model&& o) = default;
/**
* @brief Evaluate the model for a new point
*
* @param x_in pointer to the new data point (order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given data point
*/
virtual double eval(double* x_in) = 0;
/**
* @brief Evaluate the model for a new point
*
* @param x_in The data point to evaluate the model (order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given data point
*/
double eval(std::vector<double> x_in);
/**
* @brief Evaluate the model for a new point
*
* @param x_in_dct Dictionary describing the new point ("feature expr": value)
* @return The prediction of the model for a given data point
*/
double eval(std::map<std::string, double> x_in_dct);
/**
* @brief Evaluate the model for a new set of new points
*
* @param x_in a vector of pointers to the set of values for new data points (one pointer per each feature and order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given set of data points
*/
virtual std::vector<double> eval(std::vector<double>* x_in) = 0;
/**
* @brief Evaluate the model for a set of new points
*
* @param x_in The set data for a set of new data points (size of n_feature x n_points, and order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given data point
*/
std::vector<double> eval(std::vector<std::vector<double>> x_in);
/**
* @brief Evaluate the model for a set of new points
*
* @param x_in_dct The set of data points to evaluate the model. Keys must be strings representing feature expressions and vectors must be the same length
* @return The prediction of the model for a given data point
*/
std::vector<double> eval(std::map<std::string, std::vector<double>> x_in_dct);
// DocString: model_set_task_eval
......@@ -177,7 +218,7 @@ public:
* @param train If true output the training data
* @param test_inds The indexes of the test set
*/
virtual void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}) = 0;
virtual void to_file(std::string filename, bool train=true, std::vector<int> test_inds={}) = 0;
// DocString: model_fix_intercept
/**
......@@ -192,64 +233,97 @@ public:
inline std::vector<std::vector<double>> coefs(){return _coefs;}
#ifdef PY_BINDINGS
// DocString: model_coefs
/**
* @brief The coefficient array for the model
* @return The coefficients as a python list
*/
inline py::list coefs_py()
{
py::list coef_lst;
for(auto& task_coefs : _coefs)
coef_lst.append<py::list>(python_conv_utils::to_list<double>(task_coefs));
return coef_lst;
}
// DocString: model_feats
/**
* @brief The features of the model
* @return A python list containing all of the features
*/
inline py::list feats()
{
py::list feat_lst;
for(auto& feat : _feats)
feat_lst.append<ModelNode>(*feat);
return feat_lst;
}
// DocString: model_prop_train
/**
* @brief The training data property to be learned
* @return _prop_train as a numpy ndarray
*/
inline np::ndarray prop_train(){return python_conv_utils::to_ndarray<double>(_prop_train);}
// DocString: model_prop_test
/**
* @brief The test values for the property
* @return _prop_test as a numpy ndarray
*/
inline np::ndarray prop_test(){return python_conv_utils::to_ndarray<double>(_prop_test);}
// DocString: model_task_sizes_train
/**
* @brief Access the size of the training set for each task
*/
inline py::list task_sizes_train(){return python_conv_utils::to_list<int>(_task_sizes_train);}
// DocString: model_task_sizes_test
/**
* @brief Access the size of the test set for each task
*/
inline py::list task_sizes_test(){return python_conv_utils::to_list<int>(_task_sizes_test);}
inline double eval_py(np::ndarray x_in){return eval(python_conv_utils::from_ndarray<double>(x_in));}
inline double eval_py(py::list x_in){return eval(python_conv_utils::from_list<double>(x_in));}
inline double eval_py(py::dict x_in){return eval(python_conv_utils::from_dict<std::string, double>(x_in));}
np::ndarray eval_many_py(np::ndarray x_in);
np::ndarray eval_many_py(py::dict x_in);
// DocString: model_coefs
/**
* @brief The coefficient array for the model
* @return The coefficients as a python list
*/
inline py::list coefs_py()
{
py::list coef_lst;
for(auto& task_coefs : _coefs)
coef_lst.append<py::list>(python_conv_utils::to_list<double>(task_coefs));
return coef_lst;
}
// DocString: model_feats
/**
* @brief The features of the model
* @return A python list containing all of the features
*/
inline py::list feats()
{
py::list feat_lst;
for(auto& feat : _feats)
feat_lst.append<ModelNode>(*feat);
return feat_lst;
}
// DocString: model_prop_train
/**
* @brief The training data property to be learned
* @return _prop_train as a numpy ndarray
*/
inline np::ndarray prop_train(){return python_conv_utils::to_ndarray<double>(_prop_train);}
// DocString: model_prop_test
/**
* @brief The test values for the property
* @return _prop_test as a numpy ndarray
*/
inline np::ndarray prop_test(){return python_conv_utils::to_ndarray<double>(_prop_test);}
// DocString: model_task_sizes_train
/**
* @brief Access the size of the training set for each task
*/
inline py::list task_sizes_train(){return python_conv_utils::to_list<int>(_task_sizes_train);}
// DocString: model_task_sizes_test
/**
* @brief Access the size of the test set for each task
*/
inline py::list task_sizes_test(){return python_conv_utils::to_list<int>(_task_sizes_test);}
/**
* @brief Evaluate the model for a new point
*
* @param x_in The data point to evaluate the model (order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given data point
*/
inline double eval_py(np::ndarray x_in){return eval(python_conv_utils::from_ndarray<double>(x_in));}
/**
* @brief Evaluate the model for a new point
*
* @param x_in The data point to evaluate the model (order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given data point
*/
inline double eval_py(py::list x_in){return eval(python_conv_utils::from_list<double>(x_in));}
/**
* @brief Evaluate the model for a new point
*
* @param x_in_dct Dictionary describing the new point ("feature expr": value)
* @return The prediction of the model for a given data point
*/
inline double eval_py(py::dict x_in){return eval(python_conv_utils::from_dict<std::string, double>(x_in));}
/**
* @brief Evaluate the model for a set of new points
*
* @param x_in The set data for a set of new data points (size of n_feature x n_points, and order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given data point
*/
np::ndarray eval_many_py(np::ndarray x_in);
/**
* @brief Evaluate the model for a set of new points
*
* @param x_in_dct The set of data points to evaluate the model. Keys must be strings representing feature expressions and vectors must be the same length
* @return The prediction of the model for a given data point
*/
np::ndarray eval_many_py(py::dict x_in);
#endif
};
......
......@@ -89,6 +89,8 @@ ModelClassifier::ModelClassifier(std::string train_file)
std::vector<double> feat_test_val = {};
std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data());
model_node_ptr feat;
// Legacy checks can be removed once everything is sufficiently settled for release
if((split_str[1][0] == '(') || (split_str[1][0] == '['))
{
std::string unit_str = split_str[2];
......@@ -147,6 +149,8 @@ ModelClassifier::ModelClassifier(std::string train_file, std::string test_file)
std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data());
std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data());
model_node_ptr feat;
// Legacy checks can be removed once everything is sufficiently settled for release
if((split_str[1][0] == '(') || (split_str[1][0] == '['))
{
std::string unit_str = split_str[2];
......@@ -184,7 +188,19 @@ ModelClassifier::ModelClassifier(std::string train_file, std::string test_file)
set_train_test_error();
if((file_train_n_convex_overlap != _train_n_convex_overlap) || (file_test_n_convex_overlap != _test_n_convex_overlap))
throw std::logic_error("The file does not have the same convex overlap (" + std::to_string(file_train_n_convex_overlap) + ", " + std::to_string(file_test_n_convex_overlap) + ") as calculated here (" + std::to_string(_train_n_convex_overlap) + ", " + std::to_string(_test_n_convex_overlap) + ").");
{
throw std::logic_error(
"The file does not have the same convex overlap (" +
std::to_string(file_train_n_convex_overlap) +
", " +
std::to_string(file_test_n_convex_overlap) +
") as calculated here (" +
std::to_string(_train_n_convex_overlap) +
", " +
std::to_string(_test_n_convex_overlap) +
")."
);
}
}
double ModelClassifier::eval(double* x_in)
......@@ -213,7 +229,7 @@ std::vector<std::string> ModelClassifier::populate_model(std::string filename, b
std::string model_line;
std::getline(file_stream, model_line);
// Get the property unit and error
// Get the property unit and label and the error summary
std::string unit_line;
std::string error_line;
......@@ -284,6 +300,7 @@ std::vector<std::string> ModelClassifier::populate_model(std::string filename, b
std::getline(file_stream, line);
// Get task sizes
int n_samp = 0;
for(int tt = 0; tt < n_task; ++tt)
{
......@@ -300,6 +317,7 @@ std::vector<std::string> ModelClassifier::populate_model(std::string filename, b
}
}
// Get the data of the model
if(train)
{
_n_samp_train = n_samp;
......@@ -371,7 +389,7 @@ std::vector<std::string> ModelClassifier::populate_model(std::string filename, b
void ModelClassifier::set_train_test_error()
{
int n_row = _n_dim + 1;
// Set up Coin-LP objects
ClpSimplex lp_simplex;
lp_simplex.setLogLevel(0);
lp_simplex.setMoreSpecialOptions(2);
......@@ -401,6 +419,7 @@ void ModelClassifier::set_train_test_error()
for(int tt = 0; tt < _task_sizes_train.size(); ++tt)
{
// Set up sorted data for LP claculations
std::vector<int> inds_train(_task_sizes_train[tt], 0);
std::iota(inds_train.begin(), inds_train.end(), task_start_train);
util_funcs::argsort(inds_train.data(), inds_train.data() + inds_train.size(), &_prop_train[task_start_train]);
......@@ -428,7 +447,6 @@ void ModelClassifier::set_train_test_error()
inds_class_train.push_back({inds_train[ii]});
}
}
for(auto& ind_lst : inds_class_train)
_n_class = int(std::round(static_cast<double>(inds_class_train.size()) / static_cast<double>(_task_sizes_train.size())));
for(int ii = 1; ii < inds_test.size(); ++ii)
{
......@@ -443,8 +461,11 @@ void ModelClassifier::set_train_test_error()
}
if(inds_class_test.size() == 0)
{
inds_class_test = std::vector<std::vector<int>>(inds_class_train.size());
}
// Perform the Linear programming
for(int c1 = 0; c1 < inds_class_train.size(); ++c1)
{
int n_col = inds_class_train[c1].size();
......@@ -573,13 +594,13 @@ void ModelClassifier::to_file(std::string filename, bool train, std::vector<int>
out_file_stream << "# Property Label: $" << str_utils::latexify(_prop_label) << "$; Unit of the Property: " << _prop_unit.toString() << std::endl;
if(train)
{
out_file_stream << "# Number of Samples in Convex Hull Overlap Region: " << _train_n_convex_overlap << ";";
out_file_stream << "Number of Samples SVM Misclassified: " << std::setprecision(15) << n_svm_misclassified_train() << std::endl;
out_file_stream << "# # Samples in Convex Hull Overlap Region: " << _train_n_convex_overlap << ";";
out_file_stream << "# Samples SVM Misclassified: " << std::setprecision(15) << n_svm_misclassified_train() << std::endl;
}
else
{
out_file_stream << "# Number of Samples in Convex Hull Overlap Region: " << _test_n_convex_overlap << ";";
out_file_stream << "Number of Samples SVM Misclassified: " << std::setprecision(15) << n_svm_misclassified_test() << std::endl;
out_file_stream << "# # Samples in Convex Hull Overlap Region: " << _test_n_convex_overlap << ";";
out_file_stream << "# Samples SVM Misclassified: " << std::setprecision(15) << n_svm_misclassified_test() << std::endl;
}
out_file_stream << "# Plane Divider" << std::endl;
......@@ -614,7 +635,7 @@ void ModelClassifier::to_file(std::string filename, bool train, std::vector<int>
out_file_stream << boost::algorithm::join(_feats[ff]->get_x_in_expr_list(), ",") << std::endl;
}
out_file_stream << "# Number of Samples Per Task" << std::endl;
out_file_stream << "# # Samples Per Task" << std::endl;
if(train)
{
out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl;
......
......@@ -56,12 +56,14 @@ public:
/**
* @brief Constructor for the model
*
* @param prop_label The label of the property from the csv
* @param prop_unit The unit of the property
* @param prop_train The property vector for the training samples
* @param prop_test The property vector for the test samples
* @param feats The features for the model
* @param task_sizes_train Number of samples per task in the training data
* @param task_sizes_test Number of samples per task in the test data
* @param fix_intercept if True then the intercept is 0.0
*/
ModelClassifier(
std::string prop_label,
......@@ -121,8 +123,21 @@ public:
*/
ModelClassifier& operator= (ModelClassifier&& o) = default;
virtual double eval(double* x_in);
virtual std::vector<double> eval(std::vector<double>* x_in);
/**
* @brief Evaluate the model for a new point
*
* @param x_in pointer to the new data point (order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given data point
*/
double eval(double* x_in);
/**
* @brief Evaluate the model for a new set of new points
*
* @param x_in a vector of pointers to the set of values for new data points (one pointer per each feature and order the same as appending the results of _feats[nn]->get_x_in_expr_list() for all feature)
* @return The prediction of the model for a given set of data points
*/
std::vector<double> eval(std::vector<double>* x_in);
/**
* @brief Read an output file and extract all relevant information
......@@ -131,7 +146,7 @@ public:
* @param filename Name of the output file
* @param train If true then the output represents training data
*
* @return [description]
* @return The set of strings used to create the feature's meta-data
*/
std::vector<std::string> populate_model(std::string filename, bool train);
......@@ -165,9 +180,13 @@ public:
* @param train If true output the training data
* @param test_inds The indexes of the test set
*/
void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {});
void to_file(std::string filename, bool train=true, std::vector<int> test_inds={});
/**
* @brief Set the train/test error of the model using linear programming
*/
void set_train_test_error();
// DocString: model_classn_convex_overlap_train
/**
* @brief The number of samples in overlapping convex hull regions (training data)
......@@ -211,61 +230,64 @@ public:
}
#ifdef PY_BINDINGS
/**
* @brief Construct a new Model with updated coefficient
* @details Copy a model but update its coefficients
*
* @param o Model to be copied
* @param new_coefs The new coefficients
*/
ModelClassifier(const ModelClassifier& o, py::list new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est);
/**
* @brief Construct a new Model with updated coefficient
* @details Copy a model but update its coefficients
*
* @param o Model to be copied
* @param new_coefs The new coefficients
*/
ModelClassifier(const ModelClassifier& o, np::ndarray new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est);
// DocString: model_class_to_file
/**
* @brief Convert the ModelClassifier into an output file
*
* @param filename The name of the file to output to
* @param train If true output the training data
* @param test_inds The indexes of the test set
*/
inline void to_file_py(std::string filename, bool train = true){to_file(filename, train);}
// DocString: model_class_prop_train_est
/**
* @brief The estimation of the property
* @return _prop_train_est as a numpy ndarray
*/
inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);}
// DocString: model_class_prop_test_est
/**
* @brief The estimation of the properties test values
* @return _prop_test_est as a numpy ndarray