diff --git a/src/classification/LPWrapper.cpp b/src/classification/LPWrapper.cpp index f158befffc387a19edce066e5f9f029f465d8044..7b10870c165e0c0b2d53563259dbfb18284c95cd 100644 --- a/src/classification/LPWrapper.cpp +++ b/src/classification/LPWrapper.cpp @@ -45,6 +45,132 @@ LPWrapper::LPWrapper( } setup_constraints(); } +LPWrapper::LPWrapper(const LPWrapper& o) : + _elements(o._elements), + _row_lower(o._row_lower), + _row_upper(o._row_upper), + _row_lower_test(o._row_lower_test), + _row_upper_test(o._row_upper_test), + _objective(o._objective), + _column_lower(o._column_lower), + _column_upper(o._column_upper), + _column_start(o._column_start), + _row(o._row), + _row_test(o._row_test), + _n_row_per_class(o._n_row_per_class), + _n_row_per_class_test(o._n_row_per_class_test), + _tol(o._tol), + _n_dim(o._n_dim), + _n_samp(o._n_samp), + _n_samp_test(o._n_samp_test), + _n_class(o._n_class), + _n_row(o._n_row), + _task_num(o._task_num), + _n_col(o._n_col), + _n_overlap(o._n_overlap), + _n_overlap_test(o._n_overlap_test), + _n_pts_check(o._n_pts_check), + _n_pts_check_test(o._n_pts_check_test) +{ + setup_constraints(); +} + +LPWrapper::LPWrapper(LPWrapper&& o) : + _elements(o._elements), + _row_lower(o._row_lower), + _row_upper(o._row_upper), + _row_lower_test(o._row_lower_test), + _row_upper_test(o._row_upper_test), + _objective(o._objective), + _column_lower(o._column_lower), + _column_upper(o._column_upper), + _column_start(o._column_start), + _row(o._row), + _row_test(o._row_test), + _n_row_per_class(o._n_row_per_class), + _n_row_per_class_test(o._n_row_per_class_test), + _tol(o._tol), + _n_dim(o._n_dim), + _n_samp(o._n_samp), + _n_samp_test(o._n_samp_test), + _n_class(o._n_class), + _n_row(o._n_row), + _task_num(o._task_num), + _n_col(o._n_col), + _n_overlap(o._n_overlap), + _n_overlap_test(o._n_overlap_test), + _n_pts_check(o._n_pts_check), + _n_pts_check_test(o._n_pts_check_test) +{ + setup_constraints(); +} + +LPWrapper& LPWrapper::operator= (const LPWrapper& o) +{ + _elements = o._elements; + _row_lower = o._row_lower; + _row_upper = o._row_upper; + _row_lower_test = o._row_lower_test; + _row_upper_test = o._row_upper_test; + _objective = o._objective; + _column_lower = o._column_lower; + _column_upper = o._column_upper; + _column_start = o._column_start; + _row = o._row; + _row_test = o._row_test; + _n_row_per_class = o._n_row_per_class; + _n_row_per_class_test = o._n_row_per_class_test; + _tol = o._tol; + _n_dim = o._n_dim; + _n_samp = o._n_samp; + _n_samp_test = o._n_samp_test; + _n_class = o._n_class; + _n_row = o._n_row; + _task_num = o._task_num; + _n_col = o._n_col; + _n_overlap = o._n_overlap; + _n_overlap_test = o._n_overlap_test; + _n_pts_check = o._n_pts_check; + _n_pts_check_test = o._n_pts_check_test; + + setup_constraints(); + + return *this; +} + +LPWrapper& LPWrapper::operator= (LPWrapper&& o) +{ + _elements = std::move(o._elements); + _row_lower = std::move(o._row_lower); + _row_upper = std::move(o._row_upper); + _row_lower_test = std::move(o._row_lower_test); + _row_upper_test = std::move(o._row_upper_test); + _objective = std::move(o._objective); + _column_lower = std::move(o._column_lower); + _column_upper = std::move(o._column_upper); + _column_start = std::move(o._column_start); + _row = std::move(o._row); + _row_test = std::move(o._row_test); + _n_row_per_class = std::move(o._n_row_per_class); + _n_row_per_class_test = std::move(o._n_row_per_class_test); + _tol = std::move(o._tol); + _n_dim = std::move(o._n_dim); + _n_samp = std::move(o._n_samp); + _n_samp_test = std::move(o._n_samp_test); + _n_class = std::move(o._n_class); + _n_row = std::move(o._n_row); + _task_num = std::move(o._task_num); + _n_col = std::move(o._n_col); + _n_overlap = std::move(o._n_overlap); + _n_overlap_test = std::move(o._n_overlap_test); + _n_pts_check = std::move(o._n_pts_check); + _n_pts_check_test = std::move(o._n_pts_check_test); + + setup_constraints(); + + return *this; +} + void LPWrapper::setup_constraints() { @@ -204,7 +330,6 @@ int LPWrapper::get_n_overlap(const std::vector<int> inds) { _simplex.chgRowLower(&_row_lower[ii * _n_row]); _simplex.chgRowUpper(&_row_upper[ii * _n_row]); - _simplex.dual(); _n_overlap += _simplex.primalFeasible(); } @@ -216,7 +341,6 @@ void LPWrapper::set_n_overlap(const std::vector<double*> val_ptrs, const std::ve { _n_overlap = 0; _n_overlap_test = 0; - for(int cc = 0; cc < _n_class; ++cc) { _n_col = _n_row_per_class[cc]; diff --git a/src/classification/LPWrapper.hpp b/src/classification/LPWrapper.hpp index c9adc8609a0b309d1676d7f85db9f285ae059bd5..c7df6cf243c973e55557c2c7f768974e76a4968c 100644 --- a/src/classification/LPWrapper.hpp +++ b/src/classification/LPWrapper.hpp @@ -10,6 +10,7 @@ #include <algorithm> #include <iostream> +#include <memory> #include <numeric> #include <coin/ClpSimplex.hpp> @@ -37,14 +38,14 @@ protected: std::vector<int> _n_row_per_class; //!< Number of rows per class std::vector<int> _n_row_per_class_test; //!< Number of rows per class in the test set - const double _tol; //!< tolerance value (\epsilon) for the final row constraint + double _tol; //!< tolerance value (\epsilon) for the final row constraint - const int _n_dim; //!< The number of dimensions for the SVM problem - const int _n_samp; //!< The number of samples per feature - const int _n_samp_test; //!< The number of samples per feature in the test set - const int _n_class; //!< The number of classes in the data set - const int _n_row; //!< The number of rows in the A matrix - const int _task_num; //!< The task number the LPWrapper is used for + int _n_dim; //!< The number of dimensions for the SVM problem + int _n_samp; //!< The number of samples per feature + int _n_samp_test; //!< The number of samples per feature in the test set + int _n_class; //!< The number of classes in the data set + int _n_row; //!< The number of rows in the A matrix + int _task_num; //!< The task number the LPWrapper is used for int _n_col; //!< The number of columns in the A matrix int _n_overlap; //!< The number of misclassified data points in the training set int _n_overlap_test; //!< The number of misclassified data points in the test set @@ -75,6 +76,34 @@ public: const int n_samp_test=0 ); + /** + * @brief Copy Constructor + * + * @param o LPWrapper to be copied + */ + LPWrapper(const LPWrapper& o); + + /** + * @brief Move Constructor + * + * @param o LPWrapper to be moved + */ + LPWrapper(LPWrapper&& o); + + /** + * @brief Copy Assignment operator + * + * @param o LPWrapper to be copied + */ + LPWrapper& operator= (const LPWrapper& o); + + /** + * @brief Move Assignment operator + * + * @param o LPWrapper to be moved + */ + LPWrapper& operator= (LPWrapper&& o); + /** * @brief Set up the constraint matrix for the LP problem */ diff --git a/src/classification/SVMWrapper.cpp b/src/classification/SVMWrapper.cpp index 97a0e1a04eef6e353f7ffcb207281832aaf9061e..b70d501733eb6cfd723f8d131776f5796d903360 100644 --- a/src/classification/SVMWrapper.cpp +++ b/src/classification/SVMWrapper.cpp @@ -15,7 +15,6 @@ SVMWrapper::SVMWrapper(const int n_class, const int n_dim, const int n_samp, con _n_samp(n_samp), _n_class(n_class) { - // std::copy_n(prop, _n_samp, _y.data()); setup_parameter_obj(_C); setup_x_space(); @@ -62,7 +61,6 @@ SVMWrapper::SVMWrapper(const double C, const int n_class, const int n_dim, const _n_samp(n_samp), _n_class(n_class) { - // std::copy_n(prop, _n_samp, _y.data()); setup_parameter_obj(_C); setup_x_space(); diff --git a/src/descriptor_identifier/Model/Model.cpp b/src/descriptor_identifier/Model/Model.cpp index d6a9f2905b9338a0ec7d1ed25ea0de60764842a1..82ea272296b19cc8cfad53b2d80581e6f964a651 100644 --- a/src/descriptor_identifier/Model/Model.cpp +++ b/src/descriptor_identifier/Model/Model.cpp @@ -3,34 +3,33 @@ Model::Model( const std::string prop_label, const Unit prop_unit, - const std::vector<double> prop_train, - const std::vector<double> prop_test, + const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds, - const std::vector<int> task_sizes_train, - const std::vector<int> task_sizes_test, - const bool fix_intercept + const std::vector<int> leave_out_inds ) : _n_samp_train(feats[0]->n_samp()), _n_samp_test(feats[0]->n_test_samp()), _n_dim(feats.size()), _feats(feats), - _prop_train(prop_train), - _prop_test(prop_test), - _D_train(_n_samp_train * (feats.size() + (1 - fix_intercept))), - _D_test(_n_samp_test * (feats.size() + (1 - fix_intercept))), - _prop_train_est(_prop_train), - _prop_test_est(_prop_test), - _train_error(_n_samp_train), - _test_error(_n_samp_test), + _prop_train(loss->prop_train()), + _prop_test(loss->prop_test()), + _D_train(_n_samp_train * (feats.size() + (1 - loss->fix_intercept()))), + _D_test(_n_samp_test * (feats.size() + (1 - loss->fix_intercept()))), + _prop_train_est(_n_samp_train, 0.0), + _prop_test_est(_n_samp_test, 0.0), + _train_error(_n_samp_train, 0.0), + _test_error(_n_samp_test, 0.0), _leave_out_inds(leave_out_inds), - _task_sizes_train(task_sizes_train), - _task_sizes_test(task_sizes_test), + _task_sizes_train(loss->task_sizes_train()), + _task_sizes_test(loss->task_sizes_test()), + _loss(loss), _prop_label(prop_label), _prop_unit(prop_unit), _task_eval(0), - _fix_intercept(fix_intercept) -{} + _fix_intercept(loss->fix_intercept()) +{ + _loss->set_nfeat(_feats.size()); +} Model::Model() : _task_eval(0) diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp index dd1878e9d0c85420f84533f214876f3d49a1ab9e..fae885cca0302d39ce9429e752221a1288776e78 100644 --- a/src/descriptor_identifier/Model/Model.hpp +++ b/src/descriptor_identifier/Model/Model.hpp @@ -19,6 +19,9 @@ #include <iostream> #include "feature_creation/node/ModelNode.hpp" + +#include "loss_function/utils.hpp" + #include "utils/string_utils.hpp" #include "utils/vector_utils.hpp" @@ -59,6 +62,7 @@ protected: std::vector<int> _task_sizes_train; //!< Number of training samples in each task std::vector<int> _task_sizes_test; //!< Number of testing samples in each task + std::shared_ptr<LossFunction> _loss; std::string _prop_label; //!< label for the model Unit _prop_unit; //!< The Unit for the property @@ -88,13 +92,9 @@ public: Model( const std::string prop_label, const Unit prop_unit, - const std::vector<double> prop_train, - const std::vector<double> prop_test, + const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds, - const std::vector<int> task_sizes_train, - const std::vector<int> task_sizes_test, - const bool fix_intercept + const std::vector<int> leave_out_inds ); /** @@ -125,6 +125,13 @@ public: */ Model& operator= (Model&& o) = default; + /** + * @brief Copy the error into a new array + * + * @param res pointer to the beginning of the vector to store the residual + */ + virtual void copy_error(double* res) const = 0; + /** * @brief Read an output file and extract all relevant information * @details Takes in an output file and extracts all data needed to recreate the model diff --git a/src/descriptor_identifier/Model/ModelClassifier.cpp b/src/descriptor_identifier/Model/ModelClassifier.cpp index 8d4c73053ed1068899a9097a479b42aa0df6d18a..1be29cbbe90dba37aa2a26c452cc458e1f5598bb 100644 --- a/src/descriptor_identifier/Model/ModelClassifier.cpp +++ b/src/descriptor_identifier/Model/ModelClassifier.cpp @@ -3,17 +3,14 @@ ModelClassifier::ModelClassifier( const std::string prop_label, const Unit prop_unit, - const std::vector<double> prop_train, - const std::vector<double> prop_test, + const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds, - const std::vector<int> task_sizes_train, - const std::vector<int> task_sizes_test, - const bool fix_intercept + const std::vector<int> leave_out_inds ) : - Model(prop_label, prop_unit, prop_train, prop_test, feats, leave_out_inds, task_sizes_train, task_sizes_test, fix_intercept), + Model(prop_label, prop_unit, loss, feats, leave_out_inds), _train_n_convex_overlap(0), - _test_n_convex_overlap(0) + _test_n_convex_overlap(0), + _n_class(loss->n_class()) { _prop_train_est.reserve(_n_samp_train); _prop_test_est.reserve(_n_samp_test); @@ -80,6 +77,14 @@ ModelClassifier::ModelClassifier(const std::string train_file) int file_train_n_convex_overlap = _train_n_convex_overlap; _train_n_convex_overlap = 0; + _loss = std::make_shared<LossFunctionConvexHull>( + _prop_train, + _prop_test, + _task_sizes_train, + _task_sizes_test + ); + _loss->set_nfeat(_feats.size()); + set_train_test_error(); if((file_train_n_convex_overlap != _train_n_convex_overlap)) { @@ -106,6 +111,14 @@ ModelClassifier::ModelClassifier(const std::string train_file, const std::string int file_test_n_convex_overlap = _test_n_convex_overlap; _test_n_convex_overlap = 0; + _loss = std::make_shared<LossFunctionConvexHull>( + _prop_train, + _prop_test, + _task_sizes_train, + _task_sizes_test + ); + _loss->set_nfeat(_feats.size()); + set_train_test_error(); if((file_train_n_convex_overlap != _train_n_convex_overlap) || (file_test_n_convex_overlap != _test_n_convex_overlap)) { @@ -155,112 +168,11 @@ void ModelClassifier::set_error_from_file(std::string error_line, bool train) void ModelClassifier::set_train_test_error() { - // Calculate number of overlapping points by checking feasibility of a linear optimization problem - int task_start_train = 0; - int task_start_test = 0; - - _train_n_convex_overlap = 0; - _test_n_convex_overlap = 0; - - std::vector<double> unique_classes = vector_utils::unique<double>(_prop_train); - std::sort(unique_classes.begin(), unique_classes.end()); - _n_class = unique_classes.size(); - - std::map<double, int> cls_map; - for(int cc = 0; cc < _n_class; ++cc) - { - cls_map[unique_classes[cc]] = cc; - } - - for(int tt = 0; tt < _task_sizes_train.size(); ++tt) - { - // Set up sorted data for LP claculations - std::vector<int> inds_train(_task_sizes_train[tt], 0); - std::iota(inds_train.begin(), inds_train.end(), task_start_train); - util_funcs::argsort<double>(inds_train.data(), inds_train.data() + inds_train.size(), &_prop_train[task_start_train]); - - std::vector<int> inds_test(_task_sizes_test[tt], 0); - std::iota(inds_test.begin(), inds_test.end(), task_start_test); - util_funcs::argsort<double>(inds_test.data(), inds_test.data() + inds_test.size(), &_prop_test[task_start_test]); - - // Sort the value/test_value vectors based on the classes - std::vector<std::vector<double>> values(_n_dim, std::vector<double>(_task_sizes_train[tt])); - std::vector<std::vector<double>> test_values(_n_dim, std::vector<double>(_task_sizes_test[tt])); - - std::vector<double*> value_ptr_list(_n_dim); - std::vector<double*> test_value_ptr_list(_n_dim); - for(int ff = 0; ff < _n_dim; ++ff) - { - double* val_ptr = _feats[ff]->value_ptr(); - std::transform( - inds_train.begin(), - inds_train.end(), - values[ff].data(), - [val_ptr](int ind){return val_ptr[ind];} - ); - value_ptr_list[ff] = values[ff].data(); - - val_ptr = _feats[ff]->test_value_ptr(); - std::transform( - inds_test.begin(), - inds_test.end(), - test_values[ff].data(), - [val_ptr](int ind){return val_ptr[ind];} - ); - test_value_ptr_list[ff] = test_values[ff].data(); - } - - // Get the number of samples in each class - std::vector<int> n_samp_class_train(_n_class, 0); - std::for_each(inds_train.begin(), inds_train.end(), [&](int ind){++n_samp_class_train[cls_map[_prop_train[ind]]];}); - - std::vector<int> n_samp_class_test(_n_class); - std::for_each(inds_test.begin(), inds_test.end(), [&](int ind){++n_samp_class_test[cls_map[_prop_test[ind]]];}); - - // Perform the LP calculations - std::vector<double> err(_task_sizes_train[tt], 0.0); - std::vector<double> test_err(_task_sizes_test[tt], 0.0); - LPWrapper lp( - n_samp_class_train, - tt, - _n_class, - _n_dim, - _n_samp_train, - 1e-5, - n_samp_class_test, - _n_samp_test - ); - lp.set_n_overlap(value_ptr_list, test_value_ptr_list, err.data(), test_err.data()); - _train_n_convex_overlap += lp.n_overlap(); - _test_n_convex_overlap += lp.n_overlap_test(); - - for(int ii = 0; ii < inds_train.size(); ++ii) - { - _train_error[inds_train[ii]] = err[ii]; - } - for(int ii = 0; ii < inds_test.size(); ++ii) - { - _test_error[inds_test[ii]] = test_err[ii]; - } + _train_n_convex_overlap = (*_loss)(_feats); + _test_n_convex_overlap = _loss->test_loss(_feats); - task_start_train += _task_sizes_train[tt]; - task_start_test += _task_sizes_test[tt]; - } - - std::transform( - _train_error.begin(), - _train_error.end(), - _prop_train.begin(), - _train_error.begin(), - [](double err, double real){return (err < 1e-10) ? -1 : real;} - ); - std::transform( - _test_error.begin(), - _test_error.end(), - _prop_test.begin(), - _test_error.begin(), - [](double err, double real){return (err < 1e-10) ? -1 : real;} - ); + _train_error = _loss->error_train(); + _test_error = _loss->error_test(); } std::string ModelClassifier::toString() const diff --git a/src/descriptor_identifier/Model/ModelClassifier.hpp b/src/descriptor_identifier/Model/ModelClassifier.hpp index 2a6ea860a34f7b2c0e5c6019b422e72955c221af..f45d830d80ce189057e8bf1c1032363a51936ff0 100644 --- a/src/descriptor_identifier/Model/ModelClassifier.hpp +++ b/src/descriptor_identifier/Model/ModelClassifier.hpp @@ -64,13 +64,9 @@ public: ModelClassifier( const std::string prop_label, const Unit prop_unit, - const std::vector<double> prop_train, - const std::vector<double> prop_test, + const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds, - const std::vector<int> task_sizes_train, - const std::vector<int> task_sizes_test, - const bool fix_intercept + const std::vector<int> leave_out_inds ); // DocString: model_class_init_str diff --git a/src/descriptor_identifier/Model/ModelLogRegressor.cpp b/src/descriptor_identifier/Model/ModelLogRegressor.cpp index 0b1e0ce96f4de13a850303f1501e94396ae9cf9c..74e3137c5b506243595b7c3ca5644528229f17a3 100644 --- a/src/descriptor_identifier/Model/ModelLogRegressor.cpp +++ b/src/descriptor_identifier/Model/ModelLogRegressor.cpp @@ -6,15 +6,11 @@ ModelLogRegressor::ModelLogRegressor() ModelLogRegressor::ModelLogRegressor( const std::string prop_label, const Unit prop_unit, - const std::vector<double> prop_train, - const std::vector<double> prop_test, + const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds, - const std::vector<int> task_sizes_train, - const std::vector<int> task_sizes_test, - const bool fix_intercept + const std::vector<int> leave_out_inds ) : - ModelRegressor(prop_label, prop_unit, prop_train, prop_test, feats, leave_out_inds, task_sizes_train, task_sizes_test, fix_intercept, false), + ModelRegressor(prop_label, prop_unit, loss, feats, leave_out_inds), _log_prop_train(_n_samp_train, 0.0), _log_prop_test(_n_samp_test, 0.0), _log_prop_train_est(_n_samp_train, 0.0), @@ -22,102 +18,32 @@ ModelLogRegressor::ModelLogRegressor( _log_train_error(_n_samp_train, 0.0), _log_test_error(_n_samp_test, 0.0) { - // Set up the property/log of the property - std::transform(_prop_train.begin(), _prop_train.end(), _log_prop_train.begin(),[](double p){return std::log(p);}); - std::transform(_prop_test.begin(), _prop_test.end(), _log_prop_test.begin(),[](double p){return std::log(p);}); - - _log_prop_train.reserve(_n_samp_train); - _log_prop_test.reserve(_n_samp_test); - - _log_prop_train_est.reserve(_n_samp_train); - _log_prop_test_est.reserve(_n_samp_test); - - std::vector<double> a(_D_train.size(), 1.0); - std::vector<double> work(_D_train.size(), 0.0); - - int info; - int start = 0; - // Perform least squares regression to get the coefficients and then get log property estimates - for(int tt = 0; tt < _task_sizes_train.size(); ++tt) - { - int sz = _task_sizes_train[tt]; - std::fill_n(a.data(), a.size(), 1.0); - std::fill_n(_D_train.data(), _D_train.size(), 1.0); - - for(int ff = 0; ff < feats.size(); ++ff) - { - std::transform( - feats[ff]->value_ptr() + start, - feats[ff]->value_ptr() + start + sz, - _D_train.data() + ff * sz, - [](double fv){return std::log(fv);} - ); - std::transform( - feats[ff]->value_ptr() + start, - feats[ff]->value_ptr() + start + sz, - a.data() + ff * sz, - [](double fv){return std::log(fv);} - ); - } - - dgels_('N', sz, _n_dim + 1 - _fix_intercept, 1, a.data(), sz, _log_prop_train.data() + start, sz, work.data(), work.size(), &info); - - _coefs.push_back(std::vector<double>(_n_dim + (!fix_intercept), 0.0)); - std::copy_n(_log_prop_train.begin() + start, _n_dim + 1 - _fix_intercept, _coefs[tt].data()); - std::transform( - _prop_train.begin() + start, - _prop_train.begin() + start + sz, - _log_prop_train.begin() + start, - [](double p){return std::log(p);} - ); - - dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_train.data(), sz, _coefs.back().data(), 1, 0.0, _log_prop_train_est.data() + start, 1); - std::transform( - _log_prop_train_est.begin() + start, - _log_prop_train_est.begin() + start + sz, - _log_prop_train.data() + start, - _log_train_error.data() + start, - std::minus<double>() - ); - - start += sz; - } - - start = 0; - int ii = 0; - for(auto& sz : _task_sizes_test) - { - if(sz > 0) - { - std::fill_n(_D_test.data(), _D_test.size(), 1.0); - for(int ff = 0; ff < feats.size(); ++ff) - { - std::transform( - feats[ff]->test_value_ptr() + start, - feats[ff]->test_value_ptr() + start + sz, - _D_test.data() + ff * sz, - [](double fv){return std::log(fv);} - ); - } - - dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_test.data(), sz, _coefs[ii].data(), 1, 0.0, _log_prop_test_est.data() + start, 1); - std::transform( - _log_prop_test_est.begin() + start, - _log_prop_test_est.begin() + start + sz, - _log_prop_test.data() + start, - _log_test_error.data() + start, - std::minus<double>() - ); - } - ++ii; - start += sz; - } - // Transform to get the error/property estimates - std::transform(_log_prop_train_est.begin(), _log_prop_train_est.end(), _prop_train_est.begin(), [](double lp){return std::exp(lp);}); - std::transform(_log_prop_test_est.begin(), _log_prop_test_est.end(), _prop_test_est.begin(), [](double lp){return std::exp(lp);}); - - std::transform(_prop_train.begin(), _prop_train.end(), _prop_train_est.begin(), _train_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); - std::transform(_prop_test.begin(), _prop_test.end(), _prop_test_est.begin(), _test_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); + std::copy_n(_prop_train.begin(), _prop_train.size(), _log_prop_train.begin()); + std::copy_n(_prop_test.begin(), _prop_test.size(), _log_prop_test.begin()); + std::transform(_prop_train.begin(), _prop_train.end(), _prop_train.begin(), [](double pt){return std::exp(pt);}); + std::transform(_prop_test.begin(), _prop_test.end(), _prop_test.begin(), [](double pt){return std::exp(pt);}); + + std::copy_n(_prop_train_est.begin(), _prop_train_est.size(), _log_prop_train_est.begin()); + std::copy_n(_prop_test_est.begin(), _prop_test_est.size(), _log_prop_test_est.begin()); + std::transform(_prop_train_est.begin(), _prop_train_est.end(), _prop_train_est.begin(), [](double pt){return std::exp(pt);}); + std::transform(_prop_test_est.begin(), _prop_test_est.end(), _prop_test_est.begin(), [](double pt){return std::exp(pt);}); + + std::copy_n(_train_error.begin(), _train_error.size(), _log_train_error.begin()); + std::copy_n(_test_error.begin(), _test_error.size(), _log_test_error.begin()); + std::transform( + _prop_train.begin(), + _prop_train.end(), + _prop_train_est.begin(), + _train_error.begin(), + [](double p, double p_est){return p - p_est;} + ); + std::transform( + _prop_train.begin(), + _prop_train.end(), + _prop_train_est.begin(), + _train_error.begin(), + [](double p, double p_est){return p - p_est;} + ); } ModelLogRegressor::ModelLogRegressor(const std::string train_file) @@ -126,6 +52,14 @@ ModelLogRegressor::ModelLogRegressor(const std::string train_file) std::vector<std::string> feature_expr_train = populate_model(train_file, true); create_model_nodes_from_expressions(feature_expr_train); + _loss = std::make_shared<LossFunctionLogPearsonRMSE>( + _prop_train, + _prop_test, + _task_sizes_train, + _task_sizes_test + ); + _loss->set_nfeat(_feats.size()); + _log_prop_train.resize(_n_samp_train, 0.0); _log_prop_test.resize(_n_samp_test, 0.0); _log_prop_train_est.resize(_n_samp_train, 0.0); @@ -153,6 +87,14 @@ ModelLogRegressor::ModelLogRegressor(const std::string train_file, const std::st create_model_nodes_from_expressions(feature_expr_train); + _loss = std::make_shared<LossFunctionLogPearsonRMSE>( + _prop_train, + _prop_test, + _task_sizes_train, + _task_sizes_test + ); + _loss->set_nfeat(_feats.size()); + _log_prop_train.resize(_n_samp_train, 0.0); _log_prop_test.resize(_n_samp_test, 0.0); _log_prop_train_est.resize(_n_samp_train, 0.0); diff --git a/src/descriptor_identifier/Model/ModelLogRegressor.hpp b/src/descriptor_identifier/Model/ModelLogRegressor.hpp index dad7ee4b741883b43f1b66277953f1ace8f4f005..991d460fd6047f213bcae50935a7fcff46e245e0 100644 --- a/src/descriptor_identifier/Model/ModelLogRegressor.hpp +++ b/src/descriptor_identifier/Model/ModelLogRegressor.hpp @@ -59,13 +59,9 @@ public: ModelLogRegressor( const std::string prop_label, const Unit prop_unit, - const std::vector<double> prop_train, - const std::vector<double> prop_test, + const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds, - const std::vector<int> task_sizes_train, - const std::vector<int> task_sizes_test, - const bool fix_intercept + const std::vector<int> leave_out_inds ); // DocString: model_log_reg_init_str diff --git a/src/descriptor_identifier/Model/ModelRegressor.cpp b/src/descriptor_identifier/Model/ModelRegressor.cpp index 781f8a3199c84883f0c6fc2a01d040fc550ea3eb..0bf44e0248846c97830656735bc737fbf951bc1b 100644 --- a/src/descriptor_identifier/Model/ModelRegressor.cpp +++ b/src/descriptor_identifier/Model/ModelRegressor.cpp @@ -6,79 +6,41 @@ ModelRegressor::ModelRegressor() ModelRegressor::ModelRegressor( const std::string prop_label, const Unit prop_unit, - const std::vector<double> prop_train, - const std::vector<double> prop_test, + const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds, - const std::vector<int> task_sizes_train, - const std::vector<int> task_sizes_test, - const bool fix_intercept, - const bool fill_vecs + const std::vector<int> leave_out_inds ) : - Model(prop_label, prop_unit, prop_train, prop_test, feats, leave_out_inds, task_sizes_train, task_sizes_test, fix_intercept) + Model(prop_label, prop_unit, loss, feats, leave_out_inds) { _prop_train_est.reserve(_n_samp_train); _prop_test_est.reserve(_n_samp_test); - // Perform least squares regression to get the coefficient then calculate an estimate of the property - if(fill_vecs) + double rmse = (*_loss)(_feats); + _train_error = _loss->error_train(); + _test_error = _loss->error_test(); + + std::transform( + _prop_train.begin(), + _prop_train.end(), + _train_error.begin(), + _prop_train_est.begin(), + [](double prop, double err){return -1.0 * (err - prop);} + ); + + std::transform( + _prop_test.begin(), + _prop_test.end(), + _test_error.begin(), + _prop_test_est.begin(), + [](double prop, double err){return -1.0 * (err - prop);} + ); + + std::vector<double> coefs = _loss->coefs(); + int start = 0; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) { - std::vector<double> a(_D_train.size(), 1.0); - std::vector<double> work(_D_train.size(), 0.0); - int info; - int start = 0; - - std::copy_n(_D_train.begin(), _D_train.size(), a.begin()); - for(auto& sz : _task_sizes_train) - { - std::fill_n(a.data(), a.size(), 1.0); - std::fill_n(_D_train.data(), _D_train.size(), 1.0); - for(int ff = 0; ff < feats.size(); ++ff) - { - std::copy_n(feats[ff]->value_ptr() + start, sz, _D_train.data() + ff * sz); - std::copy_n(feats[ff]->value_ptr() + start, sz, a.data() + ff * sz); - } - - dgels_('N', sz, _n_dim + 1 - _fix_intercept, 1, a.data(), sz, prop_train.data() + start, sz, work.data(), work.size(), &info); - - _coefs.push_back(std::vector<double>(_n_dim + (!fix_intercept), 0.0)); - - std::copy_n(prop_train.begin() + start, _n_dim + 1 - _fix_intercept, _coefs.back().data()); - dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_train.data(), sz, _coefs.back().data(), 1, 0.0, _prop_train_est.data() + start, 1); - std::transform( - _prop_train_est.begin() + start, - _prop_train_est.begin() + start + sz, - _prop_train.data() + start, - _train_error.data() + start, - std::minus<double>() - ); - - start += sz; - } - - start = 0; - int ii = 0; - for(auto& sz : _task_sizes_test) - { - if(sz > 0) - { - std::fill_n(_D_test.data(), _D_test.size(), 1.0); - for(int ff = 0; ff < feats.size(); ++ff) - { - std::copy_n(feats[ff]->test_value().data() + start, sz, _D_test.data() + ff * sz); - } - - dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_test.data(), sz, _coefs[ii].data(), 1, 0.0, _prop_test_est.data() + start, 1); - std::transform( - _prop_test_est.begin() + start, - _prop_test_est.begin() + start + sz, - _prop_test.data() + start, - _test_error.data() + start, std::minus<double>() - ); - } - ++ii; - start += sz; - } + _coefs.push_back(std::vector<double>(_loss->n_dim(), 0.0)); + std::copy_n(coefs.begin() + tt * _loss->n_dim(), _loss->n_dim(), _coefs.back().begin()); } } @@ -88,6 +50,14 @@ ModelRegressor::ModelRegressor(const std::string train_file) std::vector<std::string> feature_expr_train = populate_model(train_file, true); create_model_nodes_from_expressions(feature_expr_train); + + _loss = std::make_shared<LossFunctionPearsonRMSE>( + _prop_train, + _prop_test, + _task_sizes_train, + _task_sizes_test + ); + _loss->set_nfeat(_feats.size()); } ModelRegressor::ModelRegressor(const std::string train_file, std::string test_file) @@ -96,6 +66,14 @@ ModelRegressor::ModelRegressor(const std::string train_file, std::string test_fi std::vector<std::string> feature_expr_test = populate_model(test_file, false); create_model_nodes_from_expressions(feature_expr_train); + + _loss = std::make_shared<LossFunctionPearsonRMSE>( + _prop_train, + _prop_test, + _task_sizes_train, + _task_sizes_test + ); + _loss->set_nfeat(_feats.size()); } double ModelRegressor::eval(double* x_in) const diff --git a/src/descriptor_identifier/Model/ModelRegressor.hpp b/src/descriptor_identifier/Model/ModelRegressor.hpp index 318e8a33b40ea5f708d6158a74ee99cff3398ce2..13537a8d62dc1b1f45fbbf96878d51d259242a13 100644 --- a/src/descriptor_identifier/Model/ModelRegressor.hpp +++ b/src/descriptor_identifier/Model/ModelRegressor.hpp @@ -58,14 +58,9 @@ public: ModelRegressor( const std::string prop_label, const Unit prop_unit, - const std::vector<double> prop_train, - const std::vector<double> prop_test, + const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds, - const std::vector<int> task_sizes_train, - const std::vector<int> task_sizes_test, - const bool fix_intercept, - const bool fill_vecs=true + const std::vector<int> leave_out_inds ); // DocString: model_reg_init_str diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp index 3ad3c65b7c2e47a6b6d02dc0d708c5021e809b01..5e711859284484e779ba58274ae4c4d69a12153c 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp @@ -13,62 +13,33 @@ SISSOClassifier::SISSOClassifier( const int n_residual, const int n_models_store ): - SISSO_DI(feat_space, prop_label, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, false), + SISSO_DI( + "classification", + feat_space, + prop_label, + prop_unit, + prop, + prop_test, + task_sizes_train, + task_sizes_test, + leave_out_inds, + n_dim, + n_residual, + n_models_store, + false + ), _c(1000.0), - _width(1.0e-5), - _n_class(1) + _width(1.0e-5) { - check_prop_test(prop, prop_test); - setup_prop(prop, prop_test); + _prop = _loss->prop_train(); + _prop_test = _loss->prop_test(); + setup_d_mat_transfer(); } -void SISSOClassifier::check_prop_test(const std::vector<double> prop, const std::vector<double> prop_test) const +void SISSOClassifier::setup_d_mat_transfer() { - for(auto& pt : prop_test) - { - if(std::none_of(prop.begin(), prop.end(), [&pt](double pp){return std::abs(pp - pt) < 1e-10;})) - { - throw std::logic_error("A class in the property vector (test set) is not in the training set."); - } - } -} - -void SISSOClassifier::setup_prop(std::vector<double> prop, std::vector<double> prop_test) -{ - std::vector<double> unique_classes = vector_utils::unique<double>(prop); - std::map<double, double> class_map; - _n_class = unique_classes.size(); - for(int cc = 0; cc < _n_class; ++cc) - { - class_map[unique_classes[cc]] = static_cast<double>(cc); - } - - std::transform(prop.begin(), prop.end(), prop.begin(), [&class_map](double pp){return class_map[pp];}); - std::transform(prop_test.begin(), prop_test.end(), prop_test.begin(), [&class_map](double pp){return class_map[pp];}); - - std::vector<bool> not_int(prop.size()); - std::transform(prop.begin(), prop.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;}); - if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) - { - throw std::logic_error("For classification the property (training set) must be an integer"); - } - - not_int.resize(prop_test.size()); - std::transform(prop_test.begin(), prop_test.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;}); - if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) - { - throw std::logic_error("For classification the property (test set) must be an integer"); - } - - std::copy_n(prop.begin(), prop.size(), _prop.begin()); - std::copy_n(prop_test.begin(), prop_test.size(), _prop_test.begin()); -} - -std::vector<LPWrapper> SISSOClassifier::setup_lp(const int n_dim) -{ - std::vector<int> n_samp_per_class; + _n_class = vector_utils::unique<double>(_prop).size(); int task_start = 0; - std::vector<LPWrapper> lp = {}; for(int tt = 0; tt < _n_task; ++tt) { std::vector<int> inds(_task_sizes_train[tt]); @@ -81,27 +52,9 @@ std::vector<LPWrapper> SISSOClassifier::setup_lp(const int n_dim) for(int ii = 1; ii < inds.size(); ++ii) { _sample_inds_to_sorted_dmat_inds[inds[ii]] = ii + task_start; - if(_prop[inds[ii]] != _prop[inds[ii - 1]]) - { - n_samp_per_class.push_back(ii - cls_start); - cls_start = ii; - } } - n_samp_per_class.push_back(inds.size() - cls_start); - - if(n_samp_per_class.size() != (tt + 1) * _n_class) - { - throw std::logic_error("A class is not represented in task " + std::to_string(tt) + "."); - } - task_start += _task_sizes_train[tt]; - std::vector<int> samp_per_class(_n_class); - std::copy_n(n_samp_per_class.begin() + tt * _n_class, _n_class, samp_per_class.begin()); - lp.push_back(LPWrapper(samp_per_class, tt, _n_class, n_dim, std::accumulate(samp_per_class.begin(), samp_per_class.end(), 0), _width)); } - prop_sorted_d_mat::initialize_sroted_d_matrix_arr(0, _n_task, _n_class, n_samp_per_class); - - return lp; } std::array<double, 2> SISSOClassifier::svm_error(std::vector<SVMWrapper>& svm, const std::vector<int>& feat_inds) const @@ -162,7 +115,6 @@ void SISSOClassifier::transfer_d_mat_to_sorted() const void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim) { - int n_get_models = std::max(_n_residual, _n_models_store); std::vector<double> coefs(n_dim + 1, 0.0); @@ -186,13 +138,13 @@ void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim) util_funcs::iterate(inds, inds.size(), _mpi_comm->rank()); int max_error_ind = 0; - std::vector<LPWrapper> lp_vec = setup_lp(n_dim); transfer_d_mat_to_sorted(); if(inds.back() >= 0) { - #pragma omp parallel firstprivate(max_error_ind, lp_vec) + #pragma omp parallel firstprivate(max_error_ind) { + std::shared_ptr<LossFunction> loss_copy = loss_function_util::copy(_loss); std::vector<int> min_inds_private(min_inds); std::vector<int> min_n_convex_overlap_private(min_n_convex_overlap); std::vector<double> min_svm_score_private(min_svm_score); @@ -215,11 +167,7 @@ void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim) for(unsigned long long int ii = 0; ii < n_interactions; ii += static_cast<unsigned long long int>(_mpi_comm->size())) { util_funcs::iterate(inds, inds.size(), ii - ii_prev); - int n_convex_overlap = 0; - for(auto& lp : lp_vec) - { - n_convex_overlap += lp.get_n_overlap(inds); - } + int n_convex_overlap = (*loss_copy)(inds); if(n_convex_overlap <= min_n_convex_overlap_private[max_error_ind]) { @@ -312,7 +260,7 @@ void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim) ); inds = util_funcs::argsort<double>(scores); - std::vector<model_node_ptr> min_nodes(n_dim); + std::vector<std::vector<model_node_ptr>> min_nodes(n_get_models, std::vector<model_node_ptr>(n_dim)); std::vector<ModelClassifier> models; for(int rr = 0; rr < n_get_models; ++rr) @@ -321,12 +269,13 @@ void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim) for(int ii = 0; ii < n_dim; ++ii) { int index = all_min_inds[inds[rr] * n_dim + ii]; - min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]); + min_nodes[rr][ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]); } - models.push_back(ModelClassifier(_prop_label, _prop_unit, _prop, _prop_test, min_nodes, _leave_out_inds, _task_sizes_train, _task_sizes_test, _fix_intercept)); + models.push_back(ModelClassifier(_prop_label, _prop_unit, _loss, min_nodes[rr], _leave_out_inds)); } _models.push_back(models); + _loss->reset_projection_prop(min_nodes); } void SISSOClassifier::fit() diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp index 1cc7b9b8924fcce0ced087828abd18ff50be421b..87094adfb8275e5e63823df57f30c895bd03533c 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp @@ -72,27 +72,6 @@ public: const int n_models_store ); - /** - * @brief Check to ensure all classes in prop_test are in prop - * - * @param prop The property vector for the training data - * @param prop_test The property vector for the test data - */ - void check_prop_test(const std::vector<double> prop, const std::vector<double> prop_test) const; - - /** - * @brief Convert the property vectors into something that SharkML's SVM expects - * - * @param prop The property vector for the training data - * @param prop_test The property vector for the test data - */ - void setup_prop(std::vector<double> prop, std::vector<double> prop_test); - - /** - * @brief set up the data structures for finding the number of overlap points for a linear programming problem - */ - std::vector<LPWrapper> setup_lp(const int n_dim = 1); - /** * @brief Transfer data from the node value array D-matrix into a property sorted array */ @@ -120,6 +99,11 @@ public: */ int get_max_error_ind(const int n_models, const int* n_convex_overlap, const double* svm_score, const double* svm_margin, double* scores) const; + /** + * @brief sets up _sample_inds_to_sorted_dmat_inds + */ + void setup_d_mat_transfer(); + /** * @brief Preform the l0 normalization for a property or the residual * diff --git a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp index 44b3d33371ef47617999bdfc66237e2fb1574c3a..68e4d1918c624c0ebc1b09ed01a059adf0a7e1cc 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp @@ -18,41 +18,14 @@ SISSOLogRegressor::SISSOLogRegressor( _prop_no_log(prop) { std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); -} - -void SISSOLogRegressor::set_a(const std::vector<int>& inds, const int start, const int n_samp, double* a) -{ - for(int ii = 0; ii < inds.size(); ++ii) - { - std::transform( - node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, - node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_samp, - a + ii * n_samp, - [](double v){return std::log(v);} - ); - } - - if(!_fix_intercept) - { - std::fill_n(a + inds.size() * n_samp, n_samp, 1.0); - } -} - -void SISSOLogRegressor::set_error(const std::vector<int>& inds, const double* coefs, const int start, const int n_samp, double* error) const -{ - std::fill_n(error + start, n_samp, -1 * (!_fix_intercept) * coefs[inds.size()]); - for(int ii = 0; ii < inds.size(); ++ii) - { - std::transform( - node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, - node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_samp, - error + start, - error + start, - [&coefs, &ii](double fv, double e){return e - coefs[ii] * std::log(fv);} - ); - } - - daxpy_(n_samp, 1.0, _prop.data() + start, 1, error + start, 1); + _loss = loss_function_util::get_loss_function( + "log_regression", + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } void SISSOLogRegressor::output_models(std::vector<double>& residual) @@ -76,26 +49,29 @@ void SISSOLogRegressor::output_models(std::vector<double>& residual) } } -void SISSOLogRegressor::add_model(const std::vector<int> indexes) +void SISSOLogRegressor::add_models(const std::vector<std::vector<int>> indexes) { - if(_models.size() < indexes.size()) - { - _models.push_back({}); - } - std::vector<model_node_ptr> min_nodes(indexes.size()); - for(int ii = 0; ii < indexes.size(); ++ii) + _models.push_back({}); + std::vector<std::vector<model_node_ptr>> min_nodes; + + for(auto& inds: indexes) { - int index = indexes[ii]; - min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]); + min_nodes.push_back(std::vector<model_node_ptr>(inds.size())); + for(int ii = 0; ii < inds.size(); ++ii) + { + int index = inds[ii]; + min_nodes.back()[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]); + } + ModelLogRegressor model(_prop_label, _prop_unit, _loss, min_nodes.back(), _leave_out_inds); + _models.back().push_back(model); } - ModelLogRegressor model(_prop_label, _prop_unit, _prop_no_log, _prop_test, min_nodes, _leave_out_inds, _task_sizes_train, _task_sizes_test, _fix_intercept); - _models.back().push_back(model); + + _loss->reset_projection_prop(min_nodes); } void SISSOLogRegressor::l0_norm(std::vector<double>& prop, int n_dim) { int n_get_models = std::max(_n_residual, _n_models_store); - std::vector<double> coefs(n_dim + 1, 0.0); std::vector<int> inds(n_dim, 0); std::vector<int> min_inds(n_get_models * n_dim, -1); @@ -113,19 +89,13 @@ void SISSOLogRegressor::l0_norm(std::vector<double>& prop, int n_dim) if(inds.back() >= 0) { - #pragma omp parallel shared(min_inds, min_errors, n_interactions, n_get_models, n_dim) firstprivate(inds, coefs) + #pragma omp parallel shared(min_inds, min_errors, n_interactions, n_get_models, n_dim) firstprivate(inds) { + std::shared_ptr<LossFunction> loss_copy = loss_function_util::copy(_loss); int max_error_ind = 0; std::vector<int> min_inds_private(min_inds); std::vector<double> min_errors_private(min_errors); - std::vector<double> error(_prop.size()); - std::vector<double> a((n_dim + (1 - _fix_intercept)) * _prop.size()); - std::vector<double> b(_prop.size()); - - _lwork = get_opt_lwork(n_dim + (1 - _fix_intercept), a.data(), b.data()); - std::vector<double> work(_lwork, 0.0); - unsigned long long int ii_prev = 0; #ifdef OMP45 @@ -136,13 +106,28 @@ void SISSOLogRegressor::l0_norm(std::vector<double>& prop, int n_dim) for(unsigned long long int ii = _mpi_comm->rank(); ii < n_interactions; ii += static_cast<unsigned long long int>(_mpi_comm->size())) { util_funcs::iterate(inds, inds.size(), ii - ii_prev); - least_squares(inds, coefs.data(), 0, _n_samp, a.data(), b.data(), work.data()); - set_error(inds, coefs.data(), 0, _n_samp, error.data()); - - double rmse = std::sqrt(ddot_(_n_samp, error.data(), 1, error.data(), 1) / _n_samp); - if(rmse < min_errors_private[max_error_ind]) + double loss = (*loss_copy)(inds); + if(loss < -1.0) + { + std::string err_msg = "A parameter passed to dgels was invalid. This is likely from a NaN in the descriptor matrix. The features that caused this are: "; + for(auto& ind : inds) + { + err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + } + throw std::logic_error(err_msg.substr(0, err_msg.size() - 2) + "."); + } + else if(loss < 0.0) + { + std::string err_msg = "Descriptor matrix is not full-rank. The features that caused this are: "; + for(auto& ind : inds) + { + err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + } + std::cerr << err_msg.substr(0, err_msg.size() - 2) << "." << std::endl; + } + else if(loss < min_errors_private[max_error_ind]) { - min_errors_private[max_error_ind] = rmse; + min_errors_private[max_error_ind] = loss; std::copy_n(inds.begin(), inds.size(), min_inds_private.begin() + max_error_ind * n_dim); max_error_ind = std::max_element(min_errors_private.begin(), min_errors_private.end()) - min_errors_private.begin(); } @@ -171,14 +156,14 @@ void SISSOLogRegressor::l0_norm(std::vector<double>& prop, int n_dim) mpi::all_gather(*_mpi_comm, min_inds.data(), n_get_models * n_dim, all_min_inds); inds = util_funcs::argsort<double>(all_min_error); + std::vector<std::vector<int>> indexes(n_get_models, std::vector<int>(n_dim)); for(int rr = 0; rr < n_get_models; ++rr) { - std::vector<int> indexes(n_dim); node_value_arrs::clear_temp_test_reg(); for(int ii = 0; ii < n_dim; ++ii) { - indexes[ii] = all_min_inds[inds[rr] * n_dim + ii]; + indexes[rr][ii] = all_min_inds[inds[rr] * n_dim + ii]; } - add_model(indexes); } + add_models(indexes); } diff --git a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp index 21af9913f67a76653f74531527a8c80ea72df883..8838fda306c12e082b64834272f37798bf0fc576 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp @@ -60,32 +60,12 @@ public: const bool fix_intercept=false ); - /** - * @brief Set the A matrix for the least squares problem - * - * @param inds indexes of the selected features - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - */ - void set_a(const std::vector<int>& inds, const int start, const int n_samp, double* a); - - /** - * @brief Set the residual for the next step - * - * @param inds indexes of the selected features - * @param coeffs Coefficients of the model - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - * @param error pointer to the error vector - */ - virtual void set_error(const std::vector<int>& inds, const double* coefs, const int start, const int n_samp, double* error) const; - /** * @brief Adds a model to the model list * * @param indexes Vector storing all of the indexes of features in _phi_selected to use for the model */ - void add_model(const std::vector<int> indexes); + void add_models(const std::vector<std::vector<int>> indexes); /** * @brief Output the models to files and copy the residuals diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp index bd3f8c19274c0ca7fea3c85b7bb511d4d57b47a6..f52f757ad6c0f8217daa6360f3d8936d6f9c8efb 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp @@ -15,6 +15,7 @@ SISSORegressor::SISSORegressor( const bool fix_intercept ): SISSO_DI( + "regression", feat_space, prop_label, prop_unit, @@ -30,97 +31,24 @@ SISSORegressor::SISSORegressor( ) {} -void SISSORegressor::set_a(const std::vector<int>& inds, int start, int n_samp, double* a) +void SISSORegressor::add_models(const std::vector<std::vector<int>> indexes) { - for(int ii = 0; ii < inds.size(); ++ii) - { - std::copy_n(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, n_samp, a + ii * n_samp); - } + _models.push_back({}); + std::vector<std::vector<model_node_ptr>> min_nodes; - if(!_fix_intercept) + for(auto& inds: indexes) { - std::fill_n(a + inds.size() * n_samp, n_samp, 1.0); - } -} - -void SISSORegressor::least_squares(const std::vector<int>& inds, double* coeffs, const int start, const int n_samp, double* a, double*b, double* work) -{ - int info; - int n_dim = inds.size() + (1 - _fix_intercept); - - set_a(inds, start, n_samp, a); - std::copy_n(_prop.data() + start, n_samp, b); - - dgels_('N', n_samp, n_dim, 1, a, n_samp, b, n_samp, work, _lwork, &info); - - if(info == 0) - { - std::copy_n(b, n_dim, coeffs); - } - else if(info > 0) - { - std::string err_msg = "Descriptor matrix is not full-rank. The features that caused this are: "; - for(auto& ind : inds) + min_nodes.push_back(std::vector<model_node_ptr>(inds.size())); + for(int ii = 0; ii < inds.size(); ++ii) { - err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + int index = inds[ii]; + min_nodes.back()[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]); } - std::cerr << err_msg.substr(0, err_msg.size() - 2) << "." << std::endl; - std::fill_n(coeffs, n_dim, std::numeric_limits<double>::infinity()); - } - else - { - std::string err_msg = "The " + std::to_string(-1*info) + - " parameter to dgels was invalid. This is likely from a NaN in the descriptor matrix. The features that caused this are: "; - for(auto& ind : inds) - { - err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; - } - throw std::logic_error(err_msg.substr(0, err_msg.size() - 2) + "."); - } -} - -int SISSORegressor::get_opt_lwork(const int n_dim, double* a, double* b) -{ - std::vector<double> work(1, 0.0); - int info = 0, rank = 0; - - dgels_('N', _n_samp, n_dim, 1, a, _n_samp, b, _n_samp, work.data(), -1, &info); - - if(info == 0) - { - return static_cast<int>(std::round(work[0])); - } - else - { - throw std::logic_error("Failed to get lwork."); + ModelRegressor model(_prop_label, _prop_unit, _loss, min_nodes.back(), _leave_out_inds); + _models.back().push_back(model); } -} - -void SISSORegressor::set_error(const std::vector<int>& inds, const double* coefs, const int start, const int n_samp, double* error) const -{ - std::fill_n(error + start, n_samp, -1 * (!_fix_intercept) * coefs[inds.size()]); - for(int ii = 0; ii < inds.size(); ++ii) - { - daxpy_(n_samp, -1.0 * coefs[ii], node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, 1, error + start, 1); - } - daxpy_(n_samp, 1.0, _prop.data() + start, 1, error + start, 1); - -} -void SISSORegressor::add_model(const std::vector<int> indexes) -{ - if(_models.size() < indexes.size()) - { - _models.push_back({}); - } - - std::vector<model_node_ptr> min_nodes(indexes.size()); - for(int ii = 0; ii < indexes.size(); ++ii) - { - int index = indexes[ii]; - min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]); - } - _models.back().push_back(ModelRegressor(_prop_label, _prop_unit, _prop, _prop_test, min_nodes, _leave_out_inds, _task_sizes_train, _task_sizes_test, _fix_intercept)); + _loss->reset_projection_prop(min_nodes); } void SISSORegressor::output_models(std::vector<double>& residual) @@ -168,21 +96,14 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) { #pragma omp parallel shared(min_inds, min_errors, n_interactions, n_get_models, n_dim) firstprivate(inds, coefs) { + std::shared_ptr<LossFunction> loss_copy = loss_function_util::copy(_loss); + int max_error_ind = 0; std::vector<int> min_inds_private(min_inds); std::vector<double> min_errors_private(min_errors); - - std::vector<double> error(_prop.size()); - std::vector<double> a((n_dim + (1 - _fix_intercept)) * _prop.size()); - std::vector<double> b(_prop.size()); - - _lwork = get_opt_lwork(n_dim + (1 - _fix_intercept), a.data(), b.data()); - std::vector<double> work(_lwork, 0.0); - unsigned long long int ii_prev = 0; - #ifdef OMP45 #pragma omp for schedule(monotonic: dynamic) #else @@ -191,24 +112,34 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) for(unsigned long long int ii = _mpi_comm->rank(); ii < n_interactions; ii += static_cast<unsigned long long int>(_mpi_comm->size())) { util_funcs::iterate(inds, inds.size(), ii - ii_prev); - int start = 0; - for(auto& sz : _task_sizes_train) + double loss = (*loss_copy)(inds); + if(loss < -1.0) + { + std::string err_msg = "A parameter passed to dgels was invalid. This is likely from a NaN in the descriptor matrix. The features that caused this are: "; + for(auto& ind : inds) + { + err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + } + throw std::logic_error(err_msg.substr(0, err_msg.size() - 2) + "."); + } + else if(loss < 0.0) { - least_squares(inds, coefs.data(), start, sz, a.data(), b.data(), work.data()); - set_error(inds, coefs.data(), start, sz, error.data()); - start += sz; + std::string err_msg = "Descriptor matrix is not full-rank. The features that caused this are: "; + for(auto& ind : inds) + { + err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + } + std::cerr << err_msg.substr(0, err_msg.size() - 2) << "." << std::endl; } - double rmse = std::sqrt(ddot_(_n_samp, error.data(), 1, error.data(), 1) / _n_samp); - if(rmse < min_errors_private[max_error_ind]) + else if(loss < min_errors_private[max_error_ind]) { - min_errors_private[max_error_ind] = rmse; + min_errors_private[max_error_ind] = loss; std::copy_n(inds.begin(), inds.size(), min_inds_private.begin() + max_error_ind * n_dim); max_error_ind = std::max_element(min_errors_private.begin(), min_errors_private.end()) - min_errors_private.begin(); } ii_prev = ii; } - #pragma omp critical { max_error_ind = std::max_element(min_errors.begin(), min_errors.end()) - min_errors.begin(); @@ -232,17 +163,16 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) mpi::all_gather(*_mpi_comm, min_inds.data(), n_get_models * n_dim, all_min_inds); inds = util_funcs::argsort<double>(all_min_error); + std::vector<std::vector<int>> indexes(n_get_models, std::vector<int>(n_dim)); for(int rr = 0; rr < n_get_models; ++rr) { - std::vector<int> indexes(n_dim); node_value_arrs::clear_temp_test_reg(); - node_value_arrs::clear_temp_reg(); for(int ii = 0; ii < n_dim; ++ii) { - indexes[ii] = all_min_inds[inds[rr] * n_dim + ii]; + indexes[rr][ii] = all_min_inds[inds[rr] * n_dim + ii]; } - add_model(indexes); } + add_models(indexes); } void SISSORegressor::fit() diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp index 8216394db2691b23f12f96e2d394debc06086c1e..ffc78b7761686cdae0a4b293882fca675a267d4f 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp @@ -30,9 +30,6 @@ class SISSORegressor: public SISSO_DI private: std::vector<std::vector<ModelRegressor>> _models; //!< List of models -protected: - int _lwork; //!< size of the work array (for dgels_) - public: /** * @brief Constructor for the Regressor @@ -65,56 +62,12 @@ public: const bool fix_intercept=false ); - /** - * @brief Set the residual for the next step - * - * @param inds indexes of the selected features - * @param coeffs Coefficients of the model - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - * @param error pointer to the error vector - */ - virtual void set_error(const std::vector<int>& inds, const double* coefs, const int start, const int n_samp, double* error) const; - - /** - * @brief Get the optimal size of the working array for dgels (solve Ax = b) - * - * @param n_dim Dimension of the least squares matrix - * @param a the pointer to the A matrix - * @param b the pointer to the b vector - * @return Optimal size of the working array - */ - int get_opt_lwork(const int n_dim, double* a, double* b); - - /** - * @brief Preform Least squares optimization using dgels (solve Ax = b) - * - * @param inds Feature indexes to get the model of - * @param coefs Coefficients for the model - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - * @param a the pointer to the A matrix - * @param b the pointer to the b vector - * @param work pointer to the working space for dgels - */ - void least_squares(const std::vector<int>& inds, double* coefs, const int start, const int n_samp, double* a, double*b, double* work); - - /** - * @brief Set the A matrix for the least squares problem - * - * @param inds indexes of the selected features - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - * @param a the pointer to the A matrix - */ - virtual void set_a(const std::vector<int>& inds, int start, int n_samp, double* a); - /** * @brief Adds a model to the model list * * @param indexes Vector storing all of the indexes of features in _phi_selected to use for the model */ - virtual void add_model(const std::vector<int> indexes); + virtual void add_models(const std::vector<std::vector<int>> indexes); /** * @brief Output the models to files and copy the residuals diff --git a/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp b/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp index 7c60edf7cf1359904f5cf165daf6cedb14d8149b..d8e861806c4262206531e960944928a1bb0f55e9 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp @@ -1,6 +1,7 @@ #include "descriptor_identifier/SISSO_DI/SISSO_DI.hpp" SISSO_DI::SISSO_DI( + const std::string loss_type, const std::shared_ptr<FeatureSpace> feat_space, const std::string prop_label, const Unit prop_unit, @@ -30,4 +31,13 @@ SISSO_DI::SISSO_DI( _n_residual(n_residual), _n_models_store(n_models_store), _fix_intercept(fix_intercept) -{} +{ + _loss = loss_function_util::get_loss_function( + loss_type, + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); +} diff --git a/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp b/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp index 0b232ae92d81061de0727b45c347df0d20a5c9e3..ba99cb13c29b679cb92115ab0d33ecc075dc57a6 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp @@ -41,6 +41,7 @@ protected: const std::shared_ptr<FeatureSpace> _feat_space; //!< Feature Space for the problem std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI Communicator + std::shared_ptr<LossFunction> _loss; //!< The loss function for the calculation const int _n_task; //!< The number of tasks const int _n_samp; //!< the number of samples per feature @@ -67,6 +68,7 @@ public: * @param fix_intrecept If true fix intercept to 0 */ SISSO_DI( + const std::string loss_type, const std::shared_ptr<FeatureSpace> feat_space, const std::string prop_label, const Unit prop_unit, @@ -158,6 +160,7 @@ public: * @param fix_intrecept If true fix intercept to 0 */ SISSO_DI( + std::string loss_type, std::shared_ptr<FeatureSpace> feat_space, std::string prop_label, Unit prop_unit, @@ -190,6 +193,7 @@ public: * @param fix_intrecept If true fix intercept to 0 */ SISSO_DI( + std::string loss_type, std::shared_ptr<FeatureSpace> feat_space, std::string prop_label, Unit prop_unit, diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index bec9ac7b3fce024452fbb6396b64ecfde7f80003..c89598eeddc65ddc5b6a59b6060b27272b99d1a2 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -183,7 +183,6 @@ void FeatureSpace::initialize_fs() } initialize_fs_output_files(); - project_funcs::set_project_fxn(_project_type, _task_sizes.size(), _project, _project_no_omp); comp_feats::set_is_valid_fxn(_project_type, _cross_cor_max, _n_samp, _is_valid, _is_valid_feat_list); set_op_lists(); @@ -970,21 +969,23 @@ void FeatureSpace::generate_feature_space() _n_feat = _phi.size(); } -void FeatureSpace::project_generated(const double* prop, const int size, std::vector<node_ptr>& phi_sel, std::vector<double>& scores_sel) +// void FeatureSpace::project_generated(const double* prop, const int size, std::vector<node_ptr>& phi_sel, std::vector<double>& scores_sel) +void FeatureSpace::project_generated(std::shared_ptr<LossFunction> loss, std::vector<node_ptr>& phi_sel, std::vector<double>& scores_sel) { std::vector<double> scores_sel_all(node_value_arrs::N_SELECTED); if(node_value_arrs::N_SELECTED > _n_sis_select) { scores_sel_all.resize(_phi_selected.size()); - _project(prop, scores_sel_all.data(), _phi_selected, _task_sizes, size / _n_samp); + project_funcs::project_loss(loss, _phi_selected, scores_sel_all.data()); } std::copy_n(scores_sel.data(), scores_sel.size(), &scores_sel_all[node_value_arrs::N_SELECTED - _n_sis_select]); int worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin(); double worst_score = scores_sel[worst_score_ind]; - #pragma omp parallel firstprivate(worst_score, worst_score_ind, prop, size, scores_sel_all) + #pragma omp parallel firstprivate(worst_score, worst_score_ind, scores_sel_all) { + std::shared_ptr<LossFunction> loss_copy = loss_function_util::copy(loss); std::vector<node_ptr> phi_sel_private(phi_sel); std::vector<double> scores_sel_private(scores_sel); int index_base = _phi.size() + _n_sis_select * (omp_get_thread_num() + _mpi_comm->size()); @@ -994,8 +995,7 @@ void FeatureSpace::project_generated(const double* prop, const int size, std::ve std::shared_ptr<NLOptimizer> reparam_optimizer; if(_reparam_residual) { - std::vector<double> prop_vec(size, 0.0); - std::copy_n(prop, size, prop_vec.data()); + std::vector<double> prop_vec(loss_copy->prop_project()); reparam_optimizer = nlopt_wrapper::get_optimizer(_project_type, _task_sizes, prop_vec, _max_phi, _max_param_depth); } else @@ -1037,7 +1037,7 @@ void FeatureSpace::project_generated(const double* prop, const int size, std::ve node_value_arrs::clear_temp_reg_thread(); std::vector<double> scores(generated_phi.size()); - _project_no_omp(prop, scores.data(), generated_phi, _task_sizes, size / _n_samp); + project_funcs::project_loss_no_omp(loss_copy, generated_phi, scores.data()); std::vector<int> inds = util_funcs::argsort<double>(scores); @@ -1128,15 +1128,29 @@ void FeatureSpace::project_generated(const double* prop, const int size, std::ve void FeatureSpace::sis(const std::vector<double>& prop) { - // Reparameterize for the residuals + sis( + loss_function_util::get_loss_function( + _project_type, + prop, + {}, + _task_sizes, + std::vector<int>(_task_sizes.size(), 0), + false + ) + ); +} + +void FeatureSpace::sis(std::shared_ptr<LossFunction> loss) +{ #ifdef PARAMETERIZE + // Reparameterize for the residuals if(_reparam_residual && (_phi_selected.size() > 0)) { double start_time = omp_get_wtime(); _phi.resize(_n_feat); _phi_reparam.resize(0); _start_gen_reparam.resize(0); - generate_reparam_feature_set(prop); + generate_reparam_feature_set(loss->prop_project()); _phi.insert(_phi.end(), _phi_reparam.begin(), _phi_reparam.end()); _scores.resize(_phi.size()); } @@ -1163,7 +1177,7 @@ void FeatureSpace::sis(const std::vector<double>& prop) // Get projection scores double start_time = omp_get_wtime(); - _project(prop.data(), _scores.data(), _phi, _task_sizes, prop.size() / _n_samp); + project_funcs::project_loss(loss, _phi, _scores.data()); _mpi_comm->barrier(); if(_mpi_comm->rank() == 0) @@ -1187,7 +1201,8 @@ void FeatureSpace::sis(const std::vector<double>& prop) std::vector<double> scores_sel_all(node_value_arrs::N_SELECTED, 0.0); if(node_value_arrs::N_SELECTED > _n_sis_select) { - _project(prop.data(), scores_sel_all.data(), _phi_selected, _task_sizes, prop.size() / _n_samp); + project_funcs::project_loss(loss, _phi_selected, scores_sel_all.data()); + // _project(prop.data(), scores_sel_all.data(), _phi_selected, _task_sizes, prop.size() / _n_samp); } // Get the best features currently generated on this rank @@ -1234,7 +1249,7 @@ void FeatureSpace::sis(const std::vector<double>& prop) phi_sel.resize(cur_feat_local); scores_sel.resize(cur_feat_local); - project_generated(prop.data(), prop.size(), phi_sel, scores_sel); + project_generated(loss, phi_sel, scores_sel); _mpi_comm->barrier(); if(_mpi_comm->rank() == 0) diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp index fdf3652355177786438589a15fb760bf8644e7a2..7a7cd26db1b408e589fb99c27aa6bc94566e204e 100644 --- a/src/feature_creation/feature_space/FeatureSpace.hpp +++ b/src/feature_creation/feature_space/FeatureSpace.hpp @@ -17,13 +17,17 @@ #include <iomanip> #include <utility> -#include "mpi_interface/MPI_Interface.hpp" -#include "mpi_interface/MPI_ops.hpp" -#include "mpi_interface/serialize_tuple.h" #include "feature_creation/node/ModelNode.hpp" #include "feature_creation/node/operator_nodes/allowed_ops.hpp" #include "feature_creation/node/utils.hpp" #include "feature_creation/node/value_storage/nodes_value_containers.hpp" + +#include "loss_function/utils.hpp" + +#include "mpi_interface/MPI_Interface.hpp" +#include "mpi_interface/MPI_ops.hpp" +#include "mpi_interface/serialize_tuple.h" + #include "utils/compare_features.hpp" #include "utils/project.hpp" @@ -69,8 +73,6 @@ class FeatureSpace const std::string _feature_space_file; //!< File to store information about the selected features const std::string _feature_space_summary_file; //!< File to store information about the selected features - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)> _project; //!< Function used to calculate the scores for SIS - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)> _project_no_omp; //!< Function used to calculate the scores for SIS without changing omp environment std::function<bool(const double*, const int, const double, const std::vector<double>&, const double, const int, const int)> _is_valid; //!< Function used to calculate the scores for SIS std::function<bool(const double*, const int, const double, const std::vector<node_ptr>&, const std::vector<double>&, const double)> _is_valid_feat_list; //!< Function used to calculate the scores for SIS without changing omp environment @@ -350,7 +352,7 @@ public: * @param phi_selected The features that would be selected from the previous rungs * @param scores_selected The projection scores of the features that would be selected from the previous rungs */ - void project_generated(const double* prop, const int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected); + void project_generated(std::shared_ptr<LossFunction> loss, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected); /** * @brief Perform SIS on a feature set with a specified property @@ -360,6 +362,14 @@ public: */ void sis(const std::vector<double>& prop); + /** + * @brief Perform SIS on a feature set with a specified loss function + * @details Perform sure-independence screening with either the correct property or the error + * + * @param loss The LossFunction to project over + */ + void sis(std::shared_ptr<LossFunction> loss); + // DocString: feat_space_feat_in_phi /** * @brief Is a feature in this process' _phi? diff --git a/src/loss_function/LossFunction.cpp b/src/loss_function/LossFunction.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b442c6db2d22838934f0b8d6df11fada64d9cba5 --- /dev/null +++ b/src/loss_function/LossFunction.cpp @@ -0,0 +1,44 @@ +#include "loss_function/LossFunction.hpp" + +LossFunction::LossFunction( + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + bool fix_intercept, + int n_feat +) : + _projection_prop(prop_train), + _prop_train(prop_train), + _prop_test(prop_test), + _error_train(prop_train.size(), 0.0), + _error_test(prop_test.size(), 0.0), + _task_sizes_train(task_sizes_train), + _task_sizes_test(task_sizes_test), + _n_samp(std::accumulate(task_sizes_train.begin(), task_sizes_train.end(), 0)), + _n_samp_test(std::accumulate(task_sizes_test.begin(), task_sizes_test.end(), 0)), + _n_task(task_sizes_train.size()), + _n_project_prop(_projection_prop.size() / _n_samp), + _n_feat(n_feat), + _n_dim(n_feat + (!fix_intercept)), + _fix_intercept(fix_intercept) +{ + set_nfeat(_n_feat); +} + +LossFunction::LossFunction(std::shared_ptr<LossFunction> o) : + _projection_prop(o->prop_project()), + _prop_train(o->prop_train()), + _prop_test(o->prop_test()), + _error_train(_prop_train.size(), 0.0), + _error_test(_prop_test.size(), 0.0), + _task_sizes_train(o->task_sizes_train()), + _task_sizes_test(o->task_sizes_test()), + _n_samp(std::accumulate(_task_sizes_train.begin(), _task_sizes_train.end(), 0)), + _n_samp_test(std::accumulate(_task_sizes_test.begin(), _task_sizes_test.end(), 0)), + _n_task(_task_sizes_train.size()), + _n_project_prop(_projection_prop.size() / _n_samp), + _n_feat(o->n_feat()), + _n_dim(o->n_dim()), + _fix_intercept(o->fix_intercept()) +{} diff --git a/src/loss_function/LossFunction.hpp b/src/loss_function/LossFunction.hpp new file mode 100644 index 0000000000000000000000000000000000000000..becf5f499d36d22f3e346e3f07981d6d940bc02a --- /dev/null +++ b/src/loss_function/LossFunction.hpp @@ -0,0 +1,154 @@ +/** @file loss_function/LossFunction.hpp + * @brief A class used to calculate the projection score and l0-regularization error + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ + +#ifndef LOSS_FUNCTION +#define LOSS_FUNCTION + +#include "feature_creation/node/ModelNode.hpp" +typedef std::shared_ptr<ModelNode> model_node_ptr; + +class LossFunction +{ +protected: + std::vector<double> _projection_prop; //!< The property vector used for projection + std::vector<double> _prop_train; //!< The value of the property to evaluate the loss function against for the training set + std::vector<double> _prop_test; //!< The value of the property to evaluate the loss function against for the test set + std::vector<double> _error_train; //!< The error vector for the training set + std::vector<double> _error_test; //!< The error vector for the test set + + const std::vector<int> _task_sizes_train; //!< Number of training samples per task + const std::vector<int> _task_sizes_test; //!< Number of testing samples per task + + const int _n_samp; //!< Number of samples in the training set + const int _n_samp_test; //!< Number of samples in the test set + int _n_project_prop; //!< Number of properties to project over + int _n_feat; + int _n_dim; + const int _n_task; //!< Number of tasks + const bool _fix_intercept; +public: + + /** + * @brief Constructor + * + * @param prop_train The value of the property to evaluate the loss function against for the training set + * @param prop_test The value of the property to evaluate the loss function against for the test set + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + */ + LossFunction( + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + bool fix_intercept = false, + int n_feat = 1 + ); + + LossFunction(std::shared_ptr<LossFunction> o); + + /** + * @brief Copy Constructor + * + * @param o LossFunction to be copied + */ + LossFunction(const LossFunction& o) = default; + + /** + * @brief Move Constructor + * + * @param o LossFunction to be moved + */ + LossFunction(LossFunction&& o) = default; + + /** + * @brief Copy Assignment operator + * + * @param o LossFunction to be copied + */ + LossFunction& operator= (const LossFunction& o) = default; + + /** + * @brief Move Assignment operator + * + * @param o LossFunction to be moved + */ + LossFunction& operator= (LossFunction&& o) = default; + + /** + * @brief Perform set up needed to calculate the projection scores + */ + virtual void prepare_project() = 0; + + /** + * @brief Calculate the projection score of a feature + * + * @param feat Feature to calculate the projection score of + * @return The projection score for the feature + */ + virtual double project(const node_ptr& feat) = 0; + + /** + * @brief Evaluate the loss function for a set of features whose data is stored in a central location + * + * @param inds Indexes to access the central storage area + * @return The value of the loss function + */ + virtual double operator()(const std::vector<int>& inds) = 0; + + /** + * @brief Evaluate the loss function for a set of features + * + * @param feats The features used to evaluate the loss function + * @return The value of the loss function + */ + virtual double operator()(const std::vector<model_node_ptr>& feats) = 0; + + /** + * @brief Evaluate the loss function for the test set + * + * @param feats The features used to evaluate the loss function + * @return The value of the loss function for the test data + */ + virtual double test_loss(const std::vector<model_node_ptr>& feats) = 0; + + /** + * @brief Set the model list from the previous generation + * @details [long description] + * + * @param model_list [description] + */ + virtual void reset_projection_prop(const std::vector<std::vector<model_node_ptr>>& models) = 0; + + virtual LOSS_TYPE type() const = 0; + virtual std::vector<double> coefs() const = 0; + + inline std::vector<int> task_sizes_train() const {return _task_sizes_train;} + inline std::vector<int> task_sizes_test() const {return _task_sizes_test;} + + inline const std::vector<double>& prop_train() const {return _prop_train;} + inline const std::vector<double>& prop_test() const {return _prop_test;} + inline const std::vector<double>& prop_project() const {return _projection_prop;} + + inline const double* prop_pointer() const {return _prop_train.data();} + inline const double* prop_test_pointer() const {return _prop_test.data();} + inline const double* prop_project_pointer() const {return _projection_prop.data();} + + inline std::vector<double> error_train() const {return _error_train;} + inline std::vector<double> error_test() const {return _error_test;} + + inline bool fix_intercept() const {return _fix_intercept;} + inline int n_feat() const {return _n_feat;} + inline int n_dim() const {return _n_dim;} + + virtual inline void set_nfeat(int n_feat){_n_feat = n_feat; _n_dim = n_feat + (!_fix_intercept);} + + virtual inline int n_class(){return 0;} + +}; + +#endif diff --git a/src/loss_function/LossFunctionConvexHull.cpp b/src/loss_function/LossFunctionConvexHull.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8bbf319d09ff77aa705c2221a0501daf696a1d8e --- /dev/null +++ b/src/loss_function/LossFunctionConvexHull.cpp @@ -0,0 +1,317 @@ +#include "loss_function/LossFunctionConvexHull.hpp" + +LossFunctionConvexHull::LossFunctionConvexHull( + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + int n_feat +) : + LossFunction(prop_train, prop_test, task_sizes_train, task_sizes_test, false, n_feat), + _width(1e-5), + _n_class(0) +{ + --_n_dim; + + for(auto& pt : prop_test) + { + if(std::none_of(prop_train.begin(), prop_train.end(), [&pt](double pp){return std::abs(pp - pt) < 1e-10;})) + { + throw std::logic_error("A class in the property vector (test set) is not in the training set."); + } + } + + std::vector<double> unique_classes = vector_utils::unique<double>(prop_train); + std::map<double, double> class_map; + _n_class = unique_classes.size(); + for(int cc = 0; cc < _n_class; ++cc) + { + class_map[unique_classes[cc]] = static_cast<double>(cc); + } + + std::transform( + prop_train.begin(), + prop_train.end(), + prop_train.begin(), + [&class_map](double pp){return class_map[pp];} + ); + std::transform( + prop_test.begin(), + prop_test.end(), + prop_test.begin(), + [&class_map](double pp){return class_map[pp];} + ); + + std::vector<bool> not_int(prop_train.size()); + std::transform( + prop_train.begin(), + prop_train.end(), + not_int.begin(), + [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;} + ); + if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) + { + throw std::logic_error("For classification the property (training set) must be an integer"); + } + + not_int.resize(prop_test.size()); + std::transform( + prop_test.begin(), + prop_test.end(), + not_int.begin(), + [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;} + ); + if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;})) + { + throw std::logic_error("For classification the property (test set) must be an integer"); + } + + std::copy_n(prop_train.begin(), prop_train.size(), _prop_train.begin()); + std::copy_n(prop_test.begin(), prop_test.size(), _prop_test.begin()); + + setup_lp(); +} + +LossFunctionConvexHull::LossFunctionConvexHull(std::shared_ptr<LossFunction> o) : + LossFunction(o), + _width(1e-5), + _n_class(vector_utils::unique<double>(_prop_train).size()) +{ + setup_lp(); +} + +void LossFunctionConvexHull::set_nfeat(int n_feat) +{ + _n_feat = n_feat; + _n_dim = n_feat; + setup_lp(); +} + +void LossFunctionConvexHull::prepare_project() +{ + _scores.resize(_n_project_prop); + _convex_hull.resize(_n_project_prop); + for(int pp = 0; pp < _n_project_prop; ++pp) + { + _convex_hull[pp] = ConvexHull1D(_task_sizes_train, &_projection_prop[pp * _n_samp]); + } +} + +double LossFunctionConvexHull::project(const node_ptr& feat) +{ + double* val_ptr = feat->value_ptr(-1, true); + for(int pp = 0; pp < _n_project_prop; ++pp) + { + _scores[pp] = _convex_hull[pp].overlap_1d(val_ptr); + } + + return *std::min_element(_scores.begin(), _scores.end()); +} + +void LossFunctionConvexHull::setup_lp() +{ + int task_start = 0; + int task_start_test = 0; + _lp = {}; + + std::vector<int> n_samp_per_class; + for(int tt = 0; tt < _n_task; ++tt) + { + std::map<double, int> rep_class; + + std::vector<int> inds(_task_sizes_train[tt]); + std::iota(inds.begin(), inds.end(), task_start); + util_funcs::argsort<double>(inds.data(), inds.data() + inds.size(), &_prop_train[task_start]); + + int cls_start = 0; + _sample_inds_to_sorted_dmat_inds[inds[0]] = task_start; + rep_class[_prop_train[inds[0]]] = 0; + for(int ii = 1; ii < inds.size(); ++ii) + { + _sample_inds_to_sorted_dmat_inds[inds[ii]] = ii + task_start; + if(_prop_train[inds[ii]] != _prop_train[inds[ii - 1]]) + { + n_samp_per_class.push_back(ii - cls_start); + rep_class[_prop_train[inds[ii]]] = n_samp_per_class.size(); + cls_start = ii; + } + } + n_samp_per_class.push_back(inds.size() - cls_start); + if(n_samp_per_class.size() != (tt + 1) * _n_class) + { + throw std::logic_error("A class is not represented in task " + std::to_string(tt) + "."); + } + + std::vector<int> samp_per_class(_n_class); + std::copy_n(n_samp_per_class.begin() + tt * _n_class, _n_class, samp_per_class.begin()); + task_start += _task_sizes_train[tt]; + + if(_task_sizes_test[tt] > 0) + { + std::vector<int> n_test_samp_per_class(n_samp_per_class.size(), 0); + inds.resize(_task_sizes_test[tt]); + std::iota(inds.begin(), inds.end(), task_start_test); + util_funcs::argsort<double>(inds.data(), inds.data() + inds.size(), &_prop_test[task_start_test]); + + cls_start = 0; + _test_sample_inds_to_sorted_dmat_inds[inds[0]] = task_start_test; + for(int ii = 1; ii < inds.size(); ++ii) + { + _test_sample_inds_to_sorted_dmat_inds[inds[ii]] = ii + task_start_test; + if(_prop_test[inds[ii]] != _prop_test[inds[ii - 1]]) + { + n_test_samp_per_class[ + rep_class[_prop_test[inds[ii - 1]]] + ] = ii - cls_start; + cls_start = ii; + } + } + n_test_samp_per_class[rep_class[_prop_test[inds.back()]]] = inds.size() - cls_start; + + task_start_test += _task_sizes_test[tt]; + _lp.push_back( + LPWrapper( + samp_per_class, + tt, + _n_class, + _n_dim, + std::accumulate(n_samp_per_class.begin(), n_samp_per_class.end(), 0), + _width, + n_test_samp_per_class, + std::accumulate(n_test_samp_per_class.begin(), n_test_samp_per_class.end(), 0) + ) + ); + } + else + { + _lp.push_back( + LPWrapper( + samp_per_class, + tt, + _n_class, + _n_dim, + std::accumulate(n_samp_per_class.begin(), n_samp_per_class.end(), 0), + _width + ) + ); + } + } + prop_sorted_d_mat::initialize_sroted_d_matrix_arr(0, _n_task, _n_class, n_samp_per_class); +} + +void LossFunctionConvexHull::reset_projection_prop(const std::vector<std::vector<model_node_ptr>>& models) +{ + _n_project_prop = models.size(); + _projection_prop.resize(_n_samp * _n_project_prop); + for(int mm = 0; mm < _n_project_prop; ++mm) + { + double loss = (*this)(models[mm]); + std::copy_n(_error_train.data(), _error_train.size(), &_projection_prop[mm * _n_samp]); + } + + _n_feat = models.back().size() + 1; + _n_dim = _n_feat; + setup_lp(); +} + +double LossFunctionConvexHull::operator()(const std::vector<int>& inds) +{ + double n_convex_overlap = 0.0; + for(auto& lp: _lp) + { + n_convex_overlap += static_cast<double>(lp.get_n_overlap(inds)); + } + return n_convex_overlap; +} + +double LossFunctionConvexHull::operator()(const std::vector<model_node_ptr>& feats) +{ + std::vector<std::vector<double>> sorted_values(feats.size(), std::vector<double>(_n_samp, 0.0)); + std::vector<std::vector<double>> sorted_test_values(feats.size(), std::vector<double>(_n_samp_test, 0.0)); + + for(int ff = 0; ff < _n_dim; ++ff) + { + double* val_ptr = feats[ff]->value_ptr(); + std::for_each( + _sample_inds_to_sorted_dmat_inds.begin(), + _sample_inds_to_sorted_dmat_inds.end(), + [&sorted_values, ff, val_ptr](auto& iter){sorted_values[ff][iter.second] = val_ptr[iter.first];} + ); + + val_ptr = feats[ff]->test_value_ptr(); + + std::for_each( + _test_sample_inds_to_sorted_dmat_inds.begin(), + _test_sample_inds_to_sorted_dmat_inds.end(), + [&sorted_test_values, ff, val_ptr](auto& iter){sorted_test_values[ff][iter.second] = val_ptr[iter.first];} + ); + } + + int start = 0; + int start_test = 0; + double n_convex_overlap = 0.0; + std::vector<double> err_train(_n_samp); + std::vector<double> err_test(_n_samp_test); + for(int tt = 0; tt < _n_task; ++tt) + { + std::vector<double*> val_ptrs(feats.size()); + std::vector<double*> test_val_ptrs(feats.size()); + std::transform( + sorted_values.begin(), + sorted_values.end(), + val_ptrs.begin(), + [start](std::vector<double>& sv){return &sv[start];} + ); + + std::transform( + sorted_test_values.begin(), + sorted_test_values.end(), + test_val_ptrs.begin(), + [start_test](std::vector<double>& sv){return &sv[start_test];} + ); + _lp[tt].set_n_overlap(val_ptrs, test_val_ptrs, &err_train[start], &err_test[start_test]); + n_convex_overlap += static_cast<double>(_lp[tt].n_overlap()); + + start += _task_sizes_train[tt]; + start_test += _task_sizes_test[tt]; + } + + std::for_each( + _sample_inds_to_sorted_dmat_inds.begin(), + _sample_inds_to_sorted_dmat_inds.end(), + [&err_train, this](auto& iter){_error_train[iter.first] = err_train[iter.second];} + ); + std::transform( + _error_train.begin(), + _error_train.end(), + _prop_train.begin(), + _error_train.begin(), + [](double err, double real){return (err < 1e-10) ? -1 : real;} + ); + + std::for_each( + _test_sample_inds_to_sorted_dmat_inds.begin(), + _test_sample_inds_to_sorted_dmat_inds.end(), + [&err_test, this](auto& iter){_error_test[iter.first] = err_test[iter.second];} + ); + std::transform( + _error_test.begin(), + _error_test.end(), + _prop_test.begin(), + _error_test.begin(), + [](double err, double real){return (err < 1e-10) ? -1 : real;} + ); + + return n_convex_overlap; +} + +double LossFunctionConvexHull::test_loss(const std::vector<model_node_ptr>& feats) +{ + double n_convex_overlap = (*this)(feats); + double n_convex_overlap_test = 0.0; + for(auto& lp : _lp) + { + n_convex_overlap_test += lp.n_overlap_test(); + } + return n_convex_overlap_test; +} diff --git a/src/loss_function/LossFunctionConvexHull.hpp b/src/loss_function/LossFunctionConvexHull.hpp new file mode 100644 index 0000000000000000000000000000000000000000..0726e8f1b90e22b7260f2f1e2dae413e4e085920 --- /dev/null +++ b/src/loss_function/LossFunctionConvexHull.hpp @@ -0,0 +1,143 @@ +/** @file loss_function/LossFunctionConvexHull.hpp + * @brief A class used to calculate the projection score and l0-regularization error + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ + +#ifndef LOSS_FUNCTION_CONVEX_HULL +#define LOSS_FUNCTION_CONVEX_HULL + +#include "classification/ConvexHull1D.hpp" +#include "classification/LPWrapper.hpp" +#include "classification/prop_sorted_d_mat.hpp" + +#include "loss_function/LossFunction.hpp" + +#include "utils/vector_utils.hpp" + +class LossFunctionConvexHull: public LossFunction +{ +protected: + std::vector<ConvexHull1D> _convex_hull; + std::vector<LPWrapper> _lp; + std::vector<double> _scores; + std::map<int, int> _sample_inds_to_sorted_dmat_inds; //!< map from input sample inds to the SORTED_D_MATRIX_INDS + std::map<int, int> _test_sample_inds_to_sorted_dmat_inds; //!< map from input sample inds to the SORTED_D_MATRIX_INDS + + const double _width; + int _n_class; + +public: + + /** + * @brief Constructor + * + * @param prop_train The value of the property to evaluate the loss function against for the training set + * @param prop_test The value of the property to evaluate the loss function against for the test set + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + */ + LossFunctionConvexHull( + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + int n_feat = 1 + ); + + LossFunctionConvexHull(std::shared_ptr<LossFunction> o); + + /** + * @brief Copy Constructor + * + * @param o LossFunctionConvexHull to be copied + */ + LossFunctionConvexHull(const LossFunctionConvexHull& o) = default; + + /** + * @brief Move Constructor + * + * @param o LossFunctionConvexHull to be moved + */ + LossFunctionConvexHull(LossFunctionConvexHull&& o) = default; + + /** + * @brief Copy Assignment operator + * + * @param o LossFunctionConvexHull to be copied + */ + LossFunctionConvexHull& operator= (const LossFunctionConvexHull& o) = default; + + /** + * @brief Move Assignment operator + * + * @param o LossFunctionConvexHull to be moved + */ + LossFunctionConvexHull& operator= (LossFunctionConvexHull&& o) = default; + + /** + * @brief Setup the _lp objects + * @details This is used for calculating the LossFunction + */ + void setup_lp(); + + /** + * @brief Perform set up needed to calculate the projection scores + */ + void prepare_project(); + + /** + * @brief Calculate the projection score of a feature + * + * @param feat Feature to calculate the projection score of + * @return The projection score for the feature + */ + virtual double project(const node_ptr& feat); + + /** + * @brief Evaluate the loss function for a set of features whose data is stored in a central location + * + * @param inds Indexes to access the central storage area + * @return The value of the loss function + */ + double operator()(const std::vector<int>& inds); + + /** + * @brief Evaluate the loss function for a set of features + * + * @param feats The features used to evaluate the loss function + * @return The value of the loss function + */ + double operator()(const std::vector<model_node_ptr>& feats); + + /** + * @brief Evaluate the loss function for the test set + * + * @param feats The features used to evaluate the loss function + * @return The value of the loss function for the test data + */ + virtual double test_loss(const std::vector<model_node_ptr>& feats); + + /** + * @brief Set the model list from the previous generation + * @details [long description] + * + * @param model_list [description] + */ + void reset_projection_prop(const std::vector<std::vector<model_node_ptr>>& models); + + inline LOSS_TYPE type() const {return LOSS_TYPE::CONVEX_HULL;} + inline std::vector<double> coefs() const + { + throw std::logic_error("Coefficient vector not obtainable from ConvexHull Loss Function."); + return {}; + } + + void set_nfeat(int n_feat); + + inline int n_class(){return _n_class;} + +}; + +#endif diff --git a/src/loss_function/LossFunctionLogPearsonRMSE.cpp b/src/loss_function/LossFunctionLogPearsonRMSE.cpp new file mode 100644 index 0000000000000000000000000000000000000000..222a7ef74a529eec4207d73ae575adb2207bcee2 --- /dev/null +++ b/src/loss_function/LossFunctionLogPearsonRMSE.cpp @@ -0,0 +1,129 @@ +#include "loss_function/LossFunctionLogPearsonRMSE.hpp" + +LossFunctionLogPearsonRMSE::LossFunctionLogPearsonRMSE( + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + bool fix_intercept, + int n_feat +) : + LossFunctionPearsonRMSE(prop_train, prop_test, task_sizes_train, task_sizes_test, fix_intercept, n_feat), + _log_feat(_n_samp, 0.0) +{ + set_nfeat(_n_feat); +} + +LossFunctionLogPearsonRMSE::LossFunctionLogPearsonRMSE(std::shared_ptr<LossFunction> o) : + LossFunctionPearsonRMSE(o), + _log_feat(_n_samp, 0.0) +{ + set_nfeat(_n_feat); +} + +double LossFunctionLogPearsonRMSE::project(const node_ptr& feat) +{ + std::fill_n(_scores.begin(), _scores.size(), 0.0); + + double* val_ptr = feat->value_ptr(-1, true); + std::transform( + val_ptr, + val_ptr + _n_samp, + _log_feat.data(), + [](double val){return std::log(val);} + ); + + double r2 = calc_max_pearson(_log_feat.data()); + return std::isfinite(r2) ? r2 : 0.0; +} + +void LossFunctionLogPearsonRMSE::set_a(const std::vector<int>& inds, int taskind, int start) +{ + for(int ff = 0; ff < inds.size(); ++ff) + { + double* val_ptr = node_value_arrs::get_d_matrix_ptr(inds[ff]) + start; + std::transform( + val_ptr, + val_ptr + _task_sizes_train[taskind], + &_a[ff * _task_sizes_train[taskind] + start * _n_dim], + [](double val){return std::log(val);} + ); + } +} + +void LossFunctionLogPearsonRMSE::set_a(const std::vector<model_node_ptr>& feats, int taskind, int start) +{ + for(int ff = 0; ff < feats.size(); ++ff) + { + double* val_ptr = feats[ff]->value_ptr() + start; + std::transform( + val_ptr, + val_ptr + _task_sizes_train[taskind], + &_a[ff * _task_sizes_train[taskind] + start * _n_dim], + [](double val){return std::log(val);} + ); + } +} + +void LossFunctionLogPearsonRMSE::set_error(const std::vector<int>& inds, int taskind, int start) +{ + std::fill_n( + _error_train.begin() + start, + _task_sizes_train[taskind], + -1 *(!_fix_intercept) * _coefs[(taskind + 1) * _n_dim - 1] + ); + + for(int ff = 0; ff < inds.size(); ++ff) + { + std::transform( + &_error_train[start], + &_error_train[start + _task_sizes_train[taskind]], + node_value_arrs::get_d_matrix_ptr(inds[ff]) + start, + &_error_train[start], + [this, &taskind, &ff](double err, double val){return err - _coefs[taskind * _n_dim + ff] * std::log(val);} + ); + } + daxpy_(_task_sizes_train[taskind], 1.0, &_prop_train[start], 1, &_error_train[start], 1); +} + +void LossFunctionLogPearsonRMSE::set_error(const std::vector<model_node_ptr>& feats, int taskind, int start) +{ + std::fill_n( + _error_train.begin() + start, + _task_sizes_train[taskind], + -1.0 * (!_fix_intercept) * _coefs[(taskind + 1) * _n_dim - 1] + ); + + for(int ff = 0; ff < feats.size(); ++ff) + { + std::transform( + &_error_train[start], + &_error_train[start + _task_sizes_train[taskind]], + feats[ff]->value_ptr() + start, + &_error_train[start], + [this, &taskind, &ff](double err, double val){return err - _coefs[taskind * _n_dim + ff] * std::log(val);} + ); + } + daxpy_(_task_sizes_train[taskind], 1.0, &_prop_train[start], 1, &_error_train[start], 1); +} + +void LossFunctionLogPearsonRMSE::set_test_error(const std::vector<model_node_ptr>& feats, int taskind, int start) +{ + std::fill_n( + _error_test.begin() + start, + _task_sizes_test[taskind], + -1.0 *(!_fix_intercept) * _coefs[(taskind + 1) * _n_feat - 1] + ); + + for(int ff = 0; ff < feats.size(); ++ff) + { + std::transform( + &_error_test[start], + &_error_test[start + _task_sizes_test[taskind]], + feats[ff]->value_ptr() + start, + &_error_test[start], + [this, &taskind, &ff](double err, double val){return err - _coefs[taskind * _n_dim + ff] * std::log(val);} + ); + } + daxpy_(_task_sizes_test[taskind], 1.0, &_prop_test[start], 1, &_error_test[start], 1); +} diff --git a/src/loss_function/LossFunctionLogPearsonRMSE.hpp b/src/loss_function/LossFunctionLogPearsonRMSE.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b970de4811c2a5bb5c48e678c7693799d1f55a6a --- /dev/null +++ b/src/loss_function/LossFunctionLogPearsonRMSE.hpp @@ -0,0 +1,116 @@ +/** @file loss_function/LossFunctionLogPearsonRMSE.hpp + * @brief A class used to calculate the projection score and l0-regularization error + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ + +#ifndef LOSS_FUNCTION_LOG_PEARSON_RMSE +#define LOSS_FUNCTION_LOG_PEARSON_RMSE + +#include "loss_function/LossFunctionPearsonRMSE.hpp" + +class LossFunctionLogPearsonRMSE: public LossFunctionPearsonRMSE +{ +protected: + std::vector<double> _log_feat; + +public: + + /** + * @brief Constructor + * + * @param prop_train The value of the property to evaluate the loss function against for the training set + * @param prop_test The value of the property to evaluate the loss function against for the test set + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + */ + LossFunctionLogPearsonRMSE( + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + bool fix_intercept = false, + int n_feat = 1 + ); + + LossFunctionLogPearsonRMSE(std::shared_ptr<LossFunction> o); + + /** + * @brief Copy Constructor + * + * @param o LossFunctionLogPearsonRMSE to be copied + */ + LossFunctionLogPearsonRMSE(const LossFunctionLogPearsonRMSE& o) = default; + + /** + * @brief Move Constructor + * + * @param o LossFunctionLogPearsonRMSE to be moved + */ + LossFunctionLogPearsonRMSE(LossFunctionLogPearsonRMSE&& o) = default; + + /** + * @brief Copy Assignment operator + * + * @param o LossFunctionLogPearsonRMSE to be copied + */ + LossFunctionLogPearsonRMSE& operator= (const LossFunctionLogPearsonRMSE& o) = default; + + /** + * @brief Move Assignment operator + * + * @param o LossFunctionLogPearsonRMSE to be moved + */ + LossFunctionLogPearsonRMSE& operator= (LossFunctionLogPearsonRMSE&& o) = default; + + /** + * @brief Calculate the projection score of a feature + * + * @param feat Feature to calculate the projection score of + * @return The projection score for the feature + */ + double project(const node_ptr& feat); + + /** + * @brief Set the _a matrix + * + * @param inds Indexes to access the central storage area + */ + void set_a(const std::vector<int>& inds, int taskind, int start); + + /** + * @brief Set the _a matrix + * + * @param feats The features used to evaluate the loss function + */ + void set_a(const std::vector<model_node_ptr>& feats, int taskind, int start); + + /** + * @brief Set the error and return the RMSE + * + * @param inds Indexes to access the central storage area + * @return The RMSE of the model + */ + void set_error(const std::vector<int>& inds, int taskind, int start); + + /** + * @brief Set the error and return the RMSE + * + * @param feats The features used to evaluate the loss function for the test set + * @return The test RMSE of the feature + */ + void set_error(const std::vector<model_node_ptr>& feats, int taskind, int start); + + /** + * @brief Set the error and return the RMSE + * + * @param feats The features used to evaluate the loss function + * @return The RMSE of the model + */ + void set_test_error(const std::vector<model_node_ptr>& feats, int taskind, int start); + + inline LOSS_TYPE type() const {return LOSS_TYPE::LOG_PEARSON_RMSE;} +}; + +#endif diff --git a/src/loss_function/LossFunctionPearsonRMSE.cpp b/src/loss_function/LossFunctionPearsonRMSE.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fe517c9e73bf6585a3bb680c5d4b1a83e1afe37b --- /dev/null +++ b/src/loss_function/LossFunctionPearsonRMSE.cpp @@ -0,0 +1,353 @@ +#include "loss_function/LossFunctionPearsonRMSE.hpp" + +LossFunctionPearsonRMSE::LossFunctionPearsonRMSE( + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + bool fix_intercept, + int n_feat +) : + LossFunction(prop_train, prop_test, task_sizes_train, task_sizes_test, fix_intercept, n_feat), + _a(_n_dim * _n_samp, 1.0), + _b(prop_train), + _c(_n_task, 0.0), + _coefs(_n_task * _n_dim, 0.0), + _lwork(0) +{ + set_nfeat(_n_feat); + set_opt_lwork(); +} + +LossFunctionPearsonRMSE::LossFunctionPearsonRMSE(std::shared_ptr<LossFunction> o) : + LossFunction(o), + _a(_n_dim * _n_samp, 1.0), + _b(_prop_train), + _c(_n_task, 0.0), + _coefs(_n_task * _n_dim, 0.0), + _lwork(0) +{ + set_nfeat(_n_feat); + set_opt_lwork(); +} + +void LossFunctionPearsonRMSE::set_nfeat(int n_feat) +{ + _n_feat = n_feat; + _n_dim = n_feat + (!_fix_intercept); + + _a.resize(_n_samp * _n_dim); + _c.resize(_n_project_prop * _n_task, 0.0); + _coefs.resize(_n_dim * _n_task); + _scores.resize(_n_project_prop); + + set_opt_lwork(); +} + +void LossFunctionPearsonRMSE::reset_projection_prop(const std::vector<std::vector<model_node_ptr>>& models) +{ + + _projection_prop.resize(_n_samp * _n_project_prop); + for(int mm = 0; mm < _n_project_prop; ++mm) + { + double loss = (*this)(models[mm]); + std::copy_n(_error_train.data(), _error_train.size(), &_projection_prop[mm * _n_samp]); + } + + set_nfeat(models.back().size() + 1); + _n_project_prop = models.size(); +} + +void LossFunctionPearsonRMSE::set_opt_lwork() +{ + std::vector<double> work(1, 0.0); + int info = 0, rank = 0; + + dgels_('N', _n_samp, _n_dim, 1, _a.data(), _n_samp, _b.data(), _n_samp, work.data(), -1, &info); + + if(info == 0) + { + _lwork = static_cast<int>(std::round(work[0])); + _work.resize(_lwork, 0.0); + } + else + { + throw std::logic_error("Failed to get lwork."); + } +} + +void LossFunctionPearsonRMSE::prepare_project() +{ + std::vector<double> temp_prop(_projection_prop); + + for(int pp = 0; pp < _n_project_prop; ++pp) + { + int start = 0; + for(int tt = 0; tt < _n_task; ++tt) + { + double prop_mean = util_funcs::mean( + temp_prop.data() + pp * _n_samp + start, + _task_sizes_train[tt] + ); + + double prop_std = util_funcs::stand_dev( + temp_prop.data() + pp * _n_samp + start, + _task_sizes_train[tt] + ); + + std::transform( + temp_prop.begin() + pp * _n_samp + start, + temp_prop.begin() + pp * _n_samp + start + _task_sizes_train[tt], + _projection_prop.begin() + pp * _task_sizes_train[tt] + start * _n_project_prop, + [prop_mean, prop_std](double p){return (p - prop_mean) / prop_std;} + ); + start += _task_sizes_train[tt]; + } + } +} +double LossFunctionPearsonRMSE::project(const node_ptr& feat) +{ + std::fill_n(_scores.begin(), _scores.size(), 0.0); + return calc_max_pearson(feat->value_ptr(-1, true)); +} + +double LossFunctionPearsonRMSE::calc_max_pearson(double* feat_val_ptr) +{ + int start = 0; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + { + double feat_std = util_funcs::stand_dev(feat_val_ptr + start, _task_sizes_train[tt]); + dgemm_( + 'N', + 'N', + _n_project_prop, + 1, + _task_sizes_train[tt], + 1.0 / feat_std / static_cast<double>(_task_sizes_train[tt]), + &_projection_prop[start * _n_project_prop], + _n_project_prop, + feat_val_ptr + start, + _task_sizes_train[tt], + 0.0, + &_c[tt * _n_project_prop], + _n_project_prop + ); + std::transform( + _scores.begin(), + _scores.end(), + &_c[tt * _n_project_prop], + _scores.begin(), + [](double score, double cc){return score + cc * cc;} + ); + + start += _task_sizes_train[tt]; + } + return *std::max_element(_scores.begin(), _scores.end()) / static_cast<double>(-1 * _n_task); +} + +double LossFunctionPearsonRMSE::operator()(const std::vector<int>& inds) +{ + std::copy_n(_prop_train.data(), _n_samp, _b.data()); + std:: fill_n(_a.begin(), _a.size(), 1.0); + int start = 0; + int tt = 0; + int info = 0; + + while((tt < _n_task) && (info == 0)) + { + set_a(inds, tt, start); + info = least_squares(tt, start); + set_error(inds, tt, start); + + start += _task_sizes_train[tt]; + ++tt; + } + + if(info != 0) + { + return -1.0 + info / std::abs(info) * 0.5; + } + std::vector<double> temp(_error_train.size(), 0.0); + std::transform( + _error_train.begin(), + _error_train.end(), + _prop_train.begin(), + temp.begin(), + [](double p_est, double p){return std::exp(p_est + p) - std::exp(p);} + ); + + return std::sqrt( 1.0 / static_cast<double>(_n_samp) * + std::accumulate( + _error_train.begin(), + _error_train.end(), + 0.0, + [](double total, double te){return total + te * te;} + ) + ); +} + +double LossFunctionPearsonRMSE::operator()(const std::vector<model_node_ptr>& feats) +{ + std::copy_n(_prop_train.data(), _n_samp, _b.data()); + std:: fill_n(_a.begin(), _a.size(), 1.0); + int start = 0; + int tt = 0; + int info = 0; + while((tt < _n_task) && (info == 0)) + { + set_a(feats, tt, start); + info = least_squares(tt, start); + set_error(feats, tt, start); + + start += _task_sizes_train[tt]; + ++tt; + } + + if(info != 0) + { + return -1.0 + info / std::abs(info) * 0.5; + } + return std::sqrt(1.0 / static_cast<double>(_n_samp) * + std::accumulate( + _error_train.begin(), + _error_train.end(), + 0.0, + [](double total, double te){return total + te * te;} + ) + ); +} + +double LossFunctionPearsonRMSE::test_loss(const std::vector<model_node_ptr>& feats) +{ + double train_rmse = (*this)(feats); + if(train_rmse < 0.0) + { + throw std::logic_error("Failed to solve least squares problem."); + } + + int start = 0; + for(int tt = 0; tt < _task_sizes_test.size(); ++tt) + { + set_test_error(feats, tt, start); + start += _task_sizes_test[tt]; + } + + return std::sqrt(1.0 / static_cast<double>(_n_samp_test) * + std::accumulate( + _error_test.begin(), + _error_test.end(), + 0.0, + [](double total, double te){return total + te * te;} + ) + ); +} + +void LossFunctionPearsonRMSE::set_a(const std::vector<int>& inds, int taskind, int start) +{ + for(int ff = 0; ff < inds.size(); ++ff) + { + std::copy_n( + node_value_arrs::get_d_matrix_ptr(inds[ff]) + start, + _task_sizes_train[taskind], + &_a[ff * _task_sizes_train[taskind] + start * _n_dim] + ); + } +} + +void LossFunctionPearsonRMSE::set_a(const std::vector<model_node_ptr>& feats, int taskind, int start) +{ + for(int ff = 0; ff < feats.size(); ++ff) + { + std::copy_n( + feats[ff]->value_ptr() + start, + _task_sizes_train[taskind], + &_a[ff * _task_sizes_train[taskind] + start * _n_dim] + ); + } +} + +int LossFunctionPearsonRMSE::least_squares(int taskind, int start) +{ + int info; + dgels_( + 'N', + _task_sizes_train[taskind], + _n_dim, + 1, + &_a[start * _n_dim], + _task_sizes_train[taskind], + &_b[start], + _task_sizes_train[taskind], + _work.data(), + _lwork, + &info + ); + std::copy_n(&_b[start], _n_dim, &_coefs[taskind * _n_dim]); + return info; +} + +void LossFunctionPearsonRMSE::set_error(const std::vector<int>& inds, int taskind, int start) +{ + std::fill_n( + _error_train.begin() + start, + _task_sizes_train[taskind], + -1 * (!_fix_intercept) * _coefs[(taskind + 1) * _n_dim - 1] + ); + + for(int ff = 0; ff < inds.size(); ++ff) + { + daxpy_( + _task_sizes_train[taskind], + -1.0 * _coefs[taskind * _n_dim + ff], + node_value_arrs::get_d_matrix_ptr(inds[ff]) + start, + 1, + &_error_train[start], + 1 + ); + } + daxpy_(_task_sizes_train[taskind], 1.0, &_prop_train[start], 1, &_error_train[start], 1); +} + +void LossFunctionPearsonRMSE::set_error(const std::vector<model_node_ptr>& feats, int taskind, int start) +{ + std::fill_n( + _error_train.begin() + start, + _task_sizes_train[taskind], + -1 * (!_fix_intercept) * _coefs[(taskind + 1) * _n_dim - 1] + ); + + for(int ff = 0; ff < feats.size(); ++ff) + { + daxpy_( + _task_sizes_train[taskind], + -1.0 * _coefs[taskind * _n_dim + ff], + feats[ff]->value_ptr() + start, + 1, + &_error_train[start], + 1 + ); + } + daxpy_(_task_sizes_train[taskind], 1.0, &_prop_train[start], 1, &_error_train[start], 1); +} + +void LossFunctionPearsonRMSE::set_test_error(const std::vector<model_node_ptr>& feats, int taskind, int start) +{ + std::fill_n( + _error_test.begin() + start, + _task_sizes_test[taskind], + -1 * (!_fix_intercept) * _coefs[(taskind + 1) * _n_feat - 1] + ); + + for(int ff = 0; ff < feats.size(); ++ff) + { + daxpy_( + _task_sizes_test[taskind], + -1.0 * _coefs[taskind * _n_dim + ff], + feats[ff]->test_value_ptr() + start, + 1, + &_error_test[start], + 1 + ); + } + daxpy_(_task_sizes_test[taskind], 1.0, &_prop_test[start], 1, &_error_test[start], 1); +} diff --git a/src/loss_function/LossFunctionPearsonRMSE.hpp b/src/loss_function/LossFunctionPearsonRMSE.hpp new file mode 100644 index 0000000000000000000000000000000000000000..32e0d32c79e8fe1a77bfb4a71eb3faabf3db2af2 --- /dev/null +++ b/src/loss_function/LossFunctionPearsonRMSE.hpp @@ -0,0 +1,183 @@ +/** @file loss_function/LossFunctionPearsonRMSE.hpp + * @brief A class used to calculate the projection score and l0-regularization error + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ + +#ifndef LOSS_FUNCTION_PEARSON_RMSE +#define LOSS_FUNCTION_PEARSON_RMSE + +#include <math.h> +#include "loss_function/LossFunction.hpp" + +class LossFunctionPearsonRMSE: public LossFunction +{ +protected: + std::vector<double> _a; //!< The A matrix used for calculating the least squares solution of the problem + std::vector<double> _b; //!< The property vector used to for solving the least squares solution + std::vector<double> _c; //!< Vector used to temporarily store _projection_prop_sum + std::vector<double> _scores; //!< Vector used to temporarily store _projection_prop_sum + std::vector<double> _work; //!< Work vector for dgels + std::vector<double> _coefs; //!< Coefficients for the least squares solution + + int _lwork; //!< Ideal size of the work size + +public: + + /** + * @brief Constructor + * + * @param prop_train The value of the property to evaluate the loss function against for the training set + * @param prop_test The value of the property to evaluate the loss function against for the test set + * @param task_sizes_train Number of training samples per task + * @param task_sizes_test Number of testing samples per task + */ + LossFunctionPearsonRMSE( + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + bool fix_intercept = false, + int n_feat = 1 + ); + + LossFunctionPearsonRMSE(std::shared_ptr<LossFunction> o); + + /** + * @brief Copy Constructor + * + * @param o LossFunctionPearsonRMSE to be copied + */ + LossFunctionPearsonRMSE(const LossFunctionPearsonRMSE& o) = default; + + /** + * @brief Move Constructor + * + * @param o LossFunctionPearsonRMSE to be moved + */ + LossFunctionPearsonRMSE(LossFunctionPearsonRMSE&& o) = default; + + /** + * @brief Copy Assignment operator + * + * @param o LossFunctionPearsonRMSE to be copied + */ + LossFunctionPearsonRMSE& operator= (const LossFunctionPearsonRMSE& o) = default; + + /** + * @brief Move Assignment operator + * + * @param o LossFunctionPearsonRMSE to be moved + */ + LossFunctionPearsonRMSE& operator= (LossFunctionPearsonRMSE&& o) = default; + + /** + * @brief Perform set up needed to calculate the projection scores + */ + void prepare_project(); + + /** + * @brief Calculate the projection score of a feature + * + * @param feat Feature to calculate the projection score of + * @return The projection score for the feature + */ + virtual double project(const node_ptr& feat); + + /** + * @brief Calculate the max Pearson correlation for all features + * + * @param feat_val_ptr pointer to the data of the feature + * @return The max Pearson correlation between the feature and properties + */ + double calc_max_pearson(double* feat_val_ptr); + + /** + * @brief Evaluate the loss function for a set of features whose data is stored in a central location + * + * @param inds Indexes to access the central storage area + * @return The value of the loss function + */ + double operator()(const std::vector<int>& inds); + + /** + * @brief Evaluate the loss function for a set of features + * + * @param feats The features used to evaluate the loss function + * @return The value of the loss function + */ + double operator()(const std::vector<model_node_ptr>& feats); + + /** + * @brief Evaluate the loss function for the test set + * + * @param feats The features used to evaluate the loss function + * @return The value of the loss function for the test data + */ + double test_loss(const std::vector<model_node_ptr>& feats); + + /** + * @brief Get the optimal size of the work vector + */ + void set_opt_lwork(); + + /** + * @brief Set the _a matrix + * + * @param inds Indexes to access the central storage area + */ + virtual void set_a(const std::vector<int>& inds, int taskind, int start); + + /** + * @brief Set the _a matrix + * + * @param feats The features used to evaluate the loss function + */ + virtual void set_a(const std::vector<model_node_ptr>& feats, int taskind, int start); + + /** + * @brief Set the error and return the RMSE + * + * @param inds Indexes to access the central storage area + * @return The RMSE of the model + */ + virtual void set_error(const std::vector<int>& inds, int taskind, int start); + + /** + * @brief Set the error and return the RMSE + * + * @param feats The features used to evaluate the loss function + * @return The RMSE of the model + */ + virtual void set_error(const std::vector<model_node_ptr>& feats, int taskind, int start); + + /** + * @brief Set the error and return the RMSE + * + * @param feats The features used to evaluate the loss function for the test set + * @return The test RMSE of the feature + */ + virtual void set_test_error(const std::vector<model_node_ptr>& feats, int taskind, int start); + + /** + * @brief Perform least squares regression + * + * @return info The final info value from dgels + */ + int least_squares(int taskind, int start); + + /** + * @brief Set the model list from the previous generation + * @details [long description] + * + * @param model_list [description] + */ + void reset_projection_prop(const std::vector<std::vector<model_node_ptr>>& models); + + virtual inline LOSS_TYPE type() const {return LOSS_TYPE::PEARSON_RMSE;} + inline std::vector<double> coefs() const {return _coefs;} + void set_nfeat(int n_feat); +}; + +#endif diff --git a/src/loss_function/utils.cpp b/src/loss_function/utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c17fb3b00bcd79eef1b9e16b229b4f36cdb4d693 --- /dev/null +++ b/src/loss_function/utils.cpp @@ -0,0 +1,61 @@ +#include "loss_function/utils.hpp" +std::shared_ptr<LossFunction> loss_function_util::get_loss_function( + std::string type, + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + bool fix_intercept +) +{ + if(type == "regression") + { + return std::make_shared<LossFunctionPearsonRMSE>( + prop_train, + prop_test, + task_sizes_train, + task_sizes_test, + fix_intercept + ); + } + else if(type == "log_regression") + { + return std::make_shared<LossFunctionLogPearsonRMSE>( + prop_train, + prop_test, + task_sizes_train, + task_sizes_test, + fix_intercept + ); + } + else if(type == "classification") + { + return std::make_shared<LossFunctionConvexHull>( + prop_train, + prop_test, + task_sizes_train, + task_sizes_test + ); + } + + throw std::logic_error("LossFunction type was not defined."); + return nullptr; +} + +std::shared_ptr<LossFunction> loss_function_util::copy(std::shared_ptr<LossFunction> loss) +{ + if(loss->type() == LOSS_TYPE::PEARSON_RMSE) + { + return std::make_shared<LossFunctionPearsonRMSE>(loss); + } + else if(loss->type() == LOSS_TYPE::LOG_PEARSON_RMSE) + { + return std::make_shared<LossFunctionLogPearsonRMSE>(loss); + } + else if(loss->type() == LOSS_TYPE::CONVEX_HULL) + { + return std::make_shared<LossFunctionConvexHull>(loss); + } + throw std::logic_error("LossFunction type was not defined in the copy function."); + return nullptr; +} diff --git a/src/loss_function/utils.hpp b/src/loss_function/utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..dc8683306aef7abca7b3505f17fb7e2fd76b19da --- /dev/null +++ b/src/loss_function/utils.hpp @@ -0,0 +1,31 @@ +/** @file loss_function/utils.hpp + * @brief utilities for generating loss_functions + * + * @author Thomas A. R. Purcell (tpurcell) + * @bug No known bugs. + */ + +#ifndef LOSS_FUNCTION_UTILS +#define LOSS_FUNCTION_UTILS + +#include "loss_function/LossFunctionConvexHull.hpp" +#include "loss_function/LossFunctionLogPearsonRMSE.hpp" +#include "loss_function/LossFunctionPearsonRMSE.hpp" + +namespace loss_function_util +{ + +std::shared_ptr<LossFunction> get_loss_function( + std::string type, + std::vector<double> prop_train, + std::vector<double> prop_test, + std::vector<int> task_sizes_train, + std::vector<int> task_sizes_test, + bool fix_intercept = false +); + +std::shared_ptr<LossFunction> copy(std::shared_ptr<LossFunction> loss); + +}; + +#endif diff --git a/src/python/descriptor_identifier/SISSOClassifier.cpp b/src/python/descriptor_identifier/SISSOClassifier.cpp index a4857b067905b1bd1bd6f9834ce06d9703227a4e..dbda781625f0c78bb724fbcb8ac69b2ad11d7b1b 100644 --- a/src/python/descriptor_identifier/SISSOClassifier.cpp +++ b/src/python/descriptor_identifier/SISSOClassifier.cpp @@ -13,16 +13,28 @@ SISSOClassifier::SISSOClassifier( int n_residual, int n_models_store ) : - SISSO_DI(feat_space, prop_label, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, false), + SISSO_DI( + "classification", + feat_space, + prop_label, + prop_unit, + prop, + prop_test, + task_sizes_train, + task_sizes_test, + leave_out_inds, + n_dim, + n_residual, + n_models_store, + false + ), _c(100.0), _width(1.0e-5), _n_class(1) { - std::vector<double> prop_vec = python_conv_utils::from_ndarray<double>(prop); - std::vector<double> prop_test_vec = python_conv_utils::from_ndarray<double>(prop_test); - - check_prop_test(prop_vec, prop_test_vec); - setup_prop(prop_vec, prop_test_vec); + _prop = _loss->prop_train(); + _prop_test = _loss->prop_test(); + setup_d_mat_transfer(); } SISSOClassifier::SISSOClassifier( @@ -38,16 +50,28 @@ SISSOClassifier::SISSOClassifier( int n_residual, int n_models_store ) : - SISSO_DI(feat_space, prop_label, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, false), + SISSO_DI( + "classification", + feat_space, + prop_label, + prop_unit, + prop, + prop_test, + task_sizes_train, + task_sizes_test, + leave_out_inds, + n_dim, + n_residual, + n_models_store, + false + ), _c(100.0), _width(1.0e-5), _n_class(1) { - std::vector<double> prop_vec = python_conv_utils::from_list<double>(prop); - std::vector<double> prop_test_vec = python_conv_utils::from_list<double>(prop_test); - - check_prop_test(prop_vec, prop_test_vec); - setup_prop(prop_vec, prop_test_vec); + _prop = _loss->prop_train(); + _prop_test = _loss->prop_test(); + setup_d_mat_transfer(); } py::list SISSOClassifier::models_py() diff --git a/src/python/descriptor_identifier/SISSOLogRegressor.cpp b/src/python/descriptor_identifier/SISSOLogRegressor.cpp index 97ff8ad17589a54182a26eba22554b2f0d317f6c..cf7d1b193a2fba03c368482c69d9d12483f50dc1 100644 --- a/src/python/descriptor_identifier/SISSOLogRegressor.cpp +++ b/src/python/descriptor_identifier/SISSOLogRegressor.cpp @@ -31,6 +31,14 @@ SISSOLogRegressor::SISSOLogRegressor( _prop_no_log(_prop) { std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); + _loss = loss_function_util::get_loss_function( + "log_regression", + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } SISSOLogRegressor::SISSOLogRegressor( @@ -64,6 +72,14 @@ SISSOLogRegressor::SISSOLogRegressor( _prop_no_log(_prop) { std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); + _loss = loss_function_util::get_loss_function( + "log_regression", + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } py::list SISSOLogRegressor::models_log_reg_py() diff --git a/src/python/descriptor_identifier/SISSORegressor.cpp b/src/python/descriptor_identifier/SISSORegressor.cpp index 98c507d15a6726b931572f98a466b75b44423ff6..530af82be3fa6d01395a4bc46d7453c9a46803c4 100644 --- a/src/python/descriptor_identifier/SISSORegressor.cpp +++ b/src/python/descriptor_identifier/SISSORegressor.cpp @@ -15,6 +15,7 @@ SISSORegressor::SISSORegressor( bool fix_intercept ) : SISSO_DI( + "regression", feat_space, prop_label, prop_unit, @@ -45,6 +46,7 @@ SISSORegressor::SISSORegressor( bool fix_intercept ) : SISSO_DI( + "regression", feat_space, prop_label, prop_unit, diff --git a/src/python/descriptor_identifier/SISSO_DI.cpp b/src/python/descriptor_identifier/SISSO_DI.cpp index 4d8998cbaffab2c6bbefcf13cfddd8c6c96fe38e..c3b54f070724129b9949d63651b12395cd20b6ab 100644 --- a/src/python/descriptor_identifier/SISSO_DI.cpp +++ b/src/python/descriptor_identifier/SISSO_DI.cpp @@ -1,6 +1,7 @@ #include "descriptor_identifier/SISSO_DI/SISSO_DI.hpp" SISSO_DI::SISSO_DI( + std::string loss_type, std::shared_ptr<FeatureSpace> feat_space, std::string prop_label, Unit prop_unit, @@ -32,9 +33,19 @@ SISSO_DI::SISSO_DI( _fix_intercept(fix_intercept) { node_value_arrs::initialize_d_matrix_arr(); + + _loss = loss_function_util::get_loss_function( + loss_type, + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } SISSO_DI::SISSO_DI( + std::string loss_type, std::shared_ptr<FeatureSpace> feat_space, std::string prop_label, Unit prop_unit, @@ -66,4 +77,13 @@ SISSO_DI::SISSO_DI( _fix_intercept(fix_intercept) { node_value_arrs::initialize_d_matrix_arr(); + + _loss = loss_function_util::get_loss_function( + loss_type, + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } diff --git a/src/python/feature_creation/FeatureSpace.cpp b/src/python/feature_creation/FeatureSpace.cpp index a739122394a3e34da53a3510f0f4b02225e5a786..c31f8f7b08b05b06f2c66c19c26ccaf4d05596b2 100644 --- a/src/python/feature_creation/FeatureSpace.cpp +++ b/src/python/feature_creation/FeatureSpace.cpp @@ -202,7 +202,6 @@ FeatureSpace::FeatureSpace( _max_param_depth(-1), _reparam_residual(false) { - project_funcs::set_project_fxn(project_type, _task_sizes.size(), _project, _project_no_omp); comp_feats::set_is_valid_fxn(project_type, _cross_cor_max, _n_samp, _is_valid, _is_valid_feat_list); mpi_reduce_op::set_op(_project_type, _cross_cor_max, _n_sis_select); @@ -318,7 +317,6 @@ FeatureSpace::FeatureSpace( _end_no_params.push_back( std::count_if(feat_n_params.begin(), feat_n_params.end(), [](int n_param){return n_param == 0;}) ); - std::cout << "out" << std::endl; } #endif _scores.resize(_n_feat); @@ -352,7 +350,6 @@ FeatureSpace::FeatureSpace( _max_param_depth(-1), _reparam_residual(false) { - project_funcs::set_project_fxn(project_type, _task_sizes.size(), _project, _project_no_omp); comp_feats::set_is_valid_fxn(project_type, _cross_cor_max, _n_samp, _is_valid, _is_valid_feat_list); mpi_reduce_op::set_op(_project_type, _cross_cor_max, _n_sis_select); diff --git a/src/utils/enum.hpp b/src/utils/enum.hpp index 629ebfc0ef400b5aeed0775e8b56337b7c272a2e..cc24940a55d0889718b199a3d8a2193068653712 100644 --- a/src/utils/enum.hpp +++ b/src/utils/enum.hpp @@ -61,4 +61,11 @@ REG, LOG_REG }; + + enum class LOSS_TYPE + { + CONVEX_HULL, + PEARSON_RMSE, + LOG_PEARSON_RMSE + }; #endif diff --git a/src/utils/mkl_interface.hpp b/src/utils/mkl_interface.hpp index 6382c7eff7c7662d28d613faa65c531b98f43ae4..0ce0dcb79422aeeb2e10e697847aba37f1b6b44b 100644 --- a/src/utils/mkl_interface.hpp +++ b/src/utils/mkl_interface.hpp @@ -850,4 +850,4 @@ namespace } -#endif \ No newline at end of file +#endif diff --git a/src/utils/project.cpp b/src/utils/project.cpp index 376e9bbf39d908d8c08dcfdacfc2fb5d5b8f13fa..b57c027a3650374cf268e217c6620fc98f61ecf9 100644 --- a/src/utils/project.cpp +++ b/src/utils/project.cpp @@ -1,34 +1,5 @@ #include "utils/project.hpp" -void project_funcs::set_project_fxn( - const std::string project_type, - const int n_task, - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)>& project, - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)>& project_no_omp -) -{ - if(project_type.compare("regression") == 0) - { - project = project_r2; - project_no_omp = project_r2_no_omp; - } - else if(project_type.compare("classification") == 0) - { - project = project_classify; - project_no_omp = project_classify_no_omp; - } - else if(project_type.compare("log_regression") == 0) - { - if(n_task > 1) - throw std::logic_error("Log Regression can not be done using multiple tasks."); - project = project_log_r2; - project_no_omp = project_log_r2_no_omp; - } - else - throw std::logic_error("Wrong projection type passed to FeatureSpace constructor."); - -} - void project_funcs::project_r( const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop ) @@ -78,140 +49,6 @@ void project_funcs::project_r( } } -void project_funcs::project_r2( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) -{ - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - #pragma omp parallel firstprivate(prop) - { - int start = ( - omp_get_thread_num() * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num(), static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - int end = ( - (omp_get_thread_num() + 1) * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num() + 1, static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - - std::vector<double> mean_prop(sizes.size(), 0.0); - std::vector<double> std_prop(sizes.size(), 0.0); - std::fill_n(scores + start, end - start, 0.0); - for(int pp = 0; pp < n_prop; ++pp) - { - int pos = 0; - for(int tt = 0; tt < sizes.size(); ++tt) - { - mean_prop[tt] = util_funcs::mean<double>(prop + pos, sizes[tt]); - std_prop[tt] = util_funcs::stand_dev(prop + pos, sizes[tt], mean_prop[tt]); - pos += sizes[tt]; - } - std::transform( - phi.begin() + start, - phi.begin() + end, - scores + start, - scores + start, - [&prop, &sizes, &mean_prop, &std_prop](node_ptr feat, double sc){ - return std::min( - -1 * util_funcs::r2(prop, feat->value_ptr(-1, true), sizes.data(), mean_prop.data(), std_prop.data(), sizes.size()), - sc - ); - } - ); - prop += n_samp; - } - std::transform(scores + start, scores + end, scores + start, [](double score){return std::isnan(score) ? 0.0 : score;}); - - #pragma omp barrier - } -} - -void project_funcs::project_log_r2( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) -{ - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - #pragma omp parallel firstprivate(prop) - { - int start = ( - omp_get_thread_num() * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num(), static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - int end = ( - (omp_get_thread_num() + 1) * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num() + 1, static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - - std::vector<double> mean_prop(sizes.size(), 0.0); - std::vector<double> std_prop(sizes.size(), 0.0); - std::vector<double> log_feat(n_samp, 0.0); - - std::fill_n(scores + start, end - start, 0.0); - for(int pp = 0; pp < n_prop; ++pp) - { - int pos = 0; - for(int tt = 0; tt < sizes.size(); ++tt) - { - mean_prop[tt] = util_funcs::mean<double>(prop + pos, sizes[tt]); - std_prop[tt] = util_funcs::stand_dev(prop + pos, sizes[tt], mean_prop[tt]); - pos += sizes[tt]; - } - std::transform( - phi.begin() + start, - phi.begin() + end, - scores + start, - scores + start, - [&prop, &sizes, &mean_prop, &std_prop, &log_feat](node_ptr feat, double sc){ - return std::min( - -1 * util_funcs::log_r2(feat->value_ptr(-1, true), prop, log_feat.data(), sizes.data(), mean_prop.data(), std_prop.data(), sizes.size()), - sc - ); - } - ); - prop += n_samp; - } - std::transform(scores + start, scores + end, scores + start, [](double score){return std::isnan(score) ? 0.0 : score;}); - - #pragma omp barrier - } -} - -void project_funcs::project_classify( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) -{ - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - std::fill_n(scores, phi.size(), std::numeric_limits<double>::max()); - #pragma omp parallel firstprivate(sizes, prop) - { - int start = ( - omp_get_thread_num() * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num(), static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - int end = ( - (omp_get_thread_num() + 1) * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num() + 1, static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - - for(int pp = 0; pp < n_prop; ++pp) - { - ConvexHull1D convex_hull(sizes, prop); - std::transform( - phi.begin() + start, - phi.begin() + end, - scores + start, - scores + start, - [&convex_hull](node_ptr feat, double score){return std::min(convex_hull.overlap_1d(feat->value_ptr(-1, true)), score);} - ); - prop += n_samp; - #pragma omp barrier - } - std::transform(scores + start, scores + end, scores + start, [](double score){return std::isnan(score) ? std::numeric_limits<double>::infinity() : score;}); - } -} - void project_funcs::project_r_no_omp( const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop ) @@ -249,99 +86,42 @@ void project_funcs::project_r_no_omp( std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;}); } -void project_funcs::project_r2_no_omp( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) +void project_funcs::project_loss(std::shared_ptr<LossFunction> loss, const std::vector<node_ptr>& feats, double* scores) { - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - std::vector<double> mean_prop(sizes.size(), 0.0); - std::vector<double> std_prop(sizes.size(), 0.0); - - std::fill_n(scores, phi.size(), 0.0); - - for(int pp = 0; pp < n_prop; ++pp) + std::fill_n(scores, feats.size(), std::numeric_limits<double>::max()); + #pragma omp parallel { - int pos = 0; - for(int tt = 0; tt < sizes.size(); ++tt) - { - mean_prop[tt] = util_funcs::mean<double>(prop + pos, sizes[tt]); - std_prop[tt] = util_funcs::stand_dev(prop + pos, sizes[tt], mean_prop[tt]); - pos += sizes[tt]; - } - std::transform( - phi.begin(), - phi.end(), - scores, - scores, - [&prop, &sizes, &mean_prop, &std_prop](node_ptr feat, double sc){ - return std::min( - -1 * util_funcs::r2(prop, feat->value_ptr(-1, true), sizes.data(), mean_prop.data(), std_prop.data(), sizes.size()), - sc - ); - } + std::shared_ptr<LossFunction> loss_copy = loss_function_util::copy(loss); + loss_copy->prepare_project(); + int start = ( + omp_get_thread_num() * (feats.size() / omp_get_num_threads()) + + std::min(omp_get_thread_num(), static_cast<int>(feats.size()) % omp_get_num_threads()) ); - prop += n_samp; - } - std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;}); -} - -void project_funcs::project_log_r2_no_omp( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) -{ - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - std::vector<double> log_feat(n_samp, 0.0); - std::vector<double> mean_prop(sizes.size(), 0.0); - std::vector<double> std_prop(sizes.size(), 0.0); - std::fill_n(scores, phi.size(), 0.0); + int end = ( + (omp_get_thread_num() + 1) * (feats.size() / omp_get_num_threads()) + + std::min(omp_get_thread_num() + 1, static_cast<int>(feats.size()) % omp_get_num_threads()) + ); - for(int pp = 0; pp < n_prop; ++pp) - { - int pos = 0; - for(int tt = 0; tt < sizes.size(); ++tt) - { - mean_prop[tt] = util_funcs::mean<double>(prop + pos, sizes[tt]); - std_prop[tt] = util_funcs::stand_dev(prop + pos, sizes[tt], mean_prop[tt]); - pos += sizes[tt]; - } std::transform( - phi.begin(), - phi.end(), - scores, - scores, - [&prop, &sizes, &mean_prop, &std_prop, &log_feat](node_ptr feat, double sc){ - return std::min( - -1 * util_funcs::log_r2(feat->value_ptr(-1, true), prop, log_feat.data(), sizes.data(), mean_prop.data(), std_prop.data(), sizes.size()), - sc - ); - } + feats.begin() + start, + feats.begin() + end, + scores + start, + [&loss_copy](node_ptr feat){return loss_copy->project(feat);} ); - prop += n_samp; + + #pragma omp barrier } - std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;}); } -void project_funcs::project_classify_no_omp( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) +void project_funcs::project_loss_no_omp(std::shared_ptr<LossFunction> loss, const std::vector<node_ptr>& feats, double* scores) { - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - std::fill_n(scores, phi.size(), std::numeric_limits<double>::max()); - for(int pp = 0; pp < n_prop; ++pp) - { - ConvexHull1D convex_hull(sizes, prop); - std::transform( - phi.begin(), - phi.end(), - scores, - scores, - [&convex_hull](node_ptr feat, double score){return std::min(convex_hull.overlap_1d(feat->value_ptr(-1, true)), score);} - ); - prop += n_samp; - } - std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? std::numeric_limits<double>::infinity() : score;}); + loss->prepare_project(); + std::fill_n(scores, feats.size(), std::numeric_limits<double>::max()); + std::transform( + feats.begin(), + feats.end(), + scores, + [&loss](node_ptr feat){return loss->project(feat);} + ); } diff --git a/src/utils/project.hpp b/src/utils/project.hpp index f3ac91513bf79a8fa58a3adffe6e7850b757a9e8..3a59b51616ae955049271cb453fed3e09e3aa61a 100644 --- a/src/utils/project.hpp +++ b/src/utils/project.hpp @@ -13,27 +13,13 @@ #include "classification/ConvexHull1D.hpp" #include "feature_creation/node/Node.hpp" +#include "loss_function/utils.hpp" #ifdef PARAMETERIZE #include "nl_opt/NLOptWrapper.hpp" #endif namespace project_funcs { - /** - * @brief Set the project functions for SIS - * - * @param project_type Type of projection to perform - * @param n_task number of tasks in calculation - * @param poroject The _poroject function from the feature_space - * @param project_no_omp The _project_no_omp function from the feature_space - */ - void set_project_fxn( - const std::string project_type, - const int n_task, - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)>& project, - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)>& project_no_omp - ); - /** * @brief Calculate the projection scores of a set of features to a vector via Pearson correlation * @@ -45,40 +31,6 @@ namespace project_funcs */ void project_r(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - /** - * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination - * - * @param prop The pointer to the property vector to calculate the Coefficient of Determination against - * @param scores The pointer of the score vector - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size Vector of the size of all of the tasks - * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of - */ - void project_r2(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - - /** - * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination (on the log transformed problem) - * - * @param prop The pointer to the property vector to calculate the Coefficient of Determination against - * @param scores The pointer of the score vector - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size Vector of the size of all of the tasks - * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of - */ - void project_log_r2(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - - /** - * @brief Calculate projection scores for classification - * @details Create a 1D-convex hull for each class and calculate the projection score - * - * @param prop The classes to separate the property - * @param scores the vector to store the projection scores - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size list of the sizes for each task - * @param n_prop number of properties - */ - void project_classify(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - /** * @brief Calculate the projection scores of a set of features to a vector via Pearson correlation * @@ -91,37 +43,22 @@ namespace project_funcs void project_r_no_omp(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); /** - * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination - * - * @param prop The pointer to the property vector to calculate the Coefficient of Determination against - * @param scores The pointer of the score vector - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size Vector of the size of all of the tasks - * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of - */ - void project_r2_no_omp(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - - /** - * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination (on the log transform problem) + * @brief Calculate the projection score of a set of features * - * @param prop The pointer to the property vector to calculate the Coefficient of Determination against - * @param scores The pointer of the score vector - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size Vector of the size of all of the tasks - * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of + * @param loss The LossFunnction used for the projection + * @param feats The set of features to calculate + * @param scores vector to output the scores to */ - void project_log_r2_no_omp(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); + void project_loss(std::shared_ptr<LossFunction> loss, const std::vector<node_ptr>& feats, double* scores); /** - * @brief Calculate projection scores for classification - * @details Create a 1D-convex hull for each class and calculate the projection score + * @brief Calculate the projection score of a set of features without using OpenMP * - * @param prop The classes to separate the property - * @param scores the vector to store the projection scores - * @param size list of the sizes for each task - * @param n_prop number of properties + * @param loss The LossFunnction used for the projection + * @param feats The set of features to calculate + * @param scores vector to output the scores to */ - void project_classify_no_omp(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); + void project_loss_no_omp(std::shared_ptr<LossFunction> loss, const std::vector<node_ptr>& feats, double* scores); } #endif diff --git a/tests/exec_test/classification/data.csv b/tests/exec_test/classification/data.csv index 711bcdb56da1d6e97e6645b854912d9574b7d4f8..3fa9f64bd0b9d2133040ea4849a71789ac8f078a 100644 --- a/tests/exec_test/classification/data.csv +++ b/tests/exec_test/classification/data.csv @@ -1,5 +1,5 @@ index,prop,A,B,C,D,E,F,G,H,I,J -0,1,0.1,-0.1,10,10,-0.492825179098274,0.173025977694162,0.598942935224295,-0.298754475196825,-0.581254909010269,-0.110656494210556 +0,1,0.1,-0.3,10,10,-0.492825179098274,0.173025977694162,0.598942935224295,-0.298754475196825,-0.581254909010269,-0.110656494210556 1,1,-1.89442810374214,-1.31996134398007,0.955713896876243,0.478117201427488,0.777586191100329,0.655369716778557,0.174914171427966,-0.288498877530604,-0.045316536149489,-0.606586193752411 2,1,-1.47460150711424,-1.22614964523433,0.330140292484796,-0.389505498689912,0.138935265824808,-0.871869282167555,0.37472462048701,0.16418591189513,0.293560701443717,0.285608940220021 3,1,-1.30213414336735,-1.82621262418812,-0.71381302228685,0.968769585007681,0.683148179202864,0.799125092538796,0.309479173526504,0.728052031003468,0.725495580994886,-0.676576302804248 @@ -19,7 +19,7 @@ index,prop,A,B,C,D,E,F,G,H,I,J 17,1,-1.29142309507315,-1.9506961526212,0.859669461054104,0.485772819254089,0.268883598825009,0.253553321699552,-0.045743087569395,0.66793403278249,0.308591963919947,0.771084301464027 18,1,-1.23404787121001,-1.68173519287847,-0.118871100462413,0.159000937768132,0.2985428841756,-0.203829205332538,-0.637945695251352,0.658049690810909,0.949529589134008,-0.577812553880056 19,1,-1.13513050029551,-1.3119036669604,0.74037411093045,0.558782660077505,-0.096052126354583,0.529119817360537,0.372190604770144,0.688656466253683,-0.819433165315505,-0.12814415930811 -20,1,-0.1,0.1,-10,-10,0.785237349732891,-0.387217730495401,-0.942409218899448,0.160806577297675,-0.723969983661972,-0.452650134415823 +20,1,-0.2,0.132,-10,-10,0.785237349732891,-0.387217730495401,-0.942409218899448,0.160806577297675,-0.723969983661972,-0.452650134415823 21,1,1.1507658618081,1.7260505392724,-0.92290734251321,0.465751384219632,-0.81727500527083,-0.182472640926628,0.887526070620356,0.111592883978488,0.386435078880162,-0.440017211221272 22,1,1.90389768224701,1.71880759316591,0.28033979546451,0.379365407838544,0.634843008192624,0.371753918780839,-0.611653305369863,0.732567927874185,0.85803611350317,-0.577973441708411 23,1,1.77751976452381,1.28697050370578,0.222850898945077,-0.424057088828287,-0.27619426781836,0.616941667680694,-0.696779972923147,0.23612770730498,0.760705889780527,0.34004139732033 @@ -39,7 +39,7 @@ index,prop,A,B,C,D,E,F,G,H,I,J 37,1,1.50455818499044,1.19094974349673,-0.653263607332762,0.359359450868376,0.30131719114182,0.649581794356589,0.942268955633086,0.884659894489377,-0.473171239344398,0.039635066570717 38,1,1.00833361547154,1.98150630000827,-0.812352457176761,0.219766101590983,-0.65021067790289,0.423621690291556,-0.58865099275791,0.061487886019891,-0.237737474016087,0.641284347380825 39,1,1.60179185724619,1.12508599627141,-0.829819386940741,-0.345104687573802,0.485166070545119,-0.258839727448056,-0.920615208326881,0.275498215871427,-0.629075534110342,-0.642527887960687 -40,0,0.1,0.1,10,-10,0.041519856511361,0.23303461629095,-0.497233246191187,-0.023544587617095,-0.418540837770003,-0.550233932792512 +40,0,0.2,0.58,10,-10,0.041519856511361,0.23303461629095,-0.497233246191187,-0.023544587617095,-0.418540837770003,-0.550233932792512 41,0,-1.09147574370355,1.70418701701285,-0.480316474702795,-0.753784710340632,-0.613234235616998,0.167955573662474,0.455636631315042,-0.380227635953206,0.48021383007369,-0.453674929885108 42,0,-1.9425392252915,1.59311394144654,0.310098050913387,-0.835007082906627,0.407580140850853,0.556924247596553,-0.388616604639346,0.60215104751412,-0.984322198098753,-0.996332888983337 43,0,-1.40302421044915,1.05041379743038,-0.898456453446964,-0.797029924245349,0.47491891024478,0.480193220538417,-0.750856163558686,-0.043960372032018,-0.242651391805662,-0.109239061054006 @@ -59,7 +59,7 @@ index,prop,A,B,C,D,E,F,G,H,I,J 57,0,-1.79333947783294,1.64615611570236,0.593670368718185,0.74125415566331,-0.835056311664806,-0.128283340965351,0.795769070113583,0.338062872249377,0.961610282279288,-0.519755961049099 58,0,-1.68562328688306,1.79136645116331,-0.917792004629201,-0.224807652067029,0.751172530954049,0.744925497765574,0.054821387540181,-0.268146122719043,-0.373795753322288,-0.023619900695578 59,0,-1.70325116873164,1.56173898398367,0.937331444475048,-0.189146596668676,0.726757528139029,0.571196020214809,0.150478496659529,0.716370904753891,0.645947936391794,-0.096512499841381 -60,0,-0.1,-0.1,-10,10,0.303748234076738,0.094684069184242,0.846651908762107,0.505710991097032,-0.664846620425076,-0.722934785670171 +60,0,-0.31,-0.164,-10,10,0.303748234076738,0.094684069184242,0.846651908762107,0.505710991097032,-0.664846620425076,-0.722934785670171 61,0,1.51747503460744,-1.57976833969122,-0.313853456471656,-0.670641690437042,0.337481189036041,-0.695059667580877,0.382512664766648,-0.754051294565859,-0.540912893771664,-0.152736592481289 62,0,1.36729416399966,-1.54942606995245,0.746279765035798,0.320667909398266,0.075712278316126,0.557089028326803,-0.314459962457274,-0.091179395352991,-0.712572618352738,-0.862523770264919 63,0,1.87551859565403,-1.01245024447758,0.961634242304571,0.99902517180177,0.428576472620752,0.790254229843056,-0.162732148014183,0.057108415575022,0.099625367521191,-0.41779573726667 diff --git a/tests/exec_test/classification/sisso.json b/tests/exec_test/classification/sisso.json index 2f7c29bf64d22cb2b1108f4726bf7e15b58b8a8b..8a758b7f1589278c8c11bcefa7a6a39b9922942e 100644 --- a/tests/exec_test/classification/sisso.json +++ b/tests/exec_test/classification/sisso.json @@ -1,6 +1,6 @@ { "desc_dim": 2, - "n_sis_select": 1, + "n_sis_select": 5, "max_rung": 1, "n_residual": 1, "data_file": "data.csv", diff --git a/tests/googletest/descriptor_identification/model/test_model_classifier.cc b/tests/googletest/descriptor_identification/model/test_model_classifier.cc index f7f68ed81f2ef1cc36c706113dc6968004fe128d..5165db2478a356ff1824d9acf261bc580027ee6d 100644 --- a/tests/googletest/descriptor_identification/model/test_model_classifier.cc +++ b/tests/googletest/descriptor_identification/model/test_model_classifier.cc @@ -22,6 +22,13 @@ namespace _prop = {0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0}; _prop_test = {0.0, 1.0}; + _loss = std::make_shared<LossFunctionConvexHull>( + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + 1 + ); } std::vector<int> _leave_out_inds; std::vector<int> _task_sizes_train; @@ -31,6 +38,7 @@ namespace std::vector<double> _prop_test; std::vector<model_node_ptr> _features; + std::shared_ptr<LossFunction> _loss; }; TEST_F(ModelClassifierTests, NodesTest) @@ -38,13 +46,9 @@ namespace ModelClassifier model( "Property", Unit("m"), - _prop, - _prop_test, + _loss, _features, - _leave_out_inds, - _task_sizes_train, - _task_sizes_test, - false + _leave_out_inds ); EXPECT_STREQ(model.toString().c_str(), "[A]"); EXPECT_EQ(model.n_convex_overlap_train(), 0); diff --git a/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc b/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc index 1f9e6a37e8ba5d10b8414ec65137e470ccf7686c..29e3ac5584484e9b1474a4654b330c15970ba06b 100644 --- a/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc +++ b/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc @@ -27,8 +27,8 @@ namespace _prop = std::vector<double>(10, 0.0); _prop_test = std::vector<double>(2, 0.0); - std::transform(value_1.begin(), value_1.end(), value_2.begin(), _prop.begin(), [](double v1, double v2){return 0.001 * std::pow(v1, 0.1) * std::pow(v2, -2.1);}); - std::transform(test_value_1.begin(), test_value_1.end(), test_value_2.begin(), _prop_test.begin(), [](double v1, double v2){return 0.001 * std::pow(v1, 0.1) * std::pow(v2, -2.1);}); + std::transform(value_1.begin(), value_1.end(), value_2.begin(), _prop.begin(), [](double v1, double v2){return std::log(0.001 * std::pow(v1, 0.1) * std::pow(v2, -2.1));}); + std::transform(test_value_1.begin(), test_value_1.end(), test_value_2.begin(), _prop_test.begin(), [](double v1, double v2){return std::log(0.001 * std::pow(v1, 0.1) * std::pow(v2, -2.1));}); } std::vector<int> _leave_out_inds; std::vector<int> _task_sizes_train; @@ -38,20 +38,26 @@ namespace std::vector<double> _prop_test; std::vector<model_node_ptr> _features; + std::shared_ptr<LossFunction> _loss; }; TEST_F(ModelLogRegssorTests, FixInterceptFalseTest) { - ModelLogRegressor model( - "Property", - Unit("m"), + _loss = std::make_shared<LossFunctionLogPearsonRMSE>( _prop, _prop_test, - _features, - _leave_out_inds, _task_sizes_train, _task_sizes_test, - false + false, + 2 + ); + + ModelLogRegressor model( + "Property", + Unit("m"), + _loss, + _features, + _leave_out_inds ); EXPECT_STREQ(model.toString().c_str(), "exp(c0) * (A)^a0 * (B)^a1"); EXPECT_LT(model.rmse(), 1e-10); @@ -120,19 +126,24 @@ namespace TEST_F(ModelLogRegssorTests, FixInterceptTrueTest) { - std::transform(_features[0]->value_ptr(), _features[0]->value_ptr() + 10, _features[1]->value_ptr(), _prop.begin(), [](double v1, double v2){return std::pow(v1, 0.1) * std::pow(v2, -2.1);}); - std::transform(_features[0]->test_value_ptr(), _features[0]->test_value_ptr() + 2, _features[1]->test_value_ptr(), _prop_test.begin(), [](double v1, double v2){return std::pow(v1, 0.1) * std::pow(v2, -2.1);}); + std::transform(_features[0]->value_ptr(), _features[0]->value_ptr() + 10, _features[1]->value_ptr(), _prop.begin(), [](double v1, double v2){return std::log(std::pow(v1, 0.1) * std::pow(v2, -2.1));}); + std::transform(_features[0]->test_value_ptr(), _features[0]->test_value_ptr() + 2, _features[1]->test_value_ptr(), _prop_test.begin(), [](double v1, double v2){return std::log(std::pow(v1, 0.1) * std::pow(v2, -2.1));}); - ModelLogRegressor model( - "Property", - Unit("m"), + _loss = std::make_shared<LossFunctionLogPearsonRMSE>( _prop, _prop_test, - _features, - _leave_out_inds, _task_sizes_train, _task_sizes_test, - true + true, + 2 + ); + + ModelLogRegressor model( + "Property", + Unit("m"), + _loss, + _features, + _leave_out_inds ); EXPECT_STREQ(model.toString().c_str(), "(A)^a0 * (B)^a1"); diff --git a/tests/googletest/descriptor_identification/model/test_model_regressor.cc b/tests/googletest/descriptor_identification/model/test_model_regressor.cc index d0d394823947bc5c25467c8a3b29ab3a0df74313..5e0814ea40fa98f00aa8e013991745f7da79811a 100644 --- a/tests/googletest/descriptor_identification/model/test_model_regressor.cc +++ b/tests/googletest/descriptor_identification/model/test_model_regressor.cc @@ -41,21 +41,28 @@ namespace std::vector<double> _prop_test; std::vector<model_node_ptr> _features; + std::shared_ptr<LossFunction> _loss; }; TEST_F(ModelRegssorTests, FixInterceptFalseTest) { - ModelRegressor model( - "Property", - Unit("m"), + _loss = std::make_shared<LossFunctionPearsonRMSE>( _prop, _prop_test, - _features, - _leave_out_inds, _task_sizes_train, _task_sizes_test, - false + false, + 2 ); + + ModelRegressor model( + "Property", + Unit("m"), + _loss, + _features, + _leave_out_inds + ); + EXPECT_STREQ(model.toString().c_str(), "c0 + a0 * A + a1 * B"); EXPECT_LT(model.rmse(), 1e-10); EXPECT_LT(model.test_rmse(), 1e-10); @@ -135,17 +142,23 @@ namespace std::transform(_features[0]->test_value_ptr(), _features[0]->test_value_ptr() + 1, _features[1]->test_value_ptr(), _prop_test.begin(), [](double v1, double v2){return v1 + v2;}); std::transform(_features[0]->test_value_ptr() + 1, _features[0]->test_value_ptr() + 2, _features[1]->test_value_ptr() + 1, _prop_test.begin() + 1, [](double v1, double v2){return 1.25 * v1 - 0.4 * v2;}); - ModelRegressor model( - "Property", - Unit("m"), + _loss = std::make_shared<LossFunctionPearsonRMSE>( _prop, _prop_test, - _features, - _leave_out_inds, _task_sizes_train, _task_sizes_test, - true + true, + 2 ); + + ModelRegressor model( + "Property", + Unit("m"), + _loss, + _features, + _leave_out_inds + ); + EXPECT_STREQ(model.toString().c_str(), "a0 * A + a1 * B"); EXPECT_LT(model.rmse(), 1e-10); EXPECT_LT(model.test_rmse(), 1e-10); diff --git a/tests/googletest/utils/test_project.cc b/tests/googletest/utils/test_project.cc index c0c1bb7df2e404e0dc7d04a26fc1c6ba0ba373b9..e3136978a02c2f9550b35d032a57fff07f182d14 100644 --- a/tests/googletest/utils/test_project.cc +++ b/tests/googletest/utils/test_project.cc @@ -20,33 +20,8 @@ namespace { EXPECT_LT(std::abs(-0.9028289727756884 - scores[0]), 1e-10); scores[0] = 0.0; - project_funcs::project_r2(prop.data(), scores.data(), phi, sizes, 1); - EXPECT_LT(std::abs(-0.8151001540832047 - scores[0]), 1e-10); - scores[0] = 0.0; - - project_funcs::project_log_r2(prop.data(), scores.data(), phi, sizes, 1); - EXPECT_LT(std::abs(-0.8437210425744424 - scores[0]), 1e-10); - scores[0] = 0.0; - - project_funcs::project_classify(prop_class.data(), scores.data(), phi, sizes, 1); - EXPECT_LT(std::abs(-0.5 - scores[0]), 1e-4); - scores[0] = 0.0; - project_funcs::project_r_no_omp(prop.data(), scores.data(), phi, sizes, 1); EXPECT_LT(std::abs(-0.9028289727756884 - scores[0]), 1e-10); scores[0] = 0.0; - - project_funcs::project_r2_no_omp(prop.data(), scores.data(), phi, sizes, 1); - EXPECT_LT(std::abs(-0.8151001540832047 - scores[0]), 1e-10); - scores[0] = 0.0; - - project_funcs::project_log_r2_no_omp(prop.data(), scores.data(), phi, sizes, 1); - EXPECT_LT(std::abs(-0.8437210425744424 - scores[0]), 1e-10); - scores[0] = 0.0; - - project_funcs::project_classify_no_omp(prop_class.data(), scores.data(), phi, sizes, 1); - EXPECT_LT(std::abs(-0.5 - scores[0]), 1e-4); - scores[0] = 0.0; - } }