From c16acd5fbeae843c33cdedbe1b7648ad8ca711c9 Mon Sep 17 00:00:00 2001 From: Thomas Purcell <purcell@fhi-berlin.mpg.de> Date: Mon, 6 Jul 2020 09:52:54 +0200 Subject: [PATCH] Refactor of Model Standardized the output file and added constructors to recreate model from the output files --- src/descriptor_identifier/Model/Model.cpp | 295 ++++++++++++++---- src/descriptor_identifier/Model/Model.hpp | 40 ++- src/descriptor_identifier/SISSORegressor.cpp | 30 +- src/descriptor_identifier/SISSORegressor.hpp | 7 +- .../feature_space/FeatureSpace.cpp | 19 +- .../feature_space/FeatureSpace.hpp | 8 +- src/feature_creation/node/FeatureNode.cpp | 9 +- src/feature_creation/node/FeatureNode.hpp | 2 +- src/feature_creation/node/ModelNode.cpp | 2 +- src/feature_creation/node/Node.cpp | 4 +- src/main.cpp | 14 +- src/utils/string_utils.cpp | 11 + src/utils/string_utils.hpp | 18 ++ 13 files changed, 357 insertions(+), 102 deletions(-) create mode 100644 src/utils/string_utils.cpp create mode 100644 src/utils/string_utils.hpp diff --git a/src/descriptor_identifier/Model/Model.cpp b/src/descriptor_identifier/Model/Model.cpp index bc313a73..01122e53 100644 --- a/src/descriptor_identifier/Model/Model.cpp +++ b/src/descriptor_identifier/Model/Model.cpp @@ -67,6 +67,175 @@ Model::Model(std::vector<double> prop_train, std::vector<double> prop_test, std: } } +Model::Model(std::string train_file) +{ + _n_samp_test = 0; + + std::vector<std::string> split_str; + std::vector<std::string> feature_expr_train = populate_model(train_file, true); + + for(int ff = 0; ff < feature_expr_train.size(); ++ff) + { + split_str = str_utils::split_string_trim(feature_expr_train[ff]); + + int rung = std::stoi(split_str[0]); + std::string unit_str = split_str[1]; + std::string expr = split_str[2]; + + std::vector<double> feat_val(_n_samp_train); + std::vector<double> feat_test_val = {}; + std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); + + model_node_ptr feat = std::make_shared<ModelNode>(ff, rung, expr, feat_val, feat_test_val, Unit(unit_str)); + _feats.push_back(feat); + } + +} + +Model::Model(std::string train_file, std::string test_file) +{ + std::vector<std::string> split_str; + std::vector<std::string> feature_expr_train = populate_model(train_file, true); + std::vector<std::string> feature_expr_test = populate_model(test_file, false); + + for(int ff = 0; ff < feature_expr_train.size(); ++ff) + { + if(feature_expr_train[ff] != feature_expr_test[ff]) + throw std::logic_error("Features for train and test file do not agree"); + + split_str = str_utils::split_string_trim(feature_expr_train[ff]); + + int rung = std::stoi(split_str[0]); + std::string unit_str = split_str[1]; + std::string expr = split_str[2]; + std::vector<double> feat_val(_n_samp_train); + std::vector<double> feat_test_val(_n_samp_test); + + std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); + std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data()); + + _feats.push_back(std::make_shared<ModelNode>(ff, rung, expr, feat_val, feat_test_val, Unit(unit_str))); + } +} + +std::vector<std::string> Model::populate_model(std::string filename, bool train) +{ + + std::ifstream file_stream; + file_stream.open(filename, std::ios::in); + + std::vector<std::string> feature_expr; + std::vector<std::string> split_line; + + // Store model line + std::string model_line; + std::getline(file_stream, model_line); + + // Get the error + std::string error_line; + std::getline(file_stream, error_line); + split_line = str_utils::split_string_trim(error_line); + double rmse = std::stod(split_line[1]); + double max_ae = std::stod(split_line[3]); + + // Get coefficients + std::string line; + std::getline(file_stream, line); + std::getline(file_stream, line); + + int n_task = 0; + int _n_dim = 0; + std::getline(file_stream, line); + + do + { + ++n_task; + split_line = str_utils::split_string_trim(line); + _n_dim = split_line.size() - 3; + if(train) + { + _coefs.push_back(std::vector<double>(_n_dim + 1, 0.0)); + std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);}); + } + std::getline(file_stream, line); + } while(line.substr(0, 39).compare("# Feature Rung, Units, and Expressions") != 0); + + std::getline(file_stream, line); + for(int ff = 0; ff < _n_dim; ++ff) + { + feature_expr.push_back(line.substr(6)); + std::getline(file_stream, line); + } + + std::getline(file_stream, line); + + int n_samp = 0; + for(int tt = 0; tt < n_task; ++tt) + { + std::getline(file_stream, line); + split_line = str_utils::split_string_trim(line); + n_samp += std::stoi(split_line[1]); + if(train) + _task_sizes_train.push_back(std::stoi(split_line[1])); + else + _task_sizes_test.push_back(std::stoi(split_line[1])); + } + if(train) + { + _n_samp_train = n_samp; + _prop_train.resize(n_samp); + _prop_train_est.resize(n_samp); + _train_error.resize(n_samp); + } + else + { + _n_samp_test = n_samp; + _prop_test.resize(n_samp); + _prop_test_est.resize(n_samp); + _test_error.resize(n_samp); + } + std::getline(file_stream, line); + std::getline(file_stream, line); + if(!train) + std::getline(file_stream, line); + std::vector<std::vector<double>> feat_vals(_n_dim, std::vector<double>(n_samp, 0.0)); + for(int ns = 0; ns < n_samp; ++ns) + { + std::getline(file_stream, line); + split_line = str_utils::split_string_trim(line); + if(train) + { + _prop_train[ns] = std::stod(split_line[0]); + _prop_train_est[ns] = std::stod(split_line[1]); + _train_error[ns] = _prop_train_est[ns] - _prop_train[ns]; + + } + else + { + _prop_test[ns] = std::stod(split_line[0]); + _prop_test_est[ns] = std::stod(split_line[1]); + _test_error[ns] = _prop_test_est[ns] - _prop_test[ns]; + } + for(int nf = 0; nf < _n_dim; ++nf) + { + feat_vals[nf][ns] = std::stod(split_line[2 + nf]); + } + } + if(train) + { + _D_train.resize(_n_dim * n_samp); + for(int nf = 0; nf < _n_dim; ++nf) + std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]); + } + else + { + _D_test.resize(_n_dim * n_samp); + for(int nf = 0; nf < _n_dim; ++nf) + std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]); + } + return feature_expr; +} + std::string Model::toString() const { std::stringstream unit_rep; @@ -75,14 +244,14 @@ std::string Model::toString() const unit_rep << " + a" << std::to_string(ff) << " * " << _feats[ff]->expr(); return unit_rep.str(); } -// + std::ostream& operator<< (std::ostream& outStream, const Model& model) { outStream << model.toString(); return outStream; } -void Model::train_to_file(std::string filename) +void Model::to_file(std::string filename, bool train, std::vector<int> test_inds) { boost::filesystem::path p(filename.c_str()); boost::filesystem::create_directories(p.remove_filename()); @@ -91,78 +260,74 @@ void Model::train_to_file(std::string filename) out_file_stream.open(filename); out_file_stream << "# " << toString() << std::endl; - out_file_stream << "# RMSE: " << rmse() << "; Max AE: " << max_ae() << std::endl; + if(train) + out_file_stream << "# RMSE: " << std::setprecision(15) << rmse() << "; Max AE: " << max_ae() << std::endl; + else + out_file_stream << "# RMSE: " << std::setprecision(15) << test_rmse() << "; Max AE: " << test_max_ae() << std::endl; out_file_stream << "# Coefficients" << std::endl; - out_file_stream << std::setw(10) << std::left << "# Task,"; + out_file_stream << std::setw(10) << std::left << "# Task;"; + for(int cc = 0; cc < _coefs[0].size() - 1; ++cc) - out_file_stream << std::setw(24) << "a" + std::to_string(cc); - out_file_stream << std::setw(24) << "c0" << std::endl; + out_file_stream << std::setw(24) << " a" + std::to_string(cc); + + out_file_stream << " c0" << std::endl; for(int cc = 0; cc < _coefs.size(); ++cc) { - out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc); + out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; for(auto& coeff : _coefs[cc]) - out_file_stream << std::setw(24) << std::setprecision(18) << coeff; + out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; out_file_stream << "\n"; } - out_file_stream << "\n" << std::setw(24) << std::left << "# Property Value" << std::setw(24) << "Property Value (EST)"; + out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; for(int ff = 0; ff < _feats.size(); ++ff) - out_file_stream << std::setw(24) << "Feature " + std::to_string(ff) + " Value"; - out_file_stream << std::endl; + out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + ", " << std::to_string(_feats[ff]->rung()) + ", " << std::setw(50) << _feats[ff]->unit().toString() + ", " << _feats[ff]->expr() << std::endl; - for(int ss = 0; ss < _n_samp_train; ++ss) + out_file_stream << "# Number of Samples Per Task" << std::endl; + if(train) { - out_file_stream << std::setw(24) << std::setprecision(18) << _prop_train[ss] << std::setw(24) << std::setprecision(18) << _prop_train_est[ss]; - for(int ff = 0; ff < _n_dim - 1; ++ff) - out_file_stream << std::setw(24) << std::setprecision(18) << _feats[ff]->value()[ss]; - out_file_stream << std::endl; + out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl; } - out_file_stream.close(); -} - -void Model::test_to_file(std::string filename, std::vector<int> test_inds) -{ - boost::filesystem::path p(filename.c_str()); - boost::filesystem::create_directories(p.remove_filename()); - - std::ofstream out_file_stream = std::ofstream(); - out_file_stream.open(filename); - - out_file_stream << "# " << toString() << std::endl; - out_file_stream << "# RMSE: " << rmse() << "; Max AE: " << max_ae() << std::endl; - - out_file_stream << "# Coefficients" << std::endl; - out_file_stream << std::setw(10) << std::left << "# Task"; - for(int cc = 0; cc < _coefs[0].size() - 1; ++cc) - out_file_stream << std::setw(24) << "a" + std::to_string(cc); - out_file_stream << std::setw(24) << "c0" << std::endl; - - for(int cc = 0; cc < _coefs.size(); ++cc) + else { - out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc); - for(auto& coeff : _coefs[cc]) - out_file_stream << std::setw(24) << std::setprecision(18) << coeff; - out_file_stream << "\n"; - } + out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_test" << std::endl; + for(int tt = 0; tt < _task_sizes_test.size(); ++tt) + out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl; - out_file_stream << "# Test Indexes: [ " << test_inds[0]; - for(int ii = 1; ii < test_inds.size(); ++ii) - out_file_stream << ", " << test_inds[ii]; - out_file_stream << " ]" << std::endl; + out_file_stream << "# Test Indexes: [ " << test_inds[0]; + for(int ii = 1; ii < test_inds.size(); ++ii) + out_file_stream << ", " << test_inds[ii]; + out_file_stream << " ]" << std::endl; + } - out_file_stream << "\n" << std::setw(24) << std::left << "# Property Value" << std::setw(24) << "Property Value (EST)"; + out_file_stream << "\n" << std::setw(24) << std::left << "#Property Value" << std::setw(24) << " Property Value (EST)"; for(int ff = 0; ff < _feats.size(); ++ff) - out_file_stream << std::setw(24) << "Feature " + std::to_string(ff) + " Value"; + out_file_stream << std::setw(24) << " Feature " + std::to_string(ff) + " Value"; out_file_stream << std::endl; - for(int ss = 0; ss < _n_samp_test; ++ss) + if(train) { - out_file_stream << std::setw(24) << std::setprecision(18) << _prop_test[ss] << std::setw(24) << std::setprecision(18) << _prop_test_est[ss]; - for(int ff = 0; ff < _n_dim - 1; ++ff) - out_file_stream << std::setw(24) << std::setprecision(18) << _feats[ff]->test_value()[ss]; - out_file_stream << std::endl; + for(int ss = 0; ss < _n_samp_train; ++ss) + { + out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", " << std::setw(22) << _prop_train_est[ss]; + for(int ff = 0; ff < _n_dim - 1; ++ff) + out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss]; + out_file_stream << std::endl; + } + } + else + { + for(int ss = 0; ss < _n_samp_test; ++ss) + { + out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", " << std::setw(22) << _prop_test_est[ss]; + for(int ff = 0; ff < _n_dim - 1; ++ff) + out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss]; + out_file_stream << std::endl; + } } out_file_stream.close(); } @@ -171,6 +336,8 @@ void Model::register_python() { using namespace boost::python; class_<Model>("Model", init<std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>>()) + .def(init<std::string>()) + .def(init<std::string, std::string>()) .def("predict", &Model::predict) .def("fit", &Model::predict_train) .def("__str__", &Model::toString) @@ -178,20 +345,16 @@ void Model::register_python() .def_readonly("_n_samp_train", &Model::_n_samp_train) .def_readonly("_n_samp_test", &Model::_n_samp_test) .def_readonly("_n_dim", &Model::_n_dim) - .def_readonly("_feats", &Model::_feats) - .def_readonly("_coefs", &Model::_coefs) - .def_readonly("_prop_train", &Model::_prop_train) - .def_readonly("_prop_test", &Model::_prop_test) - .def_readonly("_train_error", &Model::_train_error) - .def_readonly("_test_error", &Model::_test_error) - .def_readonly("_D_train", &Model::_D_train) - .def_readonly("_D_test", &Model::_D_test) - .def_readonly("_prop_train_est", &Model::_prop_train_est) - .def_readonly("_prop_test_est", &Model::_prop_test_est) - .def_readonly("_task_sizes_train", &Model::_task_sizes_train) - .def_readonly("_task_sizes_test", &Model::_task_sizes_test) + .add_property("prop_train_est", &Model::prop_train_est) + .add_property("prop_test_est", &Model::prop_test_est) + .add_property("prop_train", &Model::prop_train) + .add_property("prop_test", &Model::prop_test) + .add_property("train_error", &Model::train_error) + .add_property("test_error", &Model::test_error) + .add_property("feats", &Model::feats) + .add_property("coefs", &Model::coefs) .add_property("rmse", &Model::rmse) .add_property("test_rmse", &Model::test_rmse) .add_property("max_ae", &Model::max_ae) .add_property("test_max_ae", &Model::test_max_ae); -} \ No newline at end of file +} diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp index 7f13c1c3..e0a47000 100644 --- a/src/descriptor_identifier/Model/Model.hpp +++ b/src/descriptor_identifier/Model/Model.hpp @@ -1,6 +1,8 @@ #ifndef MODEL #define MODEL +#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string/trim.hpp> #include <boost/filesystem.hpp> #include <boost/python.hpp> @@ -9,6 +11,10 @@ #include<iostream> #include <feature_creation/node/ModelNode.hpp> +#include <utils/string_utils.hpp> + +namespace python = boost::python; +namespace np = boost::python::numpy; typedef std::shared_ptr<ModelNode> model_node_ptr; /** @@ -23,7 +29,7 @@ class Model std::vector<model_node_ptr> _feats; //!< List of features in the model - std::vector<std::vector<double>> _coefs; //!< Coefficients for teh features + std::vector<std::vector<double>> _coefs; //!< Coefficients for the features std::vector<double> _prop_train; //!< The property to be modeled std::vector<double> _prop_test; //!< The property to be modeled std::vector<double> _train_error; //!< The error of the model @@ -45,6 +51,10 @@ public: */ Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test); + Model(std::string train_file); + Model(std::string train_file, std::string test_file); + + std::vector<std::string> populate_model(std::string filename, bool train); /** * @brief Convert the model to a string @@ -89,16 +99,34 @@ public: return std::abs(*std::max_element(_test_error.data(), _test_error.data() + _n_samp_test, [](double d1, double d2){return std::abs(d1) < std::abs(d2);})); } + inline python::list coefs() + { + python::list coef_lst; + for(auto& task_coefs : _coefs) + coef_lst.append<python::list>(python_conv_utils::to_list<double>(task_coefs)); + return coef_lst; + } - /** - * @brief Print model to a file - */ - void test_to_file(std::string filename, std::vector<int> test_inds); + inline python::list feats() + { + python::list feat_lst; + for(auto& feat : _feats) + feat_lst.append<ModelNode>(*feat); + return feat_lst; + } + + inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);} + inline np::ndarray prop_test_est(){return python_conv_utils::to_ndarray<double>(_prop_test_est);} + inline np::ndarray prop_train(){return python_conv_utils::to_ndarray<double>(_prop_train);} + inline np::ndarray prop_test(){return python_conv_utils::to_ndarray<double>(_prop_test);} + inline np::ndarray train_error(){return python_conv_utils::to_ndarray<double>(_train_error);} + inline np::ndarray test_error(){return python_conv_utils::to_ndarray<double>(_test_error);} /** * @brief Print model to a file */ - void train_to_file(std::string filename); + void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}); + static void register_python(); }; diff --git a/src/descriptor_identifier/SISSORegressor.cpp b/src/descriptor_identifier/SISSORegressor.cpp index c7fdca74..8d124478 100644 --- a/src/descriptor_identifier/SISSORegressor.cpp +++ b/src/descriptor_identifier/SISSORegressor.cpp @@ -1,6 +1,7 @@ #include <descriptor_identifier/SISSORegressor.hpp> -SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, int n_dim, int n_residual): + +SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual): _prop(prop), _prop_test(prop_test), _a((n_dim + 1) * prop.size()), @@ -9,6 +10,7 @@ SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::ve _s(n_dim + 1), _task_sizes_train(task_sizes_train), _task_sizes_test(task_sizes_test), + _leave_out_inds(leave_out_inds), _feat_space(feat_space), _mpi_comm(feat_space->mpi_comm()), _n_samp(prop.size()), @@ -28,7 +30,7 @@ SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::ve _work = std::vector<double>(_lwork, 0.0); } -SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, np::ndarray prop, np::ndarray prop_test, python::list task_sizes_train, python::list task_sizes_test, int n_dim, int n_residual) : +SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, np::ndarray prop, np::ndarray prop_test, python::list task_sizes_train, python::list task_sizes_test, python::list leave_out_inds, int n_dim, int n_residual) : _prop(python_conv_utils::from_ndarray<double>(prop)), _prop_test(python_conv_utils::from_ndarray<double>(prop_test)), _a((n_dim + 1) * prop.shape(0)), @@ -37,6 +39,7 @@ SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, np::nda _s(n_dim + 1), _task_sizes_train(python_conv_utils::from_list<int>(task_sizes_train)), _task_sizes_test(python_conv_utils::from_list<int>(task_sizes_test)), + _leave_out_inds(python_conv_utils::from_list<int>(leave_out_inds)), _feat_space(feat_space), _mpi_comm(feat_space->mpi_comm()), _n_samp(prop.shape(0)), @@ -56,7 +59,7 @@ SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, np::nda _work = std::vector<double>(_lwork, 0.0); } -SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, python::list prop, python::list prop_test, python::list task_sizes_train, python::list task_sizes_test, int n_dim, int n_residual) : +SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, python::list prop, python::list prop_test, python::list task_sizes_train, python::list task_sizes_test, python::list leave_out_inds, int n_dim, int n_residual) : _prop(python_conv_utils::from_list<double>(prop)), _prop_test(python_conv_utils::from_list<double>(prop_test)), _a((n_dim + 1) * boost::python::len(prop)), @@ -65,6 +68,7 @@ SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, python: _s(n_dim + 1), _task_sizes_train(python_conv_utils::from_list<int>(task_sizes_train)), _task_sizes_test(python_conv_utils::from_list<int>(task_sizes_test)), + _leave_out_inds(python_conv_utils::from_list<int>(leave_out_inds)), _feat_space(feat_space), _mpi_comm(feat_space->mpi_comm()), _n_samp(boost::python::len(prop)), @@ -158,6 +162,12 @@ void SISSORegressor::fit() model_node_ptr model_feat = std::make_shared<ModelNode>(_feat_space->phi_selected()[rr]->arr_ind(), _feat_space->phi_selected()[rr]->rung(), _feat_space->phi_selected()[rr]->expr(), _feat_space->phi_selected()[rr]->value(), _feat_space->phi_selected()[rr]->test_value(), _feat_space->phi_selected()[rr]->unit()); models.push_back(Model(_prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test)); models.back().copy_error(&residual[rr * _n_samp]); + if(_mpi_comm->rank() == 0) + { + models.back().to_file("models/train_dim_1_model_" + std::to_string(rr) + ".dat"); + if(_leave_out_inds.size() > 0) + models.back().to_file("models/test_dim_1_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); + } } _models.push_back(models); @@ -182,10 +192,16 @@ void SISSORegressor::fit() _mpi_comm->barrier(); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; if(_mpi_comm->rank() == 0) + { std::cout << "Time for l0-norm: " << duration << std::endl; - for(int rr = 0; rr < _n_residual; ++rr) - _models.back()[rr].copy_error(&residual[rr * _n_samp]); + for(int rr = 0; rr < _n_residual; ++rr) + { + _models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat"); + if(_leave_out_inds.size() > 0) + _models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds); + } + } } } @@ -275,8 +291,8 @@ python::list SISSORegressor::models_py() void SISSORegressor::register_python() { using namespace boost::python; - class_<SISSORegressor>("SISSORegressor", init<std::shared_ptr<FeatureSpace>, np::ndarray, np::ndarray, python::list, python::list, int, int>()) - .def(init<std::shared_ptr<FeatureSpace>, python::list, python::list, python::list, python::list, int, int>()) + class_<SISSORegressor>("SISSORegressor", init<std::shared_ptr<FeatureSpace>, np::ndarray, np::ndarray, python::list, python::list, python::list, int, int>()) + .def(init<std::shared_ptr<FeatureSpace>, python::list, python::list, python::list, python::list, python::list, int, int>()) .def("fit", &SISSORegressor::fit) .add_property("prop", &SISSORegressor::prop_py) .add_property("prop_test", &SISSORegressor::prop_test_py) diff --git a/src/descriptor_identifier/SISSORegressor.hpp b/src/descriptor_identifier/SISSORegressor.hpp index da4b400f..ec1e49ac 100644 --- a/src/descriptor_identifier/SISSORegressor.hpp +++ b/src/descriptor_identifier/SISSORegressor.hpp @@ -27,6 +27,7 @@ protected: std::vector<int> _task_sizes_train; std::vector<int> _task_sizes_test; + std::vector<int> _leave_out_inds; std::shared_ptr<FeatureSpace> _feat_space; //!< Feature Space for the problem std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI Communicator @@ -47,9 +48,11 @@ public: */ SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, int n_dim, int n_residual); - SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, np::ndarray prop, np::ndarray prop_test, python::list task_sizes_train, python::list task_sizes_test, int n_dim, int n_residual); + SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual); - SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, python::list prop, python::list prop_test, python::list task_sizes_train, python::list task_sizes_test, int n_dim, int n_residual); + SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, np::ndarray prop, np::ndarray prop_test, python::list task_sizes_train, python::list task_sizes_test, python::list leave_out_inds, int n_dim, int n_residual); + + SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, python::list prop, python::list prop_test, python::list task_sizes_train, python::list task_sizes_test, python::list leave_out_inds, int n_dim, int n_residual); /** * @brief Get the optimal size of the working array diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index b853bc08..fc168e42 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -116,6 +116,22 @@ FeatureSpace::FeatureSpace( initialize_fs(python_conv_utils::from_ndarray<double>(prop)); } +boost::python::list FeatureSpace::phi0_py() +{ + python::list feat_lst; + for(auto& feat : _phi_0) + feat_lst.append<FeatureNode>(FeatureNode(feat->feat_ind(), feat->expr(), feat->value(), feat->test_value(), feat->unit())); + return feat_lst; +} + +boost::python::list FeatureSpace::phi_selected_py() +{ + python::list feat_lst; + for(auto& feat : _phi_selected) + feat_lst.append<ModelNode>(ModelNode(feat->d_mat_ind(), feat->rung(), feat->expr(), feat->value(), feat->test_value(), feat->unit())); + return feat_lst; +} + void FeatureSpace::initialize_fs(std::vector<double> prop) { if(_n_rung_store == -1) @@ -147,7 +163,6 @@ void FeatureSpace::initialize_fs(std::vector<double> prop) generate_feature_space(prop); _scores.reserve(_phi.size()); _scores.resize(_phi.size()); - } void FeatureSpace::generate_new_feats(std::vector<node_ptr>::iterator& feat, std::vector<node_ptr>& feat_set, int& feat_ind, double l_bound, double u_bound) @@ -795,7 +810,6 @@ void FeatureSpace::sis(std::vector<double>& prop) } if(_mpi_comm->rank() == 0) out_file_stream.close(); - } void FeatureSpace::register_python() @@ -810,7 +824,6 @@ void FeatureSpace::register_python() .def("sis", sis_ndarray) .def("feat_in_phi", &FeatureSpace::feat_in_phi) .add_property("phi_selected", &FeatureSpace::phi_selected_py) - .add_property("phi", &FeatureSpace::phi_py) .add_property("phi0", &FeatureSpace::phi0_py) .add_property("scores", &FeatureSpace::scores_py) .add_property("task_sizes", &FeatureSpace::task_sizes_py) diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp index 02bde78c..3b727b07 100644 --- a/src/feature_creation/feature_space/FeatureSpace.hpp +++ b/src/feature_creation/feature_space/FeatureSpace.hpp @@ -3,6 +3,7 @@ #include <mpi_interface/MPI_Interface.hpp> #include <feature_creation/node/FeatureNode.hpp> +#include <feature_creation/node/ModelNode.hpp> #include <feature_creation/node/operator_nodes/allowed_ops.hpp> #include <feature_creation/node/value_storage/nodes_value_containers.hpp> #include <utils/project.hpp> @@ -137,22 +138,19 @@ public: */ inline std::vector<node_ptr> phi_selected(){return _phi_selected;}; - inline boost::python::list phi_selected_py(){return python_conv_utils::to_list<node_ptr>(_phi_selected);}; + boost::python::list phi_selected_py(); /** * @brief Accessor function for _phi */ inline std::vector<node_ptr> phi(){return _phi;}; - inline boost::python::list phi_py(){return python_conv_utils::to_list<node_ptr>(_phi);}; - /** * @brief Accessor function for _phi_0 */ inline std::vector<node_ptr> phi0(){return _phi_0;}; - inline boost::python::list phi0_py(){return python_conv_utils::to_list<node_ptr>(_phi_0);}; - + boost::python::list phi0_py(); /** * @brief Accessor function for _scores */ diff --git a/src/feature_creation/node/FeatureNode.cpp b/src/feature_creation/node/FeatureNode.cpp index f68cf773..6184a141 100644 --- a/src/feature_creation/node/FeatureNode.cpp +++ b/src/feature_creation/node/FeatureNode.cpp @@ -3,15 +3,18 @@ FeatureNode::FeatureNode() {} -FeatureNode::FeatureNode(int feat_ind, std::string expr, std::vector<double> value, std::vector<double> test_value, Unit unit) : +FeatureNode::FeatureNode(int feat_ind, std::string expr, std::vector<double> value, std::vector<double> test_value, Unit unit, bool set_val) : Node(feat_ind, value.size(), test_value.size()), _value(value), _test_value(test_value), _unit(unit), _expr(expr) { - set_value(); - set_test_value(); + if(set_val) + { + set_value(); + set_test_value(); + } } FeatureNode::FeatureNode(int feat_ind, std::string expr, np::ndarray value, np::ndarray test_value, Unit unit) : diff --git a/src/feature_creation/node/FeatureNode.hpp b/src/feature_creation/node/FeatureNode.hpp index 1ef04770..7979fdee 100644 --- a/src/feature_creation/node/FeatureNode.hpp +++ b/src/feature_creation/node/FeatureNode.hpp @@ -60,7 +60,7 @@ public: * @param value Value of the feature for each test sample * @param unit Unit of the feature */ - FeatureNode(int feat_ind, std::string expr, std::vector<double> value, std::vector<double> test_value, Unit unit); + FeatureNode(int feat_ind, std::string expr, std::vector<double> value, std::vector<double> test_value, Unit unit, bool set_val = true); FeatureNode(int feat_ind, std::string expr, np::ndarray value, np::ndarray test_value, Unit unit); FeatureNode(int feat_ind, std::string expr, python::list value, python::list test_value, Unit unit); diff --git a/src/feature_creation/node/ModelNode.cpp b/src/feature_creation/node/ModelNode.cpp index 75d25160..246751e8 100644 --- a/src/feature_creation/node/ModelNode.cpp +++ b/src/feature_creation/node/ModelNode.cpp @@ -4,7 +4,7 @@ ModelNode::ModelNode() {} ModelNode::ModelNode(int feat_ind, int rung, std::string expr, std::vector<double> value, std::vector<double> test_value, Unit unit) : - FeatureNode(feat_ind, expr, value, test_value, unit), + FeatureNode(feat_ind, expr, value, test_value, unit, false), _rung(rung) {} diff --git a/src/feature_creation/node/Node.cpp b/src/feature_creation/node/Node.cpp index 3bde8ded..851b6b07 100644 --- a/src/feature_creation/node/Node.cpp +++ b/src/feature_creation/node/Node.cpp @@ -97,6 +97,8 @@ void Node::register_python() class_<NodeWrap, boost::noncopyable>("Node", no_init) .def("reindex", reindex_1) .def("reindex", reindex_2) + .def("__str__", &Node::expr) + .def("__repr__", &Node::expr) .add_property("n_samp", &Node::n_samp) .add_property("n_test_samp", &Node::n_test_samp) .add_property("feat_ind", &Node::feat_ind) @@ -112,7 +114,7 @@ void Node::register_python() .def("is_nan", pure_virtual(&Node::is_nan)) .def("is_const", pure_virtual(&Node::is_const)) .def("rung", pure_virtual(&Node::rung)) - ; + ; } BOOST_SERIALIZATION_ASSUME_ABSTRACT(Node) diff --git a/src/main.cpp b/src/main.cpp index b1535fa8..26df5217 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -36,7 +36,7 @@ int main(int argc, char const *argv[]) std::cout<< "time input_parsing/Feature space generation: "<< duration << std::endl; node_value_arrs::initialize_d_matrix_arr(); - SISSORegressor sisso(IP._feat_space, IP._prop_train, IP._prop_test, IP._task_sizes_train, IP._task_sizes_test, IP._n_dim, IP._n_residuals); + SISSORegressor sisso(IP._feat_space, IP._prop_train, IP._prop_test, IP._task_sizes_train, IP._task_sizes_test, IP._leave_out_inds, IP._n_dim, IP._n_residuals); sisso.fit(); if(mpi_setup::comm->rank() == 0) @@ -49,12 +49,12 @@ int main(int argc, char const *argv[]) else std::cout << std::endl; std::cout << sisso.models()[ii][0] << "\n" << std::endl; - for(int jj = 0; jj < sisso.models()[ii].size(); ++jj) - { - sisso.models()[ii][jj].train_to_file("models/train_dim_" + std::to_string(ii) + "_model_" + std::to_string(jj) + ".dat"); - if(IP._prop_test.size() > 0) - sisso.models()[ii][jj].test_to_file("models/test_dim_" + std::to_string(ii) + "_model_" + std::to_string(jj) + ".dat", IP._leave_out_inds); - } + // for(int jj = 0; jj < sisso.models()[ii].size(); ++jj) + // { + // sisso.models()[ii][jj].to_file("models/train_dim_" + std::to_string(ii) + "_model_" + std::to_string(jj) + ".dat"); + // if(IP._prop_test.size() > 0) + // sisso.models()[ii][jj].to_file("models/test_dim_" + std::to_string(ii) + "_model_" + std::to_string(jj) + ".dat", false, IP._leave_out_inds); + // } } } diff --git a/src/utils/string_utils.cpp b/src/utils/string_utils.cpp new file mode 100644 index 00000000..9691d2fb --- /dev/null +++ b/src/utils/string_utils.cpp @@ -0,0 +1,11 @@ +#include <utils/string_utils.hpp> + +std::vector<std::string> str_utils::split_string_trim(std::string str, std::string split_tokens) +{ + std::vector<std::string> split_str; + boost::algorithm::split(split_str, str, boost::algorithm::is_any_of(split_tokens)); + for(auto& str_sec : split_str) + boost::algorithm::trim(str_sec); + + return split_str; +} diff --git a/src/utils/string_utils.hpp b/src/utils/string_utils.hpp new file mode 100644 index 00000000..d7a56e72 --- /dev/null +++ b/src/utils/string_utils.hpp @@ -0,0 +1,18 @@ +#ifndef STRING_UTILS +#define STRING_UTILS + +#include <cmath> +#include <string> +#include <vector> +#include <iostream> + +#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string/trim.hpp> + +namespace str_utils +{ + std::vector<std::string> split_string_trim(std::string str, std::string split_tokens = ",;:"); +} + + +#endif \ No newline at end of file -- GitLab