diff --git a/src/descriptor_identifier/Model/Model.cpp b/src/descriptor_identifier/Model/Model.cpp index 9cc1384013ff547404d76433633882c3b4b34f90..88615fc7db535d28be34a26b7866ceb68955c144 100644 --- a/src/descriptor_identifier/Model/Model.cpp +++ b/src/descriptor_identifier/Model/Model.cpp @@ -18,6 +18,10 @@ Model::Model( _prop_test(prop_test), _D_train(_n_samp_train * (feats.size() + (1 - fix_intercept))), _D_test(_n_samp_test * (feats.size() + (1 - fix_intercept))), + _prop_train_est(_prop_train), + _prop_test_est(_prop_test), + _train_error(_n_samp_train), + _test_error(_n_samp_test), _task_sizes_train(task_sizes_train), _task_sizes_test(task_sizes_test), _prop_label(prop_label), @@ -67,7 +71,6 @@ double Model::eval(std::map<std::string, double> x_in_dct) const return eval(x_in); } - std::vector<double> Model::eval(std::vector<std::vector<double>> x_in) const { int n_leaves_tot = std::accumulate(_feats.begin(), _feats.end(), 0, [](int tot, model_node_ptr feat){return tot + feat->n_leaves();}); @@ -113,6 +116,109 @@ std::vector<double> Model::eval(std::map<std::string, std::vector<double>> x_in_ return eval(x_in); } +void Model::to_file(const std::string filename, const bool train, const std::vector<int> test_inds) const +{ + boost::filesystem::path p(filename.c_str()); + boost::filesystem::path parent = p.remove_filename(); + if(parent.string().size() > 0) + { + boost::filesystem::create_directories(parent); + } + + std::ofstream out_file_stream = std::ofstream(); + out_file_stream.open(filename); + + out_file_stream << "# " << toString() << std::endl; + out_file_stream << "# Property Label: $" << str_utils::latexify(_prop_label) << "$; Unit of the Property: " << _prop_unit.toString() << std::endl; + + out_file_stream << error_summary_string(train); + out_file_stream << coefs_header(); + + for(int cc = 0; cc < _coefs.size(); ++cc) + { + out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; + for(auto& coeff : _coefs[cc]) + { + out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; + } + out_file_stream << "\n"; + } + + out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; + for(int ff = 0; ff < _feats.size(); ++ff) + { + out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + "; "; + out_file_stream << std::to_string(_feats[ff]->rung()) + "; "; + out_file_stream << std::setw(50) << _feats[ff]->unit().toString() + "; "; + out_file_stream << _feats[ff]->postfix_expr() + "; " << _feats[ff]->expr() + "; "; + out_file_stream << _feats[ff]->latex_expr() + "; "; + out_file_stream << _feats[ff]->matlab_fxn_expr() + "; "; + out_file_stream << boost::algorithm::join(_feats[ff]->get_x_in_expr_list(), ",") << std::endl; + } + + out_file_stream << "# Number of Samples Per Task" << std::endl; + if(train) + { + out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_train" << std::endl; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + { + out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", "; + out_file_stream << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl; + } + } + else + { + out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_test" << std::endl; + for(int tt = 0; tt < _task_sizes_test.size(); ++tt) + { + out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", "; + out_file_stream << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl; + } + + out_file_stream << "# Test Indexes: [ " << test_inds[0]; + for(int ii = 1; ii < test_inds.size(); ++ii) + { + out_file_stream << ", " << test_inds[ii]; + } + out_file_stream << " ]" << std::endl; + } + + out_file_stream << "\n" << std::setw(22) << std::left << "# Property Value" << std::setw(2) << ", " << std::setw(22) << " Property Value (EST)"; + for(int ff = 0; ff < _feats.size(); ++ff) + { + out_file_stream << std::setw(2) << ", " << std::setw(22) << " Feature " + std::to_string(ff) + " Value"; + } + out_file_stream << std::endl; + + if(train) + { + for(int ss = 0; ss < _n_samp_train; ++ss) + { + out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", "; + out_file_stream << std::setw(22) << _prop_train_est[ss]; + for(int ff = 0; ff < _n_dim; ++ff) + { + out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss]; + } + out_file_stream << std::endl; + } + } + else + { + for(int ss = 0; ss < _n_samp_test; ++ss) + { + out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", "; + out_file_stream << std::setw(22) << _prop_test_est[ss]; + for(int ff = 0; ff < _n_dim; ++ff) + { + out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss]; + } + out_file_stream << std::endl; + } + } + out_file_stream.close(); +} + void Model::write_matlab_fxn(std::string fxn_filename) { if(fxn_filename.substr(fxn_filename.size() - 2, 2).compare(".m") != 0) @@ -189,3 +295,228 @@ void Model::write_matlab_fxn(std::string fxn_filename) out_file_stream << "P = reshape(" << matlab_expr() << ", [], 1);\nend\n"; out_file_stream.close(); } + +std::vector<std::string> Model::populate_model(const std::string filename, const bool train) +{ + + std::ifstream file_stream; + file_stream.open(filename, std::ios::in); + + std::vector<std::string> feature_expr; + std::vector<std::string> split_line; + + // Store model line + std::string model_line; + std::getline(file_stream, model_line); + + _fix_intercept = has_fixed_intercept(model_line); + + // Get the property unit and error + std::string prop_desc_line; + std::string error_line; + + std::getline(file_stream, prop_desc_line); + if(!is_error_line(prop_desc_line)) + { + split_line = str_utils::split_string_trim(prop_desc_line); + if(split_line.size() > 2) + { + _prop_label = split_line[1]; + _prop_unit = Unit(split_line.back()); + } + else + { + _prop_label = "Property"; + _prop_unit = Unit(split_line.back()); + } + std::getline(file_stream, error_line); + } + else + { + _prop_label = "Property"; + _prop_unit = Unit(); + error_line = prop_desc_line; + } + + set_error_from_file(error_line, train); + + // Get coefficients + std::string line; + std::getline(file_stream, line); + std::getline(file_stream, line); + + int n_task = 0; + int n_dim = 0; + std::getline(file_stream, line); + do + { + ++n_task; + split_line = str_utils::split_string_trim(line); + n_dim = split_line.size() - 3; + if(train) + { + _coefs.push_back(std::vector<double>(n_dim + 1, 0.0)); + std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);}); + } + std::getline(file_stream, line); + } while(line.substr(0, 14).compare("# Feature Rung") != 0); + + _n_dim = n_dim + _fix_intercept; + std::getline(file_stream, line); + for(int ff = 0; ff < _n_dim; ++ff) + { + feature_expr.push_back(line.substr(6)); + std::getline(file_stream, line); + } + + std::getline(file_stream, line); + + // Get task sizes + int n_samp = 0; + for(int tt = 0; tt < n_task; ++tt) + { + std::getline(file_stream, line); + split_line = str_utils::split_string_trim(line); + n_samp += std::stoi(split_line[1]); + if(train) + { + _task_sizes_train.push_back(std::stoi(split_line[1])); + } + else + { + _task_sizes_test.push_back(std::stoi(split_line[1])); + } + } + + if(train) + { + _n_samp_train = n_samp; + _prop_train.resize(n_samp); + _prop_train_est.resize(n_samp); + _train_error.resize(n_samp); + } + else + { + _n_samp_test = n_samp; + _prop_test.resize(n_samp); + _prop_test_est.resize(n_samp); + _test_error.resize(n_samp); + } + + std::getline(file_stream, line); + std::getline(file_stream, line); + if(!train) + { + std::getline(file_stream, line); + } + + std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(n_samp, 0.0)); + for(int ns = 0; ns < n_samp; ++ns) + { + std::getline(file_stream, line); + split_line = str_utils::split_string_trim(line); + if(train) + { + _prop_train[ns] = std::stod(split_line[0]); + _prop_train_est[ns] = std::stod(split_line[1]); + _train_error[ns] = _prop_train_est[ns] - _prop_train[ns]; + + } + else + { + _prop_test[ns] = std::stod(split_line[0]); + _prop_test_est[ns] = std::stod(split_line[1]); + _test_error[ns] = _prop_test_est[ns] - _prop_test[ns]; + } + for(int nf = 0; nf < n_dim; ++nf) + { + feat_vals[nf][ns] = std::stod(split_line[2 + nf]); + } + } + + if(train) + { + _D_train.resize((_n_dim + !_fix_intercept) * n_samp); + for(int nf = 0; nf < n_dim; ++nf) + { + std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]); + } + } + else + { + _D_test.resize((_n_dim + !_fix_intercept) * n_samp); + for(int nf = 0; nf < n_dim; ++nf) + { + std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]); + } + } + + file_stream.close(); + return feature_expr; +} + +void Model::create_model_nodes_from_expressions(std::vector<std::string> feat_exprs) +{ + std::vector<std::string> split_str; + + for(int ff = 0; ff < feat_exprs.size(); ++ff) + { + split_str = str_utils::split_string_trim(feat_exprs[ff], ";"); + int rung = std::stoi(split_str[0]); + + std::vector<double> feat_val(_n_samp_train); + std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); + + std::vector<double> feat_test_val(_n_samp_test); + std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data()); + + model_node_ptr feat; + // Legacy code for previous versions + if((split_str[1][0] == '(') || (split_str[1][0] == '[')) + { + std::string unit_str = split_str[2]; + std::string postfix_expr = split_str[3]; + std::string expr = split_str[4]; + std::string latex_expr = "Property"; + std::string matlab_expr; + std::vector<std::string> x_in_expr_list; + if(split_str.size() > 5) + { + x_in_expr_list = str_utils::split_string_trim(split_str.back()); + } + if(split_str.size() > 6) + { + matlab_expr = split_str[5]; + } + else + { + matlab_expr = "NaN"; + } + feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); + } + else + { + std::string unit_str = split_str[1]; + std::string postfix_expr = split_str[2]; + std::string expr = split_str[3]; + std::string latex_expr = split_str[4]; + std::string matlab_expr; + std::vector<std::string> x_in_expr_list; + if(split_str.size() > 5) + { + x_in_expr_list = str_utils::split_string_trim(split_str.back()); + } + if(split_str.size() > 6) + { + matlab_expr = split_str[5]; + } + else + { + matlab_expr = "NaN"; + } + feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); + } + _feats.push_back(feat); + } +} + diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp index 1210caaa17fcc321c4df8bb60ec9745927450b7a..e5197ceb44f6242822e7da3c28531581bd201c6e 100644 --- a/src/descriptor_identifier/Model/Model.hpp +++ b/src/descriptor_identifier/Model/Model.hpp @@ -49,6 +49,12 @@ protected: std::vector<double> _D_train; //!< The Descriptor matrix (training data) std::vector<double> _D_test; //!< The Descriptor matrix (testing data) + std::vector<double> _prop_train_est; //!< The estimated Property (training data) + std::vector<double> _prop_test_est; //!< The estimated Property (testing data) + + std::vector<double> _train_error; //!< The error of the model (training) + std::vector<double> _test_error; //!< The error of the model (testing) + std::vector<int> _task_sizes_train; //!< Number of training samples in each task std::vector<int> _task_sizes_test; //!< Number of testing samples in each task @@ -116,6 +122,48 @@ public: */ Model& operator= (Model&& o) = default; + /** + * @brief Read an output file and extract all relevant information + * @details Takes in an output file and extracts all data needed to recreate the model + * + * @param filename Name of the output file + * @param train If true then the output represents training data + * + * @return Vector of strings describing feature meta data + */ + std::vector<std::string> populate_model(const std::string filename, const bool train); + + /** + * @brief Based off of the model line in a model file determine if the intercept is fixed + * + * @param model_line the model line from the file + * @return True if the intercept should be fixed + */ + virtual bool has_fixed_intercept(std::string model_line) = 0; + + /** + * @brief Set error summary variables from a file + * + * @param error_line the line to set the error from + * @param train True if file is for training data + */ + virtual void set_error_from_file(std::string error_line, bool train) = 0; + + /** + * @brief Check if a given line summarizes the error of a model + * + * @param line to see if it contains information about the error + * @return True if the line summarizes the error + */ + virtual bool is_error_line(std::string line) = 0; + + /** + * @brief Fills _feats with the correct features based on a set of expressions + * + * @param feat_exprs Vectors containing the features to be created + */ + void create_model_nodes_from_expressions(std::vector<std::string> feat_exprs); + /** * @brief Evaluate the model for a new point * @@ -219,7 +267,19 @@ public: * @param train If true output the training data * @param test_inds The indexes of the test set */ - virtual void to_file(const std::string filename, const bool train=true, const std::vector<int> test_inds={}) const = 0; + void to_file(const std::string filename, const bool train=true, const std::vector<int> test_inds={}) const; + + /** + * @brief Get the block used to summarize the error in the output files + * + * @param train True if summary is for the training file + */ + virtual std::string error_summary_string(bool train) const = 0; + + /** + * @brief Get the coefficients list header for output file + */ + virtual std::string coefs_header() const = 0; // DocString: model_fix_intercept /** @@ -346,6 +406,33 @@ public: * @return The prediction of the model for a given data point */ np::ndarray eval_many_py(py::dict x_in) const ; + + // DocString model_to_file_ndarr + /** + * @brief Convert the Model into an output file + * + * @param filename The name of the file to output to + * @param train If true output the training data + * @param test_inds The indexes of the test set + */ + inline void to_file(const std::string filename, const bool train, py::list test_inds) const + { + to_file(filename, train, python_conv_utils::from_list<int>(test_inds)); + } + + // DocString model_to_file_list + /** + * @brief Convert the Model into an output file + * + * @param filename The name of the file to output to + * @param train If true output the training data + * @param test_inds The indexes of the test set + */ + inline void to_file(const std::string filename, const bool train, np::ndarray test_inds) const + { + to_file(filename, train, python_conv_utils::from_ndarray<int>(test_inds)); + } + #endif }; diff --git a/src/descriptor_identifier/Model/ModelClassifier.cpp b/src/descriptor_identifier/Model/ModelClassifier.cpp index 18b7c9ee8057d5d5bbfb50576aa3dfa1635e2b24..7f7005123b26ab62555888cf4e14a4d5ec946d70 100644 --- a/src/descriptor_identifier/Model/ModelClassifier.cpp +++ b/src/descriptor_identifier/Model/ModelClassifier.cpp @@ -11,10 +11,6 @@ ModelClassifier::ModelClassifier( const bool fix_intercept ) : Model(prop_label, prop_unit, prop_train, prop_test, feats, task_sizes_train, task_sizes_test, fix_intercept), - _prop_train_est(_prop_train), - _prop_test_est(_prop_test), - _train_error(_n_samp_train), - _test_error(_n_samp_test), _train_n_convex_overlap(0), _test_n_convex_overlap(0) { @@ -75,146 +71,33 @@ ModelClassifier::ModelClassifier( ModelClassifier::ModelClassifier(const std::string train_file) { _n_samp_test = 0; - - std::vector<std::string> split_str; std::vector<std::string> feature_expr_train = populate_model(train_file, true); _task_sizes_test = std::vector<int>(_task_sizes_train.size(), 0); - for(int ff = 0; ff < feature_expr_train.size(); ++ff) - { - split_str = str_utils::split_string_trim(feature_expr_train[ff], ";"); - int rung = std::stoi(split_str[0]); - - std::vector<double> feat_val(_n_samp_train); - std::vector<double> feat_test_val = {}; - std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); - model_node_ptr feat; - - // Legacy checks can be removed once everything is sufficiently settled for release - if((split_str[1][0] == '(') || (split_str[1][0] == '[')) - { - std::string unit_str = split_str[2]; - std::string postfix_expr = split_str[3]; - std::string expr = split_str[4]; - std::string latex_expr = "Property"; - std::string matlab_expr; - std::vector<std::string> x_in_expr_list; - if(split_str.size() > 5) - { - x_in_expr_list = str_utils::split_string_trim(split_str.back()); - } - if(split_str.size() > 6) - { - matlab_expr = split_str[5]; - } - else - { - matlab_expr = "NaN"; - } - feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); - } - else - { - std::string unit_str = split_str[1]; - std::string postfix_expr = split_str[2]; - std::string expr = split_str[3]; - std::string latex_expr = split_str[4]; - std::string matlab_expr; - std::vector<std::string> x_in_expr_list; - if(split_str.size() > 5) - { - x_in_expr_list = str_utils::split_string_trim(split_str.back()); - } - if(split_str.size() > 6) - { - matlab_expr = split_str[5]; - } - else - { - matlab_expr = "NaN"; - } - feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); - } - _feats.push_back(feat); - } + create_model_nodes_from_expressions(feature_expr_train); int file_train_n_convex_overlap = _train_n_convex_overlap; _train_n_convex_overlap = 0; set_train_test_error(); if((file_train_n_convex_overlap != _train_n_convex_overlap)) - throw std::logic_error("The file does not have the same convex overlap (" + std::to_string(file_train_n_convex_overlap) + ") as calculated here (" + std::to_string(_train_n_convex_overlap) + ")."); + { + throw std::logic_error( + "The file does not have the same convex overlap (" + + std::to_string(file_train_n_convex_overlap) + + ") as calculated here (" + + std::to_string(_train_n_convex_overlap) + + ")." + ); + } } ModelClassifier::ModelClassifier(const std::string train_file, const std::string test_file) { - std::vector<std::string> split_str; std::vector<std::string> feature_expr_train = populate_model(train_file, true); std::vector<std::string> feature_expr_test = populate_model(test_file, false); - for(int ff = 0; ff < feature_expr_train.size(); ++ff) - { - if(feature_expr_train[ff] != feature_expr_test[ff]) - { - throw std::logic_error("Features for train and test file do not agree"); - } - - split_str = str_utils::split_string_trim(feature_expr_train[ff], ";"); - int rung = std::stoi(split_str[0]); - - std::vector<double> feat_val(_n_samp_train); - std::vector<double> feat_test_val(_n_samp_test); - std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); - std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data()); - model_node_ptr feat; - - // Legacy checks can be removed once everything is sufficiently settled for release - if((split_str[1][0] == '(') || (split_str[1][0] == '[')) - { - std::string unit_str = split_str[2]; - std::string postfix_expr = split_str[3]; - std::string expr = split_str[4]; - std::string latex_expr = "Property"; - std::string matlab_expr; - std::vector<std::string> x_in_expr_list; - if(split_str.size() > 5) - { - x_in_expr_list = str_utils::split_string_trim(split_str.back()); - } - if(split_str.size() > 6) - { - matlab_expr = split_str[5]; - } - else - { - matlab_expr = "NaN"; - } - feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); - } - else - { - std::string unit_str = split_str[1]; - std::string postfix_expr = split_str[2]; - std::string expr = split_str[3]; - std::string latex_expr = split_str[4]; - std::string matlab_expr; - std::vector<std::string> x_in_expr_list; - if(split_str.size() > 5) - { - x_in_expr_list = str_utils::split_string_trim(split_str.back()); - } - if(split_str.size() > 6) - { - matlab_expr = split_str[5]; - } - else - { - matlab_expr = "NaN"; - } - feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); - } - _feats.push_back(feat); - } + create_model_nodes_from_expressions(feature_expr_train); int file_train_n_convex_overlap = _train_n_convex_overlap; _train_n_convex_overlap = 0; @@ -253,46 +136,9 @@ std::vector<double> ModelClassifier::eval(std::vector<double>* x_in) const return result; } -std::vector<std::string> ModelClassifier::populate_model(const std::string filename, const bool train) +void ModelClassifier::set_error_from_file(std::string error_line, bool train) { - std::ifstream file_stream; - file_stream.open(filename, std::ios::in); - - std::vector<std::string> feature_expr; - std::vector<std::string> split_line; - - // Store model line - std::string model_line; - std::getline(file_stream, model_line); - - // Get the property unit and label and the error summary - std::string unit_line; - std::string error_line; - - std::getline(file_stream, unit_line); - if(unit_line.substr(0, 42).compare("# # Samples in Convex Hull Overlap Region:") != 0) - { - split_line = str_utils::split_string_trim(unit_line); - if(split_line.size() > 2) - { - _prop_label = split_line[1]; - _prop_unit = Unit(split_line.back()); - } - else - { - _prop_label = "Property"; - _prop_unit = Unit(split_line.back()); - } - std::getline(file_stream, error_line); - } - else - { - _prop_label = "Property"; - _prop_unit = Unit(); - error_line = unit_line; - } - - split_line = str_utils::split_string_trim(error_line); + std::vector<std::string> split_line = str_utils::split_string_trim(error_line); if(train) { @@ -304,122 +150,6 @@ std::vector<std::string> ModelClassifier::populate_model(const std::string filen _test_n_convex_overlap = std::stod(split_line[1]); _test_n_svm_misclassified = std::stod(split_line[3]); } - - // Get coefficients - std::string line; - std::getline(file_stream, line); - std::getline(file_stream, line); - - int n_task = 0; - _n_dim = 0; - std::getline(file_stream, line); - do - { - ++n_task; - split_line = str_utils::split_string_trim(line); - _n_dim = split_line.size() - 3; - if(train) - { - _coefs.push_back(std::vector<double>(_n_dim + 1, 0.0)); - std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);}); - } - std::getline(file_stream, line); - } while(line.substr(0, 14).compare("# Feature Rung") != 0); - - _fix_intercept = false; - std::getline(file_stream, line); - for(int ff = 0; ff < _n_dim; ++ff) - { - feature_expr.push_back(line.substr(6)); - std::getline(file_stream, line); - } - - std::getline(file_stream, line); - - // Get task sizes - int n_samp = 0; - for(int tt = 0; tt < n_task; ++tt) - { - std::getline(file_stream, line); - split_line = str_utils::split_string_trim(line); - n_samp += std::stoi(split_line[1]); - if(train) - { - _task_sizes_train.push_back(std::stoi(split_line[1])); - } - else - { - _task_sizes_test.push_back(std::stoi(split_line[1])); - } - } - - // Get the data of the model - if(train) - { - _n_samp_train = n_samp; - _prop_train.resize(n_samp); - _prop_train_est.resize(n_samp); - _train_error.resize(n_samp); - } - else - { - _n_samp_test = n_samp; - _prop_test.resize(n_samp); - _prop_test_est.resize(n_samp); - _test_error.resize(n_samp); - } - - std::getline(file_stream, line); - std::getline(file_stream, line); - if(!train) - { - std::getline(file_stream, line); - } - - std::vector<std::vector<double>> feat_vals(_n_dim, std::vector<double>(n_samp, 0.0)); - for(int ns = 0; ns < n_samp; ++ns) - { - std::getline(file_stream, line); - split_line = str_utils::split_string_trim(line); - if(train) - { - _prop_train[ns] = std::stod(split_line[0]); - _prop_train_est[ns] = std::stod(split_line[1]); - _train_error[ns] = 0; - - } - else - { - _prop_test[ns] = std::stod(split_line[0]); - _prop_test_est[ns] = std::stod(split_line[1]); - _test_error[ns] = 0; - } - for(int nf = 0; nf < _n_dim; ++nf) - { - feat_vals[nf][ns] = std::stod(split_line[2 + nf]); - } - } - - if(train) - { - _D_train.resize(_n_dim * n_samp); - for(int nf = 0; nf < _n_dim; ++nf) - { - std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]); - } - } - else - { - _D_test.resize(_n_dim * n_samp); - for(int nf = 0; nf < _n_dim; ++nf) - { - std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]); - } - } - - file_stream.close(); - - return feature_expr; } void ModelClassifier::set_train_test_error() @@ -544,6 +274,7 @@ void ModelClassifier::set_train_test_error() for(auto& ind : inds_class_test[c2]) { + std::transform(_feats.begin(), _feats.end(), lp_row_up.begin(), [&ind](model_node_ptr feat){return feat->test_value_ptr()[ind];}); std::transform(_feats.begin(), _feats.end(), lp_row_low.begin(), [&ind](model_node_ptr feat){return feat->test_value_ptr()[ind];}); lp_simplex.loadProblem( @@ -568,6 +299,7 @@ void ModelClassifier::set_train_test_error() task_start_train += _task_sizes_train[tt]; task_start_test += _task_sizes_test[tt]; } + std::transform( _train_error.begin(), _train_error.end(), @@ -625,122 +357,33 @@ std::ostream& operator<< (std::ostream& outStream, const ModelClassifier& model) return outStream; } -void ModelClassifier::to_file(std::string filename, bool train, std::vector<int> test_inds) const +std::string ModelClassifier::error_summary_string(bool train) const { - boost::filesystem::path p(filename.c_str()); - boost::filesystem::path parent = p.remove_filename(); - if(parent.string().size() > 0) - { - boost::filesystem::create_directories(parent); - } - - std::ofstream out_file_stream = std::ofstream(); - out_file_stream.open(filename); - - out_file_stream << "# " << toString() << std::endl; - out_file_stream << "# Property Label: $" << str_utils::latexify(_prop_label) << "$; Unit of the Property: " << _prop_unit.toString() << std::endl; + std::stringstream error_stream; if(train) { - out_file_stream << "# # Samples in Convex Hull Overlap Region: " << _train_n_convex_overlap << ";"; - out_file_stream << "# Samples SVM Misclassified: " << std::setprecision(15) << n_svm_misclassified_train() << std::endl; + error_stream << "# # Samples in Convex Hull Overlap Region: " << _train_n_convex_overlap << ";"; + error_stream << "# Samples SVM Misclassified: " << std::setprecision(15) << n_svm_misclassified_train() << std::endl; } else { - out_file_stream << "# # Samples in Convex Hull Overlap Region: " << _test_n_convex_overlap << ";"; - out_file_stream << "# Samples SVM Misclassified: " << std::setprecision(15) << n_svm_misclassified_test() << std::endl; + error_stream << "# # Samples in Convex Hull Overlap Region: " << _test_n_convex_overlap << ";"; + error_stream << "# Samples SVM Misclassified: " << std::setprecision(15) << n_svm_misclassified_test() << std::endl; } - out_file_stream << "# Plane Divider" << std::endl; - out_file_stream << std::setw(10) << std::left << "# Task"; + return error_stream.str(); +} +std::string ModelClassifier::coefs_header() const +{ + std::stringstream coef_head_stream; + coef_head_stream << "# Plane Divider" << std::endl; + coef_head_stream << std::setw(10) << std::left << "# Task"; for(int cc = 0; cc < _coefs[0].size() - 1; ++cc) { - out_file_stream << std::setw(24) << " w" + std::to_string(cc); - } - - out_file_stream << " b" << std::endl; - - for(int cc = 0; cc < _coefs.size(); ++cc) - { - out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; - for(auto& coeff : _coefs[cc]) - { - out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; - } - out_file_stream << "\n"; - } - - out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; - for(int ff = 0; ff < _feats.size(); ++ff) - { - out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + "; "; - out_file_stream << std::to_string(_feats[ff]->rung()) + "; "; - out_file_stream << std::setw(50) << _feats[ff]->unit().toString() + "; "; - out_file_stream << _feats[ff]->postfix_expr() + "; "; - out_file_stream << _feats[ff]->expr() + "; "; - out_file_stream << _feats[ff]->latex_expr() + "; "; - out_file_stream << _feats[ff]->matlab_fxn_expr() + "; "; - out_file_stream << boost::algorithm::join(_feats[ff]->get_x_in_expr_list(), ",") << std::endl; - } - - out_file_stream << "# # Samples Per Task" << std::endl; - if(train) - { - out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl; - for(int tt = 0; tt < _task_sizes_train.size(); ++tt) - { - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + "; "; - out_file_stream << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl; - } - } - else - { - out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_test" << std::endl; - for(int tt = 0; tt < _task_sizes_test.size(); ++tt) - { - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + "; "; - out_file_stream << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl; - } - out_file_stream << "# Test Indexes: [ " << test_inds[0]; - for(int ii = 1; ii < test_inds.size(); ++ii) - { - out_file_stream << ", " << test_inds[ii]; - } - out_file_stream << " ]" << std::endl; - } - - out_file_stream << "\n" << std::setw(22) << std::left << "# Property Value" << std::setw(2) << ", " << std::setw(22) << " Property Value (EST)"; - for(int ff = 0; ff < _feats.size(); ++ff) - { - out_file_stream << std::setw(2) << ", " << std::setw(22) << " Feature " + std::to_string(ff) + " Value"; + coef_head_stream << std::setw(24) << " w" + std::to_string(cc); } - out_file_stream << std::endl; + coef_head_stream << " b" << std::endl; - if(train) - { - for(int ss = 0; ss < _n_samp_train; ++ss) - { - out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", "; - out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train_est[ss]; - for(int ff = 0; ff < _n_dim; ++ff) - { - out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss]; - } - out_file_stream << std::endl; - } - } - else - { - for(int ss = 0; ss < _n_samp_test; ++ss) - { - out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", "; - out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test_est[ss]; - for(int ff = 0; ff < _n_dim; ++ff) - { - out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss]; - } - out_file_stream << std::endl; - } - } - out_file_stream.close(); + return coef_head_stream.str(); } diff --git a/src/descriptor_identifier/Model/ModelClassifier.hpp b/src/descriptor_identifier/Model/ModelClassifier.hpp index 13169d09911f376e6976cb32347914a31ef7a0e6..ebe9cfe80da2f34d8abd6626e063fa7d98ae36c0 100644 --- a/src/descriptor_identifier/Model/ModelClassifier.hpp +++ b/src/descriptor_identifier/Model/ModelClassifier.hpp @@ -39,12 +39,6 @@ typedef std::shared_ptr<ModelNode> model_node_ptr; */ class ModelClassifier : public Model { - std::vector<double> _prop_train_est; //!< The estimated Property (training data) - std::vector<double> _prop_test_est; //!< The estimated Property (testing data) - - std::vector<double> _train_error; //!< The error of the model (training) - std::vector<double> _test_error; //!< The error of the model (testing) - int _n_class; //!< The number of classes int _train_n_convex_overlap; //!< Number of data points in regions where the convex hulls overlap in the training set @@ -123,6 +117,36 @@ public: */ ModelClassifier& operator= (ModelClassifier&& o) = default; + /** + * @brief Based off of the model line in a model file determine if the intercept is fixed + * + * @param model_line the model line from the file + * @return True if the intercept should be fixed + */ + inline bool has_fixed_intercept(std::string model_line) + { + return false; + } + + /** + * @brief Check if a given line summarizes the error of a model + * + * @param line to see if it contains information about the error + * @return True if the line summarizes the error + */ + inline bool is_error_line(std::string line) + { + return line.substr(0, 42).compare("# # Samples in Convex Hull Overlap Region:") == 0; + } + + /** + * @brief Set error summary variables from a file + * + * @param error_line the line to set the error from + * @param train True if file is for training data + */ + void set_error_from_file(std::string error_line, bool train); + /** * @brief Evaluate the model for a new point * @@ -139,17 +163,6 @@ public: */ std::vector<double> eval(std::vector<double>* x_in) const; - /** - * @brief Read an output file and extract all relevant information - * @details Takes in an output file and extracts all data needed to recreate the model - * - * @param filename Name of the output file - * @param train If true then the output represents training data - * - * @return The set of strings used to create the feature's meta-data - */ - std::vector<std::string> populate_model(std::string filename, bool train); - // DocString: model_class_str /** * @brief Convert the model to a string @@ -174,20 +187,23 @@ public: std::string matlab_expr() const; /** - * @brief Copy the error into a new array + * @brief Get the block used to summarize the error in the output files * - * @param res pointer to the beginning of the vector to store the residual + * @param train True if summary is for the training file */ - inline void copy_error(double* res) const {std::copy_n(_train_error.data(), _n_samp_train, res);} + std::string error_summary_string(bool train) const; + + /** + * @brief Get the coefficients list header for output file + */ + std::string coefs_header() const; /** - * @brief Convert the ModelClassifier into an output file + * @brief Copy the error into a new array * - * @param filename The name of the file to output to - * @param train If true output the training data - * @param test_inds The indexes of the test set + * @param res pointer to the beginning of the vector to store the residual */ - void to_file(const std::string filename, const bool train=true, const std::vector<int> test_inds={}) const; + inline void copy_error(double* res) const {std::copy_n(_train_error.data(), _n_samp_train, res);} /** * @brief Set the train/test error of the model using linear programming @@ -259,15 +275,6 @@ public: */ ModelClassifier(const ModelClassifier& o, np::ndarray new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est); - // DocString: model_class_to_file - /** - * @brief Convert the ModelClassifier into an output file - * - * @param filename The name of the file to output to - * @param train If true output the training data - */ - inline void to_file_py(std::string filename, bool train=true){to_file(filename, train);} - // DocString: model_class_prop_train_est /** * @brief The estimation of the property diff --git a/src/descriptor_identifier/Model/ModelLogRegressor.cpp b/src/descriptor_identifier/Model/ModelLogRegressor.cpp index 64b6b347a8e75380080943cff83f0e077813c394..9af779d42f350be04a1319dea9685d3575edef3d 100644 --- a/src/descriptor_identifier/Model/ModelLogRegressor.cpp +++ b/src/descriptor_identifier/Model/ModelLogRegressor.cpp @@ -119,15 +119,19 @@ ModelLogRegressor::ModelLogRegressor( std::transform(_prop_test.begin(), _prop_test.end(), _prop_test_est.begin(), _test_error.begin(), [](double pt, double pt_est){return pt - pt_est;}); } -ModelLogRegressor::ModelLogRegressor(const std::string train_file) : - ModelRegressor(train_file), - _log_prop_train(_n_samp_train, 0.0), - _log_prop_test(_n_samp_test, 0.0), - _log_prop_train_est(_n_samp_train, 0.0), - _log_prop_test_est(_n_samp_test, 0.0), - _log_train_error(_n_samp_train), - _log_test_error(_n_samp_test) +ModelLogRegressor::ModelLogRegressor(const std::string train_file) { + _n_samp_test = 0; + std::vector<std::string> feature_expr_train = populate_model(train_file, true); + create_model_nodes_from_expressions(feature_expr_train); + + _log_prop_train.resize(_n_samp_train, 0.0); + _log_prop_test.resize(_n_samp_test, 0.0); + _log_prop_train_est.resize(_n_samp_train, 0.0); + _log_prop_test_est.resize(_n_samp_test, 0.0); + _log_train_error.resize(_n_samp_train); + _log_test_error.resize(_n_samp_test); + std::transform(_D_train.begin(), _D_train.begin() + _n_samp_train, _D_train.begin(), [](double dt){return std::log(dt);}); std::transform(_prop_train.begin(), _prop_train.end(), _log_prop_train.begin(), [](double pt){return std::log(pt);}); std::transform(_prop_train_est.begin(), _prop_train_est.end(), _log_prop_train_est.begin(), [](double pt){return std::log(pt);}); @@ -141,15 +145,20 @@ ModelLogRegressor::ModelLogRegressor(const std::string train_file) : ); } -ModelLogRegressor::ModelLogRegressor(const std::string train_file, const std::string test_file) : - ModelRegressor(train_file, test_file), - _log_prop_train(_n_samp_train, 0.0), - _log_prop_test(_n_samp_test, 0.0), - _log_prop_train_est(_n_samp_train, 0.0), - _log_prop_test_est(_n_samp_test, 0.0), - _log_train_error(_n_samp_train), - _log_test_error(_n_samp_test) +ModelLogRegressor::ModelLogRegressor(const std::string train_file, const std::string test_file) { + std::vector<std::string> feature_expr_train = populate_model(train_file, true); + std::vector<std::string> feature_expr_test = populate_model(test_file, false); + + create_model_nodes_from_expressions(feature_expr_train); + + _log_prop_train.resize(_n_samp_train, 0.0); + _log_prop_test.resize(_n_samp_test, 0.0); + _log_prop_train_est.resize(_n_samp_train, 0.0); + _log_prop_test_est.resize(_n_samp_test, 0.0); + _log_train_error.resize(_n_samp_train); + _log_test_error.resize(_n_samp_test); + std::transform(_D_train.begin(), _D_train.begin() + _n_samp_train, _D_train.begin(), [](double dt){return std::log(dt);}); std::transform(_D_test.begin(), _D_test.begin() + _n_samp_test, _D_test.begin(), [](double dt){return std::log(dt);}); diff --git a/src/descriptor_identifier/Model/ModelLogRegressor.hpp b/src/descriptor_identifier/Model/ModelLogRegressor.hpp index ae5375795dd07278c5dcab1f60fe56043c4bc407..a3f866c33706204e7135959d66809444cfe40b7c 100644 --- a/src/descriptor_identifier/Model/ModelLogRegressor.hpp +++ b/src/descriptor_identifier/Model/ModelLogRegressor.hpp @@ -113,6 +113,17 @@ public: */ ModelLogRegressor& operator= (ModelLogRegressor&& o) = default; + /** + * @brief Based off of the model line in a model file determine if the intercept is fixed + * + * @param model_line the model line from the file + * @return True if the intercept should be fixed + */ + inline bool has_fixed_intercept(std::string model_line) + { + return model_line.substr(0, 9).compare("# exp(c0)") != 0; + } + /** * @brief Evaluate the model for a new point * diff --git a/src/descriptor_identifier/Model/ModelRegressor.cpp b/src/descriptor_identifier/Model/ModelRegressor.cpp index f60220bc02116f08a513d9cc7e4ceca297aecbe7..f39ef7859dc338ce6827ee5e0f44cca4c87e23e7 100644 --- a/src/descriptor_identifier/Model/ModelRegressor.cpp +++ b/src/descriptor_identifier/Model/ModelRegressor.cpp @@ -14,11 +14,7 @@ ModelRegressor::ModelRegressor( const bool fix_intercept, const bool fill_vecs ) : - Model(prop_label, prop_unit, prop_train, prop_test, feats, task_sizes_train, task_sizes_test, fix_intercept), - _prop_train_est(_n_samp_train, 0.0), - _prop_test_est(_n_samp_test, 0.0), - _train_error(_n_samp_train), - _test_error(_n_samp_test) + Model(prop_label, prop_unit, prop_train, prop_test, feats, task_sizes_train, task_sizes_test, fix_intercept) { _prop_train_est.reserve(_n_samp_train); _prop_test_est.reserve(_n_samp_test); @@ -88,137 +84,17 @@ ModelRegressor::ModelRegressor( ModelRegressor::ModelRegressor(const std::string train_file) { _n_samp_test = 0; - - std::vector<std::string> split_str; std::vector<std::string> feature_expr_train = populate_model(train_file, true); - for(int ff = 0; ff < feature_expr_train.size(); ++ff) - { - split_str = str_utils::split_string_trim(feature_expr_train[ff], ";"); - int rung = std::stoi(split_str[0]); - - std::vector<double> feat_val(_n_samp_train); - std::vector<double> feat_test_val = {}; - std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); - model_node_ptr feat; - // Legacy code for previous versions - if((split_str[1][0] == '(') || (split_str[1][0] == '[')) - { - std::string unit_str = split_str[2]; - std::string postfix_expr = split_str[3]; - std::string expr = split_str[4]; - std::string latex_expr = "Property"; - std::string matlab_expr; - std::vector<std::string> x_in_expr_list; - if(split_str.size() > 5) - { - x_in_expr_list = str_utils::split_string_trim(split_str.back()); - } - if(split_str.size() > 6) - { - matlab_expr = split_str[5]; - } - else - { - matlab_expr = "NaN"; - } - feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); - } - else - { - std::string unit_str = split_str[1]; - std::string postfix_expr = split_str[2]; - std::string expr = split_str[3]; - std::string latex_expr = split_str[4]; - std::string matlab_expr; - std::vector<std::string> x_in_expr_list; - if(split_str.size() > 5) - { - x_in_expr_list = str_utils::split_string_trim(split_str.back()); - } - if(split_str.size() > 6) - { - matlab_expr = split_str[5]; - } - else - { - matlab_expr = "NaN"; - } - feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); - } - _feats.push_back(feat); - } + create_model_nodes_from_expressions(feature_expr_train); } ModelRegressor::ModelRegressor(const std::string train_file, std::string test_file) { - std::vector<std::string> split_str; std::vector<std::string> feature_expr_train = populate_model(train_file, true); std::vector<std::string> feature_expr_test = populate_model(test_file, false); - for(int ff = 0; ff < feature_expr_train.size(); ++ff) - { - if(feature_expr_train[ff] != feature_expr_test[ff]) - { - throw std::logic_error("Features for train and test file do not agree"); - } - - split_str = str_utils::split_string_trim(feature_expr_train[ff], ";"); - int rung = std::stoi(split_str[0]); - - - std::vector<double> feat_val(_n_samp_train); - std::vector<double> feat_test_val(_n_samp_test); - std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data()); - std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data()); - model_node_ptr feat; - // Legacy code for previous versions - if((split_str[1][0] == '(') || (split_str[1][0] == '[')) - { - std::string unit_str = split_str[2]; - std::string postfix_expr = split_str[3]; - std::string expr = split_str[4]; - std::string latex_expr = "Property"; - std::string matlab_expr; - std::vector<std::string> x_in_expr_list; - if(split_str.size() > 5) - { - x_in_expr_list = str_utils::split_string_trim(split_str.back()); - } - if(split_str.size() > 6) - { - matlab_expr = split_str[5]; - } - else - { - matlab_expr = "NaN"; - } - feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); - } - else - { - std::string unit_str = split_str[1]; - std::string postfix_expr = split_str[2]; - std::string expr = split_str[3]; - std::string latex_expr = split_str[4]; - std::string matlab_expr; - std::vector<std::string> x_in_expr_list; - if(split_str.size() > 5) - { - x_in_expr_list = str_utils::split_string_trim(split_str.back()); - } - if(split_str.size() > 6) - { - matlab_expr = split_str[5]; - } - else - { - matlab_expr = "NaN"; - } - feat = std::make_shared<ModelNode>(ff, rung, expr, latex_expr, postfix_expr, matlab_expr, feat_val, feat_test_val, x_in_expr_list, Unit(unit_str)); - } - _feats.push_back(feat); - } + create_model_nodes_from_expressions(feature_expr_train); } double ModelRegressor::eval(double* x_in) const @@ -249,161 +125,6 @@ std::vector<double> ModelRegressor::eval(std::vector<double>* x_in) const return result; } -std::vector<std::string> ModelRegressor::populate_model(const std::string filename, const bool train) -{ - std::ifstream file_stream; - file_stream.open(filename, std::ios::in); - - std::vector<std::string> feature_expr; - std::vector<std::string> split_line; - - // Store model line - std::string model_line; - std::getline(file_stream, model_line); - - _fix_intercept = (model_line.substr(0, 4).compare("# c0") != 0) && (model_line.substr(0, 9).compare("# exp(c0)") != 0); - - // Get the property unit and error - std::string unit_line; - std::string error_line; - - std::getline(file_stream, unit_line); - if(unit_line.substr(0, 8).compare("# RMSE: ") != 0) - { - split_line = str_utils::split_string_trim(unit_line); - if(split_line.size() > 2) - { - _prop_label = split_line[1]; - _prop_unit = Unit(split_line.back()); - } - else - { - _prop_label = "Property"; - _prop_unit = Unit(split_line.back()); - } - std::getline(file_stream, error_line); - } - else - { - _prop_label = "Property"; - _prop_unit = Unit(); - error_line = unit_line; - } - - split_line = str_utils::split_string_trim(error_line); - double rmse = std::stod(split_line[1]); - double max_ae = std::stod(split_line[3]); - - // Get coefficients - std::string line; - std::getline(file_stream, line); - std::getline(file_stream, line); - - int n_task = 0; - int n_dim = 0; - std::getline(file_stream, line); - do - { - ++n_task; - split_line = str_utils::split_string_trim(line); - n_dim = split_line.size() - 3; - if(train) - { - _coefs.push_back(std::vector<double>(n_dim + 1, 0.0)); - std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);}); - } - std::getline(file_stream, line); - } while(line.substr(0, 14).compare("# Feature Rung") != 0); - - _n_dim = n_dim + _fix_intercept; - std::getline(file_stream, line); - for(int ff = 0; ff < _n_dim; ++ff) - { - feature_expr.push_back(line.substr(6)); - std::getline(file_stream, line); - } - - std::getline(file_stream, line); - - int n_samp = 0; - for(int tt = 0; tt < n_task; ++tt) - { - std::getline(file_stream, line); - split_line = str_utils::split_string_trim(line); - n_samp += std::stoi(split_line[1]); - if(train) - { - _task_sizes_train.push_back(std::stoi(split_line[1])); - } - else - { - _task_sizes_test.push_back(std::stoi(split_line[1])); - } - } - - if(train) - { - _n_samp_train = n_samp; - _prop_train.resize(n_samp); - _prop_train_est.resize(n_samp); - _train_error.resize(n_samp); - } - else - { - _n_samp_test = n_samp; - _prop_test.resize(n_samp); - _prop_test_est.resize(n_samp); - _test_error.resize(n_samp); - } - - std::getline(file_stream, line); - std::getline(file_stream, line); - if(!train) - { - std::getline(file_stream, line); - } - - std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(n_samp, 0.0)); - for(int ns = 0; ns < n_samp; ++ns) - { - std::getline(file_stream, line); - split_line = str_utils::split_string_trim(line); - if(train) - { - _prop_train[ns] = std::stod(split_line[0]); - _prop_train_est[ns] = std::stod(split_line[1]); - _train_error[ns] = _prop_train_est[ns] - _prop_train[ns]; - - } - else - { - _prop_test[ns] = std::stod(split_line[0]); - _prop_test_est[ns] = std::stod(split_line[1]); - _test_error[ns] = _prop_test_est[ns] - _prop_test[ns]; - } - for(int nf = 0; nf < n_dim; ++nf) - { - feat_vals[nf][ns] = std::stod(split_line[2 + nf]); - } - } - - if(train) - { - _D_train.resize((_n_dim + !_fix_intercept) * n_samp); - for(int nf = 0; nf < n_dim; ++nf) - std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]); - } - else - { - _D_test.resize((_n_dim + !_fix_intercept) * n_samp); - for(int nf = 0; nf < n_dim; ++nf) - std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]); - } - - file_stream.close(); - return feature_expr; -} - std::string ModelRegressor::toString() const { std::stringstream model_rep; @@ -467,127 +188,41 @@ std::ostream& operator<< (std::ostream& outStream, const ModelRegressor& model) return outStream; } -void ModelRegressor::to_file(const std::string filename, const bool train, const std::vector<int> test_inds) const +std::string ModelRegressor::error_summary_string(bool train) const { - boost::filesystem::path p(filename.c_str()); - boost::filesystem::path parent = p.remove_filename(); - if(parent.string().size() > 0) - { - boost::filesystem::create_directories(parent); - } - - std::ofstream out_file_stream = std::ofstream(); - out_file_stream.open(filename); - - out_file_stream << "# " << toString() << std::endl; - out_file_stream << "# Property Label: $" << str_utils::latexify(_prop_label) << "$; Unit of the Property: " << _prop_unit.toString() << std::endl; + std::stringstream error_stream; if(train) { - out_file_stream << "# RMSE: " << std::setprecision(15) << rmse() << "; Max AE: " << max_ae() << std::endl; + error_stream << "# RMSE: " << std::setprecision(15) << rmse() << "; Max AE: " << max_ae() << std::endl; } else { - out_file_stream << "# RMSE: " << std::setprecision(15) << test_rmse() << "; Max AE: " << test_max_ae() << std::endl; + error_stream << "# RMSE: " << std::setprecision(15) << test_rmse() << "; Max AE: " << test_max_ae() << std::endl; } - out_file_stream << "# Coefficients" << std::endl; - out_file_stream << std::setw(10) << std::left << "# Task"; - - for(int cc = 0; cc < _coefs[0].size() - (!_fix_intercept); ++cc) - out_file_stream << std::setw(24) << " a" + std::to_string(cc); + return error_stream.str(); +} - if(!_fix_intercept) - { - out_file_stream << " c0" << std::endl; - } - else - { - out_file_stream << std::endl; - } +std::string ModelRegressor::coefs_header() const +{ + std::stringstream coef_head_stream; - for(int cc = 0; cc < _coefs.size(); ++cc) - { - out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; - for(auto& coeff : _coefs[cc]) - { - out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; - } - out_file_stream << "\n"; - } + coef_head_stream << "# Coefficients" << std::endl; + coef_head_stream << std::setw(10) << std::left << "# Task"; - out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; - for(int ff = 0; ff < _feats.size(); ++ff) - { - out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + "; "; - out_file_stream << std::to_string(_feats[ff]->rung()) + "; "; - out_file_stream << std::setw(50) << _feats[ff]->unit().toString() + "; "; - out_file_stream << _feats[ff]->postfix_expr() + "; " << _feats[ff]->expr() + "; "; - out_file_stream << _feats[ff]->latex_expr() + "; "; - out_file_stream << _feats[ff]->matlab_fxn_expr() + "; "; - out_file_stream << boost::algorithm::join(_feats[ff]->get_x_in_expr_list(), ",") << std::endl; - } + for(int cc = 0; cc < _coefs[0].size() - (!_fix_intercept); ++cc) + coef_head_stream << std::setw(24) << " a" + std::to_string(cc); - out_file_stream << "# Number of Samples Per Task" << std::endl; - if(train) + if(!_fix_intercept) { - out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_train" << std::endl; - for(int tt = 0; tt < _task_sizes_train.size(); ++tt) - { - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", "; - out_file_stream << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl; - } + coef_head_stream << " c0" << std::endl; } else { - out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_test" << std::endl; - for(int tt = 0; tt < _task_sizes_test.size(); ++tt) - { - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", "; - out_file_stream << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl; - } - - out_file_stream << "# Test Indexes: [ " << test_inds[0]; - for(int ii = 1; ii < test_inds.size(); ++ii) - { - out_file_stream << ", " << test_inds[ii]; - } - out_file_stream << " ]" << std::endl; - } - - out_file_stream << "\n" << std::setw(22) << std::left << "# Property Value" << std::setw(2) << ", " << std::setw(22) << " Property Value (EST)"; - for(int ff = 0; ff < _feats.size(); ++ff) - { - out_file_stream << std::setw(2) << ", " << std::setw(22) << " Feature " + std::to_string(ff) + " Value"; + coef_head_stream << std::endl; } - out_file_stream << std::endl; - if(train) - { - for(int ss = 0; ss < _n_samp_train; ++ss) - { - out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", "; - out_file_stream << std::setw(22) << _prop_train_est[ss]; - for(int ff = 0; ff < _n_dim; ++ff) - { - out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss]; - } - out_file_stream << std::endl; - } - } - else - { - for(int ss = 0; ss < _n_samp_test; ++ss) - { - out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", "; - out_file_stream << std::setw(22) << _prop_test_est[ss]; - for(int ff = 0; ff < _n_dim; ++ff) - { - out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss]; - } - out_file_stream << std::endl; - } - } - out_file_stream.close(); + return coef_head_stream.str(); } std::vector<double> ModelRegressor::sorted_error() const diff --git a/src/descriptor_identifier/Model/ModelRegressor.hpp b/src/descriptor_identifier/Model/ModelRegressor.hpp index 4bdbece93f487fd1fb7ba45ece68c7d662d399eb..09059105e6c81d6abac0668e934cad1defe612dc 100644 --- a/src/descriptor_identifier/Model/ModelRegressor.hpp +++ b/src/descriptor_identifier/Model/ModelRegressor.hpp @@ -36,13 +36,6 @@ typedef std::shared_ptr<ModelNode> model_node_ptr; */ class ModelRegressor : public Model { -protected: - std::vector<double> _prop_train_est; //!< The estimated Property (training data) - std::vector<double> _prop_test_est; //!< The estimated Property (testing data) - - std::vector<double> _train_error; //!< The error of the model (training) - std::vector<double> _test_error; //!< The error of the model (testing) - public: /** * @brief Default Constructor @@ -137,15 +130,31 @@ public: ModelRegressor& operator= (ModelRegressor&& o) = default; /** - * @brief Read an output file and extract all relevant information - * @details Takes in an output file and extracts all data needed to recreate the model + * @brief Based off of the model line in a model file determine if the intercept is fixed * - * @param filename Name of the output file - * @param train If true then the output represents training data + * @param model_line the model line from the file + * @return True if the intercept should be fixed + */ + virtual inline bool has_fixed_intercept(std::string model_line) + { + return model_line.substr(0, 4).compare("# c0") != 0; + } + + /** + * @brief Check if a given line summarizes the error of a model + * + * @param line to see if it contains information about the error + * @return True if the line summarizes the error + */ + inline bool is_error_line(std::string line){return line.substr(0, 8).compare("# RMSE: ") == 0;} + + /** + * @brief Set error summary variables from a file * - * @return Vector of strings describing feature meta data + * @param error_line the line to set the error from + * @param train True if file is for training data */ - virtual std::vector<std::string> populate_model(const std::string filename, const bool train); + inline void set_error_from_file(std::string error_line, bool train){}; // DocString: model_reg_str /** @@ -170,6 +179,18 @@ public: */ virtual std::string matlab_expr() const; + /** + * @brief Get the block used to summarize the error in the output files + * + * @param train True if summary is for the training file + */ + std::string error_summary_string(bool train) const; + + /** + * @brief Get the coefficients list header for output file + */ + std::string coefs_header() const; + /** * @brief Copy the error into a new array * @@ -382,15 +403,6 @@ public: return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.95))]; } - /** - * @brief Convert the ModelRegressor into an output file - * - * @param filename The name of the file to output to - * @param train If true output the training data - * @param test_inds The indexes of the test set - */ - virtual void to_file(const std::string filename, const bool train=true, const std::vector<int> test_inds={}) const; - #ifdef PY_BINDINGS // DocString: model_reg_prop_train_est /** diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp index 55cf3be330bdce2a2c54b0370db9cb35a39a8299..6f2faf7b93d2a4a8944e0e686f9865f911c8261e 100644 --- a/src/python/bindings_docstring_keyed.cpp +++ b/src/python/bindings_docstring_keyed.cpp @@ -831,6 +831,9 @@ void sisso::descriptor_identifier::registerModel() double (Model::*eval_ndarr)(np::ndarray) const = &Model::eval_py; double (Model::*eval_list)(py::list) const = &Model::eval_py; double (Model::*eval_dict)(py::dict) const = &Model::eval_py; + + void (Model::*to_file_list)(const std::string filename, const bool train, py::list test_inds) const = &Model::to_file; + void (Model::*to_file_arr)(const std::string filename, const bool train, np::ndarray test_inds) const = &Model::to_file; class_<sisso::descriptor_identifier::Model_Wrap, boost::noncopyable>("Model", no_init) .def("eval_many", eval_many_dict) .def("eval_many", eval_many_ndarr) @@ -838,6 +841,8 @@ void sisso::descriptor_identifier::registerModel() .def("eval", eval_list) .def("eval", eval_dict) .def("write_matlab_fxn", &Model::write_matlab_fxn) + .def("to_file", to_file_arr, "@DocString_model_to_file_ndarr") + .def("to_file", to_file_list, "@DocString_model_to_file_list") .add_property("n_samp_train", &Model::n_samp_train, "@DocString_model_n_samp_train@") .add_property("n_samp_test", &Model::n_samp_test, "@DocString_model_n_samp_test@") .add_property("n_dim", &Model::n_dim, "@DocString_model_n_dim@") @@ -907,7 +912,6 @@ void sisso::descriptor_identifier::registerModelClassifier() .def(init<ModelClassifier, np::ndarray, np::ndarray, np::ndarray>()) .def("__str__", &ModelClassifier::toString, "@DocString_model_class_str@") .def("__repr__", &ModelClassifier::toString, "@DocString_model_class_str@") - .def("to_file", &ModelClassifier::to_file_py, "@DocString_model_class_to_file@") .add_property("latex_str", &ModelClassifier::toLatexString, "@DocString_model_class_latex_str@") .add_property("fit", &ModelClassifier::prop_train_est, "@DocString_model_class_prop_train_est@") .add_property("predict", &ModelClassifier::prop_test_est, "@DocString_model_class_prop_test_est@") diff --git a/src/python/bindings_docstring_keyed.hpp b/src/python/bindings_docstring_keyed.hpp index 412c5e2f783c1cec7e24eb4294da6753d792ec3c..71ee6d47e49f8471d5f5585c2702b8e52606fd81 100644 --- a/src/python/bindings_docstring_keyed.hpp +++ b/src/python/bindings_docstring_keyed.hpp @@ -426,7 +426,6 @@ namespace sisso struct Model_Wrap : Model, py::wrapper<Model> { - inline void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}){this->get_override("to_file")();} inline void matlab_expr(){this->get_override("matlab_expr")();} }; diff --git a/src/python/descriptor_identifier/ModelClassifier.cpp b/src/python/descriptor_identifier/ModelClassifier.cpp index a6dd3c52cf1dab3106234575f94de49bdd31808b..2c924669b064a791d6e3172f4822a95ea7557254 100644 --- a/src/python/descriptor_identifier/ModelClassifier.cpp +++ b/src/python/descriptor_identifier/ModelClassifier.cpp @@ -2,10 +2,6 @@ ModelClassifier::ModelClassifier(const ModelClassifier& o, py::list new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est) : Model(o), - _prop_train_est(python_conv_utils::from_ndarray<double>(prop_train_est)), - _prop_test_est(python_conv_utils::from_ndarray<double>(prop_test_est)), - _train_error(o._train_error), - _test_error(o._test_error), _train_n_convex_overlap(o._fix_intercept), _test_n_convex_overlap(o._test_n_convex_overlap) { @@ -26,10 +22,6 @@ ModelClassifier::ModelClassifier(const ModelClassifier& o, py::list new_coefs, n ModelClassifier::ModelClassifier(const ModelClassifier& o, np::ndarray new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est) : Model(o), - _prop_train_est(python_conv_utils::from_ndarray<double>(prop_train_est)), - _prop_test_est(python_conv_utils::from_ndarray<double>(prop_test_est)), - _train_error(o._train_error), - _test_error(o._test_error), _train_n_convex_overlap(o._fix_intercept), _test_n_convex_overlap(o._test_n_convex_overlap) {