From 2894acc8e98da342b283c801feef0c9c4b4b1077 Mon Sep 17 00:00:00 2001 From: Thomas <purcell@fhi-berlin.mpg.de> Date: Sun, 15 Aug 2021 20:35:51 +0200 Subject: [PATCH] Add the task name and sample ID column to model files Also backwards compatiable --- src/descriptor_identifier/model/Model.cpp | 89 ++++++++++++++----- src/descriptor_identifier/model/Model.hpp | 14 ++- .../model/ModelClassifier.cpp | 30 +++++-- .../model/ModelClassifier.hpp | 10 ++- .../model/ModelLogRegressor.cpp | 7 +- .../model/ModelLogRegressor.hpp | 8 +- .../model/ModelRegressor.cpp | 28 +++++- .../model/ModelRegressor.hpp | 10 ++- .../solver/SISSOClassifier.cpp | 9 +- .../solver/SISSOClassifier.hpp | 12 +-- .../solver/SISSOLogRegressor.cpp | 9 +- .../solver/SISSOLogRegressor.hpp | 12 +-- .../solver/SISSORegressor.cpp | 9 +- .../solver/SISSORegressor.hpp | 12 +-- .../solver/SISSOSolver.cpp | 4 +- .../solver/SISSOSolver.hpp | 14 +-- src/inputs/InputParser.cpp | 11 ++- src/inputs/InputParser.hpp | 2 +- src/main.cpp | 6 +- .../bindings_docstring_keyed.cpp | 12 +-- .../descriptor_identifier/SISSOClassifier.cpp | 8 +- .../SISSOLogRegressor.cpp | 8 +- .../descriptor_identifier/SISSORegressor.cpp | 8 +- .../descriptor_identifier/SISSOSolver.cpp | 8 +- src/python/py_interface/get_solver.py | 10 +-- src/python/py_interface/import_dataframe.py | 18 ++-- .../model/test_model_classifier.cc | 23 +++-- .../model/test_model_log_regressor.cc | 18 +++- .../model/test_model_regressor.cc | 18 +++- .../test_sisso_log_regressor.cc | 10 +++ .../sisso_regressor/test_sisso_regressor.cc | 11 +++ 31 files changed, 321 insertions(+), 127 deletions(-) diff --git a/src/descriptor_identifier/model/Model.cpp b/src/descriptor_identifier/model/Model.cpp index 7ebc8e61..a0ff6139 100644 --- a/src/descriptor_identifier/model/Model.cpp +++ b/src/descriptor_identifier/model/Model.cpp @@ -26,8 +26,14 @@ Model::Model( const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ) : + _sample_ids_train(sample_ids_train), + _sample_ids_test(sample_ids_test), + _task_names(task_names), _n_samp_train(feats[0]->n_samp()), _n_samp_test(feats[0]->n_samp_test()), _n_dim(feats.size()), @@ -161,17 +167,7 @@ void Model::to_file(const std::string filename, const bool train) const out_file_stream << "# Property Label: $" << str_utils::latexify(_prop_label) << "$; Unit of the Property: " << _prop_unit.toString() << std::endl; out_file_stream << error_summary_string(train); - out_file_stream << coefs_header(); - - for(int cc = 0; cc < _coefs.size(); ++cc) - { - out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; - for(auto& coeff : _coefs[cc]) - { - out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; - } - out_file_stream << "\n"; - } + out_file_stream << write_coefs(); out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; for(int ff = 0; ff < _feats.size(); ++ff) @@ -185,13 +181,17 @@ void Model::to_file(const std::string filename, const bool train) const out_file_stream << boost::algorithm::join(_feats[ff]->get_x_in_expr_list(), ",") << std::endl; } + int task_header_w = std::max( + 6, + static_cast<int>(std::max_element(_task_names.begin(), _task_names.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); out_file_stream << "# Number of Samples Per Task" << std::endl; if(train) { - out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_train" << std::endl; + out_file_stream << std::setw(task_header_w) << std::left << "# Task" << std::setw(2) << ", " << std::setw(24) << "n_mats_train" << std::endl; for(int tt = 0; tt < task_sizes_train_vec.size(); ++tt) { - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", "; + out_file_stream << std::left << std::setw(task_header_w) << "# " + _task_names[tt] << std::setw(2) << ", "; out_file_stream << std::left << std::setw(22) << task_sizes_train_vec[tt] << std::endl; } } @@ -200,7 +200,7 @@ void Model::to_file(const std::string filename, const bool train) const out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_test" << std::endl; for(int tt = 0; tt < task_sizes_test_vec.size(); ++tt) { - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", "; + out_file_stream << std::left << std::setw(10) << "# " + _task_names[tt] + ", "; out_file_stream << std::left << std::setw(22) << task_sizes_test_vec[tt] << std::endl; } @@ -212,7 +212,23 @@ void Model::to_file(const std::string filename, const bool train) const out_file_stream << " ]" << std::endl; } - out_file_stream << "\n" << std::setw(22) << std::left << "# Property Value" << std::setw(2) << ", " << std::setw(22) << " Property Value (EST)"; + int max_sample_id_len = 12; + if(train) + { + max_sample_id_len = std::max( + max_sample_id_len, + static_cast<int>(std::max_element(_sample_ids_train.begin(), _sample_ids_train.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); + } + else + { + max_sample_id_len = std::max( + max_sample_id_len, + static_cast<int>(std::max_element(_sample_ids_test.begin(), _sample_ids_test.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); + } + out_file_stream << "\n" << std::setw(max_sample_id_len) << std::left << "# Sample ID" << std::setw(2) << ", "; + out_file_stream << std::setw(22) << std::left << "Property Value" << std::setw(2) << ", " << std::setw(22) << " Property Value (EST)"; for(int ff = 0; ff < _feats.size(); ++ff) { out_file_stream << std::setw(2) << ", " << std::setw(22) << " Feature " + std::to_string(ff) + " Value"; @@ -223,6 +239,7 @@ void Model::to_file(const std::string filename, const bool train) const { for(int ss = 0; ss < _n_samp_train; ++ss) { + out_file_stream << std::left << std::setw(max_sample_id_len) << _sample_ids_train[ss] << std::setw(2) << ", "; out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << prop_train_vec[ss] << std::setw(2) << ", "; out_file_stream << std::setw(22) << prop_train_est_vec[ss]; for(int ff = 0; ff < _n_dim; ++ff) @@ -236,6 +253,7 @@ void Model::to_file(const std::string filename, const bool train) const { for(int ss = 0; ss < _n_samp_test; ++ss) { + out_file_stream << std::left << std::setw(max_sample_id_len) << _sample_ids_test[ss] << std::setw(2) << ", "; out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << prop_test_vec[ss] << std::setw(2) << ", "; out_file_stream << std::setw(22) << prop_test_est_vec[ss]; for(int ff = 0; ff < _n_dim; ++ff) @@ -435,11 +453,16 @@ void Model::populate_model(const std::string train_filename, const std::string t { ++n_task; split_line = str_utils::split_string_trim(line); + _task_names.push_back(split_line[0]); _n_samp_train += std::stoi(split_line[1]); task_sizes_train.push_back(std::stoi(split_line[1])); if(with_test) { split_line = str_utils::split_string_trim(test_line); + if(split_line[0].compare(_task_names.back()) != 0) + { + throw std::logic_error("The task names for the test and train files are not in the same order."); + } _n_samp_test += std::stoi(split_line[1]); task_sizes_test.push_back(std::stoi(split_line[1])); std::getline(test_file_stream, test_line); @@ -454,6 +477,9 @@ void Model::populate_model(const std::string train_filename, const std::string t std::vector<double> prop_train(_n_samp_train); std::vector<double> prop_test(_n_samp_test); + _sample_ids_train.resize(_n_samp_train); + _sample_ids_test.resize(_n_samp_test); + if(with_test) { split_line = str_utils::split_string_trim(test_line, "[]"); @@ -479,16 +505,28 @@ void Model::populate_model(const std::string train_filename, const std::string t std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(_n_samp_train, 0.0)); std::vector<std::vector<double>> feat_test_vals(n_dim, std::vector<double>(_n_samp_test, 0.0)); + + bool with_samp_id = false; for(int ns = 0; ns < _n_samp_train; ++ns) { std::getline(train_file_stream, line); split_line = str_utils::split_string_trim(line); + if((split_line.size() > _n_dim + 2)) + { + with_samp_id = true; + _sample_ids_train[ns] = split_line[0]; + } + else + { + with_samp_id = false; + _sample_ids_train[ns] = std::to_string(ns); + } - prop_train[ns] = std::stod(split_line[0]); + prop_train[ns] = std::stod(split_line[with_samp_id]); for(int nf = 0; nf < n_dim; ++nf) { - feat_vals[nf][ns] = std::stod(split_line[2 + nf]); + feat_vals[nf][ns] = std::stod(split_line[2 + nf + with_samp_id]); } } for(int ns = 0; ns < _n_samp_test; ++ns) @@ -496,11 +534,22 @@ void Model::populate_model(const std::string train_filename, const std::string t std::getline(test_file_stream, test_line); split_line = str_utils::split_string_trim(test_line); - prop_test[ns] = std::stod(split_line[0]); + if((split_line.size() > _n_dim + 2)) + { + with_samp_id = true; + _sample_ids_test[ns] = split_line[0]; + } + else + { + with_samp_id = false; + _sample_ids_test[ns] = std::to_string(ns); + } + + prop_test[ns] = std::stod(split_line[with_samp_id]); for(int nf = 0; nf < n_dim; ++nf) { - feat_test_vals[nf][ns] = std::stod(split_line[2 + nf]); + feat_test_vals[nf][ns] = std::stod(split_line[2 + nf + with_samp_id]); } } train_file_stream.close(); diff --git a/src/descriptor_identifier/model/Model.hpp b/src/descriptor_identifier/model/Model.hpp index 61a1e0c1..3940ba76 100644 --- a/src/descriptor_identifier/model/Model.hpp +++ b/src/descriptor_identifier/model/Model.hpp @@ -42,6 +42,10 @@ class Model { protected: + std::vector<std::string> _sample_ids_train; //!< Vector storing all sample ids for the training samples + std::vector<std::string> _sample_ids_test; //!< Vector storing all sample ids for the test samples + std::vector<std::string> _task_names; //!< Vector storing the ID of the task names + int _n_samp_train; //!< The number of samples per feature int _n_samp_test; //!< The number of test samples per feature int _n_dim; //!< The number of dimensions of the model @@ -72,13 +76,19 @@ public: * @param loss The LossFunction used to calculate the model * @param feats The features of the model * @param leave_out_inds The indexes of the samples for the test set + * @param sample_ids_train A vector storing all sample ids for the training samples + * @param sample_ids_test A vector storing all sample ids for the test samples + * @param task_names A vector storing the ID of the task names */ Model( const std::string prop_label, const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ); /** @@ -289,7 +299,7 @@ public: /** * @brief Get the coefficients list header for output file */ - virtual std::string coefs_header() const = 0; + virtual std::string write_coefs() const = 0; // DocString: model_fix_intercept /** diff --git a/src/descriptor_identifier/model/ModelClassifier.cpp b/src/descriptor_identifier/model/ModelClassifier.cpp index 480c04fc..74e75873 100644 --- a/src/descriptor_identifier/model/ModelClassifier.cpp +++ b/src/descriptor_identifier/model/ModelClassifier.cpp @@ -26,9 +26,12 @@ ModelClassifier::ModelClassifier( const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ) : - Model(prop_label, prop_unit, loss, feats, leave_out_inds), + Model(prop_label, prop_unit, loss, feats, leave_out_inds, sample_ids_train, sample_ids_test, task_names), _train_n_convex_overlap(0), _test_n_convex_overlap(0), _n_class(loss->n_class()) @@ -209,11 +212,16 @@ std::string ModelClassifier::error_summary_string(bool train) const return error_stream.str(); } -std::string ModelClassifier::coefs_header() const +std::string ModelClassifier::write_coefs() const { std::stringstream coef_head_stream; - coef_head_stream << "# Plane Divider" << std::endl; - coef_head_stream << std::setw(10) << std::left << "# Task"; + coef_head_stream << "# Decision Boundaries" << std::endl; + int n_db = _n_class * (_n_class - 1) / 2; + int task_header_w = 1 + static_cast<int>(std::floor(std::log10(n_db))) + std::max( + 6, + static_cast<int>(std::max_element(_task_names.begin(), _task_names.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); + coef_head_stream << std::setw(task_header_w + 2) << std::left << "# Task"; for(int cc = 0; cc < _coefs[0].size() - 1; ++cc) { @@ -221,5 +229,17 @@ std::string ModelClassifier::coefs_header() const } coef_head_stream << " b" << std::endl; + for(int tt = 0; tt < _task_names.size(); ++tt) + { + for(int db = 0; db < n_db; ++db) + { + coef_head_stream << std::setw(task_header_w) << std::left << "# " + _task_names[tt] + "_" + std::to_string(db) << std::setw(2) << ", "; + for(auto& coeff : _coefs[tt * n_db + db]) + { + coef_head_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; + } + coef_head_stream << "\n"; + } + } return coef_head_stream.str(); } diff --git a/src/descriptor_identifier/model/ModelClassifier.hpp b/src/descriptor_identifier/model/ModelClassifier.hpp index fe29e171..28ef2d60 100644 --- a/src/descriptor_identifier/model/ModelClassifier.hpp +++ b/src/descriptor_identifier/model/ModelClassifier.hpp @@ -53,13 +53,19 @@ public: * @param loss The LossFunction used to calculate the model * @param feats The features of the model * @param leave_out_inds The indexes of the samples for the test set + * @param sample_ids_train A vector storing all sample ids for the training samples + * @param sample_ids_test A vector storing all sample ids for the test samples + * @param task_names A vector storing the ID of the task names */ ModelClassifier( const std::string prop_label, const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ); // DocString: model_class_init_train @@ -195,7 +201,7 @@ public: /** * @brief Get the coefficients list header for output file */ - std::string coefs_header() const; + std::string write_coefs() const; /** * @brief Copy the training error into a different vector diff --git a/src/descriptor_identifier/model/ModelLogRegressor.cpp b/src/descriptor_identifier/model/ModelLogRegressor.cpp index 6bec56f6..a7519adc 100644 --- a/src/descriptor_identifier/model/ModelLogRegressor.cpp +++ b/src/descriptor_identifier/model/ModelLogRegressor.cpp @@ -29,9 +29,12 @@ ModelLogRegressor::ModelLogRegressor( const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ) : - ModelRegressor(prop_label, prop_unit, loss, feats, leave_out_inds) + ModelRegressor(prop_label, prop_unit, loss, feats, leave_out_inds, sample_ids_train, sample_ids_test, task_names) {} ModelLogRegressor::ModelLogRegressor(const std::string train_file) diff --git a/src/descriptor_identifier/model/ModelLogRegressor.hpp b/src/descriptor_identifier/model/ModelLogRegressor.hpp index 1f6bce56..da7f5e09 100644 --- a/src/descriptor_identifier/model/ModelLogRegressor.hpp +++ b/src/descriptor_identifier/model/ModelLogRegressor.hpp @@ -51,13 +51,19 @@ public: * @param loss The LossFunction used to calculate the model * @param feats The features of the model * @param leave_out_inds The indexes of the samples for the test set + * @param sample_ids_train A vector storing all sample ids for the training samples + * @param sample_ids_test A vector storing all sample ids for the test samples + * @param task_names A vector storing the ID of the task names */ ModelLogRegressor( const std::string prop_label, const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ); // DocString: model_log_reg_init_train diff --git a/src/descriptor_identifier/model/ModelRegressor.cpp b/src/descriptor_identifier/model/ModelRegressor.cpp index fd7de70a..4d809f64 100644 --- a/src/descriptor_identifier/model/ModelRegressor.cpp +++ b/src/descriptor_identifier/model/ModelRegressor.cpp @@ -29,9 +29,12 @@ ModelRegressor::ModelRegressor( const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ) : - Model(prop_label, prop_unit, loss, feats, leave_out_inds) + Model(prop_label, prop_unit, loss, feats, leave_out_inds, sample_ids_train, sample_ids_test, task_names) { double rmse = (*_loss)(_feats); } @@ -168,15 +171,22 @@ std::string ModelRegressor::error_summary_string(bool train) const return error_stream.str(); } -std::string ModelRegressor::coefs_header() const +std::string ModelRegressor::write_coefs() const { std::stringstream coef_head_stream; + int task_header_w = std::max( + 6, + static_cast<int>(std::max_element(_task_names.begin(), _task_names.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); + coef_head_stream << "# Coefficients" << std::endl; - coef_head_stream << std::setw(10) << std::left << "# Task"; + coef_head_stream << std::setw(task_header_w + 2) << std::left << "# Task"; for(int cc = 0; cc < _coefs[0].size() - (!_fix_intercept); ++cc) + { coef_head_stream << std::setw(24) << " a" + std::to_string(cc); + } if(!_fix_intercept) { @@ -187,6 +197,16 @@ std::string ModelRegressor::coefs_header() const coef_head_stream << std::endl; } + for(int cc = 0; cc < _coefs.size(); ++cc) + { + coef_head_stream << std::setw(task_header_w) << std::left << "# " + _task_names[cc] << std::setw(2) << ", "; + for(auto& coeff : _coefs[cc]) + { + coef_head_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; + } + coef_head_stream << "\n"; + } + return coef_head_stream.str(); } diff --git a/src/descriptor_identifier/model/ModelRegressor.hpp b/src/descriptor_identifier/model/ModelRegressor.hpp index af386356..80484280 100644 --- a/src/descriptor_identifier/model/ModelRegressor.hpp +++ b/src/descriptor_identifier/model/ModelRegressor.hpp @@ -50,13 +50,19 @@ public: * @param loss The LossFunction used to calculate the model * @param feats The features of the model * @param leave_out_inds The indexes of the samples for the test set + * @param sample_ids_train A vector storing all sample ids for the training samples + * @param sample_ids_test A vector storing all sample ids for the test samples + * @param task_names A vector storing the ID of the task names */ ModelRegressor( const std::string prop_label, const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ); // DocString: model_reg_init_train @@ -195,7 +201,7 @@ public: /** * @brief Get the coefficients list header for output file */ - std::string coefs_header() const; + std::string write_coefs() const; /** * @brief Copy the training error into a different vector diff --git a/src/descriptor_identifier/solver/SISSOClassifier.cpp b/src/descriptor_identifier/solver/SISSOClassifier.cpp index 91f7dc92..28726599 100644 --- a/src/descriptor_identifier/solver/SISSOClassifier.cpp +++ b/src/descriptor_identifier/solver/SISSOClassifier.cpp @@ -35,7 +35,7 @@ SISSOClassifier::SISSOClassifier( const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys + const std::vector<std::string> task_names ): SISSOSolver( "classification", @@ -52,7 +52,7 @@ SISSOClassifier::SISSOClassifier( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, false ), _c(1000.0), @@ -310,7 +310,10 @@ void SISSOClassifier::l0_regularization(const int n_dim) _prop_unit, loss_function_util::copy(_loss), min_nodes[rr], - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_names ) ); } diff --git a/src/descriptor_identifier/solver/SISSOClassifier.hpp b/src/descriptor_identifier/solver/SISSOClassifier.hpp index bd854553..16ab12d0 100644 --- a/src/descriptor_identifier/solver/SISSOClassifier.hpp +++ b/src/descriptor_identifier/solver/SISSOClassifier.hpp @@ -67,7 +67,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names */ SISSOClassifier( const std::shared_ptr<FeatureSpace> feat_space, @@ -82,7 +82,7 @@ public: const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys + const std::vector<std::string> task_names ); /** @@ -153,7 +153,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names */ SISSOClassifier( std::shared_ptr<FeatureSpace> feat_space, @@ -169,7 +169,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys + py::list task_names ); // DocString: sisso_class_init_list @@ -189,7 +189,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names */ SISSOClassifier( std::shared_ptr<FeatureSpace> feat_space, @@ -205,7 +205,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys + py::list task_names ); // DocString: sisso_class_models_py diff --git a/src/descriptor_identifier/solver/SISSOLogRegressor.cpp b/src/descriptor_identifier/solver/SISSOLogRegressor.cpp index ab36f0ed..9d502966 100644 --- a/src/descriptor_identifier/solver/SISSOLogRegressor.cpp +++ b/src/descriptor_identifier/solver/SISSOLogRegressor.cpp @@ -35,7 +35,7 @@ SISSOLogRegressor::SISSOLogRegressor( const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept ): SISSORegressor( @@ -52,7 +52,7 @@ SISSOLogRegressor::SISSOLogRegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) { @@ -106,7 +106,10 @@ void SISSOLogRegressor::add_models(const std::vector<std::vector<int>> indexes) _prop_unit, loss_function_util::copy(_loss), min_nodes.back(), - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_names ); _models.back().push_back(model); } diff --git a/src/descriptor_identifier/solver/SISSOLogRegressor.hpp b/src/descriptor_identifier/solver/SISSOLogRegressor.hpp index 2f8ce7d5..34c63511 100644 --- a/src/descriptor_identifier/solver/SISSOLogRegressor.hpp +++ b/src/descriptor_identifier/solver/SISSOLogRegressor.hpp @@ -57,7 +57,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSOLogRegressor( @@ -74,7 +74,7 @@ public: const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept=false ); @@ -121,7 +121,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept (bool) If true the bias term is fixed at 0 */ SISSOLogRegressor( @@ -138,7 +138,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); @@ -159,7 +159,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept (bool) If true the bias term is fixed at 0 */ SISSOLogRegressor( @@ -176,7 +176,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); diff --git a/src/descriptor_identifier/solver/SISSORegressor.cpp b/src/descriptor_identifier/solver/SISSORegressor.cpp index 7705db94..e6853bdc 100644 --- a/src/descriptor_identifier/solver/SISSORegressor.cpp +++ b/src/descriptor_identifier/solver/SISSORegressor.cpp @@ -35,7 +35,7 @@ SISSORegressor::SISSORegressor( const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept ): SISSOSolver( @@ -53,7 +53,7 @@ SISSORegressor::SISSORegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) {} @@ -76,7 +76,10 @@ void SISSORegressor::add_models(const std::vector<std::vector<int>> indexes) _prop_unit, loss_function_util::copy(_loss), min_nodes.back(), - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_names ); _models.back().push_back(model); } diff --git a/src/descriptor_identifier/solver/SISSORegressor.hpp b/src/descriptor_identifier/solver/SISSORegressor.hpp index 897298ef..c7f8ff71 100644 --- a/src/descriptor_identifier/solver/SISSORegressor.hpp +++ b/src/descriptor_identifier/solver/SISSORegressor.hpp @@ -57,7 +57,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSORegressor( @@ -74,7 +74,7 @@ public: const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept=false ); @@ -128,7 +128,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept (bool) If true the bias term is fixed at 0 */ SISSORegressor( @@ -145,7 +145,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); @@ -166,7 +166,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept (bool) If true the bias term is fixed at 0 */ SISSORegressor( @@ -183,7 +183,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); diff --git a/src/descriptor_identifier/solver/SISSOSolver.cpp b/src/descriptor_identifier/solver/SISSOSolver.cpp index 9039b0aa..33019c72 100644 --- a/src/descriptor_identifier/solver/SISSOSolver.cpp +++ b/src/descriptor_identifier/solver/SISSOSolver.cpp @@ -36,12 +36,12 @@ SISSOSolver::SISSOSolver( const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept ): _sample_ids_train(sample_ids_train), _sample_ids_test(sample_ids_test), - _task_keys(task_keys), + _task_names(task_names), _task_sizes_train(task_sizes_train), _task_sizes_test(task_sizes_test), _leave_out_inds(leave_out_inds), diff --git a/src/descriptor_identifier/solver/SISSOSolver.hpp b/src/descriptor_identifier/solver/SISSOSolver.hpp index a7b279d6..939b8285 100644 --- a/src/descriptor_identifier/solver/SISSOSolver.hpp +++ b/src/descriptor_identifier/solver/SISSOSolver.hpp @@ -40,7 +40,7 @@ class SISSOSolver protected: const std::vector<std::string> _sample_ids_train; //!< Vector storing all sample ids for the training samples const std::vector<std::string> _sample_ids_test; //!< Vector storing all sample ids for the test samples - const std::vector<std::string> _task_keys; //!< Vector storing the ID of the task names + const std::vector<std::string> _task_names; //!< Vector storing the ID of the task names const std::vector<int> _task_sizes_train; //!< Number of training samples per task const std::vector<int> _task_sizes_test; //!< Number of testing samples per task @@ -77,7 +77,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A vector storing all sample ids for the training samples * @param sample_ids_test A vector storing all sample ids for the test samples - * @param task_keys A vector storing the ID of the task names + * @param task_names A vector storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSOSolver( @@ -95,7 +95,7 @@ public: const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept=false ); @@ -175,7 +175,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSOSolver( @@ -193,7 +193,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); @@ -214,7 +214,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSOSolver( @@ -232,7 +232,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp index 36565188..13f0183a 100644 --- a/src/inputs/InputParser.cpp +++ b/src/inputs/InputParser.cpp @@ -119,8 +119,8 @@ InputParser::InputParser(pt::ptree ip, std::string fn, std::shared_ptr<MPI_Inter { ++_n_samp; } - tasks["none"] = std::vector<int>(_n_samp); - std::iota(tasks["none"].begin(), tasks["none"].end(), 0); + tasks["all"] = std::vector<int>(_n_samp); + std::iota(tasks["all"].begin(), tasks["all"].end(), 0); } else { @@ -151,7 +151,6 @@ InputParser::InputParser(pt::ptree ip, std::string fn, std::shared_ptr<MPI_Inter int start = 0; for(auto& el : tasks) { - _task_keys.push_back(el.first); _task_sizes_test.push_back(static_cast<int>(std::round(leave_out_frac * el.second.size()))); _task_sizes_train.push_back(el.second.size() - _task_sizes_test.back()); @@ -269,6 +268,11 @@ void InputParser::generate_feature_space( int n_train_samp = 0; int n_samp_test = 0; + for(auto& task : tasks) + { + _task_names.push_back(task.first); + } + while (std::getline(data_stream, line)) { std::vector<std::string> split_line; @@ -289,6 +293,7 @@ void InputParser::generate_feature_space( n_samp_test = 0; for(auto& task : tasks) { + int task_ind = std::find(task.second.begin(), task.second.end(), cur_line) - task.second.begin(); for(int ii = 0; ii < task_ind; ++ii) { diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp index 6c72773a..21df1c8f 100644 --- a/src/inputs/InputParser.hpp +++ b/src/inputs/InputParser.hpp @@ -50,7 +50,7 @@ class InputParser public: std::vector<std::string> _sample_ids_train; //!< Vector storing all sample ids for the training samples std::vector<std::string> _sample_ids_test; //!< Vector storing all sample ids for the test samples - std::vector<std::string> _task_keys; //!< Vector storing the ID of the task names + std::vector<std::string> _task_names; //!< Vector storing the ID of the task names std::vector<std::string> _param_opset; //!< Vector containing all allowed operators strings for operators with free parameters std::vector<std::string> _opset; //!< Vector containing all allowed operators strings diff --git a/src/main.cpp b/src/main.cpp index bd1959f2..2f85c884 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -86,7 +86,7 @@ int main(int argc, char const *argv[]) ip._n_models_store, ip._sample_ids_train, ip._sample_ids_test, - ip._task_keys, + ip._task_names, ip._fix_intercept ); sisso.fit(); @@ -124,7 +124,7 @@ int main(int argc, char const *argv[]) ip._n_models_store, ip._sample_ids_train, ip._sample_ids_test, - ip._task_keys, + ip._task_names, ip._fix_intercept ); sisso.fit(); @@ -162,7 +162,7 @@ int main(int argc, char const *argv[]) ip._n_models_store, ip._sample_ids_train, ip._sample_ids_test, - ip._task_keys + ip._task_names ); sisso.fit(); diff --git a/src/python/py_binding_cpp_def/bindings_docstring_keyed.cpp b/src/python/py_binding_cpp_def/bindings_docstring_keyed.cpp index 62907e4a..cbb996bc 100644 --- a/src/python/py_binding_cpp_def/bindings_docstring_keyed.cpp +++ b/src/python/py_binding_cpp_def/bindings_docstring_keyed.cpp @@ -1414,7 +1414,7 @@ void sisso::descriptor_identifier::registerSISSORegressor() py::list, optional<bool> >( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys"), arg("fix_intercept")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names"), arg("fix_intercept")), "@DocString_sisso_reg_init_arr@" ) ) @@ -1436,7 +1436,7 @@ void sisso::descriptor_identifier::registerSISSORegressor() py::list, optional<bool> >( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys"), arg("fix_intercept")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names"), arg("fix_intercept")), "@DocString_sisso_reg_init_list@" ) ) @@ -1467,7 +1467,7 @@ void sisso::descriptor_identifier::registerSISSOLogRegressor() py::list, optional<bool> >( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys"), arg("fix_intercept")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names"), arg("fix_intercept")), "@DocString_sisso_log_reg_init_arr@" ) ) @@ -1489,7 +1489,7 @@ void sisso::descriptor_identifier::registerSISSOLogRegressor() py::list, optional<bool> >( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys"), arg("fix_intercept")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names"), arg("fix_intercept")), "@DocString_sisso_log_reg_init_list@" ) ) @@ -1503,13 +1503,13 @@ void sisso::descriptor_identifier::registerSISSOClassifier() "SISSOClassifier", "@DocString_cls_sisso_class@", init<std::shared_ptr<FeatureSpace>, std::string, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, int, py::list, py::list, py::list>( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names")), "@DocString_sisso_class_init_arr@" ) ) .def( init<std::shared_ptr<FeatureSpace>, std::string, Unit, py::list, py::list, py::list, py::list, py::list, int, int, int, py::list, py::list, py::list>( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names")), "@DocString_sisso_class_init_list@" ) ) diff --git a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOClassifier.cpp b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOClassifier.cpp index 220a520a..4ae3b8ba 100644 --- a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOClassifier.cpp +++ b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOClassifier.cpp @@ -35,7 +35,7 @@ SISSOClassifier::SISSOClassifier( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys + py::list task_names ) : SISSOSolver( "classification", @@ -52,7 +52,7 @@ SISSOClassifier::SISSOClassifier( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, false ), _c(100.0), @@ -76,7 +76,7 @@ SISSOClassifier::SISSOClassifier( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys + py::list task_names ) : SISSOSolver( "classification", @@ -93,7 +93,7 @@ SISSOClassifier::SISSOClassifier( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, false ), _c(100.0), diff --git a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOLogRegressor.cpp b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOLogRegressor.cpp index f910cde5..53ccf1ac 100644 --- a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOLogRegressor.cpp +++ b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOLogRegressor.cpp @@ -35,7 +35,7 @@ SISSOLogRegressor::SISSOLogRegressor( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : SISSORegressor( @@ -52,7 +52,7 @@ SISSOLogRegressor::SISSOLogRegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) { @@ -86,7 +86,7 @@ SISSOLogRegressor::SISSOLogRegressor( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : SISSORegressor( @@ -103,7 +103,7 @@ SISSOLogRegressor::SISSOLogRegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) { diff --git a/src/python/py_binding_cpp_def/descriptor_identifier/SISSORegressor.cpp b/src/python/py_binding_cpp_def/descriptor_identifier/SISSORegressor.cpp index 6f9f8b03..4b023f75 100644 --- a/src/python/py_binding_cpp_def/descriptor_identifier/SISSORegressor.cpp +++ b/src/python/py_binding_cpp_def/descriptor_identifier/SISSORegressor.cpp @@ -35,7 +35,7 @@ SISSORegressor::SISSORegressor( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : SISSOSolver( @@ -53,7 +53,7 @@ SISSORegressor::SISSORegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) {} @@ -72,7 +72,7 @@ SISSORegressor::SISSORegressor( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : SISSOSolver( @@ -90,7 +90,7 @@ SISSORegressor::SISSORegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) {} diff --git a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOSolver.cpp b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOSolver.cpp index 4c5cb8e4..3d6ef29a 100644 --- a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOSolver.cpp +++ b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOSolver.cpp @@ -36,12 +36,12 @@ SISSOSolver::SISSOSolver( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : _sample_ids_train(python_conv_utils::from_list<std::string>(sample_ids_train)), _sample_ids_test(python_conv_utils::from_list<std::string>(sample_ids_test)), - _task_keys(python_conv_utils::from_list<std::string>(task_keys)), + _task_names(python_conv_utils::from_list<std::string>(task_names)), _task_sizes_train(python_conv_utils::from_list<int>(task_sizes_train)), _task_sizes_test(python_conv_utils::from_list<int>(task_sizes_test)), _leave_out_inds(python_conv_utils::from_list<int>(leave_out_inds)), @@ -83,12 +83,12 @@ SISSOSolver::SISSOSolver( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : _sample_ids_train(python_conv_utils::from_list<std::string>(sample_ids_train)), _sample_ids_test(python_conv_utils::from_list<std::string>(sample_ids_test)), - _task_keys(python_conv_utils::from_list<std::string>(task_keys)), + _task_names(python_conv_utils::from_list<std::string>(task_names)), _task_sizes_train(python_conv_utils::from_list<int>(task_sizes_train)), _task_sizes_test(python_conv_utils::from_list<int>(task_sizes_test)), _leave_out_inds(python_conv_utils::from_list<int>(leave_out_inds)), diff --git a/src/python/py_interface/get_solver.py b/src/python/py_interface/get_solver.py index 8e61c0a6..b5dea564 100644 --- a/src/python/py_interface/get_solver.py +++ b/src/python/py_interface/get_solver.py @@ -153,7 +153,7 @@ def get_fs_solver( leave_out_inds, sample_ids_train, sample_ids_test, - task_keys, + task_names, ) = read_csv( df, prop_key, @@ -178,7 +178,7 @@ def get_fs_solver( if loss_type.lower() == "regression": print(sample_ids_train) print(sample_ids_test) - print(task_keys) + print(task_names) solver = SISSORegressor( fs, prop_label, @@ -193,7 +193,7 @@ def get_fs_solver( n_model_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, ) elif loss_type.lower() == "log_regression": solver = SISSOLogRegressor( @@ -210,7 +210,7 @@ def get_fs_solver( n_model_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, ) else: solver = SISSOClassifier( @@ -227,6 +227,6 @@ def get_fs_solver( n_model_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, ) return fs, solver diff --git a/src/python/py_interface/import_dataframe.py b/src/python/py_interface/import_dataframe.py index d4dbdd91..a80a1a71 100644 --- a/src/python/py_interface/import_dataframe.py +++ b/src/python/py_interface/import_dataframe.py @@ -138,7 +138,7 @@ def read_csv( - leave_out_inds (list): Indices to use as the test set - sample_ids_train (list): List of sample id's for the training data - sample_ids_test (list): List of sample id's for the test data - - task_keys (list): List of all task id names + - task_names (list): List of all task id names """ if not max_rung: raise ValueError("Maximum rung for the calculation is not defined.") @@ -152,14 +152,14 @@ def read_csv( if task_key: task, _, _ = extract_col(df, task_key) else: - task = np.zeros(prop.shape, dtype=np.int64).astype(str) + task = np.array(["all"] * len(prop)) # Map out which index belongs to which task and get the size of each task task_map = {} - task_keys, task_sizes = np.unique(task, return_counts=True) + task_names, task_sizes = np.unique(task, return_counts=True) task_sizes = task_sizes.astype(np.int32) - for kk, key in enumerate(task_keys): + for kk, key in enumerate(task_names): task_map[key] = np.where(task == key)[0].astype(np.int32) assert task_sizes[kk] == len(task_map[key]) @@ -172,21 +172,21 @@ def read_csv( if leave_out_frac > 0.0: task_sizes_test = [int(math.ceil(ts * leave_out_frac)) for ts in task_sizes] - for kk, key in enumerate(task_keys): + for kk, key in enumerate(task_names): leave_out_inds += list( np.random.choice(task_map[key], task_sizes_test[kk], False).astype( np.int32 ) ) else: - task_sizes_test = list(np.zeros(len(task_keys), dtype=np.int32)) + task_sizes_test = list(np.zeros(len(task_names), dtype=np.int32)) else: assert (leave_out_frac == 0.0) or ( int(round(len(df) * leave_out_frac)) == len(leave_out_inds) ) - task_sizes_test = list(np.zeros(len(task_keys), dtype=np.int32)) - for kk, key in enumerate(task_keys): + task_sizes_test = list(np.zeros(len(task_names), dtype=np.int32)) + for kk, key in enumerate(task_names): left_out = [ind for ind in leave_out_inds if ind in task_map[key]] task_sizes_test[kk] = len(left_out) @@ -231,5 +231,5 @@ def read_csv( leave_out_inds, list(df.index[train_inds].to_numpy().astype(str)), list(df.index[leave_out_inds].to_numpy().astype(str)), - list(task_keys), + list(task_names), ) diff --git a/tests/googletest/descriptor_identification/model/test_model_classifier.cc b/tests/googletest/descriptor_identification/model/test_model_classifier.cc index 405b7bdf..71086ca7 100644 --- a/tests/googletest/descriptor_identification/model/test_model_classifier.cc +++ b/tests/googletest/descriptor_identification/model/test_model_classifier.cc @@ -42,16 +42,24 @@ namespace _task_sizes_test, 1 ); + + _task_keys = {"all"}; + _sample_ids_train = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"}; + _sample_ids_test = {"20", "21"}; } - std::vector<int> _leave_out_inds; - std::vector<int> _task_sizes_train; - std::vector<int> _task_sizes_test; + std::vector<std::string> _sample_ids_train; + std::vector<std::string> _sample_ids_test; + std::vector<std::string> _task_keys; std::vector<double> _prop; std::vector<double> _prop_test; std::vector<model_node_ptr> _features; std::shared_ptr<LossFunction> _loss; + + std::vector<int> _task_sizes_train; + std::vector<int> _task_sizes_test; + std::vector<int> _leave_out_inds; }; TEST_F(ModelClassifierTests, NodesTest) @@ -61,7 +69,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "[A]"); EXPECT_EQ(model.n_convex_overlap_train(), 0); @@ -96,7 +107,7 @@ namespace EXPECT_EQ(model.n_dim(), 1); EXPECT_EQ(model.prop_unit(), Unit("m")); - // boost::filesystem::remove("train_class_mods.dat"); - // boost::filesystem::remove("test_class_mods.dat"); + boost::filesystem::remove("train_class_mods.dat"); + boost::filesystem::remove("test_class_mods.dat"); } } diff --git a/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc b/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc index 4c1e43ec..07f37869 100644 --- a/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc +++ b/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc @@ -42,7 +42,15 @@ namespace std::transform(value_1.begin(), value_1.end(), value_2.begin(), _prop.begin(), [](double v1, double v2){return std::log(0.001 * std::pow(v1, 0.1) * std::pow(v2, -2.1));}); std::transform(test_value_1.begin(), test_value_1.end(), test_value_2.begin(), _prop_test.begin(), [](double v1, double v2){return std::log(0.001 * std::pow(v1, 0.1) * std::pow(v2, -2.1));}); + + _task_keys = {"all"}; + _sample_ids_train = {"0", "1", "2", "3", "4", "6", "7", "8", "9", "10"}; + _sample_ids_test = {"5", "11"}; } + std::vector<std::string> _sample_ids_train; + std::vector<std::string> _sample_ids_test; + std::vector<std::string> _task_keys; + std::vector<int> _leave_out_inds; std::vector<int> _task_sizes_train; std::vector<int> _task_sizes_test; @@ -70,7 +78,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "exp(c0) * (A)^a0 * (B)^a1"); EXPECT_LT(model.rmse(), 1e-10); @@ -156,7 +167,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "(A)^a0 * (B)^a1"); diff --git a/tests/googletest/descriptor_identification/model/test_model_regressor.cc b/tests/googletest/descriptor_identification/model/test_model_regressor.cc index 031ddd4d..d4eb9012 100644 --- a/tests/googletest/descriptor_identification/model/test_model_regressor.cc +++ b/tests/googletest/descriptor_identification/model/test_model_regressor.cc @@ -45,7 +45,15 @@ namespace std::transform(test_value_1.begin(), test_value_1.begin() + 1, test_value_2.begin(), _prop_test.begin(), [](double v1, double v2){return 0.001 + v1 + v2;}); std::transform(test_value_1.begin() + 1, test_value_1.end(), test_value_2.begin() + 1, _prop_test.begin() + 1, [](double v1, double v2){return -6.5 + 1.25 * v1 - 0.4 * v2;}); + + _task_keys = {"task_1", "task_2"}; + _sample_ids_train = {"0", "1", "2", "3", "4", "6", "7", "8", "9", "10"}; + _sample_ids_test = {"5", "11"}; } + std::vector<std::string> _sample_ids_train; + std::vector<std::string> _sample_ids_test; + std::vector<std::string> _task_keys; + std::vector<int> _leave_out_inds; std::vector<int> _task_sizes_train; std::vector<int> _task_sizes_test; @@ -73,7 +81,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "c0 + a0 * A + a1 * B"); @@ -169,7 +180,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "a0 * A + a1 * B"); diff --git a/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_log_regressor.cc b/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_log_regressor.cc index c44e494c..431e22aa 100644 --- a/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_log_regressor.cc +++ b/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_log_regressor.cc @@ -96,6 +96,16 @@ namespace _allowed_ops = {"div", "add", "mult", "sub"}; _allowed_param_ops = {}; + _task_keys = {"all"}; + for(int ii = 10; ii < 100; ++ii) + { + _sample_ids_train.push_back(std::to_string(ii)); + } + + for(int ii = 0; ii < 10; ++ii) + { + _sample_ids_test.push_back(std::to_string(ii)); + } } std::vector<std::string> _sample_ids_train; std::vector<std::string> _sample_ids_test; diff --git a/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_regressor.cc b/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_regressor.cc index 48a02d57..788b73d1 100644 --- a/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_regressor.cc +++ b/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_regressor.cc @@ -116,6 +116,17 @@ namespace _allowed_ops = {"div", "sq", "cb", "sub"}; _allowed_param_ops = {}; + + _task_keys = {"task_1", "task_2"}; + for(int ii = 10; ii < 100; ++ii) + { + _sample_ids_train.push_back(std::to_string(ii)); + } + + for(int ii = 0; ii < 10; ++ii) + { + _sample_ids_test.push_back(std::to_string(ii)); + } } std::vector<std::string> _sample_ids_train; std::vector<std::string> _sample_ids_test; -- GitLab