diff --git a/src/descriptor_identifier/model/Model.cpp b/src/descriptor_identifier/model/Model.cpp index 7ebc8e61c234a177930aeaac144ff9177c2e17ef..a0ff61393e7f7988d389599f0ddcbc85075ff323 100644 --- a/src/descriptor_identifier/model/Model.cpp +++ b/src/descriptor_identifier/model/Model.cpp @@ -26,8 +26,14 @@ Model::Model( const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ) : + _sample_ids_train(sample_ids_train), + _sample_ids_test(sample_ids_test), + _task_names(task_names), _n_samp_train(feats[0]->n_samp()), _n_samp_test(feats[0]->n_samp_test()), _n_dim(feats.size()), @@ -161,17 +167,7 @@ void Model::to_file(const std::string filename, const bool train) const out_file_stream << "# Property Label: $" << str_utils::latexify(_prop_label) << "$; Unit of the Property: " << _prop_unit.toString() << std::endl; out_file_stream << error_summary_string(train); - out_file_stream << coefs_header(); - - for(int cc = 0; cc < _coefs.size(); ++cc) - { - out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", "; - for(auto& coeff : _coefs[cc]) - { - out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; - } - out_file_stream << "\n"; - } + out_file_stream << write_coefs(); out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl; for(int ff = 0; ff < _feats.size(); ++ff) @@ -185,13 +181,17 @@ void Model::to_file(const std::string filename, const bool train) const out_file_stream << boost::algorithm::join(_feats[ff]->get_x_in_expr_list(), ",") << std::endl; } + int task_header_w = std::max( + 6, + static_cast<int>(std::max_element(_task_names.begin(), _task_names.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); out_file_stream << "# Number of Samples Per Task" << std::endl; if(train) { - out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_train" << std::endl; + out_file_stream << std::setw(task_header_w) << std::left << "# Task" << std::setw(2) << ", " << std::setw(24) << "n_mats_train" << std::endl; for(int tt = 0; tt < task_sizes_train_vec.size(); ++tt) { - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", "; + out_file_stream << std::left << std::setw(task_header_w) << "# " + _task_names[tt] << std::setw(2) << ", "; out_file_stream << std::left << std::setw(22) << task_sizes_train_vec[tt] << std::endl; } } @@ -200,7 +200,7 @@ void Model::to_file(const std::string filename, const bool train) const out_file_stream << std::setw(10) << std::left << "# Task," << std::setw(24) << "n_mats_test" << std::endl; for(int tt = 0; tt < task_sizes_test_vec.size(); ++tt) { - out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", "; + out_file_stream << std::left << std::setw(10) << "# " + _task_names[tt] + ", "; out_file_stream << std::left << std::setw(22) << task_sizes_test_vec[tt] << std::endl; } @@ -212,7 +212,23 @@ void Model::to_file(const std::string filename, const bool train) const out_file_stream << " ]" << std::endl; } - out_file_stream << "\n" << std::setw(22) << std::left << "# Property Value" << std::setw(2) << ", " << std::setw(22) << " Property Value (EST)"; + int max_sample_id_len = 12; + if(train) + { + max_sample_id_len = std::max( + max_sample_id_len, + static_cast<int>(std::max_element(_sample_ids_train.begin(), _sample_ids_train.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); + } + else + { + max_sample_id_len = std::max( + max_sample_id_len, + static_cast<int>(std::max_element(_sample_ids_test.begin(), _sample_ids_test.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); + } + out_file_stream << "\n" << std::setw(max_sample_id_len) << std::left << "# Sample ID" << std::setw(2) << ", "; + out_file_stream << std::setw(22) << std::left << "Property Value" << std::setw(2) << ", " << std::setw(22) << " Property Value (EST)"; for(int ff = 0; ff < _feats.size(); ++ff) { out_file_stream << std::setw(2) << ", " << std::setw(22) << " Feature " + std::to_string(ff) + " Value"; @@ -223,6 +239,7 @@ void Model::to_file(const std::string filename, const bool train) const { for(int ss = 0; ss < _n_samp_train; ++ss) { + out_file_stream << std::left << std::setw(max_sample_id_len) << _sample_ids_train[ss] << std::setw(2) << ", "; out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << prop_train_vec[ss] << std::setw(2) << ", "; out_file_stream << std::setw(22) << prop_train_est_vec[ss]; for(int ff = 0; ff < _n_dim; ++ff) @@ -236,6 +253,7 @@ void Model::to_file(const std::string filename, const bool train) const { for(int ss = 0; ss < _n_samp_test; ++ss) { + out_file_stream << std::left << std::setw(max_sample_id_len) << _sample_ids_test[ss] << std::setw(2) << ", "; out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << prop_test_vec[ss] << std::setw(2) << ", "; out_file_stream << std::setw(22) << prop_test_est_vec[ss]; for(int ff = 0; ff < _n_dim; ++ff) @@ -435,11 +453,16 @@ void Model::populate_model(const std::string train_filename, const std::string t { ++n_task; split_line = str_utils::split_string_trim(line); + _task_names.push_back(split_line[0]); _n_samp_train += std::stoi(split_line[1]); task_sizes_train.push_back(std::stoi(split_line[1])); if(with_test) { split_line = str_utils::split_string_trim(test_line); + if(split_line[0].compare(_task_names.back()) != 0) + { + throw std::logic_error("The task names for the test and train files are not in the same order."); + } _n_samp_test += std::stoi(split_line[1]); task_sizes_test.push_back(std::stoi(split_line[1])); std::getline(test_file_stream, test_line); @@ -454,6 +477,9 @@ void Model::populate_model(const std::string train_filename, const std::string t std::vector<double> prop_train(_n_samp_train); std::vector<double> prop_test(_n_samp_test); + _sample_ids_train.resize(_n_samp_train); + _sample_ids_test.resize(_n_samp_test); + if(with_test) { split_line = str_utils::split_string_trim(test_line, "[]"); @@ -479,16 +505,28 @@ void Model::populate_model(const std::string train_filename, const std::string t std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(_n_samp_train, 0.0)); std::vector<std::vector<double>> feat_test_vals(n_dim, std::vector<double>(_n_samp_test, 0.0)); + + bool with_samp_id = false; for(int ns = 0; ns < _n_samp_train; ++ns) { std::getline(train_file_stream, line); split_line = str_utils::split_string_trim(line); + if((split_line.size() > _n_dim + 2)) + { + with_samp_id = true; + _sample_ids_train[ns] = split_line[0]; + } + else + { + with_samp_id = false; + _sample_ids_train[ns] = std::to_string(ns); + } - prop_train[ns] = std::stod(split_line[0]); + prop_train[ns] = std::stod(split_line[with_samp_id]); for(int nf = 0; nf < n_dim; ++nf) { - feat_vals[nf][ns] = std::stod(split_line[2 + nf]); + feat_vals[nf][ns] = std::stod(split_line[2 + nf + with_samp_id]); } } for(int ns = 0; ns < _n_samp_test; ++ns) @@ -496,11 +534,22 @@ void Model::populate_model(const std::string train_filename, const std::string t std::getline(test_file_stream, test_line); split_line = str_utils::split_string_trim(test_line); - prop_test[ns] = std::stod(split_line[0]); + if((split_line.size() > _n_dim + 2)) + { + with_samp_id = true; + _sample_ids_test[ns] = split_line[0]; + } + else + { + with_samp_id = false; + _sample_ids_test[ns] = std::to_string(ns); + } + + prop_test[ns] = std::stod(split_line[with_samp_id]); for(int nf = 0; nf < n_dim; ++nf) { - feat_test_vals[nf][ns] = std::stod(split_line[2 + nf]); + feat_test_vals[nf][ns] = std::stod(split_line[2 + nf + with_samp_id]); } } train_file_stream.close(); diff --git a/src/descriptor_identifier/model/Model.hpp b/src/descriptor_identifier/model/Model.hpp index 61a1e0c1d4898f1285a1c769d507c97fc15ece4a..3940ba766a3b2fd499ae2b459974431b78949bc3 100644 --- a/src/descriptor_identifier/model/Model.hpp +++ b/src/descriptor_identifier/model/Model.hpp @@ -42,6 +42,10 @@ class Model { protected: + std::vector<std::string> _sample_ids_train; //!< Vector storing all sample ids for the training samples + std::vector<std::string> _sample_ids_test; //!< Vector storing all sample ids for the test samples + std::vector<std::string> _task_names; //!< Vector storing the ID of the task names + int _n_samp_train; //!< The number of samples per feature int _n_samp_test; //!< The number of test samples per feature int _n_dim; //!< The number of dimensions of the model @@ -72,13 +76,19 @@ public: * @param loss The LossFunction used to calculate the model * @param feats The features of the model * @param leave_out_inds The indexes of the samples for the test set + * @param sample_ids_train A vector storing all sample ids for the training samples + * @param sample_ids_test A vector storing all sample ids for the test samples + * @param task_names A vector storing the ID of the task names */ Model( const std::string prop_label, const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ); /** @@ -289,7 +299,7 @@ public: /** * @brief Get the coefficients list header for output file */ - virtual std::string coefs_header() const = 0; + virtual std::string write_coefs() const = 0; // DocString: model_fix_intercept /** diff --git a/src/descriptor_identifier/model/ModelClassifier.cpp b/src/descriptor_identifier/model/ModelClassifier.cpp index 480c04fcb52aa22570e93daaa09ddcb496f4ea4a..74e75873eb0e830032fd5630d7d9d7bc3c371014 100644 --- a/src/descriptor_identifier/model/ModelClassifier.cpp +++ b/src/descriptor_identifier/model/ModelClassifier.cpp @@ -26,9 +26,12 @@ ModelClassifier::ModelClassifier( const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ) : - Model(prop_label, prop_unit, loss, feats, leave_out_inds), + Model(prop_label, prop_unit, loss, feats, leave_out_inds, sample_ids_train, sample_ids_test, task_names), _train_n_convex_overlap(0), _test_n_convex_overlap(0), _n_class(loss->n_class()) @@ -209,11 +212,16 @@ std::string ModelClassifier::error_summary_string(bool train) const return error_stream.str(); } -std::string ModelClassifier::coefs_header() const +std::string ModelClassifier::write_coefs() const { std::stringstream coef_head_stream; - coef_head_stream << "# Plane Divider" << std::endl; - coef_head_stream << std::setw(10) << std::left << "# Task"; + coef_head_stream << "# Decision Boundaries" << std::endl; + int n_db = _n_class * (_n_class - 1) / 2; + int task_header_w = 1 + static_cast<int>(std::floor(std::log10(n_db))) + std::max( + 6, + static_cast<int>(std::max_element(_task_names.begin(), _task_names.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); + coef_head_stream << std::setw(task_header_w + 2) << std::left << "# Task"; for(int cc = 0; cc < _coefs[0].size() - 1; ++cc) { @@ -221,5 +229,17 @@ std::string ModelClassifier::coefs_header() const } coef_head_stream << " b" << std::endl; + for(int tt = 0; tt < _task_names.size(); ++tt) + { + for(int db = 0; db < n_db; ++db) + { + coef_head_stream << std::setw(task_header_w) << std::left << "# " + _task_names[tt] + "_" + std::to_string(db) << std::setw(2) << ", "; + for(auto& coeff : _coefs[tt * n_db + db]) + { + coef_head_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; + } + coef_head_stream << "\n"; + } + } return coef_head_stream.str(); } diff --git a/src/descriptor_identifier/model/ModelClassifier.hpp b/src/descriptor_identifier/model/ModelClassifier.hpp index fe29e171c52f5929fe3177b5c85effa8ca3d5b31..28ef2d60ffc6e4eaae3e9473f11fab4a1d670317 100644 --- a/src/descriptor_identifier/model/ModelClassifier.hpp +++ b/src/descriptor_identifier/model/ModelClassifier.hpp @@ -53,13 +53,19 @@ public: * @param loss The LossFunction used to calculate the model * @param feats The features of the model * @param leave_out_inds The indexes of the samples for the test set + * @param sample_ids_train A vector storing all sample ids for the training samples + * @param sample_ids_test A vector storing all sample ids for the test samples + * @param task_names A vector storing the ID of the task names */ ModelClassifier( const std::string prop_label, const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ); // DocString: model_class_init_train @@ -195,7 +201,7 @@ public: /** * @brief Get the coefficients list header for output file */ - std::string coefs_header() const; + std::string write_coefs() const; /** * @brief Copy the training error into a different vector diff --git a/src/descriptor_identifier/model/ModelLogRegressor.cpp b/src/descriptor_identifier/model/ModelLogRegressor.cpp index 6bec56f6c4afbbccfafbdf7b1ee7f6d0483f1ee5..a7519adca0b6d4e67f16854e9cddab98641f618d 100644 --- a/src/descriptor_identifier/model/ModelLogRegressor.cpp +++ b/src/descriptor_identifier/model/ModelLogRegressor.cpp @@ -29,9 +29,12 @@ ModelLogRegressor::ModelLogRegressor( const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ) : - ModelRegressor(prop_label, prop_unit, loss, feats, leave_out_inds) + ModelRegressor(prop_label, prop_unit, loss, feats, leave_out_inds, sample_ids_train, sample_ids_test, task_names) {} ModelLogRegressor::ModelLogRegressor(const std::string train_file) diff --git a/src/descriptor_identifier/model/ModelLogRegressor.hpp b/src/descriptor_identifier/model/ModelLogRegressor.hpp index 1f6bce56ae9d7ade66a0d5c8792bfcdf18679494..da7f5e097681e11adf2358a3824065f91e7d492d 100644 --- a/src/descriptor_identifier/model/ModelLogRegressor.hpp +++ b/src/descriptor_identifier/model/ModelLogRegressor.hpp @@ -51,13 +51,19 @@ public: * @param loss The LossFunction used to calculate the model * @param feats The features of the model * @param leave_out_inds The indexes of the samples for the test set + * @param sample_ids_train A vector storing all sample ids for the training samples + * @param sample_ids_test A vector storing all sample ids for the test samples + * @param task_names A vector storing the ID of the task names */ ModelLogRegressor( const std::string prop_label, const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ); // DocString: model_log_reg_init_train diff --git a/src/descriptor_identifier/model/ModelRegressor.cpp b/src/descriptor_identifier/model/ModelRegressor.cpp index fd7de70aa599b7a4d44321460bae1403aa49d87d..4d809f64dbb905c2cdbdab69e542f8d987d4d6d9 100644 --- a/src/descriptor_identifier/model/ModelRegressor.cpp +++ b/src/descriptor_identifier/model/ModelRegressor.cpp @@ -29,9 +29,12 @@ ModelRegressor::ModelRegressor( const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ) : - Model(prop_label, prop_unit, loss, feats, leave_out_inds) + Model(prop_label, prop_unit, loss, feats, leave_out_inds, sample_ids_train, sample_ids_test, task_names) { double rmse = (*_loss)(_feats); } @@ -168,15 +171,22 @@ std::string ModelRegressor::error_summary_string(bool train) const return error_stream.str(); } -std::string ModelRegressor::coefs_header() const +std::string ModelRegressor::write_coefs() const { std::stringstream coef_head_stream; + int task_header_w = std::max( + 6, + static_cast<int>(std::max_element(_task_names.begin(), _task_names.end(), [](std::string s1, std::string s2){return s1.size() <= s2.size();})->size()) + ); + coef_head_stream << "# Coefficients" << std::endl; - coef_head_stream << std::setw(10) << std::left << "# Task"; + coef_head_stream << std::setw(task_header_w + 2) << std::left << "# Task"; for(int cc = 0; cc < _coefs[0].size() - (!_fix_intercept); ++cc) + { coef_head_stream << std::setw(24) << " a" + std::to_string(cc); + } if(!_fix_intercept) { @@ -187,6 +197,16 @@ std::string ModelRegressor::coefs_header() const coef_head_stream << std::endl; } + for(int cc = 0; cc < _coefs.size(); ++cc) + { + coef_head_stream << std::setw(task_header_w) << std::left << "# " + _task_names[cc] << std::setw(2) << ", "; + for(auto& coeff : _coefs[cc]) + { + coef_head_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", "; + } + coef_head_stream << "\n"; + } + return coef_head_stream.str(); } diff --git a/src/descriptor_identifier/model/ModelRegressor.hpp b/src/descriptor_identifier/model/ModelRegressor.hpp index af3863566bb93f56be37139d6b86f46ce7870e43..8048428075d16fecfff9ab3cdb76f9d955b77043 100644 --- a/src/descriptor_identifier/model/ModelRegressor.hpp +++ b/src/descriptor_identifier/model/ModelRegressor.hpp @@ -50,13 +50,19 @@ public: * @param loss The LossFunction used to calculate the model * @param feats The features of the model * @param leave_out_inds The indexes of the samples for the test set + * @param sample_ids_train A vector storing all sample ids for the training samples + * @param sample_ids_test A vector storing all sample ids for the test samples + * @param task_names A vector storing the ID of the task names */ ModelRegressor( const std::string prop_label, const Unit prop_unit, const std::shared_ptr<LossFunction> loss, const std::vector<model_node_ptr> feats, - const std::vector<int> leave_out_inds + const std::vector<int> leave_out_inds, + const std::vector<std::string> sample_ids_train, + const std::vector<std::string> sample_ids_test, + const std::vector<std::string> task_names ); // DocString: model_reg_init_train @@ -195,7 +201,7 @@ public: /** * @brief Get the coefficients list header for output file */ - std::string coefs_header() const; + std::string write_coefs() const; /** * @brief Copy the training error into a different vector diff --git a/src/descriptor_identifier/solver/SISSOClassifier.cpp b/src/descriptor_identifier/solver/SISSOClassifier.cpp index 91f7dc923d964c7d75198676bcd2faea8c4d377f..2872659935b56823a59cddafb4e595c9a6d38a7f 100644 --- a/src/descriptor_identifier/solver/SISSOClassifier.cpp +++ b/src/descriptor_identifier/solver/SISSOClassifier.cpp @@ -35,7 +35,7 @@ SISSOClassifier::SISSOClassifier( const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys + const std::vector<std::string> task_names ): SISSOSolver( "classification", @@ -52,7 +52,7 @@ SISSOClassifier::SISSOClassifier( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, false ), _c(1000.0), @@ -310,7 +310,10 @@ void SISSOClassifier::l0_regularization(const int n_dim) _prop_unit, loss_function_util::copy(_loss), min_nodes[rr], - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_names ) ); } diff --git a/src/descriptor_identifier/solver/SISSOClassifier.hpp b/src/descriptor_identifier/solver/SISSOClassifier.hpp index bd85455361a8b7ca145844e4cb326d01972846eb..16ab12d0022799f759f197ddd5a77c9654f5cddc 100644 --- a/src/descriptor_identifier/solver/SISSOClassifier.hpp +++ b/src/descriptor_identifier/solver/SISSOClassifier.hpp @@ -67,7 +67,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names */ SISSOClassifier( const std::shared_ptr<FeatureSpace> feat_space, @@ -82,7 +82,7 @@ public: const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys + const std::vector<std::string> task_names ); /** @@ -153,7 +153,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names */ SISSOClassifier( std::shared_ptr<FeatureSpace> feat_space, @@ -169,7 +169,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys + py::list task_names ); // DocString: sisso_class_init_list @@ -189,7 +189,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names */ SISSOClassifier( std::shared_ptr<FeatureSpace> feat_space, @@ -205,7 +205,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys + py::list task_names ); // DocString: sisso_class_models_py diff --git a/src/descriptor_identifier/solver/SISSOLogRegressor.cpp b/src/descriptor_identifier/solver/SISSOLogRegressor.cpp index ab36f0edc9411a0dcfd35f59cd4d925ab88ac15f..9d5029662ee0b1403c0258e9ae87e2d3c7bede5c 100644 --- a/src/descriptor_identifier/solver/SISSOLogRegressor.cpp +++ b/src/descriptor_identifier/solver/SISSOLogRegressor.cpp @@ -35,7 +35,7 @@ SISSOLogRegressor::SISSOLogRegressor( const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept ): SISSORegressor( @@ -52,7 +52,7 @@ SISSOLogRegressor::SISSOLogRegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) { @@ -106,7 +106,10 @@ void SISSOLogRegressor::add_models(const std::vector<std::vector<int>> indexes) _prop_unit, loss_function_util::copy(_loss), min_nodes.back(), - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_names ); _models.back().push_back(model); } diff --git a/src/descriptor_identifier/solver/SISSOLogRegressor.hpp b/src/descriptor_identifier/solver/SISSOLogRegressor.hpp index 2f8ce7d5e646984dcb97bb750fa4be5d66e29308..34c63511f94cf4fab4e89f1aef933e567e356c26 100644 --- a/src/descriptor_identifier/solver/SISSOLogRegressor.hpp +++ b/src/descriptor_identifier/solver/SISSOLogRegressor.hpp @@ -57,7 +57,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSOLogRegressor( @@ -74,7 +74,7 @@ public: const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept=false ); @@ -121,7 +121,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept (bool) If true the bias term is fixed at 0 */ SISSOLogRegressor( @@ -138,7 +138,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); @@ -159,7 +159,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept (bool) If true the bias term is fixed at 0 */ SISSOLogRegressor( @@ -176,7 +176,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); diff --git a/src/descriptor_identifier/solver/SISSORegressor.cpp b/src/descriptor_identifier/solver/SISSORegressor.cpp index 7705db94dfb2af2c52e351de4f8ad23231787b54..e6853bdc93cfc67a99e00894c57b7d70b86b5c12 100644 --- a/src/descriptor_identifier/solver/SISSORegressor.cpp +++ b/src/descriptor_identifier/solver/SISSORegressor.cpp @@ -35,7 +35,7 @@ SISSORegressor::SISSORegressor( const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept ): SISSOSolver( @@ -53,7 +53,7 @@ SISSORegressor::SISSORegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) {} @@ -76,7 +76,10 @@ void SISSORegressor::add_models(const std::vector<std::vector<int>> indexes) _prop_unit, loss_function_util::copy(_loss), min_nodes.back(), - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_names ); _models.back().push_back(model); } diff --git a/src/descriptor_identifier/solver/SISSORegressor.hpp b/src/descriptor_identifier/solver/SISSORegressor.hpp index 897298ef6abeaacf92ae5ff63c118d1a9349928f..c7f8ff71040e82a153075963564be1e687e665d9 100644 --- a/src/descriptor_identifier/solver/SISSORegressor.hpp +++ b/src/descriptor_identifier/solver/SISSORegressor.hpp @@ -57,7 +57,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSORegressor( @@ -74,7 +74,7 @@ public: const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept=false ); @@ -128,7 +128,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept (bool) If true the bias term is fixed at 0 */ SISSORegressor( @@ -145,7 +145,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); @@ -166,7 +166,7 @@ public: * @param n_models_store (int) The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept (bool) If true the bias term is fixed at 0 */ SISSORegressor( @@ -183,7 +183,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); diff --git a/src/descriptor_identifier/solver/SISSOSolver.cpp b/src/descriptor_identifier/solver/SISSOSolver.cpp index 9039b0aa11ffabe4756db1334b67b9682fe88e6e..33019c72e0fd554c0cf54d1864cb90b533778017 100644 --- a/src/descriptor_identifier/solver/SISSOSolver.cpp +++ b/src/descriptor_identifier/solver/SISSOSolver.cpp @@ -36,12 +36,12 @@ SISSOSolver::SISSOSolver( const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept ): _sample_ids_train(sample_ids_train), _sample_ids_test(sample_ids_test), - _task_keys(task_keys), + _task_names(task_names), _task_sizes_train(task_sizes_train), _task_sizes_test(task_sizes_test), _leave_out_inds(leave_out_inds), diff --git a/src/descriptor_identifier/solver/SISSOSolver.hpp b/src/descriptor_identifier/solver/SISSOSolver.hpp index a7b279d68e51554be0302be9677c784c81b8f5e2..939b8285c53d5688eb05f3c0c02f0cfd8f26cc9d 100644 --- a/src/descriptor_identifier/solver/SISSOSolver.hpp +++ b/src/descriptor_identifier/solver/SISSOSolver.hpp @@ -40,7 +40,7 @@ class SISSOSolver protected: const std::vector<std::string> _sample_ids_train; //!< Vector storing all sample ids for the training samples const std::vector<std::string> _sample_ids_test; //!< Vector storing all sample ids for the test samples - const std::vector<std::string> _task_keys; //!< Vector storing the ID of the task names + const std::vector<std::string> _task_names; //!< Vector storing the ID of the task names const std::vector<int> _task_sizes_train; //!< Number of training samples per task const std::vector<int> _task_sizes_test; //!< Number of testing samples per task @@ -77,7 +77,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A vector storing all sample ids for the training samples * @param sample_ids_test A vector storing all sample ids for the test samples - * @param task_keys A vector storing the ID of the task names + * @param task_names A vector storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSOSolver( @@ -95,7 +95,7 @@ public: const int n_models_store, const std::vector<std::string> sample_ids_train, const std::vector<std::string> sample_ids_test, - const std::vector<std::string> task_keys, + const std::vector<std::string> task_names, const bool fix_intercept=false ); @@ -175,7 +175,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSOSolver( @@ -193,7 +193,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); @@ -214,7 +214,7 @@ public: * @param n_models_store The number of models to output to files * @param sample_ids_train A list storing all sample ids for the training samples * @param sample_ids_test A list storing all sample ids for the test samples - * @param task_keys A list storing the ID of the task names + * @param task_names A list storing the ID of the task names * @param fix_intrecept If true the bias term is fixed at 0 */ SISSOSolver( @@ -232,7 +232,7 @@ public: int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept=false ); diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp index 36565188b1123334653ed623cd45b72533d7ee6a..13f0183a00ad4f58694e35ee4eba711dcd5694ab 100644 --- a/src/inputs/InputParser.cpp +++ b/src/inputs/InputParser.cpp @@ -119,8 +119,8 @@ InputParser::InputParser(pt::ptree ip, std::string fn, std::shared_ptr<MPI_Inter { ++_n_samp; } - tasks["none"] = std::vector<int>(_n_samp); - std::iota(tasks["none"].begin(), tasks["none"].end(), 0); + tasks["all"] = std::vector<int>(_n_samp); + std::iota(tasks["all"].begin(), tasks["all"].end(), 0); } else { @@ -151,7 +151,6 @@ InputParser::InputParser(pt::ptree ip, std::string fn, std::shared_ptr<MPI_Inter int start = 0; for(auto& el : tasks) { - _task_keys.push_back(el.first); _task_sizes_test.push_back(static_cast<int>(std::round(leave_out_frac * el.second.size()))); _task_sizes_train.push_back(el.second.size() - _task_sizes_test.back()); @@ -269,6 +268,11 @@ void InputParser::generate_feature_space( int n_train_samp = 0; int n_samp_test = 0; + for(auto& task : tasks) + { + _task_names.push_back(task.first); + } + while (std::getline(data_stream, line)) { std::vector<std::string> split_line; @@ -289,6 +293,7 @@ void InputParser::generate_feature_space( n_samp_test = 0; for(auto& task : tasks) { + int task_ind = std::find(task.second.begin(), task.second.end(), cur_line) - task.second.begin(); for(int ii = 0; ii < task_ind; ++ii) { diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp index 6c72773a0aff272e450c4aa2cf4cc3d3c1b2458c..21df1c8fdafa7a1add0e820d5fdd3fd754e08cab 100644 --- a/src/inputs/InputParser.hpp +++ b/src/inputs/InputParser.hpp @@ -50,7 +50,7 @@ class InputParser public: std::vector<std::string> _sample_ids_train; //!< Vector storing all sample ids for the training samples std::vector<std::string> _sample_ids_test; //!< Vector storing all sample ids for the test samples - std::vector<std::string> _task_keys; //!< Vector storing the ID of the task names + std::vector<std::string> _task_names; //!< Vector storing the ID of the task names std::vector<std::string> _param_opset; //!< Vector containing all allowed operators strings for operators with free parameters std::vector<std::string> _opset; //!< Vector containing all allowed operators strings diff --git a/src/main.cpp b/src/main.cpp index bd1959f21f6e95d1661dbdc0d4a26937bfbef01b..2f85c88412ca47ed5dbea7c0e9f5a2b7db173a47 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -86,7 +86,7 @@ int main(int argc, char const *argv[]) ip._n_models_store, ip._sample_ids_train, ip._sample_ids_test, - ip._task_keys, + ip._task_names, ip._fix_intercept ); sisso.fit(); @@ -124,7 +124,7 @@ int main(int argc, char const *argv[]) ip._n_models_store, ip._sample_ids_train, ip._sample_ids_test, - ip._task_keys, + ip._task_names, ip._fix_intercept ); sisso.fit(); @@ -162,7 +162,7 @@ int main(int argc, char const *argv[]) ip._n_models_store, ip._sample_ids_train, ip._sample_ids_test, - ip._task_keys + ip._task_names ); sisso.fit(); diff --git a/src/python/py_binding_cpp_def/bindings_docstring_keyed.cpp b/src/python/py_binding_cpp_def/bindings_docstring_keyed.cpp index 62907e4aff5ee75d24efc2acbd9b9cc17c670864..cbb996bc0f41cdd1205505fa21c49b45a24cbd1f 100644 --- a/src/python/py_binding_cpp_def/bindings_docstring_keyed.cpp +++ b/src/python/py_binding_cpp_def/bindings_docstring_keyed.cpp @@ -1414,7 +1414,7 @@ void sisso::descriptor_identifier::registerSISSORegressor() py::list, optional<bool> >( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys"), arg("fix_intercept")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names"), arg("fix_intercept")), "@DocString_sisso_reg_init_arr@" ) ) @@ -1436,7 +1436,7 @@ void sisso::descriptor_identifier::registerSISSORegressor() py::list, optional<bool> >( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys"), arg("fix_intercept")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names"), arg("fix_intercept")), "@DocString_sisso_reg_init_list@" ) ) @@ -1467,7 +1467,7 @@ void sisso::descriptor_identifier::registerSISSOLogRegressor() py::list, optional<bool> >( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys"), arg("fix_intercept")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names"), arg("fix_intercept")), "@DocString_sisso_log_reg_init_arr@" ) ) @@ -1489,7 +1489,7 @@ void sisso::descriptor_identifier::registerSISSOLogRegressor() py::list, optional<bool> >( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys"), arg("fix_intercept")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names"), arg("fix_intercept")), "@DocString_sisso_log_reg_init_list@" ) ) @@ -1503,13 +1503,13 @@ void sisso::descriptor_identifier::registerSISSOClassifier() "SISSOClassifier", "@DocString_cls_sisso_class@", init<std::shared_ptr<FeatureSpace>, std::string, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, int, py::list, py::list, py::list>( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names")), "@DocString_sisso_class_init_arr@" ) ) .def( init<std::shared_ptr<FeatureSpace>, std::string, Unit, py::list, py::list, py::list, py::list, py::list, int, int, int, py::list, py::list, py::list>( - (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_keys")), + (arg("self"), arg("feat_space"), arg("prop_label"), arg("prop_unit"), arg("prop"), arg("prop_test"), arg("task_sizes_train"), arg("task_sizes_test"), arg("leave_out_inds"), arg("n_dim"), arg("n_residual"), arg("n_models_store"), arg("sample_ids_train"), arg("sample_ids_test"), arg("task_names")), "@DocString_sisso_class_init_list@" ) ) diff --git a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOClassifier.cpp b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOClassifier.cpp index 220a520a12868a075f874d86f009a70718f8c428..4ae3b8babdb20a0b4d503480cbf4fb9e06fd07a9 100644 --- a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOClassifier.cpp +++ b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOClassifier.cpp @@ -35,7 +35,7 @@ SISSOClassifier::SISSOClassifier( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys + py::list task_names ) : SISSOSolver( "classification", @@ -52,7 +52,7 @@ SISSOClassifier::SISSOClassifier( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, false ), _c(100.0), @@ -76,7 +76,7 @@ SISSOClassifier::SISSOClassifier( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys + py::list task_names ) : SISSOSolver( "classification", @@ -93,7 +93,7 @@ SISSOClassifier::SISSOClassifier( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, false ), _c(100.0), diff --git a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOLogRegressor.cpp b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOLogRegressor.cpp index f910cde57c8ab1576247baf6f72074d7cc4dc00a..53ccf1ac805c537c84631e687e8db40251f9ce29 100644 --- a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOLogRegressor.cpp +++ b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOLogRegressor.cpp @@ -35,7 +35,7 @@ SISSOLogRegressor::SISSOLogRegressor( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : SISSORegressor( @@ -52,7 +52,7 @@ SISSOLogRegressor::SISSOLogRegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) { @@ -86,7 +86,7 @@ SISSOLogRegressor::SISSOLogRegressor( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : SISSORegressor( @@ -103,7 +103,7 @@ SISSOLogRegressor::SISSOLogRegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) { diff --git a/src/python/py_binding_cpp_def/descriptor_identifier/SISSORegressor.cpp b/src/python/py_binding_cpp_def/descriptor_identifier/SISSORegressor.cpp index 6f9f8b03e48ccb5595b5f7997fd34e9f43589fc4..4b023f7501358f1d32897d648a9c2d8e06196532 100644 --- a/src/python/py_binding_cpp_def/descriptor_identifier/SISSORegressor.cpp +++ b/src/python/py_binding_cpp_def/descriptor_identifier/SISSORegressor.cpp @@ -35,7 +35,7 @@ SISSORegressor::SISSORegressor( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : SISSOSolver( @@ -53,7 +53,7 @@ SISSORegressor::SISSORegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) {} @@ -72,7 +72,7 @@ SISSORegressor::SISSORegressor( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : SISSOSolver( @@ -90,7 +90,7 @@ SISSORegressor::SISSORegressor( n_models_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, fix_intercept ) {} diff --git a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOSolver.cpp b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOSolver.cpp index 4c5cb8e4a4ddbd9a44bbc9bee20718e3b0d0066d..3d6ef29a217936f2e0d8855528927da2873e8f99 100644 --- a/src/python/py_binding_cpp_def/descriptor_identifier/SISSOSolver.cpp +++ b/src/python/py_binding_cpp_def/descriptor_identifier/SISSOSolver.cpp @@ -36,12 +36,12 @@ SISSOSolver::SISSOSolver( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : _sample_ids_train(python_conv_utils::from_list<std::string>(sample_ids_train)), _sample_ids_test(python_conv_utils::from_list<std::string>(sample_ids_test)), - _task_keys(python_conv_utils::from_list<std::string>(task_keys)), + _task_names(python_conv_utils::from_list<std::string>(task_names)), _task_sizes_train(python_conv_utils::from_list<int>(task_sizes_train)), _task_sizes_test(python_conv_utils::from_list<int>(task_sizes_test)), _leave_out_inds(python_conv_utils::from_list<int>(leave_out_inds)), @@ -83,12 +83,12 @@ SISSOSolver::SISSOSolver( int n_models_store, py::list sample_ids_train, py::list sample_ids_test, - py::list task_keys, + py::list task_names, bool fix_intercept ) : _sample_ids_train(python_conv_utils::from_list<std::string>(sample_ids_train)), _sample_ids_test(python_conv_utils::from_list<std::string>(sample_ids_test)), - _task_keys(python_conv_utils::from_list<std::string>(task_keys)), + _task_names(python_conv_utils::from_list<std::string>(task_names)), _task_sizes_train(python_conv_utils::from_list<int>(task_sizes_train)), _task_sizes_test(python_conv_utils::from_list<int>(task_sizes_test)), _leave_out_inds(python_conv_utils::from_list<int>(leave_out_inds)), diff --git a/src/python/py_interface/get_solver.py b/src/python/py_interface/get_solver.py index 8e61c0a6326eb9bb2bb2796867c08b0b49e25d9b..b5dea564f6268feff7d6f660b9304b91320594eb 100644 --- a/src/python/py_interface/get_solver.py +++ b/src/python/py_interface/get_solver.py @@ -153,7 +153,7 @@ def get_fs_solver( leave_out_inds, sample_ids_train, sample_ids_test, - task_keys, + task_names, ) = read_csv( df, prop_key, @@ -178,7 +178,7 @@ def get_fs_solver( if loss_type.lower() == "regression": print(sample_ids_train) print(sample_ids_test) - print(task_keys) + print(task_names) solver = SISSORegressor( fs, prop_label, @@ -193,7 +193,7 @@ def get_fs_solver( n_model_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, ) elif loss_type.lower() == "log_regression": solver = SISSOLogRegressor( @@ -210,7 +210,7 @@ def get_fs_solver( n_model_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, ) else: solver = SISSOClassifier( @@ -227,6 +227,6 @@ def get_fs_solver( n_model_store, sample_ids_train, sample_ids_test, - task_keys, + task_names, ) return fs, solver diff --git a/src/python/py_interface/import_dataframe.py b/src/python/py_interface/import_dataframe.py index d4dbdd914d9d0a1c92325f90f9f82152942e10b5..a80a1a71a87453719df60e168704b0965f8adcba 100644 --- a/src/python/py_interface/import_dataframe.py +++ b/src/python/py_interface/import_dataframe.py @@ -138,7 +138,7 @@ def read_csv( - leave_out_inds (list): Indices to use as the test set - sample_ids_train (list): List of sample id's for the training data - sample_ids_test (list): List of sample id's for the test data - - task_keys (list): List of all task id names + - task_names (list): List of all task id names """ if not max_rung: raise ValueError("Maximum rung for the calculation is not defined.") @@ -152,14 +152,14 @@ def read_csv( if task_key: task, _, _ = extract_col(df, task_key) else: - task = np.zeros(prop.shape, dtype=np.int64).astype(str) + task = np.array(["all"] * len(prop)) # Map out which index belongs to which task and get the size of each task task_map = {} - task_keys, task_sizes = np.unique(task, return_counts=True) + task_names, task_sizes = np.unique(task, return_counts=True) task_sizes = task_sizes.astype(np.int32) - for kk, key in enumerate(task_keys): + for kk, key in enumerate(task_names): task_map[key] = np.where(task == key)[0].astype(np.int32) assert task_sizes[kk] == len(task_map[key]) @@ -172,21 +172,21 @@ def read_csv( if leave_out_frac > 0.0: task_sizes_test = [int(math.ceil(ts * leave_out_frac)) for ts in task_sizes] - for kk, key in enumerate(task_keys): + for kk, key in enumerate(task_names): leave_out_inds += list( np.random.choice(task_map[key], task_sizes_test[kk], False).astype( np.int32 ) ) else: - task_sizes_test = list(np.zeros(len(task_keys), dtype=np.int32)) + task_sizes_test = list(np.zeros(len(task_names), dtype=np.int32)) else: assert (leave_out_frac == 0.0) or ( int(round(len(df) * leave_out_frac)) == len(leave_out_inds) ) - task_sizes_test = list(np.zeros(len(task_keys), dtype=np.int32)) - for kk, key in enumerate(task_keys): + task_sizes_test = list(np.zeros(len(task_names), dtype=np.int32)) + for kk, key in enumerate(task_names): left_out = [ind for ind in leave_out_inds if ind in task_map[key]] task_sizes_test[kk] = len(left_out) @@ -231,5 +231,5 @@ def read_csv( leave_out_inds, list(df.index[train_inds].to_numpy().astype(str)), list(df.index[leave_out_inds].to_numpy().astype(str)), - list(task_keys), + list(task_names), ) diff --git a/tests/googletest/descriptor_identification/model/test_model_classifier.cc b/tests/googletest/descriptor_identification/model/test_model_classifier.cc index 405b7bdf73730a57d0a6fe9a621389ace85303f4..71086ca7170b05590f0ac0b5007d124d4a5b9721 100644 --- a/tests/googletest/descriptor_identification/model/test_model_classifier.cc +++ b/tests/googletest/descriptor_identification/model/test_model_classifier.cc @@ -42,16 +42,24 @@ namespace _task_sizes_test, 1 ); + + _task_keys = {"all"}; + _sample_ids_train = {"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"}; + _sample_ids_test = {"20", "21"}; } - std::vector<int> _leave_out_inds; - std::vector<int> _task_sizes_train; - std::vector<int> _task_sizes_test; + std::vector<std::string> _sample_ids_train; + std::vector<std::string> _sample_ids_test; + std::vector<std::string> _task_keys; std::vector<double> _prop; std::vector<double> _prop_test; std::vector<model_node_ptr> _features; std::shared_ptr<LossFunction> _loss; + + std::vector<int> _task_sizes_train; + std::vector<int> _task_sizes_test; + std::vector<int> _leave_out_inds; }; TEST_F(ModelClassifierTests, NodesTest) @@ -61,7 +69,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "[A]"); EXPECT_EQ(model.n_convex_overlap_train(), 0); @@ -96,7 +107,7 @@ namespace EXPECT_EQ(model.n_dim(), 1); EXPECT_EQ(model.prop_unit(), Unit("m")); - // boost::filesystem::remove("train_class_mods.dat"); - // boost::filesystem::remove("test_class_mods.dat"); + boost::filesystem::remove("train_class_mods.dat"); + boost::filesystem::remove("test_class_mods.dat"); } } diff --git a/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc b/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc index 4c1e43ece3bbfc890e4878d37a8e0a7a00535e40..07f37869ccb8d74325e58bfcb260b3060fd57a69 100644 --- a/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc +++ b/tests/googletest/descriptor_identification/model/test_model_log_regressor.cc @@ -42,7 +42,15 @@ namespace std::transform(value_1.begin(), value_1.end(), value_2.begin(), _prop.begin(), [](double v1, double v2){return std::log(0.001 * std::pow(v1, 0.1) * std::pow(v2, -2.1));}); std::transform(test_value_1.begin(), test_value_1.end(), test_value_2.begin(), _prop_test.begin(), [](double v1, double v2){return std::log(0.001 * std::pow(v1, 0.1) * std::pow(v2, -2.1));}); + + _task_keys = {"all"}; + _sample_ids_train = {"0", "1", "2", "3", "4", "6", "7", "8", "9", "10"}; + _sample_ids_test = {"5", "11"}; } + std::vector<std::string> _sample_ids_train; + std::vector<std::string> _sample_ids_test; + std::vector<std::string> _task_keys; + std::vector<int> _leave_out_inds; std::vector<int> _task_sizes_train; std::vector<int> _task_sizes_test; @@ -70,7 +78,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "exp(c0) * (A)^a0 * (B)^a1"); EXPECT_LT(model.rmse(), 1e-10); @@ -156,7 +167,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "(A)^a0 * (B)^a1"); diff --git a/tests/googletest/descriptor_identification/model/test_model_regressor.cc b/tests/googletest/descriptor_identification/model/test_model_regressor.cc index 031ddd4dd2a3b9d789d59c700f341392aa701d16..d4eb901217760f76075f1ac745bbc6fed3affd66 100644 --- a/tests/googletest/descriptor_identification/model/test_model_regressor.cc +++ b/tests/googletest/descriptor_identification/model/test_model_regressor.cc @@ -45,7 +45,15 @@ namespace std::transform(test_value_1.begin(), test_value_1.begin() + 1, test_value_2.begin(), _prop_test.begin(), [](double v1, double v2){return 0.001 + v1 + v2;}); std::transform(test_value_1.begin() + 1, test_value_1.end(), test_value_2.begin() + 1, _prop_test.begin() + 1, [](double v1, double v2){return -6.5 + 1.25 * v1 - 0.4 * v2;}); + + _task_keys = {"task_1", "task_2"}; + _sample_ids_train = {"0", "1", "2", "3", "4", "6", "7", "8", "9", "10"}; + _sample_ids_test = {"5", "11"}; } + std::vector<std::string> _sample_ids_train; + std::vector<std::string> _sample_ids_test; + std::vector<std::string> _task_keys; + std::vector<int> _leave_out_inds; std::vector<int> _task_sizes_train; std::vector<int> _task_sizes_test; @@ -73,7 +81,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "c0 + a0 * A + a1 * B"); @@ -169,7 +180,10 @@ namespace Unit("m"), _loss, _features, - _leave_out_inds + _leave_out_inds, + _sample_ids_train, + _sample_ids_test, + _task_keys ); EXPECT_STREQ(model.toString().c_str(), "a0 * A + a1 * B"); diff --git a/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_log_regressor.cc b/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_log_regressor.cc index c44e494cfef731902c5b818da174bc1eb782c3e5..431e22aaeb2aad3991ca218506c8fde22de51b16 100644 --- a/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_log_regressor.cc +++ b/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_log_regressor.cc @@ -96,6 +96,16 @@ namespace _allowed_ops = {"div", "add", "mult", "sub"}; _allowed_param_ops = {}; + _task_keys = {"all"}; + for(int ii = 10; ii < 100; ++ii) + { + _sample_ids_train.push_back(std::to_string(ii)); + } + + for(int ii = 0; ii < 10; ++ii) + { + _sample_ids_test.push_back(std::to_string(ii)); + } } std::vector<std::string> _sample_ids_train; std::vector<std::string> _sample_ids_test; diff --git a/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_regressor.cc b/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_regressor.cc index 48a02d57e7004059f82142a66d4cfb348060d5e8..788b73d1496196d59e3851884e391cf4cb1e194d 100644 --- a/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_regressor.cc +++ b/tests/googletest/descriptor_identification/sisso_regressor/test_sisso_regressor.cc @@ -116,6 +116,17 @@ namespace _allowed_ops = {"div", "sq", "cb", "sub"}; _allowed_param_ops = {}; + + _task_keys = {"task_1", "task_2"}; + for(int ii = 10; ii < 100; ++ii) + { + _sample_ids_train.push_back(std::to_string(ii)); + } + + for(int ii = 0; ii < 10; ++ii) + { + _sample_ids_test.push_back(std::to_string(ii)); + } } std::vector<std::string> _sample_ids_train; std::vector<std::string> _sample_ids_test;