From de2ee8faf4f6a9396f3141eada50beac4fdf66f6 Mon Sep 17 00:00:00 2001 From: Thomas <purcell@fhi-berlin.mpg.de> Date: Thu, 9 Sep 2021 07:50:48 +0200 Subject: [PATCH] Add functionality to output the feature space _phi as a text file --- docs/tutorial/2_python.md | 35 +++++++++++++++++++ .../feature_space/FeatureSpace.cpp | 28 +++++++++++++-- .../feature_space/FeatureSpace.hpp | 7 ++++ src/inputs/InputParser.cpp | 2 ++ src/inputs/InputParser.hpp | 13 +++++++ tests/exec_test/default/sisso.json | 3 +- 6 files changed, 85 insertions(+), 3 deletions(-) diff --git a/docs/tutorial/2_python.md b/docs/tutorial/2_python.md index c013299f..9f0b6ce6 100644 --- a/docs/tutorial/2_python.md +++ b/docs/tutorial/2_python.md @@ -173,3 +173,38 @@ After the calculations are finished we can then run the same analysis as we did >>> plot_validation_rmse(models, "cv_50_error.pdf").show() ``` It is important to note here that, while it is not necessary to setup separate directories when using the python bindings, if you don't all output files will be overwritten reducing the reproducibility of the code. + +## Using the Python Interface to Reproduce Previous Calculations +The final goal of this tutorial will be to discuss how to use the python interface to reproduce previous calculations easily. +As previous examples illustrated how the Model output files can be used to interact and extract information from the models generated by `SISSO++`, but these alone can not be used to recreate a calculation. +To increase the reproducibility of the code `SISSO++` can construct a `FeatureSpace` using a text file, which can be generated using . +Depending on what we want to study there are two ways of constructing a feature space form either the `phi.out` file or `selected_features.txt` As show below for using `selected_features.txt` + +```python +from sissopp import phi_selected_from_file, read_csv, FeatureSpace + +inputs = read_csv("data.csv", prop_key="E_RS - E_ZB", max_rung=0, leave_out_frac=0.0) +phi_sel = phi_selected_from_file("feature_space/selected_features.txt", inputs.phi_0) +inputs.phi_0 = phi_sel +feature_space = FeatureSpace(inputs) +``` + +and for using `phi.out` + +```python +from sissopp import phi_selected_from_file, read_csv, FeatureSpace + +inputs = read_csv("data.csv", prop_key="E_RS - E_ZB", max_rung=0, leave_out_frac=0.0) + +feature_space = FeatureSpace( + "phi.txt", + inputs.phi_0, + inputs.prop_train, + [82], + project_type='regression', + cross_corr_max=1.0, + n_sis_select=100 +) +``` + +From here calculations can continue as was done in the earlier examples. diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index a7aa799b..08270c9d 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -71,6 +71,7 @@ FeatureSpace::FeatureSpace(InputParser inputs): _project_type(inputs.calc_type()), _feature_space_file("feature_space/selected_features.txt"), _feature_space_summary_file("feature_space/SIS_summary.txt"), + _phi_out_file(inputs.phi_out_file()), _mpi_comm(inputs.mpi_comm()), _cross_cor_max(inputs.cross_cor_max()), _l_bound(inputs.l_bound()), @@ -782,6 +783,11 @@ void FeatureSpace::generate_feature_space( { _n_feat = feat_set.size(); } + + if(_phi_out_file.size() > 0) + { + output_phi(); + } } void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std::vector<node_ptr>& phi_sel, std::vector<double>& scores_sel) @@ -1061,11 +1067,11 @@ void FeatureSpace::sis(std::shared_ptr<LossFunction> loss) // Generate features on the fly and get the best features for this rank if(_n_rung_generate > 0) { -#ifdef PARAMETERIZE + #ifdef PARAMETERIZE _phi.resize(_n_feat); _phi.insert(_phi.end(), _phi_reparam.begin() + _start_rung_reparam.back(), _phi_reparam.end()); _scores.resize(_phi.size()); -#endif + #endif start_time = omp_get_wtime(); phi_sel.resize(cur_feat_local); scores_sel.resize(cur_feat_local); @@ -1208,3 +1214,21 @@ void FeatureSpace::remove_feature(const int ind) _phi.erase(_phi.begin() + ind); } + +void FeatureSpace::output_phi() +{ + boost::filesystem::path p(_phi_out_file.c_str()); + boost::filesystem::create_directories(p.remove_filename()); + + std::ofstream out_file_stream = std::ofstream(); + out_file_stream.open(_phi_out_file); + + out_file_stream << "# Number of Features: " << _n_feat << std::endl; + out_file_stream << "# Maximum Rung of the Calculation: " << _max_rung << std::endl; + + for(auto& feat : _phi) + { + out_file_stream << feat->postfix_expr() << std::endl; + } + out_file_stream.close(); +} diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp index 4f3833f0..ca5622c2 100644 --- a/src/feature_creation/feature_space/FeatureSpace.hpp +++ b/src/feature_creation/feature_space/FeatureSpace.hpp @@ -76,6 +76,7 @@ class FeatureSpace const std::string _project_type; //!< The type of LossFunction to use when projecting the features onto a property const std::string _feature_space_file; //!< File to output the computer readable representation of the selected features to const std::string _feature_space_summary_file; //!< File to output the human readable representation of the selected features to + const std::string _phi_out_file; //!< Filename of the file to output the feature set to std::function<bool(const double*, const int, const double, const std::vector<double>&, const double, const int, const int)> _is_valid; //!< Function used to determine of a feature is too correlated to previously selected features std::function<bool(const double*, const int, const double, const std::vector<node_ptr>&, const std::vector<double>&, const double)> _is_valid_feat_list; //!< Function used to determine of a feature is too correlated to previously selected features within a given list @@ -238,6 +239,12 @@ public: const double u_bound=1e50 ); + // DocString: feat_space_output_phi + /** + * @brief Output the feature set to a file of a passed filename + */ + void output_phi(); + #ifdef PARAMETERIZE /** * @brief Generate a new set of parameterized features from a single feature diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp index 399704f6..fdc17092 100644 --- a/src/inputs/InputParser.cpp +++ b/src/inputs/InputParser.cpp @@ -29,6 +29,7 @@ InputParser::InputParser() : _prop_label("prop"), _task_key("Task"), _calc_type("regression"), + _phi_out_file(""), _mpi_comm(mpi_setup::comm), _cross_cor_max(1.0), _l_bound(1e-50), @@ -65,6 +66,7 @@ InputParser::InputParser(pt::ptree ip, std::string fn, std::shared_ptr<MPI_Inter _prop_key(ip.get<std::string>("property_key", "prop")), _task_key(ip.get<std::string>("task_key", "task")), _calc_type(ip.get<std::string>("calc_type", "regression")), + _phi_out_file(ip.get<std::string>("phi_out_file", "")), _leave_out_inds(as_vector<int>(ip, "leave_out_inds")), _mpi_comm(comm), _cross_cor_max(ip.get<double>("max_feat_cross_correlation", 1.0)), diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp index 6cc3f1c0..66f78204 100644 --- a/src/inputs/InputParser.hpp +++ b/src/inputs/InputParser.hpp @@ -83,6 +83,7 @@ private: std::string _prop_label; //!< The label of the property std::string _task_key; //!< Key used to find the task column in the data file std::string _calc_type; //!< The type of LossFunction to use when projecting the features onto a property + std::string _phi_out_file; //!< Filename of the file to output the feature set to std::shared_ptr<MPI_Interface> _mpi_comm; //!< The MPI communicator for the calculation @@ -509,6 +510,18 @@ public: */ inline void set_filename(const std::string filename) {_filename = filename;} + // DocString: inputs_get_phi_out_file + /** + * @brief Filename of the file to output the feature set to + */ + inline std::string phi_out_file() const {return _phi_out_file;} + + // DocString: inputs_set_phi_out_file + /** + * @brief Set tilename of the file to output the feature set to + */ + inline void set_phi_out_file(const std::string phi_out_file) {_phi_out_file = phi_out_file;} + // DocString: inputs_get_data_file /** * @brief Name of the data file diff --git a/tests/exec_test/default/sisso.json b/tests/exec_test/default/sisso.json index 7c12e68e..9fe148ca 100644 --- a/tests/exec_test/default/sisso.json +++ b/tests/exec_test/default/sisso.json @@ -10,5 +10,6 @@ "leave_out_frac": 0.05, "n_models_store": 1, "leave_out_inds": [0, 1, 2, 60, 61], - "fix_intercept": false + "fix_intercept": false, + "phi_out_file": "feature_space/phi.out" } -- GitLab