From adeb3b06f6dd19750f65322643412b44ee291420 Mon Sep 17 00:00:00 2001 From: Thomas Purcell <purcell@fhi-berlin.mpg.de> Date: Wed, 5 Aug 2020 11:28:58 +0200 Subject: [PATCH] Cross-correlation now working can set a maximum cross correlation for selected features --- .../feature_space/FeatureSpace.cpp | 39 ++++++++++++------- .../feature_space/FeatureSpace.hpp | 17 ++++++++ src/inputs/InputParser.cpp | 3 +- src/inputs/InputParser.hpp | 1 + src/python/bindings_docstring_keyed.cpp | 4 +- src/python/feature_creation/FeatureSpace.cpp | 4 ++ 6 files changed, 51 insertions(+), 17 deletions(-) diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index 7796e1a8..17b74f23 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -29,6 +29,7 @@ FeatureSpace::FeatureSpace( int n_sis_select, int max_store_rung, int n_rung_generate, + double cross_corr_max, double min_abs_feat_val, double max_abs_feat_val ): @@ -41,6 +42,7 @@ FeatureSpace::FeatureSpace( _feature_space_file("feature_space/selected_features.txt"), _feature_space_summary_file("feature_space/SIS_summary.txt"), _mpi_comm(mpi_comm), + _cross_cor_max(cross_corr_max), _l_bound(min_abs_feat_val), _u_bound(max_abs_feat_val), _max_phi(max_phi), @@ -434,13 +436,13 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt { double cur_score = scores[inds[ii]]; - bool is_valid = valid_score_against_current(scores_sel.size(), generated_phi[inds[ii]]->value_ptr(), cur_score, scores_sel, scores_comp); + // bool is_valid = valid_score_against_current(scores_sel.size(), generated_phi[inds[ii]]->value_ptr(), cur_score, scores_sel, scores_comp); - // Check the feature against those selected from previous SIS iterations - if((node_value_arrs::N_SELECTED > _n_sis_select) && is_valid) - is_valid = valid_score_against_past(generated_phi[inds[ii]]->value_ptr(), scores[inds[ii]], scores_prev_sel, scores_comp); + // // Check the feature against those selected from previous SIS iterations + // if((node_value_arrs::N_SELECTED > _n_sis_select) && is_valid) + // is_valid = valid_score_against_past(generated_phi[inds[ii]]->value_ptr(), scores[inds[ii]], scores_prev_sel, scores_comp); - if(is_valid) + if(valid_feature_against_selected(generated_phi[inds[ii]]->value_ptr(), node_value_arrs::N_SELECTED - _n_sis_select + scores_sel.size())) { if(scores_sel.size() == _n_sis_select) { @@ -472,6 +474,15 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt } } +bool FeatureSpace::valid_feature_against_selected(double* val_ptr, int end_sel, int start_sel) +{ + double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp); + for(int dd = start_sel; dd < end_sel; ++dd) + if(base_val - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10) + return false; + return true; +} + bool FeatureSpace::valid_score_against_past(double* val_ptr, double cur_score, std::vector<double> scores_past, std::vector<double>& scores_comp) { std::transform(scores_past.begin(), scores_past.end(), scores_comp.begin(), [&cur_score](double score){return std::abs(cur_score - score);}); @@ -480,7 +491,7 @@ bool FeatureSpace::valid_score_against_past(double* val_ptr, double cur_score, s if(*std::min_element(scores_comp.begin(), scores_comp.end()) < 1e-10) { int dd = std::min_element(scores_comp.begin(), scores_comp.end()) - scores_comp.begin(); - if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp))) < 1e-10) + if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp))) < 1.0 - _cross_cor_max + 1e-10) return false; } return true; @@ -493,7 +504,7 @@ bool FeatureSpace::valid_score_against_current(int end_check, double* val_ptr, d if(*std::min_element(scores_comp.begin(), scores_comp.begin() + end_check) < 1e-10) { int dd = std::min_element(scores_comp.begin(), scores_comp.begin() + end_check) - scores_comp.begin(); - if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(node_value_arrs::N_SELECTED - _n_sis_select + dd), val_ptr, _n_samp))) < 1e-10) + if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(node_value_arrs::N_SELECTED - _n_sis_select + dd), val_ptr, _n_samp))) < 1.0 - _cross_cor_max + 1e-10) return false; } return true; @@ -530,7 +541,7 @@ void FeatureSpace::sis(std::vector<double>& prop) int cur_feat_local = 0; double cur_score = 0.0; - while(_scores[inds[ii]] < -1.0) + while(_scores[inds[ii]] < -1.0000000001) ++ii; std::vector<double> scores_prev_sel; @@ -542,12 +553,12 @@ void FeatureSpace::sis(std::vector<double>& prop) while((cur_feat_local != _n_sis_select) && (ii < _scores.size())) { - bool is_valid = valid_score_against_current(cur_feat_local, _phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_sel, scores_comp); - // Check the feature against those selected from previous SIS iterations - if(cur_feat > 0 && is_valid) - is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_prev_sel, scores_comp); + // bool is_valid = valid_score_against_current(cur_feat_local, _phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_sel, scores_comp); + // // Check the feature against those selected from previous SIS iterations + // if(cur_feat > 0 && is_valid) + // is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_prev_sel, scores_comp); - if(is_valid) + if(valid_feature_against_selected(_phi[inds[ii]]->value_ptr(), cur_feat + cur_feat_local)) { scores_sel[cur_feat_local] = _scores[inds[ii]]; phi_sel.push_back(_phi[inds[ii]]); @@ -678,7 +689,7 @@ void FeatureSpace::sis(std::vector<double>& prop) // Get the n_sis_select best features (compare against features sent from other processes) while((cur_feat != node_value_arrs::N_SELECTED) && (ii < sent_scores.size())) { - if(valid_score_against_current(cur_feat_local, sent_phi[inds[ii]]->value().data(), sent_scores[inds[ii]], scores_sel, scores_comp)) + if(valid_feature_against_selected(sent_phi[inds[ii]]->value().data(), cur_feat + cur_feat_local, cur_feat)) { out_file_stream << std::setw(14) <<std::left << cur_feat << sent_phi[inds[ii]]->postfix_expr() << std::endl; sum_file_stream << std::setw(14) <<std::left << cur_feat << std::setw(24) << std::setprecision(18) << std::left << -1 * sent_scores[inds[ii]] << sent_phi[inds[ii]]->expr() << std::endl; diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp index 6e3e383f..45cddcf6 100644 --- a/src/feature_creation/feature_space/FeatureSpace.hpp +++ b/src/feature_creation/feature_space/FeatureSpace.hpp @@ -55,6 +55,7 @@ class FeatureSpace std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator + double _cross_cor_max; //!< Maximum cross-correlation used for selecting features double _l_bound; //!< lower bound for absolute value of the features double _u_bound; //!< upper bound for absolute value of the features @@ -78,6 +79,7 @@ public: * @param n_sis_select number of features to select during each SIS step * @param max_store_rung number of rungs to calculate and store the value of the features for all samples * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm) + * @param cross_corr_max Maximum cross-correlation used for selecting features * @param min_abs_feat_val minimum absolute feature value * @param max_abs_feat_val maximum absolute feature value */ @@ -91,6 +93,7 @@ public: int n_sis_select=1, int max_store_rung=-1, int n_rung_generate=0, + double cross_corr_max=1.0, double min_abs_feat_val=1e-50, double max_abs_feat_val=1e50 ); @@ -216,6 +219,16 @@ public: */ void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected, std::vector<double>& scores_comp); + /** + * @brief Checks the feature to see if it is still valid against previously selected features + * + * @param val_ptr pointer to value array of the current feature + * @param end_sel index of the feature to stop checking + * + * @return True if the feature is still valid + */ + bool valid_feature_against_selected(double* val_ptr, int end_sel, int start_sel = 0); + /** * @brief Check if a feature overlaps with a feature previously selected in earlier SIS iterations * @details Compares the projection score of the current candidate feature with all those of previously selected features (using the current prop) and @@ -273,6 +286,7 @@ public: * @param n_sis_select number of features to select during each SIS step * @param max_store_rung number of rungs to calculate and store the value of the features for all samples * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm) + * @param cross_corr_max Maximum cross-correlation used for selecting features * @param min_abs_feat_val minimum absolute feature value * @param max_abs_feat_val maximum absolute feature value */ @@ -285,6 +299,7 @@ public: int n_sis_select=1, int max_store_rung=-1, int n_rung_generate=0, + double cross_corr_max=1.0, double min_abs_feat_val=1e-50, double max_abs_feat_val=1e50 ); @@ -301,6 +316,7 @@ public: * @param n_sis_select number of features to select during each SIS step * @param max_store_rung number of rungs to calculate and store the value of the features for all samples * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm) + * @param cross_corr_max Maximum cross-correlation used for selecting features * @param min_abs_feat_val minimum absolute feature value * @param max_abs_feat_val maximum absolute feature value */ @@ -313,6 +329,7 @@ public: int n_sis_select=1, int max_store_rung=-1, int n_rung_generate=0, + double cross_corr_max=1.0, double min_abs_feat_val=1e-50, double max_abs_feat_val=1e50 ); diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp index 65d8cfd5..f10e5ee1 100644 --- a/src/inputs/InputParser.cpp +++ b/src/inputs/InputParser.cpp @@ -7,6 +7,7 @@ InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::sh _prop_key(IP.get<std::string>("property_key", "prop")), _task_key(IP.get<std::string>("task_key", "Task")), _leave_out_inds(as_vector<int>(IP, "leave_out_inds")), + _cross_cor_max(IP.get<double>("max_feat_cross_correlation", 1.0)), _l_bound(IP.get<double>("min_abs_feat_val", 1e-50)), _u_bound(IP.get<double>("max_abs_feat_val", 1e50)), _n_dim(IP.get<int>("desc_dim")), @@ -270,7 +271,7 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st for(int ff = 0; ff < headers.size(); ++ff) phi_0.push_back(std::make_shared<FeatureNode>(ff, headers[ff], data[ff], test_data[ff], units[ff])); - _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _l_bound, _u_bound); + _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _cross_cor_max, _l_bound, _u_bound); } void stripComments(std::string& filename) diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp index 08ba841e..f55e4c18 100644 --- a/src/inputs/InputParser.hpp +++ b/src/inputs/InputParser.hpp @@ -49,6 +49,7 @@ public: std::shared_ptr<FeatureSpace> _feat_space; //!< shared_ptr to the FeatureSpace generated from the data file and the input file + double _cross_cor_max; //!< Maximum cross-correlation used for selecting features double _l_bound; //!< Minimum absolute value allowed for the feature. double _u_bound; //!< Maximum absolute value allowed for the feature. diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp index 17dfd0c9..38ac74e7 100644 --- a/src/python/bindings_docstring_keyed.cpp +++ b/src/python/bindings_docstring_keyed.cpp @@ -39,8 +39,8 @@ void sisso::feature_creation::registerFeatureSpace() void (FeatureSpace::*sis_list)(list) = &FeatureSpace::sis; void (FeatureSpace::*sis_ndarray)(np::ndarray) = &FeatureSpace::sis; - class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double>>()) - .def(init<list, list, list, list, optional<int, int, int, int, double>>()) + class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double, double>>()) + .def(init<list, list, list, list, optional<int, int, int, int, double, double, double>>()) .def("sis", sis_list, "@DocString_feat_space_sis_list@") .def("sis", sis_ndarray, "@DocString_feat_space_sis_arr@") .def("feat_in_phi", &FeatureSpace::feat_in_phi, "@DocString_feat_space_feat_in_phi@") diff --git a/src/python/feature_creation/FeatureSpace.cpp b/src/python/feature_creation/FeatureSpace.cpp index 16d72b1f..66502c12 100644 --- a/src/python/feature_creation/FeatureSpace.cpp +++ b/src/python/feature_creation/FeatureSpace.cpp @@ -9,6 +9,7 @@ FeatureSpace::FeatureSpace( int n_sis_select, int max_store_rung, int n_rung_generate, + double cross_corr_max, double min_abs_feat_val, double max_abs_feat_val ): @@ -21,6 +22,7 @@ FeatureSpace::FeatureSpace( _feature_space_file("feature_space/selected_features.txt"), _feature_space_summary_file("feature_space/SIS_summary.txt"), _mpi_comm(mpi_setup::comm), + _cross_cor_max(cross_corr_max), _l_bound(min_abs_feat_val), _u_bound(max_abs_feat_val), _max_phi(max_phi), @@ -42,6 +44,7 @@ FeatureSpace::FeatureSpace( int n_sis_select, int max_store_rung, int n_rung_generate, + double cross_corr_max, double min_abs_feat_val, double max_abs_feat_val ): @@ -54,6 +57,7 @@ FeatureSpace::FeatureSpace( _feature_space_file("feature_space/selected_features.txt"), _feature_space_summary_file("feature_space/SIS_summary.txt"), _mpi_comm(mpi_setup::comm), + _cross_cor_max(cross_corr_max), _l_bound(min_abs_feat_val), _u_bound(max_abs_feat_val), _max_phi(max_phi), -- GitLab