diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index 7796e1a85714037344b10e534b4023eb5b49aa93..17b74f23b1d09901dfad11184640727f70bac6b8 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -29,6 +29,7 @@ FeatureSpace::FeatureSpace( int n_sis_select, int max_store_rung, int n_rung_generate, + double cross_corr_max, double min_abs_feat_val, double max_abs_feat_val ): @@ -41,6 +42,7 @@ FeatureSpace::FeatureSpace( _feature_space_file("feature_space/selected_features.txt"), _feature_space_summary_file("feature_space/SIS_summary.txt"), _mpi_comm(mpi_comm), + _cross_cor_max(cross_corr_max), _l_bound(min_abs_feat_val), _u_bound(max_abs_feat_val), _max_phi(max_phi), @@ -434,13 +436,13 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt { double cur_score = scores[inds[ii]]; - bool is_valid = valid_score_against_current(scores_sel.size(), generated_phi[inds[ii]]->value_ptr(), cur_score, scores_sel, scores_comp); + // bool is_valid = valid_score_against_current(scores_sel.size(), generated_phi[inds[ii]]->value_ptr(), cur_score, scores_sel, scores_comp); - // Check the feature against those selected from previous SIS iterations - if((node_value_arrs::N_SELECTED > _n_sis_select) && is_valid) - is_valid = valid_score_against_past(generated_phi[inds[ii]]->value_ptr(), scores[inds[ii]], scores_prev_sel, scores_comp); + // // Check the feature against those selected from previous SIS iterations + // if((node_value_arrs::N_SELECTED > _n_sis_select) && is_valid) + // is_valid = valid_score_against_past(generated_phi[inds[ii]]->value_ptr(), scores[inds[ii]], scores_prev_sel, scores_comp); - if(is_valid) + if(valid_feature_against_selected(generated_phi[inds[ii]]->value_ptr(), node_value_arrs::N_SELECTED - _n_sis_select + scores_sel.size())) { if(scores_sel.size() == _n_sis_select) { @@ -472,6 +474,15 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt } } +bool FeatureSpace::valid_feature_against_selected(double* val_ptr, int end_sel, int start_sel) +{ + double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp); + for(int dd = start_sel; dd < end_sel; ++dd) + if(base_val - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10) + return false; + return true; +} + bool FeatureSpace::valid_score_against_past(double* val_ptr, double cur_score, std::vector<double> scores_past, std::vector<double>& scores_comp) { std::transform(scores_past.begin(), scores_past.end(), scores_comp.begin(), [&cur_score](double score){return std::abs(cur_score - score);}); @@ -480,7 +491,7 @@ bool FeatureSpace::valid_score_against_past(double* val_ptr, double cur_score, s if(*std::min_element(scores_comp.begin(), scores_comp.end()) < 1e-10) { int dd = std::min_element(scores_comp.begin(), scores_comp.end()) - scores_comp.begin(); - if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp))) < 1e-10) + if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp))) < 1.0 - _cross_cor_max + 1e-10) return false; } return true; @@ -493,7 +504,7 @@ bool FeatureSpace::valid_score_against_current(int end_check, double* val_ptr, d if(*std::min_element(scores_comp.begin(), scores_comp.begin() + end_check) < 1e-10) { int dd = std::min_element(scores_comp.begin(), scores_comp.begin() + end_check) - scores_comp.begin(); - if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(node_value_arrs::N_SELECTED - _n_sis_select + dd), val_ptr, _n_samp))) < 1e-10) + if(std::abs(util_funcs::r(val_ptr, val_ptr, _n_samp) - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(node_value_arrs::N_SELECTED - _n_sis_select + dd), val_ptr, _n_samp))) < 1.0 - _cross_cor_max + 1e-10) return false; } return true; @@ -530,7 +541,7 @@ void FeatureSpace::sis(std::vector<double>& prop) int cur_feat_local = 0; double cur_score = 0.0; - while(_scores[inds[ii]] < -1.0) + while(_scores[inds[ii]] < -1.0000000001) ++ii; std::vector<double> scores_prev_sel; @@ -542,12 +553,12 @@ void FeatureSpace::sis(std::vector<double>& prop) while((cur_feat_local != _n_sis_select) && (ii < _scores.size())) { - bool is_valid = valid_score_against_current(cur_feat_local, _phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_sel, scores_comp); - // Check the feature against those selected from previous SIS iterations - if(cur_feat > 0 && is_valid) - is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_prev_sel, scores_comp); + // bool is_valid = valid_score_against_current(cur_feat_local, _phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_sel, scores_comp); + // // Check the feature against those selected from previous SIS iterations + // if(cur_feat > 0 && is_valid) + // is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_prev_sel, scores_comp); - if(is_valid) + if(valid_feature_against_selected(_phi[inds[ii]]->value_ptr(), cur_feat + cur_feat_local)) { scores_sel[cur_feat_local] = _scores[inds[ii]]; phi_sel.push_back(_phi[inds[ii]]); @@ -678,7 +689,7 @@ void FeatureSpace::sis(std::vector<double>& prop) // Get the n_sis_select best features (compare against features sent from other processes) while((cur_feat != node_value_arrs::N_SELECTED) && (ii < sent_scores.size())) { - if(valid_score_against_current(cur_feat_local, sent_phi[inds[ii]]->value().data(), sent_scores[inds[ii]], scores_sel, scores_comp)) + if(valid_feature_against_selected(sent_phi[inds[ii]]->value().data(), cur_feat + cur_feat_local, cur_feat)) { out_file_stream << std::setw(14) <<std::left << cur_feat << sent_phi[inds[ii]]->postfix_expr() << std::endl; sum_file_stream << std::setw(14) <<std::left << cur_feat << std::setw(24) << std::setprecision(18) << std::left << -1 * sent_scores[inds[ii]] << sent_phi[inds[ii]]->expr() << std::endl; diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp index 6e3e383f5620a31287272dfd21eab2d216ee3df2..45cddcf6514ac91edfd7e1d2a71beb9d9751f624 100644 --- a/src/feature_creation/feature_space/FeatureSpace.hpp +++ b/src/feature_creation/feature_space/FeatureSpace.hpp @@ -55,6 +55,7 @@ class FeatureSpace std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator + double _cross_cor_max; //!< Maximum cross-correlation used for selecting features double _l_bound; //!< lower bound for absolute value of the features double _u_bound; //!< upper bound for absolute value of the features @@ -78,6 +79,7 @@ public: * @param n_sis_select number of features to select during each SIS step * @param max_store_rung number of rungs to calculate and store the value of the features for all samples * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm) + * @param cross_corr_max Maximum cross-correlation used for selecting features * @param min_abs_feat_val minimum absolute feature value * @param max_abs_feat_val maximum absolute feature value */ @@ -91,6 +93,7 @@ public: int n_sis_select=1, int max_store_rung=-1, int n_rung_generate=0, + double cross_corr_max=1.0, double min_abs_feat_val=1e-50, double max_abs_feat_val=1e50 ); @@ -216,6 +219,16 @@ public: */ void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected, std::vector<double>& scores_comp); + /** + * @brief Checks the feature to see if it is still valid against previously selected features + * + * @param val_ptr pointer to value array of the current feature + * @param end_sel index of the feature to stop checking + * + * @return True if the feature is still valid + */ + bool valid_feature_against_selected(double* val_ptr, int end_sel, int start_sel = 0); + /** * @brief Check if a feature overlaps with a feature previously selected in earlier SIS iterations * @details Compares the projection score of the current candidate feature with all those of previously selected features (using the current prop) and @@ -273,6 +286,7 @@ public: * @param n_sis_select number of features to select during each SIS step * @param max_store_rung number of rungs to calculate and store the value of the features for all samples * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm) + * @param cross_corr_max Maximum cross-correlation used for selecting features * @param min_abs_feat_val minimum absolute feature value * @param max_abs_feat_val maximum absolute feature value */ @@ -285,6 +299,7 @@ public: int n_sis_select=1, int max_store_rung=-1, int n_rung_generate=0, + double cross_corr_max=1.0, double min_abs_feat_val=1e-50, double max_abs_feat_val=1e50 ); @@ -301,6 +316,7 @@ public: * @param n_sis_select number of features to select during each SIS step * @param max_store_rung number of rungs to calculate and store the value of the features for all samples * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm) + * @param cross_corr_max Maximum cross-correlation used for selecting features * @param min_abs_feat_val minimum absolute feature value * @param max_abs_feat_val maximum absolute feature value */ @@ -313,6 +329,7 @@ public: int n_sis_select=1, int max_store_rung=-1, int n_rung_generate=0, + double cross_corr_max=1.0, double min_abs_feat_val=1e-50, double max_abs_feat_val=1e50 ); diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp index 65d8cfd5c7d1e7492c47b0d1173aaea88be448e7..f10e5ee1be911c6c91e2e4f9e0a3b805dd26a060 100644 --- a/src/inputs/InputParser.cpp +++ b/src/inputs/InputParser.cpp @@ -7,6 +7,7 @@ InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::sh _prop_key(IP.get<std::string>("property_key", "prop")), _task_key(IP.get<std::string>("task_key", "Task")), _leave_out_inds(as_vector<int>(IP, "leave_out_inds")), + _cross_cor_max(IP.get<double>("max_feat_cross_correlation", 1.0)), _l_bound(IP.get<double>("min_abs_feat_val", 1e-50)), _u_bound(IP.get<double>("max_abs_feat_val", 1e50)), _n_dim(IP.get<int>("desc_dim")), @@ -270,7 +271,7 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st for(int ff = 0; ff < headers.size(); ++ff) phi_0.push_back(std::make_shared<FeatureNode>(ff, headers[ff], data[ff], test_data[ff], units[ff])); - _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _l_bound, _u_bound); + _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _cross_cor_max, _l_bound, _u_bound); } void stripComments(std::string& filename) diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp index 08ba841e8a00f7c495f0e82cec0eefbe94de5940..f55e4c18ec5f75d8dd85f6405475b9a3fbc025e0 100644 --- a/src/inputs/InputParser.hpp +++ b/src/inputs/InputParser.hpp @@ -49,6 +49,7 @@ public: std::shared_ptr<FeatureSpace> _feat_space; //!< shared_ptr to the FeatureSpace generated from the data file and the input file + double _cross_cor_max; //!< Maximum cross-correlation used for selecting features double _l_bound; //!< Minimum absolute value allowed for the feature. double _u_bound; //!< Maximum absolute value allowed for the feature. diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp index 17dfd0c9a4ee043c4eb7132c0392e5e2621dd6cf..38ac74e71159829730dbc36cee7df58679ff1188 100644 --- a/src/python/bindings_docstring_keyed.cpp +++ b/src/python/bindings_docstring_keyed.cpp @@ -39,8 +39,8 @@ void sisso::feature_creation::registerFeatureSpace() void (FeatureSpace::*sis_list)(list) = &FeatureSpace::sis; void (FeatureSpace::*sis_ndarray)(np::ndarray) = &FeatureSpace::sis; - class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double>>()) - .def(init<list, list, list, list, optional<int, int, int, int, double>>()) + class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double, double>>()) + .def(init<list, list, list, list, optional<int, int, int, int, double, double, double>>()) .def("sis", sis_list, "@DocString_feat_space_sis_list@") .def("sis", sis_ndarray, "@DocString_feat_space_sis_arr@") .def("feat_in_phi", &FeatureSpace::feat_in_phi, "@DocString_feat_space_feat_in_phi@") diff --git a/src/python/feature_creation/FeatureSpace.cpp b/src/python/feature_creation/FeatureSpace.cpp index 16d72b1f45abbd1a82fcbc8965add3e8b9fae870..66502c120a1e03862aeed391aee23d56bf989b82 100644 --- a/src/python/feature_creation/FeatureSpace.cpp +++ b/src/python/feature_creation/FeatureSpace.cpp @@ -9,6 +9,7 @@ FeatureSpace::FeatureSpace( int n_sis_select, int max_store_rung, int n_rung_generate, + double cross_corr_max, double min_abs_feat_val, double max_abs_feat_val ): @@ -21,6 +22,7 @@ FeatureSpace::FeatureSpace( _feature_space_file("feature_space/selected_features.txt"), _feature_space_summary_file("feature_space/SIS_summary.txt"), _mpi_comm(mpi_setup::comm), + _cross_cor_max(cross_corr_max), _l_bound(min_abs_feat_val), _u_bound(max_abs_feat_val), _max_phi(max_phi), @@ -42,6 +44,7 @@ FeatureSpace::FeatureSpace( int n_sis_select, int max_store_rung, int n_rung_generate, + double cross_corr_max, double min_abs_feat_val, double max_abs_feat_val ): @@ -54,6 +57,7 @@ FeatureSpace::FeatureSpace( _feature_space_file("feature_space/selected_features.txt"), _feature_space_summary_file("feature_space/SIS_summary.txt"), _mpi_comm(mpi_setup::comm), + _cross_cor_max(cross_corr_max), _l_bound(min_abs_feat_val), _u_bound(max_abs_feat_val), _max_phi(max_phi),