From 8307c08a2e66950d6016c9a092b4b01892d67bc8 Mon Sep 17 00:00:00 2001 From: Thomas Purcell <purcell@fhi-berlin.mpg.de> Date: Tue, 27 Oct 2020 22:20:33 +0100 Subject: [PATCH] Split the feature_is_valid functions into two functions 1) that checks if the score difference is < 1e-5 first if max correlation is 1.0 2) only checks pearson correlation if max correlation < 1.0 This should allow for speed up in most use caseso --- .../feature_space/FeatureSpace.cpp | 46 ++++++------------- .../feature_space/FeatureSpace.hpp | 17 ++----- 2 files changed, 20 insertions(+), 43 deletions(-) diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index f6765bff..c4d56aaf 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -99,6 +99,17 @@ void FeatureSpace::initialize_fs(std::vector<double> prop, std::string project_t else throw std::logic_error("Wrong projection type passed to FeatureSpace constructor."); + if(_cross_cor_max < 0.99999) + { + _is_valid = comp_feats::valid_feature_against_selected; + _is_valid_private = comp_feats::valid_feature_against_selected_no_omp; + } + else + { + _is_valid = comp_feats::valid_feature_against_selected_max_corr_1; + _is_valid_private = comp_feats::valid_feature_against_selected_max_corr_1_no_omp; + } + for(auto & op : _allowed_ops) { if((op.compare("add") == 0) || (op.compare("mult") == 0) || (op.compare("abs_diff") == 0) || (op.compare("sub") == 0)) @@ -466,7 +477,7 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt while((ii < inds.size()) && ((scores[inds[ii]] < worst_score) || (phi_sel_private.size() < _n_sis_select))) { double cur_score = scores[inds[ii]]; - if((valid_feature_against_selected(generated_phi[inds[ii]]->value_ptr(), scores_sel_all, cur_score, node_value_arrs::N_SELECTED - _n_sis_select)) && valid_feature_against_private_selected(generated_phi[inds[ii]]->value_ptr(), phi_sel_private, scores_sel_private, cur_score)) + if((_is_valid(generated_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, cur_score, node_value_arrs::N_SELECTED - _n_sis_select, 0)) && _is_valid_private(generated_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, phi_sel_private, scores_sel_private, cur_score)) { if(scores_sel_private.size() == _n_sis_select) { @@ -494,7 +505,7 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin(); for(int sc = 0; sc < scores_sel_private.size(); ++sc) { - if(((phi_sel.size() < _n_sis_select) || (scores_sel_private[sc] < scores_sel[worst_score_ind])) && valid_feature_against_private_selected(phi_sel_private[sc]->value_ptr(), phi_sel, scores_sel, scores_sel_private[sc])) + if(((phi_sel.size() < _n_sis_select) || (scores_sel_private[sc] < scores_sel[worst_score_ind])) && _is_valid_private(phi_sel_private[sc]->value_ptr(), _n_samp, _cross_cor_max, phi_sel, scores_sel, scores_sel_private[sc])) { if(phi_sel.size() == _n_sis_select) { @@ -513,33 +524,6 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt } } -bool FeatureSpace::valid_feature_against_selected(double* val_ptr, std::vector<double>& scores_sel, double cur_score, int end_sel, int start_sel) -{ - double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp); - - for(int dd = start_sel; dd < end_sel; ++dd) - { - if(abs(cur_score - scores_sel[dd]) > 1e-5) - continue; - - if(base_val - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10) - return false; - } - return true; -} - -bool FeatureSpace::valid_feature_against_private_selected(double* val_ptr, std::vector<node_ptr>& selected, std::vector<double>& scores_sel, double cur_score) -{ - double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp); - - for(int ff = 0; ff < selected.size(); ++ff) - { - if((abs(scores_sel[ff] - cur_score) < 1e-5) && (base_val - std::abs(util_funcs::r(selected[ff]->value_ptr(1), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10)) - return false; - } - return true; -} - void FeatureSpace::sis(std::vector<double>& prop) { boost::filesystem::path p(_feature_space_file.c_str()); @@ -585,7 +569,7 @@ void FeatureSpace::sis(std::vector<double>& prop) start = omp_get_wtime(); while((cur_feat_local != _n_sis_select) && (ii < _scores.size())) { - if(valid_feature_against_selected(_phi[inds[ii]]->value_ptr(), scores_sel_all, _scores[inds[ii]], cur_feat + cur_feat_local)) + if(_is_valid(_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, _scores[inds[ii]], cur_feat + cur_feat_local, 0)) { scores_sel[cur_feat_local] = _scores[inds[ii]]; scores_sel_all[cur_feat + cur_feat_local] = _scores[inds[ii]]; @@ -649,7 +633,7 @@ void FeatureSpace::sis(std::vector<double>& prop) // Get the n_sis_select best features (compare against features sent from other processes) while((cur_feat + cur_feat_local != node_value_arrs::N_SELECTED) && (ii < sent_scores.size())) { - if(valid_feature_against_selected(sent_phi[inds[ii]]->value_ptr(), scores_sel_all, sent_scores[inds[ii]], cur_feat + cur_feat_local, cur_feat)) + if(_is_valid(sent_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, sent_scores[inds[ii]], cur_feat + cur_feat_local, cur_feat)) { out_file_stream << std::setw(14) <<std::left << cur_feat + cur_feat_local << sent_phi[inds[ii]]->postfix_expr() << std::endl; sum_file_stream << std::setw(14) <<std::left << cur_feat + cur_feat_local << std::setw(24) << std::setprecision(18) << std::left << -1 * sent_scores[inds[ii]] << sent_phi[inds[ii]]->expr() << std::endl; diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp index 53f1b0ca..3a0c7b78 100644 --- a/src/feature_creation/feature_space/FeatureSpace.hpp +++ b/src/feature_creation/feature_space/FeatureSpace.hpp @@ -16,6 +16,7 @@ #include <feature_creation/node/operator_nodes/allowed_ops.hpp> #include <feature_creation/node/utils.hpp> #include <feature_creation/node/value_storage/nodes_value_containers.hpp> +#include <utils/compare_features.hpp> #include <utils/project.hpp> #include <boost/serialization/shared_ptr.hpp> @@ -55,6 +56,10 @@ class FeatureSpace std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project_no_omp; //!< Function used to calculate the scores for SIS without changing omp environment + std::function<bool(double*, int, double, std::vector<double>&, double, int, int)> _is_valid; //!< Function used to calculate the scores for SIS + std::function<bool(double*, int, double, std::vector<node_ptr>&, std::vector<double>&, double)> _is_valid_private; //!< Function used to calculate the scores for SIS without changing omp environment + + std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator double _cross_cor_max; //!< Maximum cross-correlation used for selecting features @@ -224,18 +229,6 @@ public: */ void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected); - /** - * @brief Checks the feature to see if it is still valid against previously selected features - * - * @param val_ptr pointer to value array of the current feature - * @param end_sel index of the feature to stop checking - * - * @return True if the feature is still valid - */ - bool valid_feature_against_selected(double* val_ptr, std::vector<double>& scores_sel, double cur_score, int end_sel, int start_sel = 0); - - bool valid_feature_against_private_selected(double* val_ptr, std::vector<node_ptr>& selected, std::vector<double>& scores_sel, double cur_score); - /** * @brief Perform SIS on a feature set with a specified property * @details Perform sure-independence screening with either the correct property or the error -- GitLab