From 8307c08a2e66950d6016c9a092b4b01892d67bc8 Mon Sep 17 00:00:00 2001
From: Thomas Purcell <purcell@fhi-berlin.mpg.de>
Date: Tue, 27 Oct 2020 22:20:33 +0100
Subject: [PATCH] Split the feature_is_valid functions into two functions

1) that checks if the score difference is < 1e-5 first if max correlation is 1.0
2) only checks pearson correlation if max correlation < 1.0

This should allow for speed up in most use caseso
---
 .../feature_space/FeatureSpace.cpp            | 46 ++++++-------------
 .../feature_space/FeatureSpace.hpp            | 17 ++-----
 2 files changed, 20 insertions(+), 43 deletions(-)

diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp
index f6765bff..c4d56aaf 100644
--- a/src/feature_creation/feature_space/FeatureSpace.cpp
+++ b/src/feature_creation/feature_space/FeatureSpace.cpp
@@ -99,6 +99,17 @@ void FeatureSpace::initialize_fs(std::vector<double> prop, std::string project_t
     else
         throw std::logic_error("Wrong projection type passed to FeatureSpace constructor.");
 
+    if(_cross_cor_max < 0.99999)
+    {
+        _is_valid = comp_feats::valid_feature_against_selected;
+        _is_valid_private = comp_feats::valid_feature_against_selected_no_omp;
+    }
+    else
+    {
+        _is_valid = comp_feats::valid_feature_against_selected_max_corr_1;
+        _is_valid_private = comp_feats::valid_feature_against_selected_max_corr_1_no_omp;
+    }
+
     for(auto & op : _allowed_ops)
     {
         if((op.compare("add") == 0) || (op.compare("mult") == 0) || (op.compare("abs_diff") == 0) || (op.compare("sub") == 0))
@@ -466,7 +477,7 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt
             while((ii < inds.size()) && ((scores[inds[ii]] < worst_score) || (phi_sel_private.size() < _n_sis_select)))
             {
                 double cur_score = scores[inds[ii]];
-                if((valid_feature_against_selected(generated_phi[inds[ii]]->value_ptr(), scores_sel_all, cur_score, node_value_arrs::N_SELECTED - _n_sis_select)) && valid_feature_against_private_selected(generated_phi[inds[ii]]->value_ptr(), phi_sel_private, scores_sel_private, cur_score))
+                if((_is_valid(generated_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, cur_score, node_value_arrs::N_SELECTED - _n_sis_select, 0)) && _is_valid_private(generated_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, phi_sel_private, scores_sel_private, cur_score))
                 {
                     if(scores_sel_private.size() == _n_sis_select)
                     {
@@ -494,7 +505,7 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt
             worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin();
             for(int sc = 0; sc < scores_sel_private.size(); ++sc)
             {
-                if(((phi_sel.size() < _n_sis_select) || (scores_sel_private[sc] < scores_sel[worst_score_ind])) && valid_feature_against_private_selected(phi_sel_private[sc]->value_ptr(), phi_sel, scores_sel, scores_sel_private[sc]))
+                if(((phi_sel.size() < _n_sis_select) || (scores_sel_private[sc] < scores_sel[worst_score_ind])) && _is_valid_private(phi_sel_private[sc]->value_ptr(), _n_samp, _cross_cor_max, phi_sel, scores_sel, scores_sel_private[sc]))
                 {
                     if(phi_sel.size() == _n_sis_select)
                     {
@@ -513,33 +524,6 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt
     }
 }
 
-bool FeatureSpace::valid_feature_against_selected(double* val_ptr, std::vector<double>& scores_sel, double cur_score, int end_sel, int start_sel)
-{
-    double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp);
-
-    for(int dd = start_sel; dd < end_sel; ++dd)
-    {
-        if(abs(cur_score - scores_sel[dd]) > 1e-5)
-            continue;
-
-        if(base_val - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10)
-            return false;
-    }
-    return true;
-}
-
-bool FeatureSpace::valid_feature_against_private_selected(double* val_ptr, std::vector<node_ptr>& selected, std::vector<double>& scores_sel, double cur_score)
-{
-    double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp);
-
-    for(int ff = 0; ff < selected.size(); ++ff)
-    {
-        if((abs(scores_sel[ff] - cur_score) < 1e-5) && (base_val - std::abs(util_funcs::r(selected[ff]->value_ptr(1), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10))
-            return false;
-    }
-    return true;
-}
-
 void FeatureSpace::sis(std::vector<double>& prop)
 {
     boost::filesystem::path p(_feature_space_file.c_str());
@@ -585,7 +569,7 @@ void FeatureSpace::sis(std::vector<double>& prop)
     start = omp_get_wtime();
     while((cur_feat_local != _n_sis_select) && (ii < _scores.size()))
     {
-        if(valid_feature_against_selected(_phi[inds[ii]]->value_ptr(), scores_sel_all, _scores[inds[ii]], cur_feat + cur_feat_local))
+        if(_is_valid(_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, _scores[inds[ii]], cur_feat + cur_feat_local, 0))
         {
             scores_sel[cur_feat_local] = _scores[inds[ii]];
             scores_sel_all[cur_feat + cur_feat_local] = _scores[inds[ii]];
@@ -649,7 +633,7 @@ void FeatureSpace::sis(std::vector<double>& prop)
             // Get the n_sis_select best features (compare against features sent from other processes)
             while((cur_feat + cur_feat_local != node_value_arrs::N_SELECTED) && (ii < sent_scores.size()))
             {
-                if(valid_feature_against_selected(sent_phi[inds[ii]]->value_ptr(), scores_sel_all, sent_scores[inds[ii]], cur_feat + cur_feat_local, cur_feat))
+                if(_is_valid(sent_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, sent_scores[inds[ii]], cur_feat + cur_feat_local, cur_feat))
                 {
                     out_file_stream << std::setw(14) <<std::left << cur_feat + cur_feat_local << sent_phi[inds[ii]]->postfix_expr() << std::endl;
                     sum_file_stream << std::setw(14) <<std::left << cur_feat + cur_feat_local << std::setw(24) << std::setprecision(18) << std::left << -1 * sent_scores[inds[ii]] << sent_phi[inds[ii]]->expr() << std::endl;
diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp
index 53f1b0ca..3a0c7b78 100644
--- a/src/feature_creation/feature_space/FeatureSpace.hpp
+++ b/src/feature_creation/feature_space/FeatureSpace.hpp
@@ -16,6 +16,7 @@
 #include <feature_creation/node/operator_nodes/allowed_ops.hpp>
 #include <feature_creation/node/utils.hpp>
 #include <feature_creation/node/value_storage/nodes_value_containers.hpp>
+#include <utils/compare_features.hpp>
 #include <utils/project.hpp>
 
 #include <boost/serialization/shared_ptr.hpp>
@@ -55,6 +56,10 @@ class FeatureSpace
 
     std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS
     std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project_no_omp; //!< Function used to calculate the scores for SIS without changing omp environment
+    std::function<bool(double*, int, double, std::vector<double>&, double, int, int)> _is_valid; //!< Function used to calculate the scores for SIS
+    std::function<bool(double*, int, double, std::vector<node_ptr>&, std::vector<double>&, double)> _is_valid_private; //!< Function used to calculate the scores for SIS without changing omp environment
+
+
     std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator
 
     double _cross_cor_max; //!< Maximum cross-correlation used for selecting features
@@ -224,18 +229,6 @@ public:
      */
     void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected);
 
-    /**
-     * @brief Checks the feature to see if it is still valid against previously selected features
-     *
-     * @param val_ptr pointer to value array of the current feature
-     * @param end_sel index of the feature to stop checking
-     *
-     * @return True if the feature is still valid
-     */
-    bool valid_feature_against_selected(double* val_ptr, std::vector<double>& scores_sel, double cur_score, int end_sel, int start_sel = 0);
-
-    bool valid_feature_against_private_selected(double* val_ptr, std::vector<node_ptr>& selected, std::vector<double>& scores_sel, double cur_score);
-
     /**
      * @brief Perform SIS on a feature set with a specified property
      * @details Perform sure-independence screening with either the correct property or the error
-- 
GitLab