From 28c1da4226bee99f88b2a0e9a0c16a05568d5eab Mon Sep 17 00:00:00 2001
From: Thomas Purcell <purcell@fhi-berlin.mpg.de>
Date: Fri, 12 Jun 2020 20:36:30 +0200
Subject: [PATCH] Update past feature checks to use scores

limit the number of operations done
---
 src/descriptor_identifier/Model/Model.cpp     |  2 +-
 src/descriptor_identifier/Model/Model.hpp     |  4 +-
 src/descriptor_identifier/SISSORegressor.cpp  |  2 +-
 .../feature_space/FeatureSpace.cpp            | 65 +++++++++----------
 .../feature_space/FeatureSpace.hpp            |  8 +--
 5 files changed, 38 insertions(+), 43 deletions(-)

diff --git a/src/descriptor_identifier/Model/Model.cpp b/src/descriptor_identifier/Model/Model.cpp
index 10eb8175..b8968f25 100644
--- a/src/descriptor_identifier/Model/Model.cpp
+++ b/src/descriptor_identifier/Model/Model.cpp
@@ -1,6 +1,6 @@
 #include <descriptor_identifier/Model/Model.hpp>
 
-Model::Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<std::shared_ptr<FeatureNode>> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test) :
+Model::Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test) :
     _n_samp_train(feats[0]->n_samp()),
     _n_samp_test(feats[0]->n_test_samp()),
     _n_dim(feats.size() + 1),
diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp
index d660b6da..f52463d4 100644
--- a/src/descriptor_identifier/Model/Model.hpp
+++ b/src/descriptor_identifier/Model/Model.hpp
@@ -19,7 +19,7 @@ class Model
     int _n_samp_test; //!< The number of test samples per feature
     int _n_dim; //!< Dimension of the model
 
-    std::vector<std::shared_ptr<FeatureNode>> _feats; //!< List of features in the model
+    std::vector<node_ptr> _feats; //!< List of features in the model
 
     std::vector<std::vector<double>> _coefs; //!< Coefficients for teh features
     std::vector<double> _prop_train; //!< The property to be modeled
@@ -41,7 +41,7 @@ public:
      * @param prop The property
      * @param feats The features for the model
      */
-    Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<std::shared_ptr<FeatureNode>> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test);
+    Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test);
 
 
     /**
diff --git a/src/descriptor_identifier/SISSORegressor.cpp b/src/descriptor_identifier/SISSORegressor.cpp
index 77b35c2a..0799aaed 100644
--- a/src/descriptor_identifier/SISSORegressor.cpp
+++ b/src/descriptor_identifier/SISSORegressor.cpp
@@ -176,7 +176,7 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim)
 
     inds = util_funcs::argsort(all_min_error);
 
-    std::vector<std::shared_ptr<FeatureNode>> min_nodes(n_dim);
+    std::vector<node_ptr> min_nodes(n_dim);
     std::vector<Model> models;
 
     for(int rr = 0; rr < _n_residual; ++rr)
diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp
index c740c4f6..e5e64bb7 100644
--- a/src/feature_creation/feature_space/FeatureSpace.cpp
+++ b/src/feature_creation/feature_space/FeatureSpace.cpp
@@ -295,8 +295,15 @@ void FeatureSpace::generate_feature_space()
     _n_feat = _phi.size();
 }
 
-void FeatureSpace::project_generated(double* prop, int size, std::vector<std::shared_ptr<FeatureNode>>& phi_sel, std::vector<double>& scores_sel, std::vector<double>& scores_comp)
+void FeatureSpace::project_generated(double* prop, int size, std::vector<node_ptr>& phi_sel, std::vector<double>& scores_sel, std::vector<double>& scores_comp)
 {
+    std::vector<double> scores_prev_sel;
+    if(node_value_arrs::N_SELECTED > _n_sis_select)
+    {
+        scores_prev_sel.resize(_phi_selected.size());
+        _project(prop, scores_prev_sel.data(), _phi_selected, _task_sizes, size / _n_samp);
+    }
+
     for(auto feat = _phi.begin() + _start_gen.back() + _mpi_comm->rank(); feat < _phi.end(); feat += _mpi_comm->size())
     {
         std::fill_n(node_value_arrs::TEMP_STORAGE_REG.data(), node_value_arrs::TEMP_STORAGE_REG.size(), -1);
@@ -322,11 +329,11 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<std::sh
             bool is_valid = valid_score_against_current(end_check, generated_phi[inds[ii]]->value_ptr(), scores[inds[ii]], scores_sel, scores_comp);
             // Check the feature against those selected from previous SIS iterations
             if((node_value_arrs::N_SELECTED > _n_sis_select) && is_valid)
-                is_valid = valid_score_against_past(generated_phi[inds[ii]]->value_ptr(), scores_comp);
+                is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_prev_sel, scores_comp);
 
             if(is_valid)
             {
-                std::shared_ptr<FeatureNode> new_feat = std::make_shared<FeatureNode>(node_value_arrs::N_SELECTED - _n_sis_select + end_check, generated_phi[inds[ii]]->expr(), generated_phi[inds[ii]]->value(), generated_phi[inds[ii]]->test_value(), generated_phi[inds[ii]]->unit(), true);
+                node_ptr new_feat = std::make_shared<FeatureNode>(node_value_arrs::N_SELECTED - _n_sis_select + end_check, generated_phi[inds[ii]]->expr(), generated_phi[inds[ii]]->value(), generated_phi[inds[ii]]->test_value(), generated_phi[inds[ii]]->unit(), true);
                 phi_sel.insert(phi_sel.begin() + end_check, new_feat);
                 scores_sel.insert(scores_sel.begin() + end_check, cur_score);
                 for(int jj = end_check + 1; jj < _n_sis_select; ++jj)
@@ -345,19 +352,17 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<std::sh
     }
 }
 
-bool FeatureSpace::valid_score_against_past(double* val_ptr, std::vector<double>& scores_comp)
+bool FeatureSpace::valid_score_against_past(double* val_ptr, double cur_score, std::vector<double> scores_past, std::vector<double>& scores_comp)
 {
-    double cur_feat_mean = util_funcs::mean(val_ptr, _n_samp);
-    double cur_feat_std = util_funcs::stand_dev(val_ptr, _n_samp);
-
-    std::transform(val_ptr, val_ptr + _n_samp, val_ptr, [&cur_feat_mean, &cur_feat_std](double val){return (val - cur_feat_mean) / cur_feat_std;});
-
-    dgemv_('T', _n_samp, scores_comp.size(), 1.0 / static_cast<double>(_n_samp), node_value_arrs::D_MATRIX.data(), _n_samp, val_ptr, 1, 0.0, scores_comp.data(), 1);
+    std::transform(scores_past.begin(), scores_past.end(), scores_comp.begin(), [&cur_score](double score){return cur_score - score;});
 
-    if(1.0 - util_funcs::max_abs_val<double>(scores_comp.data(), scores_comp.size()) < 1e-13)
-        return false;
-
-    std::transform(val_ptr, val_ptr + _n_samp, val_ptr, [&cur_feat_mean, &cur_feat_std](double val){return val * cur_feat_std + cur_feat_mean;});
+    // If two scores are the same then they are possibly the same feature, if not then they can't be
+    if(*std::min_element(scores_comp.begin(), scores_comp.end()) < 1e-10)
+    {
+        int dd = std::min_element(scores_comp.begin(), scores_comp.end()) - scores_comp.begin();
+        if(1.0 - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp)) < 1e-13)
+            return false;
+    }
     return true;
 }
 
@@ -377,26 +382,14 @@ bool FeatureSpace::valid_score_against_current(int end_check, double* val_ptr, d
 
 void FeatureSpace::sis(std::vector<double>& prop)
 {
-    std::vector<double> means(node_value_arrs::N_SELECTED);
-    std::vector<double> stand_devs(node_value_arrs::N_SELECTED);
     std::vector<double> scores_comp(std::max(node_value_arrs::N_SELECTED, _n_sis_select), 1.0);
-
     std::vector<double> scores_sel(_n_sis_select, 0.0);
-    std::vector<std::shared_ptr<FeatureNode>> phi_sel;
+
+    std::vector<node_ptr> phi_sel;
     phi_sel.reserve(_n_sis_select);
 
     int cur_feat = node_value_arrs::N_SELECTED;
 
-    // Standardize the description matrix
-    if(cur_feat > 0)
-    {
-        for(int dd = 0; dd < cur_feat; ++dd)
-        {
-            means[dd] = util_funcs::mean(node_value_arrs::get_d_matrix_ptr(dd), _n_samp);
-            stand_devs[dd] = util_funcs::stand_dev(node_value_arrs::get_d_matrix_ptr(dd), _n_samp);
-            std::transform(node_value_arrs::get_d_matrix_ptr(dd), node_value_arrs::get_d_matrix_ptr(dd) + _n_samp, node_value_arrs::get_d_matrix_ptr(dd), [&means, &stand_devs, &dd](double val){return (val - means[dd]) / stand_devs[dd];});
-        }
-    }
     node_value_arrs::resize_d_matrix_arr(_n_sis_select);
     _phi_selected.reserve(_phi_selected.size() + _n_sis_select);
 
@@ -408,12 +401,19 @@ void FeatureSpace::sis(std::vector<double>& prop)
     int cur_feat_local = 0;
     double cur_score = 0.0;
 
+    std::vector<double> scores_prev_sel;
+    if(node_value_arrs::N_SELECTED > _n_sis_select)
+    {
+        scores_prev_sel.resize(_phi_selected.size());
+        _project(prop.data(), scores_prev_sel.data(), _phi_selected, _task_sizes, prop.size() / _n_samp);
+    }
+
     while((cur_feat_local != _n_sis_select) && (ii < _scores.size()))
     {
         bool is_valid = valid_score_against_current(cur_feat_local, _phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_sel, scores_comp);
         // Check the feature against those selected from previous SIS iterations
         if(cur_feat > 0 && is_valid)
-            is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), scores_comp);
+            is_valid = valid_score_against_past(_phi[inds[ii]]->value_ptr(), _scores[inds[ii]], scores_prev_sel, scores_comp);
 
         if(is_valid)
         {
@@ -431,11 +431,6 @@ void FeatureSpace::sis(std::vector<double>& prop)
         project_generated(prop.data(), prop.size(), phi_sel, scores_sel, scores_comp);
     }
 
-    // Unstandardize the description matrix
-    if(cur_feat > 0)
-        for(int dd = 0; dd < cur_feat; ++dd)
-            std::transform(node_value_arrs::get_d_matrix_ptr(dd), node_value_arrs::get_d_matrix_ptr(dd) + _n_samp, node_value_arrs::get_d_matrix_ptr(dd), [&means, &stand_devs, &dd](double val){return val * stand_devs[dd] + means[dd];});
-
     phi_sel.resize(_n_sis_select);
     scores_sel.resize(_n_sis_select);
 
@@ -489,7 +484,7 @@ void FeatureSpace::sis(std::vector<double>& prop)
         if(_mpi_comm->rank() == 0)
         {
             std::vector<double> sent_scores(_n_sis_select * _mpi_comm->size(), 0.0);
-            std::vector<std::shared_ptr<FeatureNode>> sent_phi(_n_sis_select * _mpi_comm->size());
+            std::vector<node_ptr> sent_phi(_n_sis_select * _mpi_comm->size());
 
             std::copy_n(scores_sel.begin(), _n_sis_select, sent_scores.begin());
             std::copy_n(phi_sel.begin(), _n_sis_select, sent_phi.begin());
diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp
index 50a4d1e9..1299afcf 100644
--- a/src/feature_creation/feature_space/FeatureSpace.hpp
+++ b/src/feature_creation/feature_space/FeatureSpace.hpp
@@ -20,7 +20,7 @@
  */
 class FeatureSpace
 {
-    std::vector<std::shared_ptr<FeatureNode>> _phi_selected; //!< selected features
+    std::vector<node_ptr> _phi_selected; //!< selected features
     std::vector<node_ptr> _phi; //!< all features
     std::vector<node_ptr> _phi_0; //!< initial feature space
 
@@ -82,7 +82,7 @@ public:
     /**
      * @brief Accessor function for _phi_selected
      */
-    inline std::vector<std::shared_ptr<FeatureNode>> phi_selected(){return _phi_selected;};
+    inline std::vector<node_ptr> phi_selected(){return _phi_selected;};
 
     /**
      * @brief Accessor function for _phi
@@ -108,9 +108,9 @@ public:
 
     void generate_new_feats(std::vector<node_ptr>::iterator& feat, std::vector<node_ptr>& feat_set, int& feat_ind, double l_bound=1e-50, double u_bound=1e50);
 
-    void project_generated(double* prop, int size, std::vector<std::shared_ptr<FeatureNode>>& phi_selected, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
+    void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
 
-    bool valid_score_against_past(double* val_ptr, std::vector<double>& scores_comp);
+    bool valid_score_against_past(double* val_ptr, double cur_score, std::vector<double> scores_past, std::vector<double>& scores_comp);
 
     bool valid_score_against_current(int end_check, double* val_ptr, double cur_score, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
     /**
-- 
GitLab