Commit 8307c08a authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Split the feature_is_valid functions into two functions

1) that checks if the score difference is < 1e-5 first if max correlation is 1.0
2) only checks pearson correlation if max correlation < 1.0

This should allow for speed up in most use caseso
parent b08ad18f
......@@ -99,6 +99,17 @@ void FeatureSpace::initialize_fs(std::vector<double> prop, std::string project_t
else
throw std::logic_error("Wrong projection type passed to FeatureSpace constructor.");
if(_cross_cor_max < 0.99999)
{
_is_valid = comp_feats::valid_feature_against_selected;
_is_valid_private = comp_feats::valid_feature_against_selected_no_omp;
}
else
{
_is_valid = comp_feats::valid_feature_against_selected_max_corr_1;
_is_valid_private = comp_feats::valid_feature_against_selected_max_corr_1_no_omp;
}
for(auto & op : _allowed_ops)
{
if((op.compare("add") == 0) || (op.compare("mult") == 0) || (op.compare("abs_diff") == 0) || (op.compare("sub") == 0))
......@@ -466,7 +477,7 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt
while((ii < inds.size()) && ((scores[inds[ii]] < worst_score) || (phi_sel_private.size() < _n_sis_select)))
{
double cur_score = scores[inds[ii]];
if((valid_feature_against_selected(generated_phi[inds[ii]]->value_ptr(), scores_sel_all, cur_score, node_value_arrs::N_SELECTED - _n_sis_select)) && valid_feature_against_private_selected(generated_phi[inds[ii]]->value_ptr(), phi_sel_private, scores_sel_private, cur_score))
if((_is_valid(generated_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, cur_score, node_value_arrs::N_SELECTED - _n_sis_select, 0)) && _is_valid_private(generated_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, phi_sel_private, scores_sel_private, cur_score))
{
if(scores_sel_private.size() == _n_sis_select)
{
......@@ -494,7 +505,7 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt
worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin();
for(int sc = 0; sc < scores_sel_private.size(); ++sc)
{
if(((phi_sel.size() < _n_sis_select) || (scores_sel_private[sc] < scores_sel[worst_score_ind])) && valid_feature_against_private_selected(phi_sel_private[sc]->value_ptr(), phi_sel, scores_sel, scores_sel_private[sc]))
if(((phi_sel.size() < _n_sis_select) || (scores_sel_private[sc] < scores_sel[worst_score_ind])) && _is_valid_private(phi_sel_private[sc]->value_ptr(), _n_samp, _cross_cor_max, phi_sel, scores_sel, scores_sel_private[sc]))
{
if(phi_sel.size() == _n_sis_select)
{
......@@ -513,33 +524,6 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<node_pt
}
}
bool FeatureSpace::valid_feature_against_selected(double* val_ptr, std::vector<double>& scores_sel, double cur_score, int end_sel, int start_sel)
{
double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp);
for(int dd = start_sel; dd < end_sel; ++dd)
{
if(abs(cur_score - scores_sel[dd]) > 1e-5)
continue;
if(base_val - std::abs(util_funcs::r(node_value_arrs::get_d_matrix_ptr(dd), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10)
return false;
}
return true;
}
bool FeatureSpace::valid_feature_against_private_selected(double* val_ptr, std::vector<node_ptr>& selected, std::vector<double>& scores_sel, double cur_score)
{
double base_val = util_funcs::r(val_ptr, val_ptr, _n_samp);
for(int ff = 0; ff < selected.size(); ++ff)
{
if((abs(scores_sel[ff] - cur_score) < 1e-5) && (base_val - std::abs(util_funcs::r(selected[ff]->value_ptr(1), val_ptr, _n_samp)) < 1.0 - _cross_cor_max + 1e-10))
return false;
}
return true;
}
void FeatureSpace::sis(std::vector<double>& prop)
{
boost::filesystem::path p(_feature_space_file.c_str());
......@@ -585,7 +569,7 @@ void FeatureSpace::sis(std::vector<double>& prop)
start = omp_get_wtime();
while((cur_feat_local != _n_sis_select) && (ii < _scores.size()))
{
if(valid_feature_against_selected(_phi[inds[ii]]->value_ptr(), scores_sel_all, _scores[inds[ii]], cur_feat + cur_feat_local))
if(_is_valid(_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, _scores[inds[ii]], cur_feat + cur_feat_local, 0))
{
scores_sel[cur_feat_local] = _scores[inds[ii]];
scores_sel_all[cur_feat + cur_feat_local] = _scores[inds[ii]];
......@@ -649,7 +633,7 @@ void FeatureSpace::sis(std::vector<double>& prop)
// Get the n_sis_select best features (compare against features sent from other processes)
while((cur_feat + cur_feat_local != node_value_arrs::N_SELECTED) && (ii < sent_scores.size()))
{
if(valid_feature_against_selected(sent_phi[inds[ii]]->value_ptr(), scores_sel_all, sent_scores[inds[ii]], cur_feat + cur_feat_local, cur_feat))
if(_is_valid(sent_phi[inds[ii]]->value_ptr(), _n_samp, _cross_cor_max, scores_sel_all, sent_scores[inds[ii]], cur_feat + cur_feat_local, cur_feat))
{
out_file_stream << std::setw(14) <<std::left << cur_feat + cur_feat_local << sent_phi[inds[ii]]->postfix_expr() << std::endl;
sum_file_stream << std::setw(14) <<std::left << cur_feat + cur_feat_local << std::setw(24) << std::setprecision(18) << std::left << -1 * sent_scores[inds[ii]] << sent_phi[inds[ii]]->expr() << std::endl;
......
......@@ -16,6 +16,7 @@
#include <feature_creation/node/operator_nodes/allowed_ops.hpp>
#include <feature_creation/node/utils.hpp>
#include <feature_creation/node/value_storage/nodes_value_containers.hpp>
#include <utils/compare_features.hpp>
#include <utils/project.hpp>
#include <boost/serialization/shared_ptr.hpp>
......@@ -55,6 +56,10 @@ class FeatureSpace
std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS
std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project_no_omp; //!< Function used to calculate the scores for SIS without changing omp environment
std::function<bool(double*, int, double, std::vector<double>&, double, int, int)> _is_valid; //!< Function used to calculate the scores for SIS
std::function<bool(double*, int, double, std::vector<node_ptr>&, std::vector<double>&, double)> _is_valid_private; //!< Function used to calculate the scores for SIS without changing omp environment
std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator
double _cross_cor_max; //!< Maximum cross-correlation used for selecting features
......@@ -224,18 +229,6 @@ public:
*/
void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected);
/**
* @brief Checks the feature to see if it is still valid against previously selected features
*
* @param val_ptr pointer to value array of the current feature
* @param end_sel index of the feature to stop checking
*
* @return True if the feature is still valid
*/
bool valid_feature_against_selected(double* val_ptr, std::vector<double>& scores_sel, double cur_score, int end_sel, int start_sel = 0);
bool valid_feature_against_private_selected(double* val_ptr, std::vector<node_ptr>& selected, std::vector<double>& scores_sel, double cur_score);
/**
* @brief Perform SIS on a feature set with a specified property
* @details Perform sure-independence screening with either the correct property or the error
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment