Commit 7786a5a3 authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Merge branch 'data_overwrite_error' into 'joss'

Use inner_product of Standardized Values instead of r for calculating overlap

See merge request tpurcell/cpp_sisso!37
parents 34492b28 6f8b8658
......@@ -63,14 +63,15 @@ if(EXTERNAL_BOOST)
message(STATUS "Using external boost")
set(EXTERNAL_BOOST TRUE)
else(EXTERNAL_BOOST)
if(NOT DEFINED EXTERNAL_BUILD_N_PROCS)
set(EXTERNAL_BUILD_N_PROCS 1 CACHE STRING "Number of processes to use when building Boost")
endif()
message(STATUS "Building boost wth ${EXTERNAL_BUILD_N_PROCS} process(es)")
include( ExternalProject )
set(EXTERNAL_BOOST FALSE)
endif()
if(NOT DEFINED EXTERNAL_BUILD_N_PROCS)
set(EXTERNAL_BUILD_N_PROCS 1 CACHE STRING "Number of processes to use when building Boost")
endif()
# Check for FindOpenMP
find_package(OpenMP REQUIRED)
if (OPENMP_FOUND)
......
......@@ -79,7 +79,7 @@ class FeatureSpace
const std::string _phi_out_file; //!< Filename of the file to output the feature set to
std::function<bool(const double*, const int, const double, const std::vector<double>&, const double, const int, const int)> _is_valid; //!< Function used to determine of a feature is too correlated to previously selected features
std::function<bool(const double*, const int, const double, const std::vector<node_ptr>&, const std::vector<double>&, const double)> _is_valid_feat_list; //!< Function used to determine of a feature is too correlated to previously selected features within a given list
std::function<int(const double*, const int, const double, const std::vector<node_ptr>&, const std::vector<double>&, const double)> _is_valid_feat_list; //!< Function used to determine of a feature is too correlated to previously selected features within a given list
std::shared_ptr<MPI_Interface> _mpi_comm; //!< the MPI communicator for the calculation
......@@ -146,6 +146,24 @@ public:
*/
void initialize_fs_output_files() const;
/**
* @brief Remove duplicate features from the feature space
*
* @param feat_set Feature space to remove the duplicates from
* @param start The index to start the removal from
*/
void remove_duplicate_features(std::vector<node_ptr>& feat_set, int start);
#ifdef PARAMETERIZE
/**
* @brief Reorder features based on the number of parameters they have (smallest to largest)
*
* @param feat_set Feature space to remove the duplicates from
* @param start The index to start the removal from
*/
int reorder_by_n_params(std::vector<node_ptr>& feat_set, int start);
#endif
/**
* @brief Populate _phi using _phi_0 and the allowed operators up to (_max_rung - _n_rung_generate)^th rung
*/
......
......@@ -48,3 +48,49 @@ std::map<std::string, int> Node::primary_feature_decomp() const
}
BOOST_SERIALIZATION_ASSUME_ABSTRACT(Node)
void Node::set_standardized_value(const bool for_comp) const
{
double* stand_val_ptr;
if(_selected)
{
stand_val_ptr = node_value_arrs::get_stand_d_matrix_ptr(_d_mat_ind);
}
else
{
stand_val_ptr = node_value_arrs::access_temp_stand_storage(_arr_ind, for_comp);
}
util_funcs::standardize(value_ptr(-1, for_comp), _n_samp, stand_val_ptr);
}
void Node::set_standardized_test_value(const bool for_comp) const
{
double* val_ptr = value_ptr(-1, for_comp);
double* test_val_ptr = test_value_ptr(-1, for_comp);
double* stand_val_ptr = node_value_arrs::access_temp_stand_storage_test(_arr_ind, for_comp);
double mean = util_funcs::mean(val_ptr, _n_samp);
double stand_dev = util_funcs::stand_dev(val_ptr, _n_samp, mean);
std::transform(
test_val_ptr,
test_val_ptr + _n_samp_test,
stand_val_ptr,
[&](double val){return (val - mean) / stand_dev;}
);
}
double* Node::stand_value_ptr(const bool for_comp) const
{
if(_selected)
{
return node_value_arrs::get_stand_d_matrix_ptr(_d_mat_ind);
}
set_standardized_value(for_comp);
return node_value_arrs::access_temp_stand_storage(_arr_ind, for_comp);
}
double* Node::stand_test_value_ptr(const bool for_comp) const
{
set_standardized_test_value(for_comp);
return node_value_arrs::access_temp_stand_storage_test(_arr_ind, for_comp);
}
......@@ -279,6 +279,15 @@ public:
*/
virtual void set_value(int offset=-1, const bool for_comp=false) const = 0;
// DocString: node_set_stand_value
/**
* @brief Set the value of all training samples to the standardized values for the feature inside the central data storage arrays
*
* @param offset (int) Where the current node is in the binary expression tree relative to other nodes at the same depth
* @param for_comp (bool) If true then the evaluation is used for comparing features
*/
void set_standardized_value(const bool for_comp=false) const;
/**
* @brief The pointer to where the feature's training data is stored
*
......@@ -289,6 +298,16 @@ public:
*/
virtual double* value_ptr(int offset=-1, const bool for_comp=false) const = 0;
/**
* @brief The pointer to where the feature's standardized training data is stored
*
* @param offset (int) Where the current node is in the binary expression tree relative to other nodes at the same depth
* @param for_comp (bool) If true then the evaluation is used for comparing features
*
* @return pointer to the feature's training value
*/
double* stand_value_ptr(const bool for_comp=false) const;
// DocString: node_set_test_value
/**
* @brief Set the value of all test samples for the feature inside the central data storage array
......@@ -298,6 +317,15 @@ public:
*/
virtual void set_test_value(int offset=-1, const bool for_comp=false) const = 0;
// DocString: node_set_stand_test_value
/**
* @brief Set the value of all test samples to the standardized values for the feature inside the central data storage array
*
* @param offset (int) Where the current node is in the binary expression tree relative to other nodes at the same depth
* @param for_comp (bool) If true then the evaluation is used for comparing features
*/
void set_standardized_test_value(const bool for_comp=false) const;
/**
* @brief The pointer to where the feature's test data is stored
*
......@@ -308,6 +336,16 @@ public:
*/
virtual double* test_value_ptr(int offset=-1, const bool for_comp=false) const = 0;
/**
* @brief The pointer to where the feature's standardized test data is stored
*
* @param offset (int) Where the current node is in the binary expression tree relative to other nodes at the same depth
* @param for_comp (bool) If true then the evaluation is used for comparing features
*
* @return pointer to the feature's test values
*/
double* stand_test_value_ptr(const bool for_comp=false) const;
// DocString: node_is_nan
/**
* @brief Check if the feature has a NaN value in it
......
......@@ -47,6 +47,10 @@ std::vector<double> node_value_arrs::TEST_VALUES_ARR;
std::vector<double> node_value_arrs::TEMP_STORAGE_ARR;
std::vector<double> node_value_arrs::TEMP_STORAGE_TEST_ARR;
std::vector<double> node_value_arrs::STANDARDIZED_D_MATRIX;
std::vector<double> node_value_arrs::STANDARDIZED_STORAGE_ARR;
std::vector<double> node_value_arrs::STANDARDIZED_TEST_STORAGE_ARR;
void node_value_arrs::initialize_values_arr(
const int n_samples,
const int n_samples_test,
......@@ -61,6 +65,8 @@ void node_value_arrs::initialize_values_arr(
VALUES_ARR = std::vector<double>(N_STORE_FEATURES * N_SAMPLES);
TEST_VALUES_ARR = std::vector<double>(N_STORE_FEATURES * N_SAMPLES_TEST);
STANDARDIZED_STORAGE_ARR = std::vector<double>(2 * (N_PRIMARY_FEATURES + 1) * N_SAMPLES * MAX_N_THREADS);
STANDARDIZED_TEST_STORAGE_ARR = std::vector<double>(2 * (N_PRIMARY_FEATURES + 1) * N_SAMPLES_TEST * MAX_N_THREADS);
}
void node_value_arrs::initialize_values_arr(
......@@ -174,6 +180,9 @@ void node_value_arrs::resize_values_arr(const int n_dims, const int n_feat)
{
N_PRIMARY_FEATURES = N_STORE_FEATURES;
STANDARDIZED_STORAGE_ARR = std::vector<double>(2 * (N_PRIMARY_FEATURES + 1) * N_SAMPLES * MAX_N_THREADS);
STANDARDIZED_TEST_STORAGE_ARR = std::vector<double>(2 * (N_PRIMARY_FEATURES + 1) * N_SAMPLES_TEST * MAX_N_THREADS);
TEMP_STORAGE_ARR.resize(MAX_N_THREADS * (N_OP_SLOTS * N_PRIMARY_FEATURES + 1) * N_SAMPLES);
TEMP_STORAGE_ARR.shrink_to_fit();
......@@ -236,6 +245,7 @@ void node_value_arrs::initialize_d_matrix_arr()
{
N_SELECTED = 0;
D_MATRIX = std::vector<double>(0);
STANDARDIZED_D_MATRIX = std::vector<double>(0);
}
void node_value_arrs::resize_d_matrix_arr(const int n_select)
......@@ -243,6 +253,9 @@ void node_value_arrs::resize_d_matrix_arr(const int n_select)
N_SELECTED += n_select;
D_MATRIX.resize(N_SELECTED * N_SAMPLES, 0.0);
D_MATRIX.shrink_to_fit();
STANDARDIZED_D_MATRIX.resize(N_SELECTED * N_SAMPLES, 0.0);
STANDARDIZED_D_MATRIX.shrink_to_fit();
}
void node_value_arrs::finalize_values_arr()
......@@ -265,11 +278,18 @@ void node_value_arrs::finalize_values_arr()
TASK_START_TRAIN.resize(0);
TASK_SZ_TEST.resize(0);
PARAM_STORAGE_ARR.resize(0);
PARAM_STORAGE_TEST_ARR.resize(0);
D_MATRIX.resize(0);
VALUES_ARR.resize(0);
TEST_VALUES_ARR.resize(0);
TEMP_STORAGE_ARR.resize(0);
TEMP_STORAGE_TEST_ARR.resize(0);
PARAM_STORAGE_ARR.resize(0);
PARAM_STORAGE_TEST_ARR.resize(0);
STANDARDIZED_D_MATRIX.resize(0);
STANDARDIZED_STORAGE_ARR.resize(0);
STANDARDIZED_TEST_STORAGE_ARR.resize(0);
}
......@@ -58,6 +58,10 @@ namespace node_value_arrs
extern std::vector<int> TASK_START_TRAIN; //!< The starting point for each task in the training data
extern std::vector<int> TASK_SZ_TEST; //!< Number of test sample per task
extern std::vector<double> STANDARDIZED_D_MATRIX; //!< The descriptor matrix filled with standardized feature values (Central storage for the selected feature space)
extern std::vector<double> STANDARDIZED_STORAGE_ARR; //!< //!< The vector used to temporarily store the values of the standardized feature training values
extern std::vector<double> STANDARDIZED_TEST_STORAGE_ARR; //!< //!< The vector used to temporarily store the values of the standardized feature test values
extern int N_SELECTED; //!< Number of selected features
extern int N_SAMPLES; //!< Number of training samples for each feature (Sum of all elements in TASK_SZ_TRAIN)
......@@ -290,6 +294,38 @@ namespace node_value_arrs
*/
inline double* access_temp_storage_test(const unsigned long int slot){return &TEMP_STORAGE_TEST_ARR[slot*N_SAMPLES_TEST];}
/**
* @brief Access element of temporary standardized storage array for the training data
*
* @param arr_ind The array index of the feature
* @param for_comp True if used for a comparison
*
* @return pointer to the data stored in the specified slot
*/
inline double* access_temp_stand_storage(const unsigned long int arr_ind, const bool for_comp)
{
return &STANDARDIZED_STORAGE_ARR[
((arr_ind % N_PRIMARY_FEATURES) + for_comp * N_PRIMARY_FEATURES) * N_SAMPLES +
omp_get_thread_num() * 2 * (N_PRIMARY_FEATURES + 1) * N_SAMPLES
];
}
/**
* @brief Access element of temporary standardized storage array for the test data
*
* @param arr_ind The array index of the feature
* @param for_comp True if used for a comparison
*
* @return pointer to the data stored in the specified slot
*/
inline double* access_temp_stand_storage_test(const unsigned long int arr_ind, const bool for_comp)
{
return &STANDARDIZED_TEST_STORAGE_ARR[
((arr_ind % N_PRIMARY_FEATURES) + for_comp * N_PRIMARY_FEATURES) * N_SAMPLES_TEST +
omp_get_thread_num() * 2 * (N_PRIMARY_FEATURES + 1) * N_SAMPLES_TEST
];
}
/**
* @brief Access the param storage array
*
......@@ -367,7 +403,7 @@ namespace node_value_arrs
);
/**
* @brief Get the pointer to a particular selected Node from sis
* @brief Get the pointer to a particular selected Node's data from sis
*
* @param ind Index of the data in the descriptor matrix
* @return The pointer to the descriptor matrix's data
......@@ -375,7 +411,7 @@ namespace node_value_arrs
inline double* get_d_matrix_ptr(const int ind){return &D_MATRIX[ind * N_SAMPLES];}
/**
* @brief Get the pointer to a particular selected Node from sis
* @brief Get the pointer to a particular selected Node's data from sis
*
* @param ind Index of the data in the descriptor matrix
* @param taskind The index for the given task
......@@ -383,6 +419,23 @@ namespace node_value_arrs
*/
inline double* get_d_matrix_ptr(const int ind, const int taskind){return &D_MATRIX[ind * N_SAMPLES + TASK_START_TRAIN[taskind]];}
/**
* @brief Get the pointer to a particular selected Node's standardized from sis
*
* @param ind Index of the data in the descriptor matrix
* @return The pointer to the descriptor matrix's standardized data
*/
inline double* get_stand_d_matrix_ptr(const int ind){return &STANDARDIZED_D_MATRIX[ind * N_SAMPLES];}
/**
* @brief Get the pointer to a particular selected Node's standardized from sis
*
* @param ind Index of the data in the descriptor matrix
* @param taskind The index for the given task
* @return The pointer to the descriptor matrix's standardized data
*/
inline double* get_stand_d_matrix_ptr(const int ind, const int taskind){return &STANDARDIZED_D_MATRIX[ind * N_SAMPLES + TASK_START_TRAIN[taskind]];}
/**
* @brief Flush the temporary storage register (training data)
* @details Reset all slots in the register to -1
......
......@@ -65,20 +65,20 @@ std::vector<node_sc_pair> mpi_reduce_op::select_top_feats(std::vector<node_sc_pa
// Merge input vectors and sort
in_vec_2.insert(in_vec_2.end(), in_vec_1.begin(), in_vec_1.end());
std::sort(in_vec_2.begin(), in_vec_2.end(), my_sorter);
std::sort(in_vec_2.begin(), in_vec_2.end());
// Populate the output vector
int ff = 0;
int out_ind = 0;
while((out_ind < N_SIS_SELECT) && (ff < in_vec_2.size()))
{
const node_ptr cur_node = std::get<0>(in_vec_2[ff]);
if(cur_node && IS_VALID(cur_node->value_ptr(), cur_node->n_samp(), CROSS_COR_MAX, out_vec, std::get<1>(in_vec_2[ff])))
if(in_vec_2[ff]._feat && IS_VALID(in_vec_2[ff]._feat->stand_value_ptr(), in_vec_2[ff]._feat->n_samp(), CROSS_COR_MAX, out_vec, in_vec_2[ff]._score))
{
out_vec.push_back(in_vec_2[ff]);
++out_ind;
}
++ff;
}
return out_vec;
}
......@@ -35,26 +35,6 @@ namespace mpi_reduce_op
extern double CROSS_COR_MAX; //!< The maximum cross correlation between features
extern int N_SIS_SELECT; //!< The number of features to select
/**
* @brief Create a node_sc pair from a node_ptr and a score value
*
* @param feat the node_ptr for the pair
* @param sc the score for the pair
*
* @return The resulting pair
*/
inline node_sc_pair make_node_sc_pair(node_ptr feat, double sc){return std::make_tuple(feat, sc);}
/**
* @brief The function for sorting different node_sc pointers
*
* @param node_1 first node to compare
* @param node_2 second node to compare
*
* @return True if the score of node_1 is less then the score of node_2
*/
inline bool my_sorter(node_sc_pair node_1, node_sc_pair node_2){ return (std::get<1>(node_1) < std::get<1>(node_2)); }
/**
* @brief Get the top features of the combined input vectors
*
......
......@@ -21,6 +21,7 @@
#include "utils/compare_features.hpp"
#include <iomanip>
std::vector<double> comp_feats::DGEMV_OUT;
std::vector<double> comp_feats::RANK;
std::vector<int> comp_feats::INDEX;
......@@ -29,13 +30,14 @@ void comp_feats::set_is_valid_fxn(
const double max_corr,
const int n_samp,
std::function<bool(const double*, const int, const double, const std::vector<double>&, const double, const int, const int)>& is_valid,
std::function<bool(const double*, const int, const double, const std::vector<node_ptr>&, const std::vector<double>&, const double)>& is_valid_feat_list
std::function<int(const double*, const int, const double, const std::vector<node_ptr>&, const std::vector<double>&, const double)>& is_valid_feat_list
)
{
if(project_type.compare("classification") != 0)
{
if(max_corr < 0.99999)
{
DGEMV_OUT.resize(n_samp);
is_valid = valid_feature_against_selected_pearson;
is_valid_feat_list = valid_feature_against_selected_pearson_feat_list;
}
......@@ -79,9 +81,7 @@ bool comp_feats::valid_feature_against_selected_pearson_max_corr_1(
const int start_sel
)
{
double mean = util_funcs::mean<double>(val_ptr, n_samp);
double stand_dev = util_funcs::stand_dev(val_ptr, n_samp, mean);
double base_val = util_funcs::r(val_ptr, val_ptr, n_samp, mean, stand_dev, mean, stand_dev);
double base_val = std::inner_product(val_ptr, val_ptr + n_samp, val_ptr, 0.0);
for(int dd = start_sel; dd < end_sel; ++dd)
{
......@@ -90,9 +90,18 @@ bool comp_feats::valid_feature_against_selected_pearson_max_corr_1(
continue;
}
double comp_value = (
base_val - std::abs(util_funcs::r(val_ptr, node_value_arrs::get_d_matrix_ptr(dd), n_samp, mean, stand_dev))
double comp_value = 1.0 / static_cast<double>(n_samp) * (
base_val -
std::abs(
std::inner_product(
val_ptr,
val_ptr + n_samp,
node_value_arrs::get_stand_d_matrix_ptr(dd),
0.0
)
)
);
if(std::abs(comp_value) < 5.0e-9)
{
return false;
......@@ -101,7 +110,7 @@ bool comp_feats::valid_feature_against_selected_pearson_max_corr_1(
return true;
}
bool comp_feats::valid_feature_against_selected_pearson_max_corr_1_feat_list(
int comp_feats::valid_feature_against_selected_pearson_max_corr_1_feat_list(
const double* val_ptr,
const int n_samp,
const double cross_cor_max,
......@@ -110,9 +119,7 @@ bool comp_feats::valid_feature_against_selected_pearson_max_corr_1_feat_list(
const double cur_score
)
{
double mean = util_funcs::mean<double>(val_ptr, n_samp);
double stand_dev = util_funcs::stand_dev(val_ptr, n_samp, mean);
double base_val = util_funcs::r(val_ptr, val_ptr, n_samp, mean, stand_dev, mean, stand_dev);
double base_val = std::inner_product(val_ptr, val_ptr + n_samp, val_ptr, 0.0);
for(int ff = 0; ff < selected.size(); ++ff)
{
......@@ -121,15 +128,24 @@ bool comp_feats::valid_feature_against_selected_pearson_max_corr_1_feat_list(
continue;
}
double comp_value = (
base_val - std::abs(util_funcs::r(val_ptr, selected[ff]->value_ptr(-1, true), n_samp, mean, stand_dev))
double comp_value = 1.0 / static_cast<double>(n_samp) * (
base_val -
std::abs(
std::inner_product(
val_ptr,
val_ptr + n_samp,
selected[ff]->stand_value_ptr(true),
0.0
)
)
);
if(std::abs(comp_value) < 5.0e-9)
{
return false;
return 0;
}
}
return true;
return 1;
}
bool comp_feats::valid_feature_against_selected_pearson_max_corr_1_mpi_op(
......@@ -140,20 +156,27 @@ bool comp_feats::valid_feature_against_selected_pearson_max_corr_1_mpi_op(
const double cur_score
)
{
double mean = util_funcs::mean<double>(val_ptr, n_samp);
double stand_dev = util_funcs::stand_dev(val_ptr, n_samp, mean);
double base_val = util_funcs::r(val_ptr, val_ptr, n_samp, mean, stand_dev, mean, stand_dev);
double base_val = std::inner_product(val_ptr, val_ptr + n_samp, val_ptr, 0.0);
for(auto& feat_sc : out_vec)
{
if(abs(cur_score - std::get<1>(feat_sc)) > 1e-5)
if(abs(cur_score - feat_sc._score) > 1e-5)
{
continue;
}
double comp_value = (
base_val - std::abs(util_funcs::r(val_ptr, std::get<0>(feat_sc)->value_ptr(-1, true), n_samp, mean, stand_dev))
double comp_value = 1.0 / static_cast<double>(n_samp) * (
base_val -
std::abs(
std::inner_product(
val_ptr,
val_ptr + n_samp,
feat_sc._feat->stand_value_ptr(true),
0.0
)
)
);
if(std::abs(comp_value) < 5.0e-9)
{
return false;
......@@ -173,32 +196,30 @@ bool comp_feats::valid_feature_against_selected_pearson(
const int start_sel
)
{
double mean = util_funcs::mean<double>(val_ptr, n_samp);
double stand_dev = util_funcs::stand_dev(val_ptr, n_samp, mean);
double base_val = util_funcs::r(val_ptr, val_ptr, n_samp, mean, stand_dev, mean, stand_dev);
volatile bool is_valid = true;
#pragma omp parallel for schedule(dynamic)
for(int dd = start_sel; dd < end_sel; ++dd)
if(end_sel <= start_sel)
{
if(!is_valid)
{
continue;
}
double comp_value = (
base_val - std::abs(util_funcs::r(val_ptr, node_value_arrs::get_d_matrix_ptr(dd), n_samp, mean, stand_dev))
);
if(std::abs(comp_value) < (1.0 - cross_cor_max + 5.0e-9))
{
is_valid = false;
}
return true;
}
return is_valid;
DGEMV_OUT.resize(end_sel - start_sel);
dgemv_(
'N',
DGEMV_OUT.size(),
n_samp,
1.0 / static_cast<double>(n_samp),
node_value_arrs::get_stand_d_matrix_ptr(start_sel),
DGEMV_OUT.size(),
val_ptr,
1,
0.0,
DGEMV_OUT.data(),
1
);
return std::abs(DGEMV_OUT[idamax_(DGEMV_OUT.size(), DGEMV_OUT.data(), 1) - 1]) <= cross_cor_max;
}
bool comp_feats::valid_feature_against_selected_pearson_feat_list(
int comp_feats::valid_feature_against_selected_pearson_feat_list(
const double* val_ptr,
const int n_samp,
const double cross_cor_max,
......@@ -207,21 +228,21 @@ bool comp_feats::valid_feature_against_selected_pearson_feat_list(
const double cur_score
)
{
double mean = util_funcs::mean<double>(val_ptr, n_samp);
double stand_dev = util_funcs::stand_dev(val_ptr, n_samp, mean);
double base_val = util_funcs::r(val_ptr, val_ptr, n_samp, mean, stand_dev, mean, stand_dev);
for(auto& feat : selected)
int is_valid = 1;
double comp_value = 1.0;
for(int ff = 0; ff < selected.size(); ++ff)
{
double comp_value = (
base_val - std::abs(util_funcs::r(val_ptr, feat->value_ptr(-1, true), n_samp, mean, stand_dev))
comp_value = 1.0 / static_cast<double>(n_samp) * std::abs(
std::inner_product(val_ptr, val_ptr + n_samp, selected[ff]->stand_value_ptr(true), 0.0)
);
if(std::abs(comp_value) < (1.0 - cross_cor_max + 5.0e-9))
if((comp_value > cross_cor_max) && (cur_score > scores_sel[ff]))
{
return false;
return 0;
}
is_valid -= 2 * (comp_value > cross_cor_max);
}
return true;
return is_valid / std::abs