Commit 4259aefb authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Add projection for classification

Projection score added for classification problems, now onto the di part
parent a2ee6e60
#include <convex_hull/utils.hpp>
std::vector<double> convex_hull::SORTED_VALUE;
std::vector<int> convex_hull::SORTED_PROP_INDS;
std::vector<std::vector<double>> convex_hull::CLS_MAX;
std::vector<std::vector<double>> convex_hull::CLS_MIN;
std::vector<std::vector<int>> convex_hull::CLS_SZ;
std::vector<double> convex_hull::TASK_SCORES;
int convex_hull::N_CLASS = 0;
int convex_hull::N_TASK = 0;
void convex_hull::initialize_projection(std::vector<int>& sizes, double* prop)
{
int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0);
SORTED_PROP_INDS = std::vector<int>(n_samp);
std::iota(SORTED_PROP_INDS.begin(), SORTED_PROP_INDS.end(), 0);
SORTED_VALUE = std::vector<double>(SORTED_PROP_INDS.size(), 0.0);
CLS_SZ = std::vector<std::vector<int>>(sizes.size());
int start = 0;
for(int tt = 0; tt < sizes.size(); ++tt)
{
int start_original = start;
util_funcs::argsort(SORTED_PROP_INDS.data() + start, SORTED_PROP_INDS.data() + start + sizes[tt], prop + start);
for(int pp = start + 1; pp < start_original + sizes[tt]; ++pp)
{
if(prop[SORTED_PROP_INDS[pp]] != prop[SORTED_PROP_INDS[pp - 1]])
{
CLS_SZ[tt].push_back(pp - start);
start += pp - start;
}
}
CLS_SZ[tt].push_back(sizes[tt] + start_original - start);
start = start_original + sizes[tt];
}
CLS_MAX = std::vector<std::vector<double>>(CLS_SZ.size());
CLS_MIN = std::vector<std::vector<double>>(CLS_SZ.size());
N_TASK = sizes.size();
TASK_SCORES = std::vector<double>(N_TASK, 0.0);
for(int tt = 0; tt < N_TASK; ++tt)
{
CLS_MAX[tt] = std::vector<double>(CLS_SZ[tt].size(), 0);
CLS_MIN[tt] = std::vector<double>(CLS_SZ[tt].size(), 0);
N_CLASS = std::max(N_CLASS, static_cast<int>((CLS_SZ[tt].size())));
}
}
double convex_hull::overlap_1d(double* value, double width)
{
std::fill_n(TASK_SCORES.data(), TASK_SCORES.size(), 0.0);
for(int ii = 0; ii < SORTED_PROP_INDS.size(); ++ii)
SORTED_VALUE[ii] = value[SORTED_PROP_INDS[ii]];
int start = 0;
for(int tt = 0; tt < N_TASK; ++tt)
{
for(int cc = 0; cc < N_CLASS; ++cc)
{
CLS_MAX[tt][cc] = *std::max_element(SORTED_VALUE.begin() + start, SORTED_VALUE.begin() + start + CLS_SZ[tt][cc]) + width;
CLS_MIN[tt][cc] = *std::min_element(SORTED_VALUE.begin() + start, SORTED_VALUE.begin() + start + CLS_SZ[tt][cc]) - width;
start += CLS_SZ[tt][cc];
}
}
start = 0;
for(int tt = 0; tt < N_TASK; ++tt)
{
double cls_norm = 2.0 / (static_cast<double>(CLS_SZ[tt].size()) * static_cast<double>(CLS_SZ[tt].size() - 1));
for(int c1 = 0; c1 < CLS_SZ[tt].size(); ++c1)
{
start = 0;
double min = CLS_MIN[tt][c1];
double max = CLS_MAX[tt][c1];
for(int c2 = 0; c2 < CLS_SZ[tt].size(); ++c2)
{
if(c1 == c2)
{
start += CLS_SZ[tt][c2];
continue;
}
TASK_SCORES[tt] += std::count_if(SORTED_VALUE.begin() + start, SORTED_VALUE.begin() + start + CLS_SZ[tt][c2], [min, max](double val){return (val >= min) && (val <= max);});
TASK_SCORES[tt] += cls_norm * (1.0 - std::min(CLS_MAX[tt][c1] - CLS_MIN[tt][c2], CLS_MAX[tt][c2] - CLS_MIN[tt][c1]) / std::max(CLS_MAX[tt][c1] - CLS_MIN[tt][c2], CLS_MAX[tt][c2] - CLS_MIN[tt][c1]));
start += CLS_SZ[tt][c2];
}
}
}
return util_funcs::mean(TASK_SCORES);
}
\ No newline at end of file
/** @file convex_hull/utils.hpp
* @brief A set of functions to create convex hulls, and compare overlap/distance
*
* @author Thomas A. R. Purcell (tpurcell)
* @bug No known bugs.
*/
#ifndef CONVEX_UTILS
#define CONVEX_UTILS
#include <utils/math_funcs.hpp>
#include <iomanip>
namespace convex_hull
{
extern std::vector<double> SORTED_VALUE; //!< The value sorted based on the property
extern std::vector<int> SORTED_PROP_INDS; //!< The value sorted based on the property
extern std::vector<std::vector<double>> CLS_MAX; //!< The maximum value of the property in each class
extern std::vector<std::vector<double>> CLS_MIN; //!< The minimum value of the property in each class
extern std::vector<std::vector<int>> CLS_SZ; //!< The number of values in each class
extern std::vector<double> TASK_SCORES; //!< The scores for each task
extern int N_CLASS; //!< Number of classes
extern int N_TASK; //!< Number of tasks
void initialize_projection(std::vector<int>& sizes, double* prop);
/**
* @brief Calculate the projection scores of a set of features to a vector via Pearson correlation
*
* @param prop The pointer to the property vector to calculate the Pearson correlation against
* @param value The pointer to the value of the data
* @param inds Vector storing the indexes of the sorted porperty value
* @param class_szs number of elements in each class
* @returns The projection score for the particular feature
*/
double overlap_1d(double* value, double width = 1e-10);
}
#endif
\ No newline at end of file
......@@ -25,6 +25,7 @@ FeatureSpace::FeatureSpace(
std::vector<std::string> allowed_ops,
std::vector<double> prop,
std::vector<int> task_sizes,
std::string project_type,
int max_phi,
int n_sis_select,
int max_store_rung,
......@@ -53,10 +54,10 @@ FeatureSpace::FeatureSpace(
_n_rung_generate(n_rung_generate)
{
initialize_fs(prop);
initialize_fs(prop, project_type);
}
void FeatureSpace::initialize_fs(std::vector<double> prop)
void FeatureSpace::initialize_fs(std::vector<double> prop, std::string project_type)
{
if(_n_rung_store == -1)
_n_rung_store = _max_phi - 1;
......@@ -76,7 +77,13 @@ void FeatureSpace::initialize_fs(std::vector<double> prop)
out_file_stream.close();
sum_file_stream.close();
}
_project = project_funcs::project_r;
if(project_type.compare("regression") == 0)
_project = project_funcs::project_r2;
else if(project_type.compare("classification") == 0)
_project = project_funcs::project_classify;
else
throw std::logic_error("Wrong projection type passed to FeatureSpace constructor.");
for(auto & op : _allowed_ops)
{
......
......@@ -76,6 +76,8 @@ public:
* @param phi_0 The initial set of features to combine
* @param allowed_ops list of allowed operators
* @param prop The property to be learned (training data)
* @param task_sizes The number of samples per task
* @param project_type The projection operator to use
* @param max_phi highest rung value for the calculation
* @param n_sis_select number of features to select during each SIS step
* @param max_store_rung number of rungs to calculate and store the value of the features for all samples
......@@ -90,6 +92,7 @@ public:
std::vector<std::string> allowed_ops,
std::vector<double> prop,
std::vector<int> task_sizes,
std::string project_type="pearson",
int max_phi=1,
int n_sis_select=1,
int max_store_rung=-1,
......@@ -104,7 +107,7 @@ public:
*
* @param prop The property trying to be learned
*/
void initialize_fs(std::vector<double> prop);
void initialize_fs(std::vector<double> prop, std::string project_type);
/**
* @brief Generate the full feature set from the allowed operators and initial feature set
......@@ -264,6 +267,8 @@ public:
* @param phi_0 The initial set of features to combine
* @param allowed_ops list of allowed operators
* @param prop The property to be learned (training data)
* @param task_sizes The number of samples per task
* @param project_type The projection operator to use
* @param max_phi highest rung value for the calculation
* @param n_sis_select number of features to select during each SIS step
* @param max_store_rung number of rungs to calculate and store the value of the features for all samples
......@@ -277,6 +282,7 @@ public:
py::list allowed_ops,
py::list prop,
py::list task_sizes,
std::string project_type="pearson",
int max_phi=1,
int n_sis_select=1,
int max_store_rung=-1,
......@@ -293,6 +299,8 @@ public:
* @param phi_0 The initial set of features to combine
* @param allowed_ops list of allowed operators
* @param prop The property to be learned (training data)
* @param task_sizes The number of samples per task
* @param project_type The projection operator to use
* @param max_phi highest rung value for the calculation
* @param n_sis_select number of features to select during each SIS step
* @param max_store_rung number of rungs to calculate and store the value of the features for all samples
......@@ -306,6 +314,7 @@ public:
py::list allowed_ops,
np::ndarray prop,
py::list task_sizes,
std::string project_type="pearson",
int max_phi=1,
int n_sis_select=1,
int max_store_rung=-1,
......@@ -321,6 +330,8 @@ public:
*
* @param feature_file The file with the postfix expressions for the feature space
* @param phi_0 The initial set of features to combine
* @param task_sizes The number of samples per task
* @param project_type The projection operator to use
* @param n_sis_select number of features to select during each SIS step
* @param max_store_rung number of rungs to calculate and store the value of the features for all samples
* @param cross_corr_max Maximum cross-correlation used for selecting features
......@@ -329,6 +340,7 @@ public:
std::string feature_file,
py::list phi_0,
py::list task_sizes,
std::string project_type="pearson",
int n_sis_select=1,
double cross_corr_max=1.0
);
......
......@@ -6,6 +6,7 @@ InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::sh
_data_file(IP.get<std::string>("data_file", "data.csv")),
_prop_key(IP.get<std::string>("property_key", "prop")),
_task_key(IP.get<std::string>("task_key", "Task")),
_calc_type(IP.get<std::string>("calc_type", "regression")),
_leave_out_inds(as_vector<int>(IP, "leave_out_inds")),
_cross_cor_max(IP.get<double>("max_feat_cross_correlation", 1.0)),
_l_bound(IP.get<double>("min_abs_feat_val", 1e-50)),
......@@ -273,7 +274,7 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st
for(int ff = 0; ff < headers.size(); ++ff)
phi_0.push_back(std::make_shared<FeatureNode>(ff, headers[ff], data[ff], test_data[ff], units[ff]));
_feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _cross_cor_max, _l_bound, _u_bound);
_feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _calc_type, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _cross_cor_max, _l_bound, _u_bound);
}
void stripComments(std::string& filename)
......
......@@ -48,6 +48,7 @@ public:
std::string _data_file; //!< Name of the data file
std::string _prop_key; //!< Key used to find the property column in the data file
std::string _task_key; //!< Key used to find the task column in the data file
std::string _calc_type; //!< Type of projection operator to use
std::shared_ptr<FeatureSpace> _feat_space; //!< shared_ptr to the FeatureSpace generated from the data file and the input file
......
......@@ -77,6 +77,7 @@ def generate_phi_0_from_csv(
# Load csv file
if isinstance(df, str):
df = pd.read_csv(df, index_col=0)
df = df.astype(float)
df_cols = df.columns.to_list()
col_exprs = [c.split(" (")[0] for c in df_cols]
prop_ind = np.where([c_ex == prop_key.split(" (")[0] for c_ex in col_exprs])[0][0]
......@@ -191,7 +192,9 @@ def generate_phi_0_from_csv(
)
def generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_select):
def generate_fs(
phi_0, prop, task_sizes_train, allowed_ops, calc_type, max_phi, n_sis_select
):
"""Generate a FeatureSet for the calculation
Args:
......@@ -200,9 +203,8 @@ def generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_selec
task_sizes_train (list): The number of samples in the training data for each task
allowed_ops (list): List of operations used to combine the features
max_phi (int): Maximum rung for the calculation
calc_type (str): type of calculation regression or classification
n_sis_select (int): number of features to select in each round of SIS
task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in
leave_out_frac (list): List of indices to pull from the training data to act as a test set
Returns:
fs (FeatureSpace): The FeatureSpace for the calculation
......@@ -230,7 +232,13 @@ def generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_selec
]
return FeatureSpace(
phi_0, allowed_ops, list(prop), task_sizes_train, max_phi, n_sis_select
phi_0,
allowed_ops,
list(prop),
task_sizes_train,
calc_type,
max_phi,
n_sis_select,
)
......@@ -242,6 +250,7 @@ def generate_fs_sr_from_csv(
max_phi,
n_sis_select,
max_dim,
calc_type="regression",
n_residuals=1,
n_model_store=1,
task_key=None,
......@@ -258,6 +267,7 @@ def generate_fs_sr_from_csv(
max_phi (int): Maximum rung for the calculation
n_sis_select (int): number of features to select in each round of SIS
max_dim (int): Maximum dimension of the models to learn
calc_type (str): type of calculation regression or classification
n_residuals (int): number of residuals to use for the next SIS step when learning higher dimensional models
task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in
leave_out_frac (float): Fraction of samples to leave out
......@@ -276,7 +286,9 @@ def generate_fs_sr_from_csv(
leave_out_inds=leave_out_inds,
)
fs = generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_select)
fs = generate_fs(
phi_0, prop, task_sizes_train, allowed_ops, calc_type, max_phi, n_sis_select
)
sr = SISSORegressor(
fs,
......
......@@ -40,9 +40,9 @@ void sisso::feature_creation::registerFeatureSpace()
void (FeatureSpace::*sis_list)(list) = &FeatureSpace::sis;
void (FeatureSpace::*sis_ndarray)(np::ndarray) = &FeatureSpace::sis;
class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double, double>>())
.def(init<list, list, list, list, optional<int, int, int, int, double, double, double>>())
.def(init<std::string, list, list, optional<int, double>>())
class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<std::string, int, int, int, int, double, double, double>>())
.def(init<list, list, list, list, optional<std::string, int, int, int, int, double, double, double>>())
.def(init<std::string, list, list, optional<std::string, int, double>>())
.def("sis", sis_list, "@DocString_feat_space_sis_list@")
.def("sis", sis_ndarray, "@DocString_feat_space_sis_arr@")
.def("feat_in_phi", &FeatureSpace::feat_in_phi, "@DocString_feat_space_feat_in_phi@")
......
......@@ -5,6 +5,7 @@ FeatureSpace::FeatureSpace(
py::list allowed_ops,
py::list prop,
py::list task_sizes,
std::string project_type,
int max_phi,
int n_sis_select,
int max_store_rung,
......@@ -32,7 +33,7 @@ FeatureSpace::FeatureSpace(
_n_rung_generate(n_rung_generate)
{
_n_samp = _phi_0[0]->n_samp();
initialize_fs(python_conv_utils::from_list<double>(prop));
initialize_fs(python_conv_utils::from_list<double>(prop), project_type);
}
FeatureSpace::FeatureSpace(
......@@ -40,6 +41,7 @@ FeatureSpace::FeatureSpace(
py::list allowed_ops,
np::ndarray prop,
py::list task_sizes,
std::string project_type,
int max_phi,
int n_sis_select,
int max_store_rung,
......@@ -67,13 +69,14 @@ FeatureSpace::FeatureSpace(
_n_rung_generate(n_rung_generate)
{
_n_samp = _phi_0[0]->n_samp();
initialize_fs(python_conv_utils::from_ndarray<double>(prop));
initialize_fs(python_conv_utils::from_ndarray<double>(prop), project_type);
}
FeatureSpace::FeatureSpace(
std::string feature_file,
py::list phi_0,
py::list task_sizes,
std::string project_type,
int n_sis_select,
double cross_corr_max
):
......@@ -92,7 +95,12 @@ FeatureSpace::FeatureSpace(
_n_rung_generate(0)
{
_n_samp = _phi_0[0]->n_samp();
_project = project_funcs::project_r;
if(project_type.compare("regression") == 0)
_project = project_funcs::project_r2;
else if(project_type.compare("classification") == 0)
_project = project_funcs::project_classify;
else
throw std::logic_error("Wrong projection type passed to FeatureSpace constructor.");
std::vector<node_ptr> phi_temp = str2node::phi_from_file(feature_file, _phi_0);
_n_feat = phi_temp.size();
......
......@@ -207,9 +207,9 @@ namespace util_funcs
/**
* @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one
*
* @param vec vector to sort
* @param begin the starting point for the sorting
* @param end the end point for the sorting
* @param vec vector to sort
* @return The indexes of the sorted array
*/
inline void argsort(int* begin, int* end, std::vector<double>& vec)
......@@ -217,6 +217,18 @@ namespace util_funcs
std::sort(begin, end, [&vec](int i1, int i2){return vec[i1] < vec[i2];});
}
/**
* @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one
*
* @param begin the starting point for the sorting
* @param end the end point for the sorting
* @param vec_begin start pointer to the vector to sort
*/
inline void argsort(int* begin, int* end, double* vec_begin)
{
std::sort(begin, end, [vec_begin](int i1, int i2){return vec_begin[i1] < vec_begin[i2];});
}
/**
* @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one
*
......@@ -245,6 +257,18 @@ namespace util_funcs
std::sort(begin, end, [&vec](int i1, int i2){return vec[i1] < vec[i2];});
}
/**
* @brief Sort a vector and return the indexes of the unsorted array that corresponds to the sorted one
*
* @param begin the starting point for the sorting
* @param end the end point for the sorting
* @param vec_begin start pointer to the vector to sort
*/
inline void argsort(int* begin, int* end, int* vec_begin)
{
std::sort(begin, end, [vec_begin](int i1, int i2){return vec_begin[i1] < vec_begin[i2];});
}
/**
* @brief The maximum absolute value of the vector
*
......
......@@ -25,4 +25,20 @@ void project_funcs::project_r2(double* prop, double* scores, std::vector<node_pt
std::transform(phi.begin(), phi.end(), scores, scores, [&prop, &sizes](node_ptr feat, double sc){return std::min(-1 * util_funcs::r2(prop, feat->value_ptr(), sizes.data(), sizes.size()), sc);});
}
std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;});
}
\ No newline at end of file
}
void project_funcs::project_classify(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& sizes, int n_prop)
{
int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0);
convex_hull::initialize_projection(sizes, prop);
std::transform(phi.begin(), phi.end(), scores, [](node_ptr feat){return convex_hull::overlap_1d(feat->value_ptr());});
for(int pp = 1; pp < n_prop; ++pp)
{
prop += n_samp;
convex_hull::initialize_projection(sizes, prop);
std::transform(phi.begin(), phi.end(), scores, [](node_ptr feat){return convex_hull::overlap_1d(feat->value_ptr());});
}
std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? std::numeric_limits<double>::infinity() : score;});
}
......@@ -4,7 +4,13 @@
* @author Thomas A. R. Purcell (tpurcell)
* @bug No known bugs.
*/
#ifndef UTILS_PROJECT
#define UTILS_PROJECT
#include <limits>
#include <feature_creation/node/Node.hpp>
#include <convex_hull/utils.hpp>
namespace project_funcs
{
......@@ -29,4 +35,17 @@ namespace project_funcs
* @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of
*/
void project_r2(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop);
}
\ No newline at end of file
/**
* @brief Calculate projection scores for classification
* @details Create a 1D-convex hull for each class and calculate the projection score
*
* @param prop The classes to separate the property
* @param scores the vector to store the projection scores
* @param size list of the sizes for each task
* @param n_prop number of properties
*/
void project_classify(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop);
}
#endif
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment