Commit 8c344b22 authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Create a feature space constructor to build from a file for python

For the role descriptor
parent af14f591
......@@ -765,3 +765,11 @@ void FeatureSpace::sis(std::vector<double>& prop)
sum_file_stream.close();
}
}
void FeatureSpace::remove_feature(int ind)
{
if(_phi[ind]->selected())
throw std::logic_error("Can't remove a selected feature");
_phi.erase(_phi.begin() + ind);
}
......@@ -14,6 +14,7 @@
#include <feature_creation/node/FeatureNode.hpp>
#include <feature_creation/node/ModelNode.hpp>
#include <feature_creation/node/operator_nodes/allowed_ops.hpp>
#include <feature_creation/node/utils.hpp>
#include <feature_creation/node/value_storage/nodes_value_containers.hpp>
#include <utils/project.hpp>
......@@ -272,13 +273,20 @@ public:
*/
inline bool feat_in_phi(int ind){return (ind >= _phi[0]->feat_ind()) && (ind <= _phi.back()->feat_ind());}
// DocString: feat_space_remove_feature
/**
* @brief Remove a feature from phi
*
* @param ind index of feature to remove
*/
void remove_feature(int ind);
// Python Interface Functions
#ifdef PY_BINDINGS
/**
* @brief Constructor for the feature space that takes in python objects
* @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
*
* @param mpi_comm MPI communicator for the calculations
* @param phi_0 The initial set of features to combine
* @param allowed_ops list of allowed operators
* @param prop The property to be learned (training data)
......@@ -308,7 +316,6 @@ public:
* @brief Constructor for the feature space that takes in python and numpy objects
* @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
*
* @param mpi_comm MPI communicator for the calculations
* @param phi_0 The initial set of features to combine
* @param allowed_ops list of allowed operators
* @param prop The property to be learned (training data)
......@@ -334,6 +341,24 @@ public:
double max_abs_feat_val=1e50
);
/**
* @brief Constructor for the feature space that takes in python and numpy objects
* @details constructs the feature space from an initial set of features and a file containing postfix expressions for the features (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
*
* @param feature_file The file with the postfix expressions for the feature space
* @param phi_0 The initial set of features to combine
* @param n_sis_select number of features to select during each SIS step
* @param max_store_rung number of rungs to calculate and store the value of the features for all samples
* @param cross_corr_max Maximum cross-correlation used for selecting features
*/
FeatureSpace(
std::string feature_file,
py::list phi_0,
py::list task_sizes,
int n_sis_select=1,
double cross_corr_max=1.0
);
// DocString: feat_space_sis_arr
/**
* @brief Wrapper function for SIS using a numpy array
......@@ -372,6 +397,13 @@ public:
*/
py::list phi0_py();
// DocString: feat_space_phi_py
/**
* @brief The feature space (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
* @return _phi as a python list
*/
py::list phi_py();
// DocString: feat_space_scores_py
/**
* @brief The vector of projection scores for SIS
......@@ -400,6 +432,14 @@ public:
*/
inline py::list start_gen_py(){return python_conv_utils::to_list<int>(_start_gen);}
// DocString: feat_space_get_feature
/**
* @brief Return a feature at a specified index
*
* @param ind index of the feature to get
* @return A ModelNode of the feature at index ind
*/
inline ModelNode get_feature(int ind){return ModelNode(_phi[ind]->d_mat_ind(), _phi[ind]->rung(), _phi[ind]->expr(), _phi[ind]->postfix_expr(), _phi[ind]->value(), _phi[ind]->test_value(), _phi[ind]->unit());}
#endif
};
......
......@@ -201,7 +201,6 @@ public:
*/
virtual std::vector<double> value() = 0;
/**
* @brief Get the test data for the feature
*/
......
#include <feature_creation/node/utils.hpp>
node_ptr str2node::postfix2node(std::string postfix_expr, const std::vector<node_ptr>& phi_0, int feat_ind)
node_ptr str2node::postfix2node(std::string postfix_expr, const std::vector<node_ptr>& phi_0, int& feat_ind)
{
std::vector<node_ptr> stack;
std::vector<std::string> postfix_split = str_utils::split_string_trim(postfix_expr, "|");
feat_ind += postfix_split.size() - 1;
if(postfix_split.size() == 1)
return phi_0[std::stoi(postfix_split[0])];
for(int ff = 0; ff < postfix_split.size(); ++ff)
{
std::string term = postfix_split[ff];
try
{
stack.push_back(phi_0[std::stoi(term)]);
--feat_ind;
++feat_ind;
}
catch(const std::invalid_argument e)
{
......@@ -66,7 +69,7 @@ node_ptr str2node::postfix2node(std::string postfix_expr, const std::vector<node
stack[stack.size() - 1] = std::make_shared<SixPowNode>(stack[stack.size() - 1], feat_ind, 1e-50, 1e50);
else
throw std::logic_error("Term in postfix expression does not represent a node");
--feat_ind;
++feat_ind;
}
}
if(stack.size() != 1)
......@@ -110,3 +113,82 @@ std::vector<node_ptr> str2node::phi_selected_from_file(std::string filename, std
file_stream.close();
return phi_selected;
}
std::vector<node_ptr> str2node::phi_from_file(std::string filename, std::vector<node_ptr> phi_0)
{
node_value_arrs::resize_values_arr(0, phi_0.size(), true);
node_value_arrs::initialize_d_matrix_arr();
std::ifstream file_stream;
file_stream.open(filename, std::ios::in);
std::string line;
std::vector<node_ptr> phi;
int feat_ind = phi_0.size();
std::getline(file_stream, line);
while(std::getline(file_stream, line))
{
if(line[0] == '#')
continue;
try
{
node_ptr feat = postfix2node(line, phi_0, feat_ind);
if(!feat)
if(feat->type() == NODE_TYPE::FEAT)
continue;
phi.push_back(feat);
phi.back()->set_selected(false);
++feat_ind;
}
catch(const InvalidFeatureException& e)
{
// Do Nothing
}
}
file_stream.close();
return phi;
}
std::string node_identifier::feature_type_to_string(NODE_TYPE nt)
{
if(nt == NODE_TYPE::FEAT)
return "feature";
if(nt == NODE_TYPE::MODEL_FEATURE)
return "model";
if(nt == NODE_TYPE::ADD)
return "add";
if(nt == NODE_TYPE::SUB)
return "sub";
if(nt == NODE_TYPE::ABS_DIFF)
return "abs_diff";
if(nt == NODE_TYPE::MULT)
return "mult";
if(nt == NODE_TYPE::DIV)
return "div";
if(nt == NODE_TYPE::EXP)
return "exp";
if(nt == NODE_TYPE::NEG_EXP)
return "neg_exp";
if(nt == NODE_TYPE::INV)
return "inv";
if(nt == NODE_TYPE::SQ)
return "sq";
if(nt == NODE_TYPE::CB)
return "cb";
if(nt == NODE_TYPE::SIX_POW)
return "six_pow";
if(nt == NODE_TYPE::SQRT)
return "sqrt";
if(nt == NODE_TYPE::CBRT)
return "cbrt";
if(nt == NODE_TYPE::LOG)
return "log";
if(nt == NODE_TYPE::ABS)
return "abs";
if(nt == NODE_TYPE::SIN)
return "sin";
if(nt == NODE_TYPE::COS)
return "cos";
}
......@@ -26,7 +26,7 @@ namespace str2node
* @param feat_ind The desired feature index
* @return The feature node described by the postfix expression
*/
node_ptr postfix2node(std::string postfix_expr, const std::vector<node_ptr>& phi_0, int feat_ind);
node_ptr postfix2node(std::string postfix_expr, const std::vector<node_ptr>& phi_0, int& feat_ind);
/**
* @brief Convert a feature_space/selected_features.txt into a phi_selected;
......@@ -38,6 +38,27 @@ namespace str2node
* @return The selected feature set from the file
*/
std::vector<node_ptr> phi_selected_from_file(std::string filename, std::vector<node_ptr> phi_0);
}
/**
* @brief Convert a text file into the feature space
* @details Read in the file to get the postfix expressions and regenerate the features using phi_0
*
* @param filename The name of the file storing all the features
* @param phi_0 The initial feature space
*
* @return The feature set defined from the file
*/
std::vector<node_ptr> phi_from_file(std::string filename, std::vector<node_ptr> phi_0);
}
namespace node_identifier
{
/**
* @brief Convert a node type into the corresponding string identifier
*
* @param nt node type
* @return string representation of the node type
*/
std::string feature_type_to_string(NODE_TYPE nt);
}
#endif
\ No newline at end of file
......@@ -33,7 +33,7 @@ Unit::Unit(std::string unit_str) :
all_ops.insert(all_ops.end(), div_ops.begin(), div_ops.end());
std::sort(all_ops.begin(), all_ops.end());
std::vector<double> ops(1 + all_ops.size(), 1.0);
std::vector<double> ops((unit_str != "") + all_ops.size(), 1.0);
for(int oo = 1; oo < ops.size(); ++oo)
{
if(std::any_of(div_ops.begin(), div_ops.end(), [&all_ops, &oo](int i){return i == all_ops[oo-1];}))
......
......@@ -21,12 +21,12 @@ def get_unit(header):
def generate_phi_0_from_csv(
df_csv, prop_key, cols="all", task_key=None, leave_out_frac=0.0
df, prop_key, cols="all", task_key=None, leave_out_frac=0.0
):
"""Create initial feature set from csv file
Args:
df_csv (str): The csv file containing all of the data for the calculation
df (str or pandas.DataFrame): The DataFrame of csv file of the initial feature set
prop_key (str): The key corresponding to which column in the csv file the property is stored in
cols (list or str): The columns to include in the initial feature set
task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in
......@@ -42,9 +42,10 @@ def generate_phi_0_from_csv(
"""
# Load csv file
df = pd.read_csv(df_csv, index_col=0)
cols = df.columns.to_list()
col_exprs = [c.split(" (")[0] for c in cols]
if isinstance(df, str):
df = pd.read_csv(df, index_col=0)
df_cols = df.columns.to_list()
col_exprs = [c.split(" (")[0] for c in df_cols]
prop_ind = np.where([c_ex == prop_key.split(" (")[0] for c_ex in col_exprs])[0][0]
# Get the task information
......@@ -65,7 +66,7 @@ def generate_phi_0_from_csv(
task_sizes = [len(df)]
# Get prop
prop_key = cols[prop_ind]
prop_key = df_cols[prop_ind]
prop = df[prop_key].to_numpy()
df = df.drop([prop_key], axis=1)
......@@ -102,7 +103,7 @@ def generate_phi_0_from_csv(
if cols != "all":
for col in df.columns.tolist():
if col not in cols:
if col.split(" (")[0] not in cols:
df = df.drop([col], axis=1)
phi_0 = []
......@@ -118,6 +119,7 @@ def generate_phi_0_from_csv(
unit = ""
else:
unit = unit.replace(" ", "")
phi_0.append(
FeatureNode(feat_ind, expr.replace(" ", ""), value, test_value, Unit(unit))
)
......@@ -133,63 +135,7 @@ def generate_phi_0_from_csv(
)
def generate_fs_prop(
df_csv,
prop_key,
allowed_ops,
cols,
max_phi,
n_sis_select,
task_key=None,
leave_out_frac=0.0,
):
"""Generate a FeatureSet for the calculation
Args:
df_csv (str): The csv file containing all of the data for the calculation
prop_key (str): The key corresponding to which column in the csv file the property is stored in
allowed_ops (list): List of operations used to combine the features
cols (list or str): The columns to include in the initial feature set
max_phi (int): Maximum rung for the calculation
n_sis_select (int): number of features to select in each round of SIS
task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in
leave_out_frac (list): List of indices to pull from the training data to act as a test set
Returns:
fs (FeatureSpace): The FeatureSpace for the calculation
"""
phi_0, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv(
df_csv, prop_key, cols=cols, task_key=task_key, leave_out_frac=leave_out_frac
)
if allowed_ops == "all":
allowed_ops = [
"add",
"sub",
"mult",
"div",
"abs_diff",
"inv",
"abs",
"cos",
"sin",
"exp",
"neg_exp",
"log",
"sq",
"sqrt",
"cb",
"cbrt",
"six_pow",
]
return FeatureSpace(
phi_0, allowed_ops, list(prop), task_sizes_train, max_phi, n_sis_select
)
def generate_fs_prop(
phi_0, prop, task_sizes_train, allowed_ops, cols, max_phi, n_sis_select
):
def generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_select):
"""Generate a FeatureSet for the calculation
Args:
......@@ -197,7 +143,6 @@ def generate_fs_prop(
prop (np.ndarray): The property values for the training data
task_sizes_train (list): The number of samples in the training data for each task
allowed_ops (list): List of operations used to combine the features
cols (list or str): The columns to include in the initial feature set
max_phi (int): Maximum rung for the calculation
n_sis_select (int): number of features to select in each round of SIS
task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in
......@@ -234,7 +179,7 @@ def generate_fs_prop(
def generate_fs_sr_from_csv(
df_csv,
df,
prop_key,
allowed_ops,
cols,
......@@ -248,7 +193,7 @@ def generate_fs_sr_from_csv(
"""Generate a FeatureSet and SISSORegressor for the calculation
Args:
df_csv (str): The csv file containing all of the data for the calculation
df (str): The csv file containing all of the data for the calculation
prop_key (str): The key corresponding to which column in the csv file the property is stored in
allowed_ops (list): List of operations used to combine the features
cols (list or str): The columns to include in the initial feature set
......@@ -264,12 +209,10 @@ def generate_fs_sr_from_csv(
sr (SISSORegressor): The SISSORegressor for the calculation
"""
phi_0, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv(
df_csv, prop_key, cols=cols, task_key=task_key, leave_out_frac=leave_out_frac
df, prop_key, cols=cols, task_key=task_key, leave_out_frac=leave_out_frac
)
fs = generate_fs_prop(
phi_0, prop, task_sizes_train, allowed_ops, cols, max_phi, n_sis_select
)
fs = generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_select)
sr = SISSORegressor(
fs,
......
......@@ -41,11 +41,15 @@ void sisso::feature_creation::registerFeatureSpace()
class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double, double>>())
.def(init<list, list, list, list, optional<int, int, int, int, double, double, double>>())
.def(init<std::string, list, list, optional<int, double>>())
.def("sis", sis_list, "@DocString_feat_space_sis_list@")
.def("sis", sis_ndarray, "@DocString_feat_space_sis_arr@")
.def("feat_in_phi", &FeatureSpace::feat_in_phi, "@DocString_feat_space_feat_in_phi@")
.def("remove_feature", &FeatureSpace::remove_feature, "@DocString_feat_space_remove_feature@")
.def("get_feature", &FeatureSpace::get_feature, "@DocString_feat_space_get_feature@")
.add_property("phi_selected", &FeatureSpace::phi_selected_py, "@DocString_feat_space_phi_selected_py@")
.add_property("phi0", &FeatureSpace::phi0_py, "@DocString_feat_space_phi0_py@")
.add_property("phi", &FeatureSpace::phi_py, "@DocString_feat_space_phi_py@")
.add_property("scores", &FeatureSpace::scores_py, "@DocString_feat_space_scores_py@")
.add_property("task_sizes", &FeatureSpace::task_sizes_py, "@DocString_feat_space_task_sizes_py@")
.add_property("allowed_ops", &FeatureSpace::allowed_ops_py, "@DocString_feat_space_allowed_ops_py@")
......
......@@ -70,6 +70,68 @@ FeatureSpace::FeatureSpace(
initialize_fs(python_conv_utils::from_ndarray<double>(prop));
}
FeatureSpace::FeatureSpace(
std::string feature_file,
py::list phi_0,
py::list task_sizes,
int n_sis_select,
double cross_corr_max
):
_phi_0(python_conv_utils::shared_ptr_vec_from_list<Node, FeatureNode>(phi_0)),
_scores(py::len(phi_0), 0.0),
_task_sizes(python_conv_utils::from_list<int>(task_sizes)),
_feature_space_file("feature_space/selected_features.txt"),
_feature_space_summary_file("feature_space/SIS_summary.txt"),
_mpi_comm(mpi_setup::comm),
_cross_cor_max(cross_corr_max),
_l_bound(1e-50),
_u_bound(1e50),
_n_sis_select(n_sis_select),
_n_feat(py::len(phi_0)),
_n_rung_store(0),
_n_rung_generate(0)
{
_n_samp = _phi_0[0]->n_samp();
_project = project_funcs::project_r;
std::vector<node_ptr> phi_temp = str2node::phi_from_file(feature_file, _phi_0);
_n_feat = phi_temp.size();
_phi.resize(_n_feat);
std::vector<int> rungs(_n_feat, 0);
for(int ff = 0; ff < _n_feat; ++ff)
{
rungs[ff] = phi_temp[ff]->rung();
if(phi_temp[ff]->type() == NODE_TYPE::FEAT)
continue;
std::string nt = node_identifier::feature_type_to_string(phi_temp[ff]->type());
if(std::find(_allowed_ops.begin(), _allowed_ops.end(), nt) == _allowed_ops.end())
_allowed_ops.push_back(nt);
}
for(auto & op : _allowed_ops)
{
if((op.compare("add") == 0) || (op.compare("sub") == 0) || (op.compare("mult") == 0) || (op.compare("abs_diff") == 0))
_com_bin_operators.push_back(allowed_op_maps::binary_operator_map[op]);
else if((op.compare("div") == 0))
_bin_operators.push_back(allowed_op_maps::binary_operator_map[op]);
else
_un_operators.push_back(allowed_op_maps::unary_operator_map[op]);
}
std::vector<int> rung_inds = util_funcs::argsort(rungs);
_max_phi = *std::max_element(rungs.begin(), rungs.end());
_phi[0] = phi_temp[rung_inds[0]];
for(int ff = 1; ff < _n_feat; ++ff)
{
_phi[ff] = phi_temp[rung_inds[ff]];
if(_phi[ff]->rung() != _phi[ff - 1]->rung())
_start_gen.push_back(ff);
}
_scores.resize(_n_feat);
}
py::list FeatureSpace::phi0_py()
{
py::list feat_lst;
......@@ -78,6 +140,14 @@ py::list FeatureSpace::phi0_py()
return feat_lst;
}
py::list FeatureSpace::phi_py()
{
py::list feat_lst;
for(auto& feat : _phi)
feat_lst.append<ModelNode>(ModelNode(feat->d_mat_ind(), feat->rung(), feat->expr(), feat->postfix_expr(), feat->value(), feat->test_value(), feat->unit()));
return feat_lst;
}
py::list FeatureSpace::phi_selected_py()
{
py::list feat_lst;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment