diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index 17b74f23b1d09901dfad11184640727f70bac6b8..6c4c41ca358eb231b7368e977755d15935d9d2f8 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -765,3 +765,11 @@ void FeatureSpace::sis(std::vector<double>& prop) sum_file_stream.close(); } } + +void FeatureSpace::remove_feature(int ind) +{ + if(_phi[ind]->selected()) + throw std::logic_error("Can't remove a selected feature"); + + _phi.erase(_phi.begin() + ind); +} diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp index 45cddcf6514ac91edfd7e1d2a71beb9d9751f624..d328c28906423005ba7e4a22f0e508707f594c86 100644 --- a/src/feature_creation/feature_space/FeatureSpace.hpp +++ b/src/feature_creation/feature_space/FeatureSpace.hpp @@ -14,6 +14,7 @@ #include <feature_creation/node/FeatureNode.hpp> #include <feature_creation/node/ModelNode.hpp> #include <feature_creation/node/operator_nodes/allowed_ops.hpp> +#include <feature_creation/node/utils.hpp> #include <feature_creation/node/value_storage/nodes_value_containers.hpp> #include <utils/project.hpp> @@ -272,13 +273,20 @@ public: */ inline bool feat_in_phi(int ind){return (ind >= _phi[0]->feat_ind()) && (ind <= _phi.back()->feat_ind());} + // DocString: feat_space_remove_feature + /** + * @brief Remove a feature from phi + * + * @param ind index of feature to remove + */ + void remove_feature(int ind); + // Python Interface Functions #ifdef PY_BINDINGS /** * @brief Constructor for the feature space that takes in python objects * @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>) * - * @param mpi_comm MPI communicator for the calculations * @param phi_0 The initial set of features to combine * @param allowed_ops list of allowed operators * @param prop The property to be learned (training data) @@ -308,7 +316,6 @@ public: * @brief Constructor for the feature space that takes in python and numpy objects * @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>) * - * @param mpi_comm MPI communicator for the calculations * @param phi_0 The initial set of features to combine * @param allowed_ops list of allowed operators * @param prop The property to be learned (training data) @@ -334,6 +341,24 @@ public: double max_abs_feat_val=1e50 ); + /** + * @brief Constructor for the feature space that takes in python and numpy objects + * @details constructs the feature space from an initial set of features and a file containing postfix expressions for the features (cpp definition in <python/feature_creation/FeatureSpace.cpp>) + * + * @param feature_file The file with the postfix expressions for the feature space + * @param phi_0 The initial set of features to combine + * @param n_sis_select number of features to select during each SIS step + * @param max_store_rung number of rungs to calculate and store the value of the features for all samples + * @param cross_corr_max Maximum cross-correlation used for selecting features + */ + FeatureSpace( + std::string feature_file, + py::list phi_0, + py::list task_sizes, + int n_sis_select=1, + double cross_corr_max=1.0 + ); + // DocString: feat_space_sis_arr /** * @brief Wrapper function for SIS using a numpy array @@ -372,6 +397,13 @@ public: */ py::list phi0_py(); + // DocString: feat_space_phi_py + /** + * @brief The feature space (cpp definition in <python/feature_creation/FeatureSpace.cpp>) + * @return _phi as a python list + */ + py::list phi_py(); + // DocString: feat_space_scores_py /** * @brief The vector of projection scores for SIS @@ -400,6 +432,14 @@ public: */ inline py::list start_gen_py(){return python_conv_utils::to_list<int>(_start_gen);} + // DocString: feat_space_get_feature + /** + * @brief Return a feature at a specified index + * + * @param ind index of the feature to get + * @return A ModelNode of the feature at index ind + */ + inline ModelNode get_feature(int ind){return ModelNode(_phi[ind]->d_mat_ind(), _phi[ind]->rung(), _phi[ind]->expr(), _phi[ind]->postfix_expr(), _phi[ind]->value(), _phi[ind]->test_value(), _phi[ind]->unit());} #endif }; diff --git a/src/feature_creation/node/Node.hpp b/src/feature_creation/node/Node.hpp index b258b5e728116203fe58e4dc43325a16f6a79555..ef7e7b5101dbad58c020424def4046cb992b8f97 100644 --- a/src/feature_creation/node/Node.hpp +++ b/src/feature_creation/node/Node.hpp @@ -201,7 +201,6 @@ public: */ virtual std::vector<double> value() = 0; - /** * @brief Get the test data for the feature */ diff --git a/src/feature_creation/node/utils.cpp b/src/feature_creation/node/utils.cpp index 63a66a6b9bd9e61105bc7156b4281942fbd15261..7c5f77f4e8ee73edb5920269951d2276449e79d3 100644 --- a/src/feature_creation/node/utils.cpp +++ b/src/feature_creation/node/utils.cpp @@ -1,17 +1,20 @@ #include <feature_creation/node/utils.hpp> -node_ptr str2node::postfix2node(std::string postfix_expr, const std::vector<node_ptr>& phi_0, int feat_ind) +node_ptr str2node::postfix2node(std::string postfix_expr, const std::vector<node_ptr>& phi_0, int& feat_ind) { std::vector<node_ptr> stack; std::vector<std::string> postfix_split = str_utils::split_string_trim(postfix_expr, "|"); - feat_ind += postfix_split.size() - 1; + + if(postfix_split.size() == 1) + return phi_0[std::stoi(postfix_split[0])]; + for(int ff = 0; ff < postfix_split.size(); ++ff) { std::string term = postfix_split[ff]; try { stack.push_back(phi_0[std::stoi(term)]); - --feat_ind; + ++feat_ind; } catch(const std::invalid_argument e) { @@ -66,7 +69,7 @@ node_ptr str2node::postfix2node(std::string postfix_expr, const std::vector<node stack[stack.size() - 1] = std::make_shared<SixPowNode>(stack[stack.size() - 1], feat_ind, 1e-50, 1e50); else throw std::logic_error("Term in postfix expression does not represent a node"); - --feat_ind; + ++feat_ind; } } if(stack.size() != 1) @@ -110,3 +113,82 @@ std::vector<node_ptr> str2node::phi_selected_from_file(std::string filename, std file_stream.close(); return phi_selected; } + +std::vector<node_ptr> str2node::phi_from_file(std::string filename, std::vector<node_ptr> phi_0) +{ + node_value_arrs::resize_values_arr(0, phi_0.size(), true); + node_value_arrs::initialize_d_matrix_arr(); + + std::ifstream file_stream; + file_stream.open(filename, std::ios::in); + + std::string line; + + std::vector<node_ptr> phi; + int feat_ind = phi_0.size(); + std::getline(file_stream, line); + + while(std::getline(file_stream, line)) + { + if(line[0] == '#') + continue; + try + { + node_ptr feat = postfix2node(line, phi_0, feat_ind); + if(!feat) + if(feat->type() == NODE_TYPE::FEAT) + continue; + phi.push_back(feat); + phi.back()->set_selected(false); + ++feat_ind; + } + catch(const InvalidFeatureException& e) + { + // Do Nothing + } + } + file_stream.close(); + return phi; +} + +std::string node_identifier::feature_type_to_string(NODE_TYPE nt) +{ + if(nt == NODE_TYPE::FEAT) + return "feature"; + if(nt == NODE_TYPE::MODEL_FEATURE) + return "model"; + if(nt == NODE_TYPE::ADD) + return "add"; + if(nt == NODE_TYPE::SUB) + return "sub"; + if(nt == NODE_TYPE::ABS_DIFF) + return "abs_diff"; + if(nt == NODE_TYPE::MULT) + return "mult"; + if(nt == NODE_TYPE::DIV) + return "div"; + if(nt == NODE_TYPE::EXP) + return "exp"; + if(nt == NODE_TYPE::NEG_EXP) + return "neg_exp"; + if(nt == NODE_TYPE::INV) + return "inv"; + if(nt == NODE_TYPE::SQ) + return "sq"; + if(nt == NODE_TYPE::CB) + return "cb"; + if(nt == NODE_TYPE::SIX_POW) + return "six_pow"; + if(nt == NODE_TYPE::SQRT) + return "sqrt"; + if(nt == NODE_TYPE::CBRT) + return "cbrt"; + if(nt == NODE_TYPE::LOG) + return "log"; + if(nt == NODE_TYPE::ABS) + return "abs"; + if(nt == NODE_TYPE::SIN) + return "sin"; + if(nt == NODE_TYPE::COS) + return "cos"; +} diff --git a/src/feature_creation/node/utils.hpp b/src/feature_creation/node/utils.hpp index e8d7d4a39a0a0efa23834b87c4564e46aefc8f90..2b728fe105d3263380c09bd3755445da5c5dc879 100644 --- a/src/feature_creation/node/utils.hpp +++ b/src/feature_creation/node/utils.hpp @@ -26,7 +26,7 @@ namespace str2node * @param feat_ind The desired feature index * @return The feature node described by the postfix expression */ - node_ptr postfix2node(std::string postfix_expr, const std::vector<node_ptr>& phi_0, int feat_ind); + node_ptr postfix2node(std::string postfix_expr, const std::vector<node_ptr>& phi_0, int& feat_ind); /** * @brief Convert a feature_space/selected_features.txt into a phi_selected; @@ -38,6 +38,27 @@ namespace str2node * @return The selected feature set from the file */ std::vector<node_ptr> phi_selected_from_file(std::string filename, std::vector<node_ptr> phi_0); -} + /** + * @brief Convert a text file into the feature space + * @details Read in the file to get the postfix expressions and regenerate the features using phi_0 + * + * @param filename The name of the file storing all the features + * @param phi_0 The initial feature space + * + * @return The feature set defined from the file + */ + std::vector<node_ptr> phi_from_file(std::string filename, std::vector<node_ptr> phi_0); + +} +namespace node_identifier +{ + /** + * @brief Convert a node type into the corresponding string identifier + * + * @param nt node type + * @return string representation of the node type + */ + std::string feature_type_to_string(NODE_TYPE nt); +} #endif \ No newline at end of file diff --git a/src/feature_creation/units/Unit.cpp b/src/feature_creation/units/Unit.cpp index ce4520ddda7dbe4e351803fd97b6ddb76c5db7a6..75e4bec0285a1ec6093ad41c5fa021f7944b8a34 100644 --- a/src/feature_creation/units/Unit.cpp +++ b/src/feature_creation/units/Unit.cpp @@ -33,7 +33,7 @@ Unit::Unit(std::string unit_str) : all_ops.insert(all_ops.end(), div_ops.begin(), div_ops.end()); std::sort(all_ops.begin(), all_ops.end()); - std::vector<double> ops(1 + all_ops.size(), 1.0); + std::vector<double> ops((unit_str != "") + all_ops.size(), 1.0); for(int oo = 1; oo < ops.size(); ++oo) { if(std::any_of(div_ops.begin(), div_ops.end(), [&all_ops, &oo](int i){return i == all_ops[oo-1];})) diff --git a/src/python/__init__.py b/src/python/__init__.py index f7da78d5a3df8b0355957503e0a1df3300468204..52884c2ac27b859950b29a9d56c1df83bbf86adc 100644 --- a/src/python/__init__.py +++ b/src/python/__init__.py @@ -21,12 +21,12 @@ def get_unit(header): def generate_phi_0_from_csv( - df_csv, prop_key, cols="all", task_key=None, leave_out_frac=0.0 + df, prop_key, cols="all", task_key=None, leave_out_frac=0.0 ): """Create initial feature set from csv file Args: - df_csv (str): The csv file containing all of the data for the calculation + df (str or pandas.DataFrame): The DataFrame of csv file of the initial feature set prop_key (str): The key corresponding to which column in the csv file the property is stored in cols (list or str): The columns to include in the initial feature set task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in @@ -42,9 +42,10 @@ def generate_phi_0_from_csv( """ # Load csv file - df = pd.read_csv(df_csv, index_col=0) - cols = df.columns.to_list() - col_exprs = [c.split(" (")[0] for c in cols] + if isinstance(df, str): + df = pd.read_csv(df, index_col=0) + df_cols = df.columns.to_list() + col_exprs = [c.split(" (")[0] for c in df_cols] prop_ind = np.where([c_ex == prop_key.split(" (")[0] for c_ex in col_exprs])[0][0] # Get the task information @@ -65,7 +66,7 @@ def generate_phi_0_from_csv( task_sizes = [len(df)] # Get prop - prop_key = cols[prop_ind] + prop_key = df_cols[prop_ind] prop = df[prop_key].to_numpy() df = df.drop([prop_key], axis=1) @@ -102,7 +103,7 @@ def generate_phi_0_from_csv( if cols != "all": for col in df.columns.tolist(): - if col not in cols: + if col.split(" (")[0] not in cols: df = df.drop([col], axis=1) phi_0 = [] @@ -118,6 +119,7 @@ def generate_phi_0_from_csv( unit = "" else: unit = unit.replace(" ", "") + phi_0.append( FeatureNode(feat_ind, expr.replace(" ", ""), value, test_value, Unit(unit)) ) @@ -133,63 +135,7 @@ def generate_phi_0_from_csv( ) -def generate_fs_prop( - df_csv, - prop_key, - allowed_ops, - cols, - max_phi, - n_sis_select, - task_key=None, - leave_out_frac=0.0, -): - """Generate a FeatureSet for the calculation - - Args: - df_csv (str): The csv file containing all of the data for the calculation - prop_key (str): The key corresponding to which column in the csv file the property is stored in - allowed_ops (list): List of operations used to combine the features - cols (list or str): The columns to include in the initial feature set - max_phi (int): Maximum rung for the calculation - n_sis_select (int): number of features to select in each round of SIS - task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in - leave_out_frac (list): List of indices to pull from the training data to act as a test set - - Returns: - fs (FeatureSpace): The FeatureSpace for the calculation - """ - phi_0, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv( - df_csv, prop_key, cols=cols, task_key=task_key, leave_out_frac=leave_out_frac - ) - if allowed_ops == "all": - allowed_ops = [ - "add", - "sub", - "mult", - "div", - "abs_diff", - "inv", - "abs", - "cos", - "sin", - "exp", - "neg_exp", - "log", - "sq", - "sqrt", - "cb", - "cbrt", - "six_pow", - ] - - return FeatureSpace( - phi_0, allowed_ops, list(prop), task_sizes_train, max_phi, n_sis_select - ) - - -def generate_fs_prop( - phi_0, prop, task_sizes_train, allowed_ops, cols, max_phi, n_sis_select -): +def generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_select): """Generate a FeatureSet for the calculation Args: @@ -197,7 +143,6 @@ def generate_fs_prop( prop (np.ndarray): The property values for the training data task_sizes_train (list): The number of samples in the training data for each task allowed_ops (list): List of operations used to combine the features - cols (list or str): The columns to include in the initial feature set max_phi (int): Maximum rung for the calculation n_sis_select (int): number of features to select in each round of SIS task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in @@ -234,7 +179,7 @@ def generate_fs_prop( def generate_fs_sr_from_csv( - df_csv, + df, prop_key, allowed_ops, cols, @@ -248,7 +193,7 @@ def generate_fs_sr_from_csv( """Generate a FeatureSet and SISSORegressor for the calculation Args: - df_csv (str): The csv file containing all of the data for the calculation + df (str): The csv file containing all of the data for the calculation prop_key (str): The key corresponding to which column in the csv file the property is stored in allowed_ops (list): List of operations used to combine the features cols (list or str): The columns to include in the initial feature set @@ -264,12 +209,10 @@ def generate_fs_sr_from_csv( sr (SISSORegressor): The SISSORegressor for the calculation """ phi_0, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv( - df_csv, prop_key, cols=cols, task_key=task_key, leave_out_frac=leave_out_frac + df, prop_key, cols=cols, task_key=task_key, leave_out_frac=leave_out_frac ) - fs = generate_fs_prop( - phi_0, prop, task_sizes_train, allowed_ops, cols, max_phi, n_sis_select - ) + fs = generate_fs(phi_0, prop, task_sizes_train, allowed_ops, max_phi, n_sis_select) sr = SISSORegressor( fs, diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp index 38ac74e71159829730dbc36cee7df58679ff1188..4b7e00ded1408176c4459a50d316dce26cdc234e 100644 --- a/src/python/bindings_docstring_keyed.cpp +++ b/src/python/bindings_docstring_keyed.cpp @@ -41,11 +41,15 @@ void sisso::feature_creation::registerFeatureSpace() class_<FeatureSpace>("FeatureSpace", init<list, list, np::ndarray, list, optional<int, int, int, int, double, double, double>>()) .def(init<list, list, list, list, optional<int, int, int, int, double, double, double>>()) + .def(init<std::string, list, list, optional<int, double>>()) .def("sis", sis_list, "@DocString_feat_space_sis_list@") .def("sis", sis_ndarray, "@DocString_feat_space_sis_arr@") .def("feat_in_phi", &FeatureSpace::feat_in_phi, "@DocString_feat_space_feat_in_phi@") + .def("remove_feature", &FeatureSpace::remove_feature, "@DocString_feat_space_remove_feature@") + .def("get_feature", &FeatureSpace::get_feature, "@DocString_feat_space_get_feature@") .add_property("phi_selected", &FeatureSpace::phi_selected_py, "@DocString_feat_space_phi_selected_py@") .add_property("phi0", &FeatureSpace::phi0_py, "@DocString_feat_space_phi0_py@") + .add_property("phi", &FeatureSpace::phi_py, "@DocString_feat_space_phi_py@") .add_property("scores", &FeatureSpace::scores_py, "@DocString_feat_space_scores_py@") .add_property("task_sizes", &FeatureSpace::task_sizes_py, "@DocString_feat_space_task_sizes_py@") .add_property("allowed_ops", &FeatureSpace::allowed_ops_py, "@DocString_feat_space_allowed_ops_py@") diff --git a/src/python/feature_creation/FeatureSpace.cpp b/src/python/feature_creation/FeatureSpace.cpp index 66502c120a1e03862aeed391aee23d56bf989b82..9f38bbfb82edaeda6881cf057867a2f32659cc17 100644 --- a/src/python/feature_creation/FeatureSpace.cpp +++ b/src/python/feature_creation/FeatureSpace.cpp @@ -70,6 +70,68 @@ FeatureSpace::FeatureSpace( initialize_fs(python_conv_utils::from_ndarray<double>(prop)); } +FeatureSpace::FeatureSpace( + std::string feature_file, + py::list phi_0, + py::list task_sizes, + int n_sis_select, + double cross_corr_max +): + _phi_0(python_conv_utils::shared_ptr_vec_from_list<Node, FeatureNode>(phi_0)), + _scores(py::len(phi_0), 0.0), + _task_sizes(python_conv_utils::from_list<int>(task_sizes)), + _feature_space_file("feature_space/selected_features.txt"), + _feature_space_summary_file("feature_space/SIS_summary.txt"), + _mpi_comm(mpi_setup::comm), + _cross_cor_max(cross_corr_max), + _l_bound(1e-50), + _u_bound(1e50), + _n_sis_select(n_sis_select), + _n_feat(py::len(phi_0)), + _n_rung_store(0), + _n_rung_generate(0) +{ + _n_samp = _phi_0[0]->n_samp(); + _project = project_funcs::project_r; + + std::vector<node_ptr> phi_temp = str2node::phi_from_file(feature_file, _phi_0); + _n_feat = phi_temp.size(); + _phi.resize(_n_feat); + std::vector<int> rungs(_n_feat, 0); + + for(int ff = 0; ff < _n_feat; ++ff) + { + rungs[ff] = phi_temp[ff]->rung(); + if(phi_temp[ff]->type() == NODE_TYPE::FEAT) + continue; + + std::string nt = node_identifier::feature_type_to_string(phi_temp[ff]->type()); + if(std::find(_allowed_ops.begin(), _allowed_ops.end(), nt) == _allowed_ops.end()) + _allowed_ops.push_back(nt); + } + + for(auto & op : _allowed_ops) + { + if((op.compare("add") == 0) || (op.compare("sub") == 0) || (op.compare("mult") == 0) || (op.compare("abs_diff") == 0)) + _com_bin_operators.push_back(allowed_op_maps::binary_operator_map[op]); + else if((op.compare("div") == 0)) + _bin_operators.push_back(allowed_op_maps::binary_operator_map[op]); + else + _un_operators.push_back(allowed_op_maps::unary_operator_map[op]); + } + + std::vector<int> rung_inds = util_funcs::argsort(rungs); + _max_phi = *std::max_element(rungs.begin(), rungs.end()); + _phi[0] = phi_temp[rung_inds[0]]; + for(int ff = 1; ff < _n_feat; ++ff) + { + _phi[ff] = phi_temp[rung_inds[ff]]; + if(_phi[ff]->rung() != _phi[ff - 1]->rung()) + _start_gen.push_back(ff); + } + _scores.resize(_n_feat); +} + py::list FeatureSpace::phi0_py() { py::list feat_lst; @@ -78,6 +140,14 @@ py::list FeatureSpace::phi0_py() return feat_lst; } +py::list FeatureSpace::phi_py() +{ + py::list feat_lst; + for(auto& feat : _phi) + feat_lst.append<ModelNode>(ModelNode(feat->d_mat_ind(), feat->rung(), feat->expr(), feat->postfix_expr(), feat->value(), feat->test_value(), feat->unit())); + return feat_lst; +} + py::list FeatureSpace::phi_selected_py() { py::list feat_lst;