// Copyright 2021 Thomas A. R. Purcell // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. /** @file feature_creation/feature_space/FeatureSpace.hpp * @brief Defines the class for creating/operating on a feature space in SISSO * * @author Thomas A. R. Purcell (tpurcell90) * @bug No known bugs. */ #ifndef FEATURE_SPACE #define FEATURE_SPACE #include #include #include "feature_creation/node/utils.hpp" #include "inputs/InputParser.hpp" #include "mpi_interface/MPI_Interface.hpp" #include "mpi_interface/MPI_Ops.hpp" #include "mpi_interface/serialize_tuple.h" #include "utils/project.hpp" #include "mpi_interface/MPI_Interface.hpp" #ifdef PY_BINDINGS namespace np = boost::python::numpy; namespace py = boost::python; #endif // DocString: cls_feat_space /** * @brief Feature Space for SISSO calculations. It stores and performs all actions on the feature space for SISSO. * */ class FeatureSpace { std::vector _phi_selected; //!< A vector containing all of the selected features std::vector _phi; //!< A vector containing all features generated (Not including those created on the Fly during SIS) std::vector _phi_0; //!< A vector containing all of the Primary features #ifdef PARAMETERIZE std::vector _phi_reparam; //!< A vector containing the features created when reparameterizating using the residuals std::vector _end_no_params; //!< A vector containing the indexes of each rung where parameterized nodes start std::vector _start_rung_reparam; //!< A vector containing the indexes of each rung where parameterized nodes start std::vector _un_param_operators; //!< Vector containing all parameterized unary operators with free parameters std::vector _com_bin_param_operators; //!< Vector containing all parameterized commutable binary operators with free parameters std::vector _bin_param_operators; //!< Vector containing all parameterized binary operators with free parameters std::vector _allowed_param_ops; //!< Vector containing all allowed operators strings for operators with free parameters #endif std::vector _allowed_ops; //!< Vector containing all allowed operators strings std::vector _un_operators; //!< Vector containing all unary operators std::vector _com_bin_operators; //!< Vector containing all commutable binary operators std::vector _bin_operators; //!< Vector containing all binary operators std::vector _prop_train; //!< The value of the property vector for each training sample std::vector _scores; //!< The projection scores for each feature const std::vector _task_sizes_train; //!< Number of training samples per task std::vector _start_rung; //!< Vector containing the indexes where each rung starts in _phi const std::string _project_type; //!< The type of LossFunction to use when projecting the features onto a property const std::string _feature_space_file; //!< File to output the computer readable representation of the selected features to const std::string _feature_space_summary_file; //!< File to output the human readable representation of the selected features to const std::string _phi_out_file; //!< Filename of the file to output the feature set to std::function&, const double, const int, const int)> _is_valid; //!< Function used to determine of a feature is too correlated to previously selected features std::function&, const std::vector&, const double)> _is_valid_feat_list; //!< Function used to determine of a feature is too correlated to previously selected features within a given list std::shared_ptr _mpi_comm; //!< the MPI communicator for the calculation const double _cross_cor_max; //!< Maximum cross-correlation used for selecting features const double _l_bound; //!< The lower bound for the maximum absolute value of the features const double _u_bound; //!< The upper bound for the maximum absolute value of the features int _n_rung_store; //!< The number of rungs to calculate and store the value of the features for all samples int _n_feat; //!< Total number of features in the feature space int _max_rung; //!< Maximum rung for the feature creation const int _n_sis_select; //!< Number of features to select during each SIS iteration const int _n_samp_train; //!< Number of samples in the training set const int _n_rung_generate; //!< Either 0 or 1, and is the number of rungs to generate on the fly during SIS #ifdef PARAMETERIZE int _max_param_depth; //!< The maximum depth in the binary expression tree to set non-linear optimization bool _reparam_residual; //!< If True then reparameterize features using the residuals of each model #endif public: // DocString: feat_space_init /** * @brief Construct a FeatureSpace using an InputParser object * * @param inputs InputParser object used to build the FeatureSpace */ FeatureSpace(InputParser inputs); /** * @brief FeatureSpace constructor that uses a file containing postfix feature expressions to describe all features in Phi, and a primary feature setn ) * * @param feature_file The file containing the postfix expressions of all features in the FeatureSpace * @param phi_0 The set of primary features * @param prop List containing the property vector (training data only) * @param task_sizes_train The number of samples in the training data per task * @param project_type The type of loss function/projection operator to use * @param n_sis_select The number of features to select during each SIS step * @param cross_corr_max The maximum allowed cross-correlation value between selected features */ FeatureSpace( std::string feature_file, std::vector phi_0, std::vector prop, std::vector task_sizes_train, std::string project_type="regression", int n_sis_select=1, double cross_corr_max=1.0, std::vector excluded_inds = std::vector() ); /** * @brief Destructor */ ~FeatureSpace(); /** * @brief Populate the operator lists using _allowed_ops and _allowed_param_ops */ void set_op_lists(); /** * @brief Create SIS output files and write their headers */ void initialize_fs_output_files() const; /** * @brief Populate _phi using _phi_0 and the allowed operators up to (_max_rung - _n_rung_generate)^th rung */ void generate_feature_space( std::vector& feat_set, std::vector& start_rung, const std::vector& prop, bool reparam = false ); /** * @brief A vector containing all of the selected features */ inline std::vector phi_selected() const {return _phi_selected;}; /** * @brief A vector containing all features generated (Not including those created on the Fly during SIS) */ inline std::vector phi() const {return _phi;}; /** * @brief A vector containing all of the Primary features */ inline std::vector phi0() const {return _phi_0;}; /** * @brief The projection scores for each feature in _phi */ inline std::vector scores() const {return _scores;} /** * @brief The MPI Communicator */ inline std::shared_ptr mpi_comm() const {return _mpi_comm;} /** * @brief Number of training samples per task */ inline std::vector task_sizes_train() const {return _task_sizes_train;} // DocString: feat_space_feature_space_file /** * @brief Filename of the file to output the computer readable representation of the selected features to */ inline std::string feature_space_file() const {return _feature_space_file;} // DocString: feat_space_feature_space_file /** * @brief Filename of the file to output the human readable representation of the selected features to */ inline std::string feature_space_summary_file() const {return _feature_space_summary_file;} // DocString: feat_space_l_bound /** * @brief The mlower bound for the maximum absolute value of the features */ inline double l_bound() const {return _l_bound;} // DocString: feat_space_u_bound /** * @brief The upper bound for the maximum absolute value of the features */ inline double u_bound() const {return _u_bound;} // DocString: feat_space_max_rung /** * @brief The maximum rung for the feature creation */ inline int max_rung() const {return _max_rung;} // DocString: feat_space_n_sis_select /** * @brief The number of features to select during each SIS iteration */ inline int n_sis_select() const {return _n_sis_select;} // DocString: feat_space_n_samp_train /** * @brief The nuumber of samples in the training set */ inline int n_samp_train() const {return _n_samp_train;} // DocString: feat_space_n_feat /** * @brief The total number of features in the feature space */ inline int n_feat() const {return _n_feat;} // DocString: feat_space_n_rung_store /** * @brief The number of rungs to calculate and store the value of the features for all samples */ inline int n_rung_store() const {return _n_rung_store;} // DocString: feat_space_n_rung_generate /** * @brief Either 0 or 1, and is the number of rungs to generate on the fly during SIS */ inline int n_rung_generate() const {return _n_rung_generate;} /** * @brief Generate a new set of non-parameterized features from a single feature * @details Perform all valid algebraic operations on the passed feature and all features that appear before it in _phi. * * @param feat The feature to spawn new features from * @param feat_set The feature set to pull features from for binary operations * @param start The point in feat_set to begin pulling features from for binary operations * @param feat_ind starting index for the next feature generated * @param l_bound lower bound for the maximum absolute value of the feature * @param u_bound upper bound for the maximum abosulte value of the feature */ void generate_non_param_feats( std::vector::iterator& feat, std::vector& feat_set, const std::vector::iterator& start, unsigned long int& feat_ind, const double l_bound=1e-50, const double u_bound=1e50 ); // DocString: feat_space_output_phi /** * @brief Output the feature set to a file of a passed filename */ void output_phi(); #ifdef PARAMETERIZE /** * @brief Generate a new set of parameterized features from a single feature * @details Perform all valid algebraic operations on the passed feature and all features that appear before it in _phi. * * @param feat The feature to spawn new features from * @param feat_set The feature set to pull features from for binary operations * @param start The point in feat_set to begin pulling features from for binary operations * @param feat_ind starting index for the next feature generated * @param optimizer The object used to optimize the parameterized features * @param l_bound lower bound for the maximum absolute value of the feature * @param u_bound upper bound for the maximum abosulte value of the feature */ void generate_param_feats( std::vector::iterator& feat, std::vector& feat_set, const std::vector::iterator& start, unsigned long int& feat_ind, std::shared_ptr optimizer, const double l_bound=1e-50, const double u_bound=1e50 ); /** * @brief Generate a new set of parameterized features for the residuals * * @param feat The feature to spawn new features from * @param feat_set The feature set to pull features from for binary operations * @param feat_ind starting index for the next feature generated * @param optimizer The object used to optimize the parameterized features * @param l_bound lower bound for the maximum absolute value of the feature * @param u_bound upper bound for the maximum abosulte value of the feature */ void generate_reparam_feats( std::vector::iterator& feat, std::vector& feat_set, unsigned long int& feat_ind, std::shared_ptr optimizer, const double l_bound=1e-50, const double u_bound=1e50 ); #endif /** * @brief Generate the final rung of features on the fly and calculate their projection scores for SISat can be selected by SIS. * * @param loss The LossFunction used to project over all of the features * @param phi_selected The set of features that would be selected excluding the final rung * @param scores_selected The projection scores of all features in phi_selected */ void generate_and_project(std::shared_ptr loss, std::vector& phi_selected, std::vector& scores_selected); /** * @brief Perform Sure-Independence Screening over the FeatureSpace. The features are ranked using a projection operator constructed using _project_type and the Property vector * * @param prop Vector containing the property vector (training data only) */ void sis(const std::vector& prop); /** * @brief Perform Sure-Independence Screening over the FeatureSpace. The features are ranked using a projection operator defined in loss * * @param loss The LossFunction used to project over all of the features */ void sis(std::shared_ptr loss); // DocString: feat_space_feat_in_phi /** * @brief Is a feature in this process' _phi? * * @param ind (int) The index of the feature * * @return True if feature is in this rank's _phi */ inline bool feat_in_phi(int ind) const {return (ind >= _phi[0]->feat_ind()) && (ind <= _phi.back()->feat_ind());} // DocString: feat_space_remove_feature /** * @brief Remove a feature from phi * * @param ind (int) index of feature to remove */ void remove_feature(const int ind); #ifdef PARAMETERIZE // DocString: feat_space_param_feats_allowed /** * @brief True if built with -DBUILD_PARAMS (used for python tests) */ bool parameterized_feats_allowed() const {return true;} #else // DocString: feat_space_param_feats_allowed /** * @brief True if built with -DBUILD_PARAMS (used for python tests) */ bool parameterized_feats_allowed() const {return false;} #endif // Python Interface Functions #ifdef PY_BINDINGS // DocString: feat_space_init_file_np_array /** * @brief FeatureSpace constructor that uses a file containing postfix feature expressions to describe all features in Phi, and a primary feature setn ) * * @param feature_file (str) The file containing the postfix expressions of all features in the FeatureSpace * @param phi_0 (list) The set of primary features * @param prop (np.ndarray) List containing the property vector (training data only) * @param task_sizes_train (list) The number of samples in the training data per task * @param project_type (str) The type of loss function/projection operator to use * @param n_sis_select (int) The number of features to select during each SIS step * @param cross_corr_max (double) The maximum allowed cross-correlation value between selected features * @param excluded_inds (list) The list of primary feature indexes to not include in any features */ FeatureSpace( std::string feature_file, py::list phi_0, np::ndarray prop, py::list task_sizes_train, std::string project_type="regression", int n_sis_select=1, double cross_corr_max=1.0, py::list excluded_inds = py::list() ); // DocString: feat_space_init_file_py_list /** * @brief FeatureSpace constructor that uses a file containing postfix feature expressions to describe all features in Phi, and a primary feature setn ) * * @param feature_file (str) The file containing the postfix expressions of all features in the FeatureSpace * @param phi_0 (list) The set of primary features * @param prop (list) List containing the property vector (training data only) * @param task_sizes_train (list) The number of samples in the training data per task * @param project_type (str) The type of loss function/projection operator to use * @param n_sis_select (int) The number of features to select during each SIS step * @param cross_corr_max (double) The maximum allowed cross-correlation value between selected features * @param excluded_inds (list) The list of primary feature indexes to not include in any features */ FeatureSpace( std::string feature_file, py::list phi_0, py::list prop, py::list task_sizes_train, std::string project_type="regression", int n_sis_select=1, double cross_corr_max=1.0, py::list excluded_inds = py::list() ); // DocString: feat_space_sis_arr /** * @brief Perform Sure-Independence Screening over the FeatureSpace. The features are ranked using a projection operator constructed using _project_type and the Property vector * * @param prop (np.ndarray) Array containing the property vector (training data only) */ inline void sis(np::ndarray prop) { std::vector prop_vec = python_conv_utils::from_ndarray(prop); sis(prop_vec); } // DocString: feat_space_sis_list /** * @brief Perform Sure-Independence Screening over the FeatureSpace. The features are ranked using a projection operator constructed using _project_type and the Property vector * * @param prop (list) List containing the property vector (training data only) */ inline void sis(py::list prop) { std::vector prop_vec = python_conv_utils::from_list(prop); sis(prop_vec); } // DocString: feat_space_phi_selected_py /** * @brief A list containing all of the selected features */ py::list phi_selected_py(); // DocString: feat_space_phi0_py /** * @brief A list containing all features generated (Not including those created on the Fly during SIS) */ py::list phi_py(); // DocString: feat_space_phi_py /** * @brief A list containing all of the Primary features */ py::list phi0_py(); // DocString: feat_space_scores_py /** * @brief An array of all stored projection scores from SIS */ inline np::ndarray scores_py(){return python_conv_utils::to_ndarray(_scores);}; // DocString: feat_space_task_sizes_train_py /** * @brief A list of the number of samples in each task for the training data */ inline py::list task_sizes_train_py(){return python_conv_utils::to_list(_task_sizes_train);}; // DocString: feat_space_allowed_ops_py /** * @brief The list of allowed operators */ inline py::list allowed_ops_py(){return python_conv_utils::to_list(_allowed_ops);} #ifdef PARAMETERIZE // DocString: feat_space_allowed_ops_py /** * @brief The list of allowed operators */ inline py::list allowed_param_ops_py(){return python_conv_utils::to_list(_allowed_param_ops);} #else // DocString: feat_space_allowed_ops_py /** * @brief The list of allowed operators */ inline py::list allowed_param_ops_py(){return python_conv_utils::to_list({});} #endif // DocString: feat_space_start_rung_py /** * @brief A list containing the index of the first feature of each rung in the feature space. */ inline py::list start_rung_py(){return python_conv_utils::to_list(_start_rung);} // DocString: feat_space_get_feature /** * @brief Access the feature in _phi with an index ind * * @param ind (int) The index of the feature to get * @return A ModelNode of the feature at index ind */ inline ModelNode get_feature(const int ind) const {return ModelNode(_phi[ind]);} #endif }; #endif