FeatureSpace.hpp 14.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
/** @file feature_creation/feature_space/FeatureSpace.hpp
 *  @brief Create a feature space from an initial set of features and algebraic operators
 *
 *  Use an initial set of features and combine them to generate more complicated algebraical features. SIS is also performed here
 *
 *  @author Thomas A. R. Purcell (tpurcell)
 *  @bug No known bugs.
 */

Thomas Purcell's avatar
Thomas Purcell committed
10
11
12
#ifndef FEATURE_SPACE
#define FEATURE_SPACE

Thomas Purcell's avatar
Thomas Purcell committed
13
#include <mpi_interface/MPI_Interface.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
14
#include <feature_creation/node/FeatureNode.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
15
#include <feature_creation/node/ModelNode.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
16
#include <feature_creation/node/operator_nodes/allowed_ops.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
17
#include <feature_creation/node/value_storage/nodes_value_containers.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
18
#include <utils/project.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
19

Thomas Purcell's avatar
Thomas Purcell committed
20
#include <boost/serialization/shared_ptr.hpp>
21
#include <boost/filesystem.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
22

Thomas Purcell's avatar
Thomas Purcell committed
23
#include <iostream>
Thomas Purcell's avatar
Thomas Purcell committed
24
#include <iomanip>
Thomas Purcell's avatar
Thomas Purcell committed
25

26
27
28
29
#ifdef PY_BINDINGS
    namespace np = boost::python::numpy;
    namespace py = boost::python;
#endif
30

31
32
33
34
35
/**
 * @brief Feature Space for SISSO calculations
 * @details Stores and performs all feature calculations for SIS
 *
 */
Thomas Purcell's avatar
Thomas Purcell committed
36
37
class FeatureSpace
{
38
    std::vector<node_ptr> _phi_selected; //!< selected features
39
40
41
42
43
44
45
46
47
48
    std::vector<node_ptr> _phi; //!< all features
    std::vector<node_ptr> _phi_0; //!< initial feature space

    std::vector<std::string> _allowed_ops; //!< list of all allowed operators strings
    std::vector<un_op_node_gen> _un_operators; //!< list of all unary operators
    std::vector<bin_op_node_gen> _com_bin_operators; //!< list of all commutable binary operators
    std::vector<bin_op_node_gen> _bin_operators; //!< list of all binary operators

    std::vector<double> _scores; //!< projection scores for each feature

49
50
51
    std::vector<int> _task_sizes; //!< The number of elements in each task (training data)
    std::vector<int> _start_gen; //!< list of the indexes where each generation starts in _phi
    std::string _feature_space_file; //!< File to store information about the selected features
52

53
54
    std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS
    std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator
55
56
57
58

    double _l_bound; //!< lower bound for absolute value of the features
    double _u_bound; //!< upper bound for absolute value of the features

59
60
    int _max_phi; //!< Maximum rung for the feature creation
    int _n_sis_select; //!< Number of features to select for each dimensions
61
    int _n_samp; //!< Number of samples (training data)
62
    int _n_feat; //!< Total number of features
Thomas Purcell's avatar
Thomas Purcell committed
63
    int _n_rung_store; //!< Total rungs stored
64
    int _n_rung_generate; //!< Total number of rungs to generate on the fly
Thomas Purcell's avatar
Thomas Purcell committed
65
public:
Thomas Purcell's avatar
Thomas Purcell committed
66

67
68
    /**
     * @brief Constructor for the feature space
69
     * @details constructs the feature space from an initial set of features and a list of allowed operators
70
71
     *
     * @param mpi_comm MPI communicator for the calculations
72
     * @param phi_0 The initial set of features to combine
73
     * @param allowed_ops list of allowed operators
74
     * @param prop The property to be learned (training data)
75
76
     * @param max_phi highest rung value for the calculation
     * @param n_sis_select number of features to select during each SIS step
77
78
79
     * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
     * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
     * @param min_abs_feat_val minimum absolute feature value
80
81
     * @param max_abs_feat_val maximum absolute feature value
     */
Thomas Purcell's avatar
Thomas Purcell committed
82
    FeatureSpace(
Thomas Purcell's avatar
Thomas Purcell committed
83
        std::shared_ptr<MPI_Interface> mpi_comm,
Thomas Purcell's avatar
Thomas Purcell committed
84
85
        std::vector<node_ptr> phi_0,
        std::vector<std::string> allowed_ops,
86
        std::vector<double> prop,
Thomas Purcell's avatar
Thomas Purcell committed
87
        std::vector<int> task_sizes,
Thomas Purcell's avatar
Thomas Purcell committed
88
89
        int max_phi=1,
        int n_sis_select=1,
90
91
92
93
94
95
96
97
        int max_store_rung=-1,
        int n_rung_generate=0,
        double min_abs_feat_val=1e-50,
        double max_abs_feat_val=1e50
    );

    void initialize_fs(std::vector<double> prop);

98
99
100
101
    /**
     * @brief Generate the full feature set from the allowed operators and initial feature set
     * @details populates phi with all features from an initial set and the allowed operators
     */
102
    void generate_feature_space(std::vector<double>& prop);
Thomas Purcell's avatar
Thomas Purcell committed
103

104
105
106
    /**
     * @brief Accessor function for _phi_selected
     */
107
    inline std::vector<node_ptr> phi_selected(){return _phi_selected;};
108
109
110
111

    /**
     * @brief Accessor function for _phi
     */
Thomas Purcell's avatar
Thomas Purcell committed
112
    inline std::vector<node_ptr> phi(){return _phi;};
113
114
115
116

    /**
     * @brief Accessor function for _phi_0
     */
Thomas Purcell's avatar
Thomas Purcell committed
117
    inline std::vector<node_ptr> phi0(){return _phi_0;};
118
119
120
121

    /**
     * @brief Accessor function for _scores
     */
122
123
    inline std::vector<double> scores(){return _scores;}

124
125
126
    /**
     * @brief Accessor function for _mpi_comm
     */
Thomas Purcell's avatar
Thomas Purcell committed
127
    inline std::shared_ptr<MPI_Interface> mpi_comm(){return _mpi_comm;}
128

129
130
131
    /**
     * @brief Accessor function for _mpi_comm
     */
Thomas Purcell's avatar
Thomas Purcell committed
132
    inline std::vector<int> task_sizes(){return _task_sizes;}
133

134
135
136
    /**
     * @brief Accessor function for _feature_space_file
     */
137
    inline std::string feature_space_file(){return _feature_space_file;}
138
139
140
141

    /**
     * @brief Accessor function for _l_bound
     */
142
    inline double l_bound(){return _l_bound;}
143
144
145
146

    /**
     * @brief Accessor function for _u_bound
     */
147
    inline double u_bound(){return _u_bound;}
148
149
150
151

    /**
     * @brief Accessor function for _max_phi
     */
152
    inline int max_phi(){return _max_phi;}
153
154
155
156

    /**
     * @brief Accessor function for _n_sis_select
     */
157
    inline int n_sis_select(){return _n_sis_select;}
158
159
160
161

    /**
     * @brief Accessor function for _n_samp
     */
162
    inline int n_samp(){return _n_samp;}
163
164
165
166

    /**
     * @brief Accessor function for _n_feat
     */
167
    inline int n_feat(){return _n_feat;}
168
169
170
171

    /**
     * @brief Accessor function for _n_rung_store
     */
172
    inline int n_rung_store(){return _n_rung_store;}
173
174
175
176

    /**
     * @brief Accessor function for _n_rung_generate
     */
177
    inline int n_rung_generate(){return _n_rung_generate;}
178

179
180
181
182
183
184
185
186
187
188
    /**
     * @brief Generate a new set of features from a single feature
     * @details Take in the feature and perform all valid algebraic operations on it.
     *
     * @param feat The feature to spawn new features from
     * @param feat_set The feature set to pull features from for combinations
     * @param feat_ind starting index for the next feature generated
     * @param l_bound lower bound for the absolute value of the feature
     * @param u_bound upper bound for the abosulte value of the feature
     */
189
190
    void generate_new_feats(std::vector<node_ptr>::iterator& feat, std::vector<node_ptr>& feat_set, int& feat_ind, double l_bound=1e-50, double u_bound=1e50);

191
192
193
194
195
196
197
198
199
200
    /**
     * @brief Calculate the SIS Scores for feature generated on the fly
     * @details Create the next rung of features and calculate their projection scores. Only keep those that can be selected by SIS.
     *
     * @param prop Pointer to the start of the vector storing the data to project the features onto
     * @param size The size of the data to project over
     * @param phi_selected The features that would be selected from the previous rungs
     * @param scores_selected The projection scores of the features that would be selected from the previous rungs
     * @param scores_comp vector to store temporary score comparisons
     */
201
    void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
202

203
204
205
206
207
208
209
210
211
212
213
    /**
     * @brief Check if a feature overlaps with a feature previously selected in earlier SIS iterations
     * @details Compares the projection score of the current candidate feature with all those of previously selected features (using the current prop) and
     *          if they are within 1e-10, then check the correlation between the features themselves
     *
     * @param val_ptr pointer to the candidate feature's data
     * @param cur_score the projection score of the candidate feature
     * @param scores_past The projection scores of the previous features
     * @param scores_comp vector to temporarily store the comparison of projection scores
     * @return True if the feature does not overlap with any previously selected
     */
214
    bool valid_score_against_past(double* val_ptr, double cur_score, std::vector<double> scores_past, std::vector<double>& scores_comp);
215

216
217
218
219
220
221
222
223
224
225
226
227
    /**
     * @brief Check if a feature overlaps with a feature previously selected in this SIS iterations
     * @details CCompares the projection score of the current candidate feature with all those of previously selected features in this iteration and
     *          if they are within 1e-10, then check the correlation between the features themselves
     *
     * @param end_check the end point to stop the comparison (the same as the current number of selected features)
     * @param val_ptr pointer to the candidate feature's data
     * @param cur_score the projection score of the candidate feature
     * @param scores_selected The projection scores of the previous features
     * @param scores_comp vector to temporarily store the comparison of projection scores
     * @return True if the feature does not overlap with any previously selected
     */
228
    bool valid_score_against_current(int end_check, double* val_ptr, double cur_score, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
229
230
    /**
     * @brief Perform SIS on a feature set with a specified property
231
     * @details Perform sure-independence screening with either the correct property or the error
232
     *
233
     * @param prop The property to perform SIS over
234
     */
Thomas Purcell's avatar
Thomas Purcell committed
235
    void sis(std::vector<double>& prop);
236
237
238
239

    /**
     * @brief Is a feature in this process' _phi?
     *
240
241
     * @param ind The index of the feature
     * @return True if feature is in this rank's _phi
242
243
244
     */
    inline bool feat_in_phi(int ind){return (ind >= _phi[0]->feat_ind()) && (ind <= _phi.back()->feat_ind());}

245
246
247
    // Python Interface Functions
    #ifdef PY_BINDINGS
        /**
248
249
         * @brief Constructor for the feature space that takes in python objects
         * @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
250
251
         *
         * @param mpi_comm MPI communicator for the calculations
252
         * @param phi_0 The initial set of features to combine
253
         * @param allowed_ops list of allowed operators
254
         * @param prop The property to be learned (training data)
255
256
         * @param max_phi highest rung value for the calculation
         * @param n_sis_select number of features to select during each SIS step
257
258
259
         * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
         * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
         * @param min_abs_feat_val minimum absolute feature value
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
         * @param max_abs_feat_val maximum absolute feature value
         */
        FeatureSpace(
            py::list phi_0,
            py::list allowed_ops,
            py::list prop,
            py::list task_sizes,
            int max_phi=1,
            int n_sis_select=1,
            int max_store_rung=-1,
            int n_rung_generate=0,
            double min_abs_feat_val=1e-50,
            double max_abs_feat_val=1e50
        );

        /**
276
277
         * @brief Constructor for the feature space that takes in python and numpy objects
         * @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
278
279
         *
         * @param mpi_comm MPI communicator for the calculations
280
         * @param phi_0 The initial set of features to combine
281
         * @param allowed_ops list of allowed operators
282
         * @param prop The property to be learned (training data)
283
284
         * @param max_phi highest rung value for the calculation
         * @param n_sis_select number of features to select during each SIS step
285
286
287
         * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
         * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
         * @param min_abs_feat_val minimum absolute feature value
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
         * @param max_abs_feat_val maximum absolute feature value
         */
        FeatureSpace(
            py::list phi_0,
            py::list allowed_ops,
            np::ndarray prop,
            py::list task_sizes,
            int max_phi=1,
            int n_sis_select=1,
            int max_store_rung=-1,
            int n_rung_generate=0,
            double min_abs_feat_val=1e-50,
            double max_abs_feat_val=1e50
        );

303
304
305
306
307
        /**
         * @brief Wrapper function for SIS using a numpy array
         *
         * @param prop The property to perform SIS over as a numpy array
         */
308
309
310
311
312
        inline void sis(np::ndarray prop)
        {
            std::vector<double> prop_vec = python_conv_utils::from_ndarray<double>(prop);
            sis(prop_vec);
        }
313
314
315
316
317
        /**
         * @brief Wrapper function for SIS using a python list
         *
         * @param prop The property to perform SIS over as a python list
         */
318
319
320
321
322
323
        inline void sis(py::list prop)
        {
            std::vector<double> prop_vec = python_conv_utils::from_list<double>(prop);
            sis(prop_vec);
        }

324
325
326
327
        /**
         * @brief Python Accesor function to _phi_selected (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
         * @return _phi_selected as a python list
         */
328
        py::list phi_selected_py();
329
330
331
332
333

        /**
         * @brief Python Accesor function to _phi0 (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
         * @return _phi0 as a python list
         */
334
        py::list phi0_py();
335
336
337
338
339

        /**
         * @brief Python Accesor function to _scores
         * @return _scores as a numpy array
         */
340
        inline np::ndarray scores_py(){return python_conv_utils::to_ndarray<double>(_scores);};
341
342
343
344
345

        /**
         * @brief Python Accesor function to _task_sizes
         * @return _task_sizes as a python list
         */
346
        inline py::list task_sizes_py(){return python_conv_utils::to_list<int>(_task_sizes);};
347
348
349
350
351

        /**
         * @brief Python Accesor function to _allowed_ops
         * @return _allowed_ops as a python list
         */
352
        inline py::list allowed_ops_py(){return python_conv_utils::to_list<std::string>(_allowed_ops);}
353
354
355
356
357

        /**
         * @brief Python Accesor function to _start_gen
         * @return _start_gen as a python list
         */
358
        inline py::list start_gen_py(){return python_conv_utils::to_list<int>(_start_gen);}
359

360
    #endif
Thomas Purcell's avatar
Thomas Purcell committed
361
362
363
};

#endif