FeatureSpace.hpp 18.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
/** @file feature_creation/feature_space/FeatureSpace.hpp
 *  @brief Create a feature space from an initial set of features and algebraic operators
 *
 *  Use an initial set of features and combine them to generate more complicated algebraical features. SIS is also performed here
 *
 *  @author Thomas A. R. Purcell (tpurcell)
 *  @bug No known bugs.
 */

Thomas Purcell's avatar
Thomas Purcell committed
10
11
12
#ifndef FEATURE_SPACE
#define FEATURE_SPACE

Thomas Purcell's avatar
Thomas Purcell committed
13
#include <mpi_interface/MPI_Interface.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
14
#include <feature_creation/node/ModelNode.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
15
#include <feature_creation/node/operator_nodes/allowed_ops.hpp>
16
#include <feature_creation/node/utils.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
17
#include <feature_creation/node/value_storage/nodes_value_containers.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
18
#include <utils/project.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
19

Thomas Purcell's avatar
Thomas Purcell committed
20
#include <boost/serialization/shared_ptr.hpp>
21
#include <boost/filesystem.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
22

Thomas Purcell's avatar
Thomas Purcell committed
23
#include <iostream>
Thomas Purcell's avatar
Thomas Purcell committed
24
#include <iomanip>
25
#include <utility>
Thomas Purcell's avatar
Thomas Purcell committed
26

27
28
29
30
#ifdef PY_BINDINGS
    namespace np = boost::python::numpy;
    namespace py = boost::python;
#endif
31

32
// DocString: cls_feat_space
33
34
35
36
37
/**
 * @brief Feature Space for SISSO calculations
 * @details Stores and performs all feature calculations for SIS
 *
 */
Thomas Purcell's avatar
Thomas Purcell committed
38
39
class FeatureSpace
{
40
    std::vector<node_ptr> _phi_selected; //!< selected features
41
42
43
    std::vector<node_ptr> _phi; //!< all features
    std::vector<node_ptr> _phi_0; //!< initial feature space

44
45
46
47
48
    #ifdef PARAMETERIZE
        std::vector<std::pair<un_param_op_node_gen, std::vector<std::string>>> _un_param_operators; //!< list of all parameterized unary operators with free parameters
        std::vector<std::pair<bin_param_op_node_gen, std::vector<std::string>>> _com_bin_param_operators; //!< list of all parameterized commutable binary operators with free parameters
        std::vector<std::pair<bin_param_op_node_gen, std::vector<std::string>>> _bin_param_operators; //!< list of all parameterized binary operators with free parameters
    #endif
49
50

    std::map<std::string, std::vector<std::string>> _allowed_param_ops; //!< Map of parameterization operator set (set of operators and non-linear parameters used for a non-linear least squares fit to property)
51
52
53
54
55
    std::vector<std::string> _allowed_ops; //!< list of all allowed operators strings
    std::vector<un_op_node_gen> _un_operators; //!< list of all unary operators
    std::vector<bin_op_node_gen> _com_bin_operators; //!< list of all commutable binary operators
    std::vector<bin_op_node_gen> _bin_operators; //!< list of all binary operators

56
    std::vector<double> _prop; //!< The property to fit
57
58
    std::vector<double> _scores; //!< projection scores for each feature

59
60
61
    std::vector<int> _task_sizes; //!< The number of elements in each task (training data)
    std::vector<int> _start_gen; //!< list of the indexes where each generation starts in _phi
    std::string _feature_space_file; //!< File to store information about the selected features
Thomas Purcell's avatar
Thomas Purcell committed
62
    std::string _feature_space_summary_file; //!< File to store information about the selected features
63

64
65
    std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS
    std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator
66

Thomas Purcell's avatar
Thomas Purcell committed
67
    double _cross_cor_max; //!< Maximum cross-correlation used for selecting features
68
69
70
    double _l_bound; //!< lower bound for absolute value of the features
    double _u_bound; //!< upper bound for absolute value of the features

71
72
    int _max_phi; //!< Maximum rung for the feature creation
    int _n_sis_select; //!< Number of features to select for each dimensions
73
    int _n_samp; //!< Number of samples (training data)
74
    int _n_feat; //!< Total number of features
Thomas Purcell's avatar
Thomas Purcell committed
75
    int _n_rung_store; //!< Total rungs stored
76
    int _n_rung_generate; //!< Total number of rungs to generate on the fly
77

Thomas Purcell's avatar
Thomas Purcell committed
78
public:
Thomas Purcell's avatar
Thomas Purcell committed
79

80
81
    /**
     * @brief Constructor for the feature space
82
     * @details constructs the feature space from an initial set of features and a list of allowed operators
83
84
     *
     * @param mpi_comm MPI communicator for the calculations
85
     * @param phi_0 The initial set of features to combine
86
     * @param allowed_ops list of allowed operators
87
     * @param allowed_param_ops dictionary of the parameterizable operators and their associated free parameters
88
     * @param prop The property to be learned (training data)
89
90
     * @param task_sizes The number of samples per task
     * @param project_type The projection operator to use
91
92
     * @param max_phi highest rung value for the calculation
     * @param n_sis_select number of features to select during each SIS step
93
94
     * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
     * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
Thomas Purcell's avatar
Thomas Purcell committed
95
     * @param cross_corr_max Maximum cross-correlation used for selecting features
96
     * @param min_abs_feat_val minimum absolute feature value
97
98
     * @param max_abs_feat_val maximum absolute feature value
     */
Thomas Purcell's avatar
Thomas Purcell committed
99
    FeatureSpace(
Thomas Purcell's avatar
Thomas Purcell committed
100
        std::shared_ptr<MPI_Interface> mpi_comm,
Thomas Purcell's avatar
Thomas Purcell committed
101
102
        std::vector<node_ptr> phi_0,
        std::vector<std::string> allowed_ops,
103
        std::map<std::string, std::vector<std::string>> allowed_param_ops,
104
        std::vector<double> prop,
Thomas Purcell's avatar
Thomas Purcell committed
105
        std::vector<int> task_sizes,
106
        std::string project_type="regression",
Thomas Purcell's avatar
Thomas Purcell committed
107
108
        int max_phi=1,
        int n_sis_select=1,
109
110
        int max_store_rung=-1,
        int n_rung_generate=0,
Thomas Purcell's avatar
Thomas Purcell committed
111
        double cross_corr_max=1.0,
112
113
114
115
        double min_abs_feat_val=1e-50,
        double max_abs_feat_val=1e50
    );

116
117
118
119
120
    /**
     * @brief Initialize the feature set given a property vector
     *
     * @param prop The property trying to be learned
     */
121
    void initialize_fs(std::vector<double> prop, std::string project_type);
122

123
124
125
126
    /**
     * @brief Generate the full feature set from the allowed operators and initial feature set
     * @details populates phi with all features from an initial set and the allowed operators
     */
127
    void generate_feature_space();
Thomas Purcell's avatar
Thomas Purcell committed
128

129
    /**
130
     * @brief The selected feature space
131
     */
132
    inline std::vector<node_ptr> phi_selected(){return _phi_selected;};
133
134

    /**
135
     * @brief The full feature space
136
     */
Thomas Purcell's avatar
Thomas Purcell committed
137
    inline std::vector<node_ptr> phi(){return _phi;};
138
139

    /**
140
     * @brief The initial feature space
141
     */
Thomas Purcell's avatar
Thomas Purcell committed
142
    inline std::vector<node_ptr> phi0(){return _phi_0;};
143
144

    /**
145
     * @brief The vector of projection scores for SIS
146
     */
147
148
    inline std::vector<double> scores(){return _scores;}

149
    /**
150
     * @brief The MPI Communicator
151
     */
Thomas Purcell's avatar
Thomas Purcell committed
152
    inline std::shared_ptr<MPI_Interface> mpi_comm(){return _mpi_comm;}
153

154
    /**
155
     * @brief The vector storing the number of samples in each task
156
     */
Thomas Purcell's avatar
Thomas Purcell committed
157
    inline std::vector<int> task_sizes(){return _task_sizes;}
158

159
    // DocString: feat_space_feature_space_file
160
    /**
161
     * @brief The feature space filename
162
     */
163
    inline std::string feature_space_file(){return _feature_space_file;}
164

165
    // DocString: feat_space_l_bound
166
    /**
167
     * @brief The minimum absolute value of the feature
168
     */
169
    inline double l_bound(){return _l_bound;}
170

171
    // DocString: feat_space_u_bound
172
    /**
173
     * @brief The maximum absolute value of the feature
174
     */
175
    inline double u_bound(){return _u_bound;}
176

177
    // DocString: feat_space_max_phi
178
    /**
179
     * @brief The maximum rung of the feature space
180
     */
181
    inline int max_phi(){return _max_phi;}
182

183
    // DocString: feat_space_n_sis_select
184
    /**
185
     * @brief The number of features selected in each SIS step
186
     */
187
    inline int n_sis_select(){return _n_sis_select;}
188

189
    // DocString: feat_space_n_samp
190
    /**
191
     * @brief The number of samples per feature
192
     */
193
    inline int n_samp(){return _n_samp;}
194

195
    // DocString: feat_space_n_feat
196
    /**
197
     * @brief The number of features in the feature space
198
     */
199
    inline int n_feat(){return _n_feat;}
200

201
    // DocString: feat_space_n_rung_store
202
    /**
203
     * @brief The number of rungs whose feature training data is stored in memory
204
     */
205
    inline int n_rung_store(){return _n_rung_store;}
206

207
    // DocString: feat_space_n_rung_generate
208
    /**
209
     * @brief The number of rungs to be generated on the fly during SIS
210
     */
211
    inline int n_rung_generate(){return _n_rung_generate;}
212

213
214
215
216
217
218
219
220
221
222
    /**
     * @brief Generate a new set of features from a single feature
     * @details Take in the feature and perform all valid algebraic operations on it.
     *
     * @param feat The feature to spawn new features from
     * @param feat_set The feature set to pull features from for combinations
     * @param feat_ind starting index for the next feature generated
     * @param l_bound lower bound for the absolute value of the feature
     * @param u_bound upper bound for the abosulte value of the feature
     */
223
224
    void generate_new_feats(std::vector<node_ptr>::iterator& feat, std::vector<node_ptr>& feat_set, int& feat_ind, double l_bound=1e-50, double u_bound=1e50);

225
226
227
228
229
230
231
232
233
234
    /**
     * @brief Calculate the SIS Scores for feature generated on the fly
     * @details Create the next rung of features and calculate their projection scores. Only keep those that can be selected by SIS.
     *
     * @param prop Pointer to the start of the vector storing the data to project the features onto
     * @param size The size of the data to project over
     * @param phi_selected The features that would be selected from the previous rungs
     * @param scores_selected The projection scores of the features that would be selected from the previous rungs
     * @param scores_comp vector to store temporary score comparisons
     */
235
    void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected, std::vector<double>& scores_comp);
236

Thomas Purcell's avatar
Thomas Purcell committed
237
238
239
240
241
242
243
244
245
246
    /**
     * @brief Checks the feature to see if it is still valid against previously selected features
     *
     * @param val_ptr pointer to value array of the current feature
     * @param end_sel index of the feature to stop checking
     *
     * @return True if the feature is still valid
     */
    bool valid_feature_against_selected(double* val_ptr, int end_sel, int start_sel = 0);

247
248
    /**
     * @brief Perform SIS on a feature set with a specified property
249
     * @details Perform sure-independence screening with either the correct property or the error
250
     *
251
     * @param prop The property to perform SIS over
252
     */
Thomas Purcell's avatar
Thomas Purcell committed
253
    void sis(std::vector<double>& prop);
254

255
    // DocString: feat_space_feat_in_phi
256
257
258
    /**
     * @brief Is a feature in this process' _phi?
     *
259
260
     * @param ind The index of the feature
     * @return True if feature is in this rank's _phi
261
262
263
     */
    inline bool feat_in_phi(int ind){return (ind >= _phi[0]->feat_ind()) && (ind <= _phi.back()->feat_ind());}

264
265
266
267
268
269
270
271
    // DocString: feat_space_remove_feature
    /**
     * @brief Remove a feature from phi
     *
     * @param ind index of feature to remove
     */
    void remove_feature(int ind);

272
273
274
    // Python Interface Functions
    #ifdef PY_BINDINGS
        /**
275
276
         * @brief Constructor for the feature space that takes in python objects
         * @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
277
         *
278
         * @param phi_0 The initial set of features to combine
279
         * @param allowed_ops list of allowed operators
280
         * @param allowed_param_ops dictionary of the parameterizable operators and their associated free parameters
281
         * @param prop The property to be learned (training data)
282
283
         * @param task_sizes The number of samples per task
         * @param project_type The projection operator to use
284
285
         * @param max_phi highest rung value for the calculation
         * @param n_sis_select number of features to select during each SIS step
286
287
         * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
         * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
Thomas Purcell's avatar
Thomas Purcell committed
288
         * @param cross_corr_max Maximum cross-correlation used for selecting features
289
         * @param min_abs_feat_val minimum absolute feature value
290
291
292
293
294
         * @param max_abs_feat_val maximum absolute feature value
         */
        FeatureSpace(
            py::list phi_0,
            py::list allowed_ops,
295
            py::dict allowed_param_ops,
296
297
            py::list prop,
            py::list task_sizes,
298
            std::string project_type="pearson",
299
300
301
302
            int max_phi=1,
            int n_sis_select=1,
            int max_store_rung=-1,
            int n_rung_generate=0,
Thomas Purcell's avatar
Thomas Purcell committed
303
            double cross_corr_max=1.0,
304
305
306
307
308
            double min_abs_feat_val=1e-50,
            double max_abs_feat_val=1e50
        );

        /**
309
310
         * @brief Constructor for the feature space that takes in python and numpy objects
         * @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
311
         *
312
         * @param phi_0 The initial set of features to combine
313
         * @param allowed_ops list of allowed operators
314
         * @param allowed_param_ops dictionary of the parameterizable operators and their associated free parameters
315
         * @param prop The property to be learned (training data)
316
317
         * @param task_sizes The number of samples per task
         * @param project_type The projection operator to use
318
319
         * @param max_phi highest rung value for the calculation
         * @param n_sis_select number of features to select during each SIS step
320
321
         * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
         * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
Thomas Purcell's avatar
Thomas Purcell committed
322
         * @param cross_corr_max Maximum cross-correlation used for selecting features
323
         * @param min_abs_feat_val minimum absolute feature value
324
325
326
327
328
         * @param max_abs_feat_val maximum absolute feature value
         */
        FeatureSpace(
            py::list phi_0,
            py::list allowed_ops,
329
            py::dict allowed_param_ops,
330
331
            np::ndarray prop,
            py::list task_sizes,
332
            std::string project_type="pearson",
333
334
335
336
            int max_phi=1,
            int n_sis_select=1,
            int max_store_rung=-1,
            int n_rung_generate=0,
Thomas Purcell's avatar
Thomas Purcell committed
337
            double cross_corr_max=1.0,
338
339
340
341
            double min_abs_feat_val=1e-50,
            double max_abs_feat_val=1e50
        );

342
343
344
345
346
347
        /**
         * @brief Constructor for the feature space that takes in python and numpy objects
         * @details constructs the feature space from an initial set of features and a file containing postfix expressions for the features (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
         *
         * @param feature_file The file with the postfix expressions for the feature space
         * @param phi_0 The initial set of features to combine
348
349
         * @param task_sizes The number of samples per task
         * @param project_type The projection operator to use
350
351
352
353
354
355
356
357
         * @param n_sis_select number of features to select during each SIS step
         * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
         * @param cross_corr_max Maximum cross-correlation used for selecting features
         */
        FeatureSpace(
            std::string feature_file,
            py::list phi_0,
            py::list task_sizes,
358
            std::string project_type="pearson",
359
360
361
362
            int n_sis_select=1,
            double cross_corr_max=1.0
        );

363
        // DocString: feat_space_sis_arr
364
365
366
        /**
         * @brief Wrapper function for SIS using a numpy array
         *
367
         * @param prop(np.ndarray) The property to perform SIS over as a numpy array
368
         */
369
370
371
372
373
        inline void sis(np::ndarray prop)
        {
            std::vector<double> prop_vec = python_conv_utils::from_ndarray<double>(prop);
            sis(prop_vec);
        }
374
375

        // DocString: feat_space_sis_list
376
377
378
        /**
         * @brief Wrapper function for SIS using a python list
         *
379
         * @param prop(list) The property to perform SIS over as a python list
380
         */
381
382
383
384
385
386
        inline void sis(py::list prop)
        {
            std::vector<double> prop_vec = python_conv_utils::from_list<double>(prop);
            sis(prop_vec);
        }

387
        // DocString: feat_space_phi_selected_py
388
        /**
389
         * @brief The selected feature space (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
390
391
         * @return _phi_selected as a python list
         */
392
        py::list phi_selected_py();
393

394
        // DocString: feat_space_phi0_py
395
        /**
396
         * @brief The initial feature space (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
397
398
         * @return _phi0 as a python list
         */
399
        py::list phi0_py();
400

401
402
403
404
405
406
407
        // DocString: feat_space_phi_py
        /**
         * @brief The feature space (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
         * @return _phi as a python list
         */
        py::list phi_py();

408
        // DocString: feat_space_scores_py
409
        /**
410
         * @brief The vector of projection scores for SIS
411
412
         * @return _scores as a numpy array
         */
413
        inline np::ndarray scores_py(){return python_conv_utils::to_ndarray<double>(_scores);};
414

415
        // DocString: feat_space_task_sizes_py
416
        /**
417
         * @brief The vector storing the number of samples in each task
418
419
         * @return _task_sizes as a python list
         */
420
        inline py::list task_sizes_py(){return python_conv_utils::to_list<int>(_task_sizes);};
421

422
        // DocString: feat_space_allowed_ops_py
423
        /**
424
         * @brief The list of allowed operator nodes
425
426
         * @return _allowed_ops as a python list
         */
427
        inline py::list allowed_ops_py(){return python_conv_utils::to_list<std::string>(_allowed_ops);}
428

429
        // DocString: feat_space_start_gen_py
430
        /**
431
         * @brief The index in _phi where each generation starts
432
433
         * @return _start_gen as a python list
         */
434
        inline py::list start_gen_py(){return python_conv_utils::to_list<int>(_start_gen);}
435

436
437
438
439
440
441
442
        // DocString: feat_space_get_feature
        /**
         * @brief Return a feature at a specified index
         *
         * @param ind index of the feature to get
         * @return A ModelNode of the feature at index ind
         */
Thomas Purcell's avatar
Thomas Purcell committed
443
        inline ModelNode get_feature(int ind){return ModelNode(_phi[ind]->d_mat_ind(), _phi[ind]->rung(), _phi[ind]->expr(), _phi[ind]->postfix_expr(), _phi[ind]->value(), _phi[ind]->test_value(), _phi[ind]->domain(), _phi[ind]->unit());}
444
    #endif
Thomas Purcell's avatar
Thomas Purcell committed
445
446
447
};

#endif