FeatureSpace.hpp 21.2 KB
Newer Older
1
2
3
4
5
6
7
8
9
/** @file feature_creation/feature_space/FeatureSpace.hpp
 *  @brief Create a feature space from an initial set of features and algebraic operators
 *
 *  Use an initial set of features and combine them to generate more complicated algebraical features. SIS is also performed here
 *
 *  @author Thomas A. R. Purcell (tpurcell)
 *  @bug No known bugs.
 */

Thomas Purcell's avatar
Thomas Purcell committed
10
11
12
#ifndef FEATURE_SPACE
#define FEATURE_SPACE

Thomas Purcell's avatar
Thomas Purcell committed
13
#include <mpi_interface/MPI_Interface.hpp>
14
15
#include <mpi_interface/MPI_ops.hpp>
#include <mpi_interface/serialize_tuple.h>
Thomas Purcell's avatar
Thomas Purcell committed
16
#include <feature_creation/node/ModelNode.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
17
#include <feature_creation/node/operator_nodes/allowed_ops.hpp>
18
#include <feature_creation/node/utils.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
19
#include <feature_creation/node/value_storage/nodes_value_containers.hpp>
20
#include <utils/compare_features.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
21
#include <utils/project.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
22

Thomas Purcell's avatar
Thomas Purcell committed
23
#include <boost/serialization/shared_ptr.hpp>
24
#include <boost/filesystem.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
25

Thomas Purcell's avatar
Thomas Purcell committed
26
#include <iostream>
Thomas Purcell's avatar
Thomas Purcell committed
27
#include <iomanip>
28
#include <utility>
Thomas Purcell's avatar
Thomas Purcell committed
29

30
31
32
33
#ifdef PY_BINDINGS
    namespace np = boost::python::numpy;
    namespace py = boost::python;
#endif
34

35
// DocString: cls_feat_space
36
37
38
39
40
/**
 * @brief Feature Space for SISSO calculations
 * @details Stores and performs all feature calculations for SIS
 *
 */
Thomas Purcell's avatar
Thomas Purcell committed
41
42
class FeatureSpace
{
43
    std::vector<node_ptr> _phi_selected; //!< selected features
44
    std::vector<node_ptr> _phi; //!< all features
45
    const std::vector<node_ptr> _phi_0; //!< initial feature space
46

47
    #ifdef PARAMETERIZE
Thomas Purcell's avatar
Thomas Purcell committed
48
49
50
        std::vector<un_param_op_node_gen> _un_param_operators; //!< list of all parameterized unary operators with free parameters
        std::vector<bin_param_op_node_gen> _com_bin_param_operators; //!< list of all parameterized commutable binary operators with free parameters
        std::vector<bin_param_op_node_gen> _bin_param_operators; //!< list of all parameterized binary operators with free parameters
51
    #endif
52

Thomas Purcell's avatar
Thomas Purcell committed
53
    std::vector<std::string> _allowed_param_ops; //!< Map of parameterization operator set (set of operators and non-linear parameters used for a non-linear least squares fit to property)
54
55
56
57
58
    std::vector<std::string> _allowed_ops; //!< list of all allowed operators strings
    std::vector<un_op_node_gen> _un_operators; //!< list of all unary operators
    std::vector<bin_op_node_gen> _com_bin_operators; //!< list of all commutable binary operators
    std::vector<bin_op_node_gen> _bin_operators; //!< list of all binary operators

59
    std::vector<double> _prop; //!< The property to fit
60
61
    std::vector<double> _scores; //!< projection scores for each feature

62
    const std::vector<int> _task_sizes; //!< The number of elements in each task (training data)
63
    std::vector<int> _start_gen; //!< list of the indexes where each generation starts in _phi
Thomas Purcell's avatar
Thomas Purcell committed
64
    const std::string _project_type; //!< The type of projection that should be done during SIS
65
66
    const std::string _feature_space_file; //!< File to store information about the selected features
    const std::string _feature_space_summary_file; //!< File to store information about the selected features
67

68
69
    std::function<void(double*, double*, std::vector<node_ptr>&, const std::vector<int>&, int)> _project; //!< Function used to calculate the scores for SIS
    std::function<void(double*, double*, std::vector<node_ptr>&, const std::vector<int>&, int)> _project_no_omp; //!< Function used to calculate the scores for SIS without changing omp environment
70
    std::function<bool(double*, int, double, std::vector<double>&, double, int, int)> _is_valid; //!< Function used to calculate the scores for SIS
Thomas Purcell's avatar
Bug fix    
Thomas Purcell committed
71
    std::function<bool(double*, int, double, std::vector<node_ptr>&, std::vector<double>&, double)> _is_valid_feat_list; //!< Function used to calculate the scores for SIS without changing omp environment
72

73
    std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI communicator
74

75
76
77
    const double _cross_cor_max; //!< Maximum cross-correlation used for selecting features
    const double _l_bound; //!< lower bound for absolute value of the features
    const double _u_bound; //!< upper bound for absolute value of the features
78

Thomas Purcell's avatar
Thomas Purcell committed
79
    int _n_rung_store; //!< Total rungs stored
80
81
    int _n_feat; //!< Total number of features
    int _max_phi; //!< Maximum rung for the feature creation
82

83
84
85
    const int _n_sis_select; //!< Number of features to select for each dimensions
    const int _n_samp; //!< Number of samples (training data)
    const int _n_rung_generate; //!< Total number of rungs to generate on the fly
86

87
88
    int _max_param_depth; //!< Max depth to parameterize a feature (default=_max_rung)

Thomas Purcell's avatar
Thomas Purcell committed
89
public:
Thomas Purcell's avatar
Thomas Purcell committed
90

91
92
    /**
     * @brief Constructor for the feature space
93
     * @details constructs the feature space from an initial set of features and a list of allowed operators
94
95
     *
     * @param mpi_comm MPI communicator for the calculations
96
     * @param phi_0 The initial set of features to combine
97
     * @param allowed_ops list of allowed operators
98
     * @param allowed_param_ops dictionary of the parameterizable operators and their associated free parameters
99
     * @param prop The property to be learned (training data)
100
101
     * @param task_sizes The number of samples per task
     * @param project_type The projection operator to use
102
103
     * @param max_phi highest rung value for the calculation
     * @param n_sis_select number of features to select during each SIS step
104
105
     * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
     * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
Thomas Purcell's avatar
Thomas Purcell committed
106
     * @param cross_corr_max Maximum cross-correlation used for selecting features
107
     * @param min_abs_feat_val minimum absolute feature value
108
109
     * @param max_abs_feat_val maximum absolute feature value
     */
Thomas Purcell's avatar
Thomas Purcell committed
110
    FeatureSpace(
Thomas Purcell's avatar
Thomas Purcell committed
111
        std::shared_ptr<MPI_Interface> mpi_comm,
Thomas Purcell's avatar
Thomas Purcell committed
112
113
        std::vector<node_ptr> phi_0,
        std::vector<std::string> allowed_ops,
Thomas Purcell's avatar
Thomas Purcell committed
114
        std::vector<std::string> allowed_param_ops,
115
        std::vector<double> prop,
Thomas Purcell's avatar
Thomas Purcell committed
116
        std::vector<int> task_sizes,
117
        std::string project_type="regression",
Thomas Purcell's avatar
Thomas Purcell committed
118
119
        int max_phi=1,
        int n_sis_select=1,
120
121
        int max_store_rung=-1,
        int n_rung_generate=0,
Thomas Purcell's avatar
Thomas Purcell committed
122
        double cross_corr_max=1.0,
123
        double min_abs_feat_val=1e-50,
124
        double max_abs_feat_val=1e50,
Thomas Purcell's avatar
Thomas Purcell committed
125
        int max_param_depth = -1
126
127
    );

128
129
130
    /**
     * @brief Initialize the feature set given a property vector
     */
Thomas Purcell's avatar
Thomas Purcell committed
131
    void initialize_fs();
132

133
134
135
136
137
138
139
140
141
    /**
     * @brief Uses _allowed_ops to set the operator lists
     */
    void set_op_lists();

    /**
     * @brief Initializes the output files for SIS
     */
    void initialize_fs_output_files();
142
143
144
145
    /**
     * @brief Generate the full feature set from the allowed operators and initial feature set
     * @details populates phi with all features from an initial set and the allowed operators
     */
146
    void generate_feature_space();
Thomas Purcell's avatar
Thomas Purcell committed
147

148
    /**
149
     * @brief The selected feature space
150
     */
151
    inline std::vector<node_ptr> phi_selected(){return _phi_selected;};
152
153

    /**
154
     * @brief The full feature space
155
     */
Thomas Purcell's avatar
Thomas Purcell committed
156
    inline std::vector<node_ptr> phi(){return _phi;};
157
158

    /**
159
     * @brief The initial feature space
160
     */
Thomas Purcell's avatar
Thomas Purcell committed
161
    inline std::vector<node_ptr> phi0(){return _phi_0;};
162
163

    /**
164
     * @brief The vector of projection scores for SIS
165
     */
166
167
    inline std::vector<double> scores(){return _scores;}

168
    /**
169
     * @brief The MPI Communicator
170
     */
Thomas Purcell's avatar
Thomas Purcell committed
171
    inline std::shared_ptr<MPI_Interface> mpi_comm(){return _mpi_comm;}
172

173
    /**
174
     * @brief The vector storing the number of samples in each task
175
     */
Thomas Purcell's avatar
Thomas Purcell committed
176
    inline std::vector<int> task_sizes(){return _task_sizes;}
177

178
    // DocString: feat_space_feature_space_file
179
    /**
180
     * @brief The feature space filename
181
     */
182
    inline std::string feature_space_file(){return _feature_space_file;}
183

184
    // DocString: feat_space_l_bound
185
    /**
186
     * @brief The minimum absolute value of the feature
187
     */
188
    inline double l_bound(){return _l_bound;}
189

190
    // DocString: feat_space_u_bound
191
    /**
192
     * @brief The maximum absolute value of the feature
193
     */
194
    inline double u_bound(){return _u_bound;}
195

196
    // DocString: feat_space_max_phi
197
    /**
198
     * @brief The maximum rung of the feature space
199
     */
200
    inline int max_phi(){return _max_phi;}
201

202
    // DocString: feat_space_n_sis_select
203
    /**
204
     * @brief The number of features selected in each SIS step
205
     */
206
    inline int n_sis_select(){return _n_sis_select;}
207

208
    // DocString: feat_space_n_samp
209
    /**
210
     * @brief The number of samples per feature
211
     */
212
    inline int n_samp(){return _n_samp;}
213

214
    // DocString: feat_space_n_feat
215
    /**
216
     * @brief The number of features in the feature space
217
     */
218
    inline int n_feat(){return _n_feat;}
219

220
    // DocString: feat_space_n_rung_store
221
    /**
222
     * @brief The number of rungs whose feature training data is stored in memory
223
     */
224
    inline int n_rung_store(){return _n_rung_store;}
225

226
    // DocString: feat_space_n_rung_generate
227
    /**
228
     * @brief The number of rungs to be generated on the fly during SIS
229
     */
230
    inline int n_rung_generate(){return _n_rung_generate;}
231

Thomas Purcell's avatar
Thomas Purcell committed
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
    #ifdef PARAMETERIZE
        /**
         * @brief Generate a new set of features from a single feature
         * @details Take in the feature and perform all valid algebraic operations on it.
         *
         * @param feat The feature to spawn new features from
         * @param feat_set The feature set to pull features from for combinations
         * @param feat_ind starting index for the next feature generated
         * @param optimizer The object used to optimize the parameterized features
         * @param l_bound lower bound for the absolute value of the feature
         * @param u_bound upper bound for the abosulte value of the feature
         */
        void generate_new_feats(
            std::vector<node_ptr>::iterator& feat,
            std::vector<node_ptr>& feat_set,
247
            unsigned long int& feat_ind,
Thomas Purcell's avatar
Thomas Purcell committed
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
            std::shared_ptr<NLOptimizer> optimizer,
            double l_bound=1e-50,
            double u_bound=1e50
        );
    #else
        /**
         * @brief Generate a new set of features from a single feature
         * @details Take in the feature and perform all valid algebraic operations on it.
         *
         * @param feat The feature to spawn new features from
         * @param feat_set The feature set to pull features from for combinations
         * @param feat_ind starting index for the next feature generated
         * @param l_bound lower bound for the absolute value of the feature
         * @param u_bound upper bound for the abosulte value of the feature
         */
        void generate_new_feats(
            std::vector<node_ptr>::iterator& feat,
            std::vector<node_ptr>& feat_set,
266
            unsigned long int& feat_ind,
Thomas Purcell's avatar
Thomas Purcell committed
267
268
269
270
            double l_bound=1e-50,
            double u_bound=1e50
        );
    #endif
271

272
273
274
275
276
277
278
279
280
281
    /**
     * @brief Calculate the SIS Scores for feature generated on the fly
     * @details Create the next rung of features and calculate their projection scores. Only keep those that can be selected by SIS.
     *
     * @param prop Pointer to the start of the vector storing the data to project the features onto
     * @param size The size of the data to project over
     * @param phi_selected The features that would be selected from the previous rungs
     * @param scores_selected The projection scores of the features that would be selected from the previous rungs
     * @param scores_comp vector to store temporary score comparisons
     */
282
    void project_generated(double* prop, int size, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected);
283

284
285
    /**
     * @brief Perform SIS on a feature set with a specified property
286
     * @details Perform sure-independence screening with either the correct property or the error
287
     *
288
     * @param prop The property to perform SIS over
289
     */
Thomas Purcell's avatar
Thomas Purcell committed
290
    void sis(std::vector<double>& prop);
291

292
    // DocString: feat_space_feat_in_phi
293
294
295
    /**
     * @brief Is a feature in this process' _phi?
     *
296
297
     * @param ind The index of the feature
     * @return True if feature is in this rank's _phi
298
299
300
     */
    inline bool feat_in_phi(int ind){return (ind >= _phi[0]->feat_ind()) && (ind <= _phi.back()->feat_ind());}

301
302
303
304
305
306
307
308
    // DocString: feat_space_remove_feature
    /**
     * @brief Remove a feature from phi
     *
     * @param ind index of feature to remove
     */
    void remove_feature(int ind);

309
310
311
    // Python Interface Functions
    #ifdef PY_BINDINGS
        /**
312
313
         * @brief Constructor for the feature space that takes in python objects
         * @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
314
         *
315
         * @param phi_0 The initial set of features to combine
316
         * @param allowed_ops list of allowed operators
317
         * @param allowed_param_ops dictionary of the parameterizable operators and their associated free parameters
318
         * @param prop The property to be learned (training data)
319
320
         * @param task_sizes The number of samples per task
         * @param project_type The projection operator to use
321
322
         * @param max_phi highest rung value for the calculation
         * @param n_sis_select number of features to select during each SIS step
323
324
         * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
         * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
Thomas Purcell's avatar
Thomas Purcell committed
325
         * @param cross_corr_max Maximum cross-correlation used for selecting features
326
         * @param min_abs_feat_val minimum absolute feature value
327
328
329
330
331
         * @param max_abs_feat_val maximum absolute feature value
         */
        FeatureSpace(
            py::list phi_0,
            py::list allowed_ops,
Thomas Purcell's avatar
Thomas Purcell committed
332
            py::list allowed_param_ops,
333
334
            py::list prop,
            py::list task_sizes,
Thomas Purcell's avatar
Thomas Purcell committed
335
            std::string project_type="regression",
336
337
338
339
            int max_phi=1,
            int n_sis_select=1,
            int max_store_rung=-1,
            int n_rung_generate=0,
Thomas Purcell's avatar
Thomas Purcell committed
340
            double cross_corr_max=1.0,
341
            double min_abs_feat_val=1e-50,
342
            double max_abs_feat_val=1e50,
Thomas Purcell's avatar
Thomas Purcell committed
343
            int max_param_depth = -1
344
345
346
        );

        /**
347
348
         * @brief Constructor for the feature space that takes in python and numpy objects
         * @details constructs the feature space from an initial set of features and a list of allowed operators (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
349
         *
350
         * @param phi_0 The initial set of features to combine
351
         * @param allowed_ops list of allowed operators
352
         * @param allowed_param_ops dictionary of the parameterizable operators and their associated free parameters
353
         * @param prop The property to be learned (training data)
354
355
         * @param task_sizes The number of samples per task
         * @param project_type The projection operator to use
356
357
         * @param max_phi highest rung value for the calculation
         * @param n_sis_select number of features to select during each SIS step
358
359
         * @param max_store_rung number of rungs to calculate and store the value of the features for all samples
         * @param n_rung_generate number of rungs to generate on the fly during SIS (this must be 1 or 0 right now, possible to be higher with recursive algorithm)
Thomas Purcell's avatar
Thomas Purcell committed
360
         * @param cross_corr_max Maximum cross-correlation used for selecting features
361
         * @param min_abs_feat_val minimum absolute feature value
362
363
364
365
366
         * @param max_abs_feat_val maximum absolute feature value
         */
        FeatureSpace(
            py::list phi_0,
            py::list allowed_ops,
Thomas Purcell's avatar
Thomas Purcell committed
367
            py::list allowed_param_ops,
368
369
            np::ndarray prop,
            py::list task_sizes,
Thomas Purcell's avatar
Thomas Purcell committed
370
            std::string project_type="regression",
371
372
373
374
            int max_phi=1,
            int n_sis_select=1,
            int max_store_rung=-1,
            int n_rung_generate=0,
Thomas Purcell's avatar
Thomas Purcell committed
375
            double cross_corr_max=1.0,
376
            double min_abs_feat_val=1e-50,
377
            double max_abs_feat_val=1e50,
Thomas Purcell's avatar
Thomas Purcell committed
378
            int max_param_depth = -1
379
380
        );

381
382
383
384
385
386
        /**
         * @brief Constructor for the feature space that takes in python and numpy objects
         * @details constructs the feature space from an initial set of features and a file containing postfix expressions for the features (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
         *
         * @param feature_file The file with the postfix expressions for the feature space
         * @param phi_0 The initial set of features to combine
387
         * @param prop The property to be learned (training data)
388
389
         * @param task_sizes The number of samples per task
         * @param project_type The projection operator to use
390
391
392
393
394
395
         * @param n_sis_select number of features to select during each SIS step
         * @param cross_corr_max Maximum cross-correlation used for selecting features
         */
        FeatureSpace(
            std::string feature_file,
            py::list phi_0,
396
397
398
399
400
401
402
403
404
405
406
407
408
409
            np::ndarray prop,
            py::list task_sizes,
            std::string project_type="pearson",
            int n_sis_select=1,
            double cross_corr_max=1.0
        );

        /**
         * @brief Constructor for the feature space that takes in python and numpy objects
         * @details constructs the feature space from an initial set of features and a file containing postfix expressions for the features (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
         *
         * @param feature_file The file with the postfix expressions for the feature space
         * @param prop The property to be learned (training data)
         * @param phi_0 The initial set of features to combine
410
411
         * @param task_sizes The number of samples per task
         * @param project_type The projection operator to use
412
413
414
415
416
417
         * @param n_sis_select number of features to select during each SIS step
         * @param cross_corr_max Maximum cross-correlation used for selecting features
         */
        FeatureSpace(
            std::string feature_file,
            py::list phi_0,
418
            py::list prop,
419
            py::list task_sizes,
420
            std::string project_type="pearson",
421
422
423
424
            int n_sis_select=1,
            double cross_corr_max=1.0
        );

425
        // DocString: feat_space_sis_arr
426
427
428
        /**
         * @brief Wrapper function for SIS using a numpy array
         *
429
         * @param prop(np.ndarray) The property to perform SIS over as a numpy array
430
         */
431
432
433
434
435
        inline void sis(np::ndarray prop)
        {
            std::vector<double> prop_vec = python_conv_utils::from_ndarray<double>(prop);
            sis(prop_vec);
        }
436
437

        // DocString: feat_space_sis_list
438
439
440
        /**
         * @brief Wrapper function for SIS using a python list
         *
441
         * @param prop(list) The property to perform SIS over as a python list
442
         */
443
444
445
446
447
448
        inline void sis(py::list prop)
        {
            std::vector<double> prop_vec = python_conv_utils::from_list<double>(prop);
            sis(prop_vec);
        }

449
        // DocString: feat_space_phi_selected_py
450
        /**
451
         * @brief The selected feature space (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
452
453
         * @return _phi_selected as a python list
         */
454
        py::list phi_selected_py();
455

456
        // DocString: feat_space_phi0_py
457
        /**
458
         * @brief The initial feature space (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
459
460
         * @return _phi0 as a python list
         */
461
        py::list phi0_py();
462

463
464
465
466
467
468
469
        // DocString: feat_space_phi_py
        /**
         * @brief The feature space (cpp definition in <python/feature_creation/FeatureSpace.cpp>)
         * @return _phi as a python list
         */
        py::list phi_py();

470
        // DocString: feat_space_scores_py
471
        /**
472
         * @brief The vector of projection scores for SIS
473
474
         * @return _scores as a numpy array
         */
475
        inline np::ndarray scores_py(){return python_conv_utils::to_ndarray<double>(_scores);};
476

477
        // DocString: feat_space_task_sizes_py
478
        /**
479
         * @brief The vector storing the number of samples in each task
480
481
         * @return _task_sizes as a python list
         */
482
        inline py::list task_sizes_py(){return python_conv_utils::to_list<int>(_task_sizes);};
483

484
        // DocString: feat_space_allowed_ops_py
485
        /**
486
         * @brief The list of allowed operator nodes
487
488
         * @return _allowed_ops as a python list
         */
489
        inline py::list allowed_ops_py(){return python_conv_utils::to_list<std::string>(_allowed_ops);}
490

491
        // DocString: feat_space_start_gen_py
492
        /**
493
         * @brief The index in _phi where each generation starts
494
495
         * @return _start_gen as a python list
         */
496
        inline py::list start_gen_py(){return python_conv_utils::to_list<int>(_start_gen);}
497

498
499
500
501
502
503
504
        // DocString: feat_space_get_feature
        /**
         * @brief Return a feature at a specified index
         *
         * @param ind index of the feature to get
         * @return A ModelNode of the feature at index ind
         */
505
        inline ModelNode get_feature(int ind){return ModelNode(_phi[ind]);}
506
    #endif
Thomas Purcell's avatar
Thomas Purcell committed
507
508
};

509
#endif