FeatureSpace.hpp 19.1 KB
Newer Older
1
// Copyright 2021 Thomas A. R. Purcell
2
//
3
4
5
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
6
//
7
//     http://www.apache.org/licenses/LICENSE-2.0
8
//
9
10
11
12
13
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14

15
/** @file feature_creation/feature_space/FeatureSpace.hpp
16
 *  @brief Defines the class for creating/operating on a feature space in SISSO
17
 *
18
 *  @author Thomas A. R. Purcell (tpurcell90)
19
20
21
 *  @bug No known bugs.
 */

Thomas Purcell's avatar
Thomas Purcell committed
22
23
24
#ifndef FEATURE_SPACE
#define FEATURE_SPACE

25
#include <boost/filesystem.hpp>
Thomas Purcell's avatar
Thomas Purcell committed
26

27
#include <utility>
Thomas Purcell's avatar
Thomas Purcell committed
28

Thomas Purcell's avatar
Thomas Purcell committed
29
#include "feature_creation/node/utils.hpp"
30
#include "inputs/InputParser.hpp"
Thomas Purcell's avatar
Thomas Purcell committed
31
32

#include "mpi_interface/MPI_Interface.hpp"
33
#include "mpi_interface/MPI_Ops.hpp"
Thomas Purcell's avatar
Thomas Purcell committed
34
35
#include "mpi_interface/serialize_tuple.h"

Thomas Purcell's avatar
Thomas Purcell committed
36
#include "utils/project.hpp"
37
#include "mpi_interface/MPI_Interface.hpp"
Thomas Purcell's avatar
Thomas Purcell committed
38

39
40
41
42
#ifdef PY_BINDINGS
    namespace np = boost::python::numpy;
    namespace py = boost::python;
#endif
43

44
// DocString: cls_feat_space
45
/**
46
 * @brief Feature Space for SISSO calculations. It stores and performs all actions on the feature space for SISSO.
47
48
 *
 */
Thomas Purcell's avatar
Thomas Purcell committed
49
50
class FeatureSpace
{
Thomas Purcell's avatar
Thomas Purcell committed
51
52
    std::vector<node_ptr> _phi_selected; //!< A vector containing all of the selected features
    std::vector<node_ptr> _phi; //!< A vector containing all features generated (Not including those created on the Fly during SIS)
53
    std::vector<node_ptr> _phi_0; //!< A vector containing all of the Primary features
54

55
    #ifdef PARAMETERIZE
Thomas Purcell's avatar
Thomas Purcell committed
56
57
58
59
60
61
62
63
    std::vector<node_ptr> _phi_reparam; //!< A vector containing the features created when reparameterizating using the residuals
    std::vector<int> _end_no_params; //!< A vector containing the indexes of each rung where parameterized nodes start
    std::vector<int> _start_rung_reparam; //!< A vector containing the indexes of each rung where parameterized nodes start

    std::vector<un_param_op_node_gen> _un_param_operators; //!< Vector containing all parameterized unary operators with free parameters
    std::vector<bin_param_op_node_gen> _com_bin_param_operators; //!< Vector containing all parameterized commutable binary operators with free parameters
    std::vector<bin_param_op_node_gen> _bin_param_operators; //!< Vector containing all parameterized binary operators with free parameters
    std::vector<std::string> _allowed_param_ops; //!< Vector containing all allowed operators strings for operators with free parameters
64
    #endif
65

Thomas Purcell's avatar
Thomas Purcell committed
66
67
68
69
    std::vector<std::string> _allowed_ops; //!< Vector containing all allowed operators strings
    std::vector<un_op_node_gen> _un_operators; //!< Vector containing all unary operators
    std::vector<bin_op_node_gen> _com_bin_operators; //!< Vector containing all commutable binary operators
    std::vector<bin_op_node_gen> _bin_operators; //!< Vector containing all binary operators
70

71
    std::vector<double> _prop_train; //!< The value of the property vector for each training sample
Thomas Purcell's avatar
Thomas Purcell committed
72
    std::vector<double> _scores; //!< The projection scores for each feature
73

74
    const std::vector<int> _task_sizes_train; //!< Number of training samples per task
Thomas Purcell's avatar
Thomas Purcell committed
75
76
77
78
    std::vector<int> _start_rung; //!< Vector containing the indexes where each rung starts in _phi
    const std::string _project_type; //!< The type of LossFunction to use when projecting the features onto a property
    const std::string _feature_space_file; //!< File to output the computer readable representation of the selected features to
    const std::string _feature_space_summary_file; //!< File to output the human readable representation of the selected features to
79

Thomas Purcell's avatar
Thomas Purcell committed
80
81
    std::function<bool(const double*, const int, const double, const std::vector<double>&, const double, const int, const int)> _is_valid; //!< Function used to determine of a feature is too correlated to previously selected features
    std::function<bool(const double*, const int, const double, const std::vector<node_ptr>&, const std::vector<double>&, const double)> _is_valid_feat_list; //!< Function used to determine of a feature is too correlated to previously selected features within a given list
82

Thomas Purcell's avatar
Thomas Purcell committed
83
    std::shared_ptr<MPI_Interface> _mpi_comm; //!< the MPI communicator for the calculation
84

85
    const double _cross_cor_max; //!< Maximum cross-correlation used for selecting features
Thomas Purcell's avatar
Thomas Purcell committed
86
87
    const double _l_bound; //!< The lower bound for the maximum absolute value of the features
    const double _u_bound; //!< The upper bound for the maximum absolute value of the features
88

Thomas Purcell's avatar
Thomas Purcell committed
89
90
91
    int _n_rung_store; //!< The number of rungs to calculate and store the value of the features for all samples
    int _n_feat; //!< Total number of features in the feature space
    int _max_rung; //!< Maximum rung for the feature creation
92

Thomas Purcell's avatar
Thomas Purcell committed
93
    const int _n_sis_select; //!< Number of features to select during each SIS iteration
94
    const int _n_samp_train; //!< Number of samples in the training set
Thomas Purcell's avatar
Thomas Purcell committed
95
    const int _n_rung_generate; //!< Either 0 or 1, and is the number of rungs to generate on the fly during SIS
96

97
    #ifdef PARAMETERIZE
Thomas Purcell's avatar
Thomas Purcell committed
98
    int _max_param_depth; //!< The maximum depth in the binary expression tree to set non-linear optimization
99
100
    bool _reparam_residual; //!< If True then reparameterize features using the residuals of each model
    #endif
Thomas Purcell's avatar
Thomas Purcell committed
101
public:
Thomas Purcell's avatar
Thomas Purcell committed
102

103
    // DocString: feat_space_init
104
    /**
105
     * @brief Construct a FeatureSpace using an InputParser object
106
     *
107
     * @param inputs InputParser object used to build the FeatureSpace
108
     */
109
    FeatureSpace(InputParser inputs);
110

111
    /**
Thomas Purcell's avatar
Thomas Purcell committed
112
     * @brief Populate the operator lists using _allowed_ops and _allowed_param_ops
113
114
115
116
     */
    void set_op_lists();

    /**
Thomas Purcell's avatar
Thomas Purcell committed
117
     * @brief Create SIS output files and write their headers
118
     */
119
    void initialize_fs_output_files() const;
Thomas Purcell's avatar
Thomas Purcell committed
120

121
    /**
Thomas Purcell's avatar
Thomas Purcell committed
122
     * @brief Populate _phi using _phi_0 and the allowed operators up to (_max_rung - _n_rung_generate)^th rung
123
     */
124
    void generate_feature_space();
Thomas Purcell's avatar
Thomas Purcell committed
125

126
    /**
Thomas Purcell's avatar
Thomas Purcell committed
127
     * @brief A vector containing all of the selected features
128
     */
129
    inline std::vector<node_ptr> phi_selected() const {return _phi_selected;};
130
131

    /**
Thomas Purcell's avatar
Thomas Purcell committed
132
     * @brief A vector containing all features generated (Not including those created on the Fly during SIS)
133
     */
134
    inline std::vector<node_ptr> phi() const {return _phi;};
135
136

    /**
Thomas Purcell's avatar
Thomas Purcell committed
137
     * @brief A vector containing all of the Primary features
138
     */
139
    inline std::vector<node_ptr> phi0() const {return _phi_0;};
140
141

    /**
Thomas Purcell's avatar
Thomas Purcell committed
142
     * @brief The projection scores for each feature in _phi
143
     */
144
    inline std::vector<double> scores() const {return _scores;}
145

146
    /**
147
     * @brief The MPI Communicator
148
     */
149
    inline std::shared_ptr<MPI_Interface> mpi_comm() const {return _mpi_comm;}
150

151
    /**
Thomas Purcell's avatar
Thomas Purcell committed
152
     * @brief Number of training samples per task
153
     */
154
    inline std::vector<int> task_sizes_train() const {return _task_sizes_train;}
155

156
    // DocString: feat_space_feature_space_file
157
    /**
Thomas Purcell's avatar
Thomas Purcell committed
158
     * @brief Filename of the file to output the computer readable representation of the selected features to
159
     */
160
    inline std::string feature_space_file() const {return _feature_space_file;}
161

Thomas Purcell's avatar
Thomas Purcell committed
162
163
164
165
166
167
    // DocString: feat_space_feature_space_file
    /**
     * @brief Filename of the file to output the human readable representation of the selected features to
     */
    inline std::string feature_space_summary_file() const {return _feature_space_summary_file;}

168
    // DocString: feat_space_l_bound
169
    /**
Thomas Purcell's avatar
Thomas Purcell committed
170
     * @brief The mlower bound for the maximum absolute value of the features
171
     */
172
    inline double l_bound() const {return _l_bound;}
173

174
    // DocString: feat_space_u_bound
175
    /**
Thomas Purcell's avatar
Thomas Purcell committed
176
     * @brief The upper bound for the maximum absolute value of the features
177
     */
178
    inline double u_bound() const {return _u_bound;}
179

Thomas Purcell's avatar
Thomas Purcell committed
180
    // DocString: feat_space_max_rung
181
    /**
Thomas Purcell's avatar
Thomas Purcell committed
182
     * @brief The maximum rung for the feature creation
183
     */
Thomas Purcell's avatar
Thomas Purcell committed
184
    inline int max_rung() const {return _max_rung;}
185

186
    // DocString: feat_space_n_sis_select
187
    /**
Thomas Purcell's avatar
Thomas Purcell committed
188
     * @brief The number of features to select during each SIS iteration
189
     */
190
    inline int n_sis_select() const {return _n_sis_select;}
191

192
    // DocString: feat_space_n_samp_train
193
    /**
Thomas Purcell's avatar
Thomas Purcell committed
194
     * @brief The nuumber of samples in the training set
195
     */
196
    inline int n_samp_train() const {return _n_samp_train;}
197

198
    // DocString: feat_space_n_feat
199
    /**
Thomas Purcell's avatar
Thomas Purcell committed
200
     * @brief The total number of features in the feature space
201
     */
202
    inline int n_feat() const {return _n_feat;}
203

204
    // DocString: feat_space_n_rung_store
205
    /**
Thomas Purcell's avatar
Thomas Purcell committed
206
     * @brief The number of rungs to calculate and store the value of the features for all samples
207
     */
208
    inline int n_rung_store() const {return _n_rung_store;}
209

210
    // DocString: feat_space_n_rung_generate
211
    /**
212
     * @brief Either 0 or 1, and is the number of rungs to generate on the fly during SIS
213
     */
214
    inline int n_rung_generate() const {return _n_rung_generate;}
215

Thomas Purcell's avatar
Thomas Purcell committed
216
217
    /**
     * @brief Generate a new set of non-parameterized features from a single feature
Thomas Purcell's avatar
Thomas Purcell committed
218
     * @details Perform all valid algebraic operations on the passed feature and all features that appear before it in _phi.
Thomas Purcell's avatar
Thomas Purcell committed
219
220
     *
     * @param feat The feature to spawn new features from
Thomas Purcell's avatar
Thomas Purcell committed
221
222
     * @param feat_set The feature set to pull features from for binary operations
     * @param start The point in feat_set to begin pulling features from for binary operations
Thomas Purcell's avatar
Thomas Purcell committed
223
     * @param feat_ind starting index for the next feature generated
Thomas Purcell's avatar
Thomas Purcell committed
224
225
     * @param l_bound lower bound for the maximum absolute value of the feature
     * @param u_bound upper bound for the maximum abosulte value of the feature
Thomas Purcell's avatar
Thomas Purcell committed
226
227
228
229
     */
    void generate_non_param_feats(
        std::vector<node_ptr>::iterator& feat,
        std::vector<node_ptr>& feat_set,
230
        const std::vector<node_ptr>::iterator& start,
Thomas Purcell's avatar
Thomas Purcell committed
231
232
233
234
235
        unsigned long int& feat_ind,
        const double l_bound=1e-50,
        const double u_bound=1e50
    );

236
#ifdef PARAMETERIZE
237
    /**
Thomas Purcell's avatar
Thomas Purcell committed
238
     * @brief Generate a new set of parameterized features from a single feature
Thomas Purcell's avatar
Thomas Purcell committed
239
     * @details Perform all valid algebraic operations on the passed feature and all features that appear before it in _phi.
240
241
     *
     * @param feat The feature to spawn new features from
Thomas Purcell's avatar
Thomas Purcell committed
242
243
     * @param feat_set The feature set to pull features from for binary operations
     * @param start The point in feat_set to begin pulling features from for binary operations
244
245
     * @param feat_ind starting index for the next feature generated
     * @param optimizer The object used to optimize the parameterized features
Thomas Purcell's avatar
Thomas Purcell committed
246
247
     * @param l_bound lower bound for the maximum absolute value of the feature
     * @param u_bound upper bound for the maximum abosulte value of the feature
248
     */
Thomas Purcell's avatar
Thomas Purcell committed
249
    void generate_param_feats(
250
251
        std::vector<node_ptr>::iterator& feat,
        std::vector<node_ptr>& feat_set,
252
        const std::vector<node_ptr>::iterator& start,
253
254
        unsigned long int& feat_ind,
        std::shared_ptr<NLOptimizer> optimizer,
255
256
        const double l_bound=1e-50,
        const double u_bound=1e50
257
    );
Thomas Purcell's avatar
Thomas Purcell committed
258

259
    /**
Thomas Purcell's avatar
Thomas Purcell committed
260
     * @brief Generate a new set of parameterized features for the residuals
261
262
     *
     * @param feat The feature to spawn new features from
Thomas Purcell's avatar
Thomas Purcell committed
263
     * @param feat_set The feature set to pull features from for binary operations
264
     * @param feat_ind starting index for the next feature generated
Thomas Purcell's avatar
Thomas Purcell committed
265
     * @param optimizer The object used to optimize the parameterized features
Thomas Purcell's avatar
Thomas Purcell committed
266
267
     * @param l_bound lower bound for the maximum absolute value of the feature
     * @param u_bound upper bound for the maximum abosulte value of the feature
268
     */
Thomas Purcell's avatar
Thomas Purcell committed
269
    void generate_reparam_feats(
270
271
272
        std::vector<node_ptr>::iterator& feat,
        std::vector<node_ptr>& feat_set,
        unsigned long int& feat_ind,
Thomas Purcell's avatar
Thomas Purcell committed
273
        std::shared_ptr<NLOptimizer> optimizer,
274
275
        const double l_bound=1e-50,
        const double u_bound=1e50
276
    );
Thomas Purcell's avatar
Thomas Purcell committed
277
278
279
280
281
282
283

    /**
     * @brief Generate reparameterized feature set
     *
     * @param prop The property to optimize against
     */
    void generate_reparam_feature_set(const std::vector<double>& prop);
284
#endif
285

286
    /**
Thomas Purcell's avatar
Thomas Purcell committed
287
     * @brief Generate the final rung of features on the fly and calculate their projection scores for SISat can be selected by SIS.
288
     *
Thomas Purcell's avatar
Thomas Purcell committed
289
290
291
     * @param loss The LossFunction used to project over all of the features
     * @param phi_selected The set of features that would be selected excluding the final rung
     * @param scores_selected The projection scores of all features in phi_selected
292
     */
Thomas Purcell's avatar
Thomas Purcell committed
293
    void generate_and_project(std::shared_ptr<LossFunction> loss, std::vector<node_ptr>& phi_selected, std::vector<double>& scores_selected);
294

295
    /**
296
     * @brief Perform Sure-Independence Screening over the FeatureSpace. The features are ranked using a projection operator constructed using _project_type and the Property vector
297
     *
298
     * @param prop Vector containing the property vector (training data only)
299
     */
300
    void sis(const std::vector<double>& prop);
301

Thomas Purcell's avatar
Thomas Purcell committed
302
    /**
303
     * @brief Perform Sure-Independence Screening over the FeatureSpace. The features are ranked using a projection operator defined in loss
Thomas Purcell's avatar
Thomas Purcell committed
304
     *
305
     * @param loss The LossFunction used to project over all of the features
Thomas Purcell's avatar
Thomas Purcell committed
306
307
308
     */
    void sis(std::shared_ptr<LossFunction> loss);

309
    // DocString: feat_space_feat_in_phi
310
311
312
    /**
     * @brief Is a feature in this process' _phi?
     *
313
     * @param ind (int) The index of the feature
Thomas Purcell's avatar
Thomas Purcell committed
314
     *
315
     * @return True if feature is in this rank's _phi
316
     */
317
    inline bool feat_in_phi(int ind) const {return (ind >= _phi[0]->feat_ind()) && (ind <= _phi.back()->feat_ind());}
318

319
320
321
322
    // DocString: feat_space_remove_feature
    /**
     * @brief Remove a feature from phi
     *
Thomas Purcell's avatar
Thomas Purcell committed
323
     * @param ind (int) index of feature to remove
324
     */
325
    void remove_feature(const int ind);
326

327
328
    // Python Interface Functions
    #ifdef PY_BINDINGS
Thomas Purcell's avatar
Thomas Purcell committed
329

330
    // DocString: feat_space_init_file_np_array
331
    /**
332
     * @brief FeatureSpace constructor that uses a file containing postfix feature expressions to describe all features in Phi, and a primary feature setn <python/feature_creation/FeatureSpace.cpp>)
333
     *
334
335
336
     * @param feature_file (str) The file containing the postfix expressions of all features in the FeatureSpace
     * @param phi_0 (list) The set of primary features
     * @param prop (np.ndarray) List containing the property vector (training data only)
337
     * @param task_sizes_train (list) The number of samples in the training data per task
338
339
340
     * @param project_type (str) The type of loss function/projection operator to use
     * @param n_sis_select (int) The number of features to select during each SIS step
     * @param cross_corr_max (double) The maximum allowed cross-correlation value between selected features
341
342
343
344
345
     */
    FeatureSpace(
        std::string feature_file,
        py::list phi_0,
        np::ndarray prop,
346
        py::list task_sizes_train,
Thomas Purcell's avatar
Thomas Purcell committed
347
        std::string project_type="regression",
348
349
350
351
        int n_sis_select=1,
        double cross_corr_max=1.0
    );

352
    // DocString: feat_space_init_file_py_list
353
    /**
354
     * @brief FeatureSpace constructor that uses a file containing postfix feature expressions to describe all features in Phi, and a primary feature setn <python/feature_creation/FeatureSpace.cpp>)
355
     *
356
357
358
     * @param feature_file (str) The file containing the postfix expressions of all features in the FeatureSpace
     * @param phi_0 (list) The set of primary features
     * @param prop (list) List containing the property vector (training data only)
359
     * @param task_sizes_train (list) The number of samples in the training data per task
360
361
362
     * @param project_type (str) The type of loss function/projection operator to use
     * @param n_sis_select (int) The number of features to select during each SIS step
     * @param cross_corr_max (double) The maximum allowed cross-correlation value between selected features
363
364
365
366
367
     */
    FeatureSpace(
        std::string feature_file,
        py::list phi_0,
        py::list prop,
368
        py::list task_sizes_train,
Thomas Purcell's avatar
Thomas Purcell committed
369
        std::string project_type="regression",
370
371
372
373
374
375
        int n_sis_select=1,
        double cross_corr_max=1.0
    );

    // DocString: feat_space_sis_arr
    /**
376
     * @brief Perform Sure-Independence Screening over the FeatureSpace. The features are ranked using a projection operator constructed using _project_type and the Property vector
377
     *
378
     * @param prop (np.ndarray) Array containing the property vector (training data only)
379
380
381
382
383
384
385
386
387
     */
    inline void sis(np::ndarray prop)
    {
        std::vector<double> prop_vec = python_conv_utils::from_ndarray<double>(prop);
        sis(prop_vec);
    }

    // DocString: feat_space_sis_list
    /**
388
     * @brief Perform Sure-Independence Screening over the FeatureSpace. The features are ranked using a projection operator constructed using _project_type and the Property vector
389
     *
390
     * @param prop (list) List containing the property vector (training data only)
391
392
393
394
395
396
397
398
399
     */
    inline void sis(py::list prop)
    {
        std::vector<double> prop_vec = python_conv_utils::from_list<double>(prop);
        sis(prop_vec);
    }

    // DocString: feat_space_phi_selected_py
    /**
Thomas Purcell's avatar
Thomas Purcell committed
400
     * @brief A list containing all of the selected features
401
402
403
404
405
     */
    py::list phi_selected_py();

    // DocString: feat_space_phi0_py
    /**
Thomas Purcell's avatar
Thomas Purcell committed
406
     * @brief A list containing all features generated (Not including those created on the Fly during SIS)
407
     */
Thomas Purcell's avatar
Thomas Purcell committed
408
    py::list phi_py();
409
410
411

    // DocString: feat_space_phi_py
    /**
Thomas Purcell's avatar
Thomas Purcell committed
412
     * @brief A list containing all of the Primary features
413
     */
Thomas Purcell's avatar
Thomas Purcell committed
414
    py::list phi0_py();
415
416
417

    // DocString: feat_space_scores_py
    /**
418
     * @brief An array of all stored projection scores from SIS
419
420
421
     */
    inline np::ndarray scores_py(){return python_conv_utils::to_ndarray<double>(_scores);};

422
    // DocString: feat_space_task_sizes_train_py
423
    /**
424
     * @brief A list of the number of samples in each task for the training data
425
     */
426
    inline py::list task_sizes_train_py(){return python_conv_utils::to_list<int>(_task_sizes_train);};
427
428
429

    // DocString: feat_space_allowed_ops_py
    /**
430
     * @brief The list of allowed operators
431
432
433
     */
    inline py::list allowed_ops_py(){return python_conv_utils::to_list<std::string>(_allowed_ops);}

434
435
436
437
438
439
440
441
442
443
444
445
446
    #ifdef PARAMETERIZE
    // DocString: feat_space_allowed_ops_py
    /**
     * @brief The list of allowed operators
     */
    inline py::list allowed_param_ops_py(){return python_conv_utils::to_list<std::string>(_allowed_param_ops);}
    #else
    // DocString: feat_space_allowed_ops_py
    /**
     * @brief The list of allowed operators
     */
    inline py::list allowed_param_ops_py(){return python_conv_utils::to_list<std::string>({});}
    #endif
Thomas Purcell's avatar
Thomas Purcell committed
447
    // DocString: feat_space_start_rung_py
448
    /**
449
     * @brief A list containing the index of the first feature of each rung in the feature space.
450
     */
Thomas Purcell's avatar
Thomas Purcell committed
451
    inline py::list start_rung_py(){return python_conv_utils::to_list<int>(_start_rung);}
452
453
454

    // DocString: feat_space_get_feature
    /**
455
     * @brief Access the feature in _phi with an index ind
456
     *
457
     * @param ind (int) The index of the feature to get
458
459
     * @return A ModelNode of the feature at index ind
     */
460
    inline ModelNode get_feature(const int ind) const {return ModelNode(_phi[ind]);}
461
    #endif
Thomas Purcell's avatar
Thomas Purcell committed
462
463
};

464
#endif