diff --git a/src/Makefile.am b/src/Makefile.am index 377c4e50132928435738dba70e0205fb1aa0ece0..15983d9b76fd18982374989ad05a699f1c332bd0 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -29,6 +29,7 @@ __top_builddir__sisso_cpp_SOURCES = \ feature_creation/node/operator_nodes/allowed_operator_nodes/sin.cpp \ feature_creation/node/operator_nodes/allowed_operator_nodes/cos.cpp \ feature_creation/node/operator_nodes/allowed_ops.cpp \ + utils/project.cpp \ feature_creation/feature_space/FeatureSpace.cpp \ inputs/InputParser.cpp \ descriptor_identifier/Model/Model.cpp \ diff --git a/src/descriptor_identifier/Model/Model.cpp b/src/descriptor_identifier/Model/Model.cpp index 1085dcea0b6cec22dd0a7abb45181eee38c8b0f5..10eb81759d73d1c4dbc52347355abf0c63067f15 100644 --- a/src/descriptor_identifier/Model/Model.cpp +++ b/src/descriptor_identifier/Model/Model.cpp @@ -1,11 +1,10 @@ #include <descriptor_identifier/Model/Model.hpp> -Model::Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<std::shared_ptr<FeatureNode>> feats) : +Model::Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<std::shared_ptr<FeatureNode>> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test) : _n_samp_train(feats[0]->n_samp()), _n_samp_test(feats[0]->n_test_samp()), _n_dim(feats.size() + 1), _feats(feats), - _coefs(_n_dim), _prop_train(prop_train), _prop_test(prop_test), _train_error(_n_samp_train), @@ -13,47 +12,67 @@ Model::Model(std::vector<double> prop_train, std::vector<double> prop_test, std: _D_train(_n_samp_train * _n_dim), _D_test(_n_samp_test * _n_dim), _prop_train_est(_n_samp_train, 0.0), - _prop_test_est(_n_samp_test, 0.0) + _prop_test_est(_n_samp_test, 0.0), + _task_sizes_train(task_sizes_train), + _task_sizes_test(task_sizes_test) { _prop_train_est.reserve(_n_samp_train); _prop_test_est.reserve(_n_samp_test); std::vector<double> a(_n_samp_train * _n_dim, 1.0); - for(int ff = 0; ff < feats.size(); ++ff) - { - std::copy_n(feats[ff]->value_ptr(), _n_samp_train, _D_train.data() + ff * _n_samp_train); - std::copy_n(feats[ff]->value_ptr(), _n_samp_train, a.data() + ff * _n_samp_train); - - if(_n_samp_test > 0) - std::copy_n(feats[ff]->test_value().data(), _n_samp_test, _D_test.data() + ff * _n_samp_test); - } - std::copy_n(a.data() + feats.size() * _n_samp_train, _n_samp_train, _D_train.data() + feats.size() * _n_samp_train); - std::vector<double> s(_n_dim, 0.0); std::vector<double> work(_n_dim * _n_samp_train, 0.0); + int rank = 0; int info = 0; + int start = 0; - dgelss_(_n_samp_train, _n_dim, 1, a.data(), _n_samp_train, prop_train.data(), _n_samp_train, s.data(), 1e-13, &rank, work.data(), work.size(), &info); - std::copy_n(prop_train.begin(), _n_dim, _coefs.data()); - - dgemv_('N', _n_samp_train, _n_dim, 1.0, _D_train.data(), _n_samp_train, _coefs.data(), 1, 0.0, _prop_train_est.data(), 1); - std::transform(_prop_train_est.begin(), _prop_train_est.end(), _prop_train.data(), _train_error.data(), std::minus<double>()); - if(_n_samp_test > 0) + for(auto& sz : _task_sizes_train) { - std::copy_n(std::vector<double>(_n_samp_test, 1.0).data(), _n_samp_test, _D_test.data() + feats.size() * _n_samp_test); - dgemv_('N', _n_samp_test, _n_dim, 1.0, _D_test.data(), _n_samp_test, _coefs.data(), 1, 0.0, _prop_test_est.data(), 1); - std::transform(_prop_test_est.begin(), _prop_test_est.end(), _prop_test.data(), _test_error.data(), std::minus<double>()); + std::fill_n(a.data() + feats.size() * sz, sz, 1.0); + std::fill_n(_D_train.data() + feats.size() * sz, sz, 1.0); + + for(int ff = 0; ff < feats.size(); ++ff) + { + std::copy_n(feats[ff]->value_ptr() + start, sz, _D_train.data() + ff * sz); + std::copy_n(feats[ff]->value_ptr() + start, sz, a.data() + ff * sz); + } + + dgelss_(sz, _n_dim, 1, a.data(), sz, prop_train.data() + start, sz, s.data(), 1e-13, &rank, work.data(), work.size(), &info); + + _coefs.push_back(std::vector<double>(_n_dim, 0.0)); + + std::copy_n(prop_train.begin() + start, _n_dim, _coefs.back().data()); + dgemv_('N', sz, _n_dim, 1.0, _D_train.data(), sz, _coefs.back().data(), 1, 0.0, _prop_train_est.data() + start, 1); + std::transform(_prop_train_est.begin() + start, _prop_train_est.begin() + start + sz, _prop_train.data() + start, _train_error.data() + start, std::minus<double>()); + + start += sz; } + start = 0; + int ii = 0; + for(auto& sz : _task_sizes_test) + { + if(sz > 0) + { + for(int ff = 0; ff < feats.size(); ++ff) + std::copy_n(feats[ff]->test_value().data() + start, sz, _D_test.data() + ff * sz); + + std::fill_n(_D_test.data() + feats.size() * sz, sz, 1.0); + dgemv_('N', sz, _n_dim, 1.0, _D_test.data(), sz, _coefs[ii].data(), 1, 0.0, _prop_test_est.data() + start, 1); + std::transform(_prop_test_est.begin() + start, _prop_test_est.begin() + start + sz, _prop_test.data() + start, _test_error.data() + start, std::minus<double>()); + } + ++ii; + start += sz; + } } std::string Model::toString() const { std::stringstream unit_rep; - unit_rep << _coefs[_n_dim - 1]; + unit_rep << "c0"; for(int ff = 0; ff < _feats.size(); ++ff) - unit_rep << " + (" << _coefs[ff] << ") * " << _feats[ff]->expr(); + unit_rep << " + a" << std::to_string(ff) << " * " << _feats[ff]->expr(); return unit_rep.str(); } @@ -73,19 +92,31 @@ void Model::train_to_file(std::string filename) out_file_stream << "# " << toString() << std::endl; out_file_stream << "# RMSE: " << rmse() << "; Max AE: " << max_ae() << std::endl; - out_file_stream << "# coeffs:"; - for(auto& coef: _coefs) - out_file_stream << " " << std::setw(24) << std::setprecision(18) << coef << ";"; - out_file_stream << "\n# " << std::setw(23) << "Property Value," << std::setw(24) << "Property Value (EST),"; + + out_file_stream << "# Coefficients" << std::endl; + out_file_stream << std::setw(10) << std::left << "# Task,"; + for(int cc = 0; cc < _coefs[0].size() - 1; ++cc) + out_file_stream << std::setw(24) << "a" + std::to_string(cc); + out_file_stream << std::setw(24) << "c0" << std::endl; + + for(int cc = 0; cc < _coefs.size(); ++cc) + { + out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc); + for(auto& coeff : _coefs[cc]) + out_file_stream << std::setw(24) << std::setprecision(18) << coeff; + out_file_stream << "\n"; + } + + out_file_stream << "\n" << std::setw(24) << std::left << "# Property Value" << std::setw(24) << "Property Value (EST)"; for(int ff = 0; ff < _feats.size(); ++ff) - out_file_stream << " Feature " << ff << " Value,"; + out_file_stream << std::setw(24) << "Feature " + std::to_string(ff) + " Value"; out_file_stream << std::endl; for(int ss = 0; ss < _n_samp_train; ++ss) { out_file_stream << std::setw(24) << std::setprecision(18) << _prop_train[ss] << std::setw(24) << std::setprecision(18) << _prop_train_est[ss]; for(int ff = 0; ff < _n_dim - 1; ++ff) - out_file_stream << std::setw(24) << std::setprecision(18) << _D_train[ss + ff * _n_samp_train]; + out_file_stream << std::setw(24) << std::setprecision(18) << _feats[ff]->value()[ss]; out_file_stream << std::endl; } out_file_stream.close(); @@ -100,23 +131,36 @@ void Model::test_to_file(std::string filename, std::vector<int> test_inds) out_file_stream.open(filename); out_file_stream << "# " << toString() << std::endl; - out_file_stream << "# Testing Indexes: [" << test_inds[0]; - for(int ss = 1; ss < _n_samp_test; ++ss) - out_file_stream << ", " << test_inds[ss]; - out_file_stream << "]" << std::endl; - out_file_stream << "# RMSE: " << test_rmse() << "; Max AE: " << test_max_ae() << std::endl; - out_file_stream << "# coeffs:"; - for(auto& coef: _coefs) - out_file_stream << " " << std::setw(24) << std::setprecision(18) << coef << ";"; - out_file_stream << "\n# " << std::setw(23) << "Property Value," << std::setw(24) << "Property Value (EST),"; + out_file_stream << "# RMSE: " << rmse() << "; Max AE: " << max_ae() << std::endl; + + out_file_stream << "# Coefficients" << std::endl; + out_file_stream << std::setw(10) << std::left << "# Task"; + for(int cc = 0; cc < _coefs[0].size() - 1; ++cc) + out_file_stream << std::setw(24) << "a" + std::to_string(cc); + out_file_stream << std::setw(24) << "c0" << std::endl; + + for(int cc = 0; cc < _coefs.size(); ++cc) + { + out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc); + for(auto& coeff : _coefs[cc]) + out_file_stream << std::setw(24) << std::setprecision(18) << coeff; + out_file_stream << "\n"; + } + + out_file_stream << "#Test Indexes: [ " << test_inds[0]; + for(int ii = 1; ii < test_inds.size(); ++ii) + out_file_stream << ", " << test_inds[ii]; + out_file_stream << " ]" << std::endl; + + out_file_stream << "\n" << std::setw(24) << std::left << "# Property Value" << std::setw(24) << "Property Value (EST)"; for(int ff = 0; ff < _feats.size(); ++ff) - out_file_stream << " Feature " << ff << " Value,"; + out_file_stream << std::setw(24) << "Feature " + std::to_string(ff) + " Value"; out_file_stream << std::endl; for(int ss = 0; ss < _n_samp_test; ++ss) { out_file_stream << std::setw(24) << std::setprecision(18) << _prop_test[ss] << std::setw(24) << std::setprecision(18) << _prop_test_est[ss]; - for(int ff = 0; ff < _feats.size(); ++ff) + for(int ff = 0; ff < _n_dim - 1; ++ff) out_file_stream << std::setw(24) << std::setprecision(18) << _feats[ff]->test_value()[ss]; out_file_stream << std::endl; } diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp index 261995eb95ab09f2425073a927b3389468172180..d660b6da6266dba210ceb7f96c58c65ffda52870 100644 --- a/src/descriptor_identifier/Model/Model.hpp +++ b/src/descriptor_identifier/Model/Model.hpp @@ -21,7 +21,7 @@ class Model std::vector<std::shared_ptr<FeatureNode>> _feats; //!< List of features in the model - std::vector<double> _coefs; //!< Coefficients for teh features + std::vector<std::vector<double>> _coefs; //!< Coefficients for teh features std::vector<double> _prop_train; //!< The property to be modeled std::vector<double> _prop_test; //!< The property to be modeled std::vector<double> _train_error; //!< The error of the model @@ -32,6 +32,8 @@ class Model std::vector<double> _prop_train_est; //!< The estimated Property std::vector<double> _prop_test_est; //!< The estimated Property + std::vector<int> _task_sizes_train; //!< Number of samples in each task + std::vector<int> _task_sizes_test; //!< Number of samples in each task public: /** * @brief Constructor for the model @@ -39,7 +41,7 @@ public: * @param prop The property * @param feats The features for the model */ - Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<std::shared_ptr<FeatureNode>> feats); + Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<std::shared_ptr<FeatureNode>> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test); /** diff --git a/src/descriptor_identifier/SISSORegressor.cpp b/src/descriptor_identifier/SISSORegressor.cpp index 27b42e72048fbfeb42e3629989a52d78a0bf1b3a..77b35c2abdec0cda69261f211f1a5b9512efd473 100644 --- a/src/descriptor_identifier/SISSORegressor.cpp +++ b/src/descriptor_identifier/SISSORegressor.cpp @@ -1,6 +1,6 @@ #include <descriptor_identifier/SISSORegressor.hpp> -SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, int n_dim, int n_residual): +SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, int n_dim, int n_residual): _prop(prop), _prop_test(prop_test), _a(new double[(n_dim + 1) * prop.size()]), @@ -9,6 +9,8 @@ SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::ve _error(new double[prop.size()]), _work(nullptr), _s(new double[n_dim + 1]), + _task_sizes_train(task_sizes_train), + _task_sizes_test(task_sizes_test), _feat_space(feat_space), _mpi_comm(feat_space->mpi_comm()), _n_samp(prop.size()), @@ -33,22 +35,22 @@ SISSORegressor::SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::ve _work = std::unique_ptr<double[]>(new double[_lwork]); } -void SISSORegressor::set_a(std::vector<int>& inds) +void SISSORegressor::set_a(std::vector<int>& inds, int start, int n_samp) { for(int ii = 0; ii < inds.size(); ++ii) - std::copy_n(node_value_arrs::get_d_matrix_ptr(inds[ii]), _n_samp, _a.get() + ii * _n_samp); - std::copy_n(_ones.get(), _n_samp, _a.get() + inds.size() * _n_samp); + std::copy_n(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, n_samp, _a.get() + ii * n_samp); + std::copy_n(_ones.get(), n_samp, _a.get() + inds.size() * n_samp); } -void SISSORegressor::least_squares(std::vector<int>& inds, double* coeffs) +void SISSORegressor::least_squares(std::vector<int>& inds, double* coeffs, int start, int n_samp) { int info; int n_dim = inds.size() + 1; - set_a(inds); - std::copy_n(_prop.data(), _n_samp, _b.get()); + set_a(inds, start, n_samp); + std::copy_n(_prop.data() + start, n_samp, _b.get()); - dgelss_(_n_samp, n_dim, 1, _a.get(), _n_samp, _b.get(), _n_samp, _s.get(), 1e-13, &_rank, _work.get(), _lwork, &info); + dgelss_(n_samp, n_dim, 1, _a.get(), n_samp, _b.get(), n_samp, _s.get(), 1e-13, &_rank, _work.get(), _lwork, &info); if(info == 0) std::copy_n(_b.get(), n_dim, coeffs); @@ -69,11 +71,11 @@ int SISSORegressor::get_opt_lwork(int n_dim) throw std::logic_error("Failed to get lwork."); } -void SISSORegressor::set_error(std::vector<int>& inds, double* coeffs) +void SISSORegressor::set_error(std::vector<int>& inds, double* coeffs, int start, int n_samp) { - set_a(inds); - dgemv_('N', _n_samp, inds.size() + 1, 1.0, _a.get(), _n_samp, coeffs, 1, 1e-13, _b.get(), 1); - std::transform(_prop.begin(), _prop.end(), _b.get(), _error.get(), std::minus<double>()); + set_a(inds, start, n_samp); + dgemv_('N', n_samp, inds.size() + 1, 1.0, _a.get(), n_samp, coeffs, 1, 1e-13, _b.get(), 1); + std::transform(_prop.begin() + start, _prop.begin() + start + n_samp, _b.get(), _error.get() + start, std::minus<double>()); } void SISSORegressor::fit() @@ -95,7 +97,7 @@ void SISSORegressor::fit() std::vector<Model> models; for(int rr = 0; rr < _n_residual; ++rr) { - models.push_back(Model(_prop, _prop_test, {_feat_space->phi_selected()[rr]})); + models.push_back(Model(_prop, _prop_test, {_feat_space->phi_selected()[rr]}, _task_sizes_train, _task_sizes_test)); models.back().copy_error(&residual[rr * _n_samp]); } _models.push_back(models); @@ -142,9 +144,16 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) util_funcs::iterate(inds, inds.size(), _mpi_comm->rank()); do { - least_squares(inds, coefs.data()); - set_error(inds, coefs.data()); - double error = util_funcs::norm(_error.get(), _n_samp); + int start = 0; + double error = 0.0; + for(auto& sz : _task_sizes_train) + { + least_squares(inds, coefs.data(), start, sz); + set_error(inds, coefs.data(), start, sz); + error += std::pow(util_funcs::norm(_error.get() + start, sz), 2.0) / sz; + start += sz; + } + error = std::sqrt(error / _task_sizes_train.size()); if(error < min_errors.back()) { int rr = 0; @@ -174,7 +183,7 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) { for(int ii = 0; ii < n_dim; ++ii) min_nodes[ii] = _feat_space->phi_selected()[all_inds_min[inds[rr] * n_dim + ii]]; - models.push_back(Model(_prop, _prop_test, min_nodes)); + models.push_back(Model(_prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test)); } _models.push_back(models); diff --git a/src/descriptor_identifier/SISSORegressor.hpp b/src/descriptor_identifier/SISSORegressor.hpp index 02d1af20551c511541bbcfa3214301de903aa691..1c87f3e467aa83b9fc460a951ed5e5f60091f865 100644 --- a/src/descriptor_identifier/SISSORegressor.hpp +++ b/src/descriptor_identifier/SISSORegressor.hpp @@ -24,6 +24,8 @@ protected: std::unique_ptr<double[]> _work; //!< The work array for least squares problems std::unique_ptr<double[]> _s; //!< The S array for least squares problems + std::vector<int> _task_sizes_train; + std::vector<int> _task_sizes_test; std::shared_ptr<FeatureSpace> _feat_space; //!< Feature Space for the problem std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI Communicator @@ -42,7 +44,7 @@ public: * @param prop Property to model * @param n_dim Maximum dimension of the model */ - SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, int n_dim, int n_residual); + SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, int n_dim, int n_residual); /** * @brief Get the optimal size of the working array @@ -58,7 +60,7 @@ public: * @param inds Feature indexes to get the model of * @param coeffs Coefficients for the model */ - void least_squares(std::vector<int>& inds, double* coeffs); + void least_squares(std::vector<int>& inds, double* coeffs, int start, int n_samp); /** * @brief Set the residual for the next step @@ -66,14 +68,14 @@ public: * @param inds indexes of the selected features * @param coeffs Coefficients of the model */ - void set_error(std::vector<int>& inds, double* coeffs); + void set_error(std::vector<int>& inds, double* coeffs, int start, int n_samp); /** * @brief Set the A matrix for the least squares problem * * @param inds indexes of the selected features */ - void set_a(std::vector<int>& inds); + void set_a(std::vector<int>& inds, int start, int n_samp); /** * @brief Fit the models diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index 12f0b93a2f91941c64dfc888c1f673943385ba1a..c740c4f6ff4a8d8ca6692c7f6016d8ea63c01603 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -22,6 +22,7 @@ FeatureSpace::FeatureSpace( std::shared_ptr<MPI_Interface> mpi_comm, std::vector<node_ptr> phi_0, std::vector<std::string> allowed_ops, + std::vector<int> task_sizes, int max_phi, int n_sis_select, int max_store_rung, @@ -33,6 +34,7 @@ FeatureSpace::FeatureSpace( _phi_0(phi_0), _allowed_ops(allowed_ops), _scores(phi_0.size(), 0.0), + _task_sizes(task_sizes), _start_gen(1, 0), _mpi_comm(mpi_comm), _l_bound(min_abs_feat_val), @@ -44,6 +46,7 @@ FeatureSpace::FeatureSpace( _n_rung_store(max_store_rung), _n_rung_generate(n_rung_generate) { + _project = project_funcs::project_r; if(_n_rung_generate > 1) throw std::logic_error("A maximum of one rung can be generated on the fly."); else if(_max_phi - _n_rung_generate < _n_rung_store) @@ -289,48 +292,9 @@ void FeatureSpace::generate_feature_space() } } } - for(int ii = 0; ii < _mpi_comm->size(); ++ii) - { - _mpi_comm->barrier(); - if(_mpi_comm->rank() == ii) - for(auto& feat : _phi) - std::cout << feat->expr() << std::endl; - _mpi_comm->barrier(); - } _n_feat = _phi.size(); } -void FeatureSpace::project_r(double* prop, int size) -{ - std::vector<double> scores(_phi.size(), 0.0); - for(int ff = 0; ff < _phi.size(); ++ff) - _scores[ff] = -1.0 * std::abs(util_funcs::r(&prop[0], _phi[ff]->value_ptr(), _n_samp)); - - for(int pp = 1; pp < size / _n_samp; ++pp) - { - for(int ff = 0; ff < _phi.size(); ++ff) - scores[ff] = -1.0 * std::abs(util_funcs::r(&prop[_n_samp*pp], _phi[ff]->value_ptr(), _n_samp)); - std::transform(scores.begin(), scores.end(), _scores.begin(), _scores.begin(), [](double s1, double s2){return std::min(s1, s2);}); - } -} - -std::vector<double> FeatureSpace::project_r(double* prop, int size, std::vector<node_ptr>& phi) -{ - std::vector<double> scores(phi.size(), 0.0); - std::vector<double> scores_temp(phi.size(), 0.0); - - for(int ff = 0; ff < phi.size(); ++ff) - scores[ff] = -1.0 * std::abs(util_funcs::r(&prop[0], phi[ff]->value_ptr(), _n_samp)); - - for(int pp = 1; pp < size / _n_samp; ++pp) - { - for(int ff = 0; ff < phi.size(); ++ff) - scores_temp[ff] = -1.0 * std::abs(util_funcs::r(&prop[_n_samp*pp], phi[ff]->value_ptr(), _n_samp)); - std::transform(scores_temp.begin(), scores_temp.end(), scores.begin(), scores.begin(), [](double s1, double s2){return std::min(s1, s2);}); - } - return scores; -} - void FeatureSpace::project_generated(double* prop, int size, std::vector<std::shared_ptr<FeatureNode>>& phi_sel, std::vector<double>& scores_sel, std::vector<double>& scores_comp) { for(auto feat = _phi.begin() + _start_gen.back() + _mpi_comm->rank(); feat < _phi.end(); feat += _mpi_comm->size()) @@ -342,7 +306,9 @@ void FeatureSpace::project_generated(double* prop, int size, std::vector<std::sh std::vector<node_ptr> generated_phi; generate_new_feats(feat, generated_phi, feat_ind, _l_bound, _u_bound); - std::vector<double> scores = project_r(prop, size, generated_phi); + std::vector<double> scores(generated_phi.size(), 0.0); + _project(prop, scores.data(), generated_phi, _task_sizes, size / _n_samp); + std::vector<int> inds = util_funcs::argsort(scores); int ii = 0; @@ -434,7 +400,8 @@ void FeatureSpace::sis(std::vector<double>& prop) node_value_arrs::resize_d_matrix_arr(_n_sis_select); _phi_selected.reserve(_phi_selected.size() + _n_sis_select); - project_r(prop.data(), prop.size()); + _project(prop.data(), _scores.data(), _phi, _task_sizes, prop.size() / _n_samp); + std::vector<int> inds = util_funcs::argsort(_scores); int ii = 0; @@ -459,7 +426,6 @@ void FeatureSpace::sis(std::vector<double>& prop) if(_n_rung_generate > 0) { - for(auto& feat : phi_sel) phi_sel.resize(cur_feat_local); scores_sel.resize(cur_feat_local); project_generated(prop.data(), prop.size(), phi_sel, scores_sel, scores_comp); @@ -586,20 +552,20 @@ void FeatureSpace::sis(std::vector<double>& prop) for(int ii = _phi_selected.size() - _n_sis_select; ii < _phi_selected.size(); ++ii) { _phi_selected[ii]->set_value(); - _phi_selected[ii]->set_test_value(); ++cur_feat; } } else { - // cur_feat += cur_feat_local; + cur_feat_local = 0; for(auto& feat : phi_sel) { + std::cout << scores_sel[cur_feat_local] << '\t' << phi_sel[cur_feat_local]->expr() << std::endl; _phi_selected.push_back(feat); _phi_selected.back()->reindex(cur_feat); _phi_selected.back()->set_value(); - _phi_selected.back()->set_test_value(); ++cur_feat; + ++cur_feat_local; } } if(cur_feat != node_value_arrs::N_SELECTED) diff --git a/src/feature_creation/feature_space/FeatureSpace.hpp b/src/feature_creation/feature_space/FeatureSpace.hpp index 4cf22bc2e938c426250caf8f82e1d4587575bdfa..50a4d1e953eb7c389e830cc9efd665c4c805433d 100644 --- a/src/feature_creation/feature_space/FeatureSpace.hpp +++ b/src/feature_creation/feature_space/FeatureSpace.hpp @@ -5,6 +5,7 @@ #include <feature_creation/node/FeatureNode.hpp> #include <feature_creation/node/operator_nodes/allowed_ops.hpp> #include <feature_creation/node/value_storage/nodes_value_containers.hpp> +#include <utils/project.hpp> #include <boost/serialization/shared_ptr.hpp> @@ -29,10 +30,12 @@ class FeatureSpace std::vector<bin_op_node_gen> _bin_operators; //!< list of all binary operators std::vector<double> _scores; //!< projection scores for each feature - std::vector<double> _prop; //!< property to learn + std::vector<int> _task_sizes; //!< The number of elements in each task std::vector<int> _start_gen; //!< list of starting index for each generation + std::function<void(double*, double*, std::vector<node_ptr>&, std::vector<int>&, int)> _project; //!< Function used for projection onto SIS + std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPi communicator double _l_bound; //!< lower bound for absolute value of the features @@ -46,6 +49,7 @@ class FeatureSpace int _n_rung_generate; //!< Total number of rungs to generate on the fly public: + /** * @brief Constructor for the feature space * @details constructs the feature space from an initial set of features and a list of allowed operatiors @@ -60,6 +64,7 @@ public: std::shared_ptr<MPI_Interface> mpi_comm, std::vector<node_ptr> phi_0, std::vector<std::string> allowed_ops, + std::vector<int> task_sizes, int max_phi=1, int n_sis_select=1, int max_store_rung=2, @@ -99,15 +104,7 @@ public: */ inline std::shared_ptr<MPI_Interface> mpi_comm(){return _mpi_comm;} - /** - * @brief calculate the projection scores for all features for a given property - * @details Calculate the projection score based on the Pearson correlation - * - * @param prop [description] - */ - void project_r(double* prop, int size); - - std::vector<double> project_r(double* prop, int size, std::vector<node_ptr>& phi); + inline std::vector<int> task_sizes(){return _task_sizes;} void generate_new_feats(std::vector<node_ptr>::iterator& feat, std::vector<node_ptr>& feat_set, int& feat_ind, double l_bound=1e-50, double u_bound=1e50); diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp index c20264c13d0107ef10c54de977a7d82885716e6a..7308002ae004f43b3f62e695eadc3fe20be8c1e0 100644 --- a/src/inputs/InputParser.cpp +++ b/src/inputs/InputParser.cpp @@ -2,10 +2,10 @@ InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::shared_ptr<MPI_Interface> comm) : _opset(as_vector<std::string>(IP, "opset")), - _leave_out_inds(as_vector<int>(IP, "leave_out_inds")), _filename(fn), _data_file(IP.get<std::string>("data_file", "data.csv")), _prop_key(IP.get<std::string>("property_key", "prop")), + _leave_out_inds(as_vector<int>(IP, "leave_out_inds")), _l_bound(IP.get<double>("min_abs_feat_val", 1e-50)), _u_bound(IP.get<double>("max_abs_feat_val", 1e50)), _n_dim(IP.get<int>("desc_dim")), @@ -13,38 +13,118 @@ InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::sh _max_rung(IP.get<int>("max_rung")), _max_store_rung(IP.get<int>("n_rung_store", _max_rung - 1)), _n_rung_generate(IP.get<int>("n_rung_generate", 0)), - _n_samp(-1), + _n_samp(0), _n_residuals(IP.get<int>("n_residual", 1)) { - std::ifstream data_stream; std::string line; data_stream.open(_data_file, std::ios::in); - while (std::getline(data_stream, line)) - ++_n_samp; - _n_leave_out = IP.get<int>("n_leave_out", static_cast<int>(std::round(_n_samp * IP.get<double>("leave_out_frac", 0.0)))); - if((_leave_out_inds.size() == 0) && _n_leave_out > 0) + + std::getline(data_stream, line); + std::vector<std::string> split_line; + + boost::algorithm::split(split_line, line, boost::algorithm::is_any_of(",")); + if(split_line.size() == 1) + throw std::logic_error("data_file is not a comma separated file"); + + std::vector<std::string> headers; + std::vector<Unit> units; + for(int dd = 1; dd < split_line.size(); ++dd) { - if(comm->rank() == 0) + std::vector<std::string> name_unit_split; + boost::algorithm::split(name_unit_split, split_line[dd], boost::algorithm::is_any_of("()")); + if(name_unit_split.size() == 1) { - _leave_out_inds = std::vector<int>(_n_leave_out); + boost::algorithm::trim(split_line[dd]); + headers.push_back(split_line[dd]); + units.push_back(Unit()); + } + else if(name_unit_split.size() == 3) + { + boost::algorithm::trim(name_unit_split[0]); + boost::algorithm::trim(name_unit_split[1]); + headers.push_back(name_unit_split[0]); + units.push_back(name_unit_split[1]); + } + else + { + throw std::logic_error("Invalid feature name \"" + split_line[dd] + "\" in header of file"); + } + } + + int taskind = 0; + while((taskind < headers.size()) && (headers[taskind] != "Task")) + ++taskind; + + std::map<std::string, std::vector<int>> tasks; + if(taskind >= headers.size()) + { + while (std::getline(data_stream, line)) + ++_n_samp; + tasks["none"] = std::vector<int>(_n_samp); + std::iota(tasks["none"].begin(), tasks["none"].end(), 0); + } + else + { + while (std::getline(data_stream, line)) + { + boost::algorithm::split(split_line, line, [](char c) {return c==',';}); + if(tasks.count(split_line[taskind+1]) > 0) + tasks[split_line[taskind+1]].push_back(_n_samp); + else + tasks[split_line[taskind+1]] = {_n_samp}; + ++_n_samp; + } + } + + double leave_out_frac = IP.get<double>("leave_out_frac", static_cast<double>(IP.get<int>("n_leave_out", 0)) / static_cast<double>(_n_samp) ); + if((_leave_out_inds.size() == 0) && leave_out_frac > 0.0) + { + if(comm->rank() == 0) + { unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::vector<int> indexes(_n_samp, 0); - for(int ii = 0; ii < _n_samp; ++ii) - indexes[ii] = ii; - std::shuffle (indexes.begin(), indexes.end(), std::default_random_engine(seed)); + int start = 0; + for(auto& el : tasks) + { + _task_sizes_test.push_back(static_cast<int>(std::round(leave_out_frac * el.second.size()))); + _task_sizes_train.push_back(el.second.size() - _task_sizes_test.back()); + + _leave_out_inds.resize(_leave_out_inds.size() + _task_sizes_test.back()); + + std::vector<int> inds(el.second); + std::shuffle(inds.begin(), inds.end(), std::default_random_engine(seed)); - std::copy_n(indexes.begin(), _n_leave_out, _leave_out_inds.begin()); - std::sort(_leave_out_inds.begin(), _leave_out_inds.end()); + std::copy_n(inds.begin(), _task_sizes_test.back(), _leave_out_inds.begin() + start); + std::sort(_leave_out_inds.begin() + start, _leave_out_inds.end()); + + start += _task_sizes_test.back(); + } } mpi::broadcast(*comm, _leave_out_inds, 0); + mpi::broadcast(*comm, _task_sizes_test, 0); + mpi::broadcast(*comm, _task_sizes_train, 0); + } + else if(_leave_out_inds.size() > 0) + { + for(auto& el : tasks) + { + int n_test = 0; + for(auto& ind: el.second) + if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [ind](int i1){return i1 == ind;})) + ++n_test; + _task_sizes_test.push_back(n_test); + _task_sizes_train.push_back(el.second.size() - n_test); + } + } + else + { + for(auto& el : tasks) + { + _task_sizes_test.push_back(0); + _task_sizes_train.push_back(el.second.size()); + } } - else if((_n_leave_out == 0) && (_leave_out_inds.size() > 0)) - _n_leave_out = _leave_out_inds.size(); - else if(_leave_out_inds.size() != _n_leave_out) - throw std::logic_error("n_leave_out is not the same size as leave_out_inds"); - if(_opset.size() == 0) { @@ -69,59 +149,33 @@ InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::sh }; } - generate_feature_space(comm); + generate_feature_space(comm, headers, units, tasks, taskind); } -void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm) +void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, std::vector<std::string> headers, std::vector<Unit> units, std::map<std::string, std::vector<int>> tasks, int taskind) { std::string line; std::ifstream data_stream; data_stream.open(_data_file, std::ios::in); + std::getline(data_stream, line); + int number_of_lines = 0; - std::vector<std::string> headers; - std::vector<Unit> units; std::vector<std::string> samples; + std::vector<int> task_size; - std::getline(data_stream, line); - std::vector<std::string> split_line; - - boost::algorithm::split(split_line, line, boost::algorithm::is_any_of(",")); - if(split_line.size() == 1) - throw std::logic_error("data_file is not a comma separated file"); - - for(int dd = 1; dd < split_line.size(); ++dd) - { - std::vector<std::string> name_unit_split; - boost::algorithm::split(name_unit_split, split_line[dd], boost::algorithm::is_any_of("()")); - if(name_unit_split.size() == 1) - { - boost::algorithm::trim(split_line[dd]); - headers.push_back(split_line[dd]); - units.push_back(Unit()); - } - else if(name_unit_split.size() == 3) - { - boost::algorithm::trim(name_unit_split[0]); - boost::algorithm::trim(name_unit_split[1]); + std::vector<std::vector<double>> data(headers.size(), std::vector<double>(std::accumulate(_task_sizes_train.begin(), _task_sizes_train.end(), 0))); + std::vector<std::vector<double>> test_data(headers.size(), std::vector<double>(std::accumulate(_task_sizes_test.begin(), _task_sizes_test.end(), 0))); - headers.push_back(name_unit_split[0]); - units.push_back(name_unit_split[1]); - } - else - { - throw std::logic_error("Invalid feature name \"" + split_line[dd] + "\" in header of file"); - } - } - std::vector<std::vector<double>> data(headers.size(), std::vector<double>(_n_samp - _n_leave_out)); - std::vector<std::vector<double>> test_data(headers.size(), std::vector<double>(_n_leave_out)); std::vector<std::string> bad_samples; int cur_line = 0; int n_train_samp = 0; int n_test_samp = 0; + while (std::getline(data_stream, line)) { + std::vector<std::string> split_line; boost::algorithm::split(split_line, line, [](char c) {return c==',';}); samples.push_back(split_line[0]); @@ -130,17 +184,41 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm) if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&cur_line](int ind){return ind == cur_line;})) { + n_test_samp = 0; + for(auto& task : tasks) + { + int task_ind = std::find(task.second.begin(), task.second.end(), cur_line) - task.second.begin(); + + for(int ii = 0; ii < task_ind; ++ii) + if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&ii, &task](int ind){return ind == task.second[ii];})) + ++n_test_samp; + + if(task_ind < task.second.size()) + break; + } for(int ii = 1; ii < split_line.size(); ++ii) - test_data[ii-1][n_test_samp] = std::stod(split_line[ii]); - ++n_test_samp; + if(ii-1 != taskind) + test_data[ii-1][n_test_samp] = std::stod(split_line[ii]); } else { + n_train_samp = 0; + for(auto& task : tasks) + { + + int task_ind = std::find(task.second.begin(), task.second.end(), cur_line) - task.second.begin(); + + for(int ii = 0; ii < task_ind; ++ii) + if(std::none_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&ii, &task](int ind){return ind == task.second[ii];})) + ++n_train_samp; + + if(task_ind < task.second.size()) + break; + } for(int ii = 1; ii < split_line.size(); ++ii) - data[ii-1][n_train_samp] = std::stod(split_line[ii]); - ++n_train_samp; + if(ii-1 != taskind) + data[ii-1][n_train_samp] = std::stod(split_line[ii]); } - ++cur_line; } @@ -152,26 +230,37 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm) throw std::logic_error(msg.substr(0, msg.size() - 2)); } - int _propind = 0; - while((headers[_propind] != _prop_key) && (_propind < headers.size())) - ++_propind; + int propind = 0; + while((headers[propind] != _prop_key) && (propind < headers.size())) + ++propind; - if(_propind >= headers.size()) + if(propind >= headers.size()) { throw std::logic_error("_propkey not found in data_file"); } else { - _prop_train = std::vector<double>(data[_propind].size(), 0.0); - _prop_test = std::vector<double>(test_data[_propind].size(), 0.0); + _prop_train = std::vector<double>(data[propind].size(), 0.0); + _prop_test = std::vector<double>(test_data[propind].size(), 0.0); + + std::copy_n(data[propind].begin(), data[propind].size(), _prop_train.begin()); + std::copy_n(test_data[propind].begin(), test_data[propind].size(), _prop_test.begin()); - std::copy_n(data[_propind].begin(), data[_propind].size(), _prop_train.begin()); - std::copy_n(test_data[_propind].begin(), test_data[_propind].size(), _prop_test.begin()); + data.erase(data.begin() + propind); + test_data.erase(test_data.begin() + propind); + headers.erase(headers.begin() + propind); + units.erase(units.begin() + propind); + } + + if(taskind > propind) + --taskind; - data.erase(data.begin() + _propind); - test_data.erase(test_data.begin() + _propind); - headers.erase(headers.begin() + _propind); - units.erase(units.begin() + _propind); + if(taskind < headers.size()) + { + data.erase(data.begin() + taskind); + test_data.erase(test_data.begin() + taskind); + headers.erase(headers.begin() + taskind); + units.erase(units.begin() + taskind); } node_value_arrs::initialize_values_arr(_prop_train.size(), _prop_test.size(), headers.size()); @@ -179,7 +268,7 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm) for(int ff = 0; ff < headers.size(); ++ff) phi_0.push_back(std::make_shared<FeatureNode>(ff, headers[ff], data[ff], test_data[ff], units[ff])); - _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _l_bound, _u_bound); + _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _l_bound, _u_bound); } void stripComments(std::string& filename) diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp index 8cab6a31110ac5da267d3198157d003b2aa7b95a..32c4e3b2254dae84554b72af5610cf01d5f471d3 100644 --- a/src/inputs/InputParser.hpp +++ b/src/inputs/InputParser.hpp @@ -25,10 +25,14 @@ class InputParser { public: + std::vector<std::string> _opset; std::vector<double> _prop_train; std::vector<double> _prop_test; + std::vector<int> _leave_out_inds; + std::vector<int> _task_sizes_train; + std::vector<int> _task_sizes_test; std::string _filename; std::string _data_file; @@ -46,11 +50,10 @@ public: int _n_sis_select; int _n_samp; int _n_residuals; - int _n_leave_out; InputParser(boost::property_tree::ptree IP, std::string fn, std::shared_ptr<MPI_Interface> comm); inline std::shared_ptr<FeatureSpace> feat_space(){return _feat_space;} - void generate_feature_space(std::shared_ptr<MPI_Interface> comm); + void generate_feature_space(std::shared_ptr<MPI_Interface> comm,std::vector<std::string> headers, std::vector<Unit> units, std::map<std::string, std::vector<int>> tasks, int taskind); }; /** * @brief strips comments from the input file diff --git a/src/main.cpp b/src/main.cpp index a23cff94396521dfb48fd79cd555b43ba23b89bb..8170f78992c6503560f4e3d2dce716219907d42f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -37,7 +37,7 @@ int main(int argc, char const *argv[]) std::cout<< "time input_parsing/Feature space generation: "<< duration << std::endl; node_value_arrs::initialize_d_matrix_arr(); - SISSORegressor sisso(IP._feat_space, IP._prop_train, IP._prop_test, IP._n_dim, IP._n_residuals); + SISSORegressor sisso(IP._feat_space, IP._prop_train, IP._prop_test, IP._task_sizes_train, IP._task_sizes_test, IP._n_dim, IP._n_residuals); sisso.fit(); if(comm->rank() == 0) diff --git a/src/utils/math_funcs.cpp b/src/utils/math_funcs.cpp index f37f5c1ecaee1020f28a7dc9f5307e2d20092c91..90bd03bc074182fdadee45f40687f03dce26cd5c 100644 --- a/src/utils/math_funcs.cpp +++ b/src/utils/math_funcs.cpp @@ -17,3 +17,29 @@ bool util_funcs::iterate(std::vector<int>& inds, int size, int incriment) } return cont; } + +double util_funcs::r(double* a, double* b, std::vector<int>& sizes) +{ + double result = 0.0; + int pos = 0; + for(auto sz : sizes) + { + result += r2(a + pos, b + pos, sz); + pos += sz; + } + + return std::sqrt(result / sizes.size()); +} + +double util_funcs::r2(double* a, double* b, std::vector<int>& sizes) +{ + double result = 0.0; + int pos = 0; + for(auto sz : sizes) + { + result += r2(a + pos, b + pos, sz); + pos += sz; + } + + return result / sizes.size(); +} diff --git a/src/utils/math_funcs.hpp b/src/utils/math_funcs.hpp index c4a5577fc0bb6946ace1224bfd4d8d67d14019da..789275d5f2c7e18d289a6c9b739375919d000925 100644 --- a/src/utils/math_funcs.hpp +++ b/src/utils/math_funcs.hpp @@ -8,6 +8,8 @@ #include <math.h> #include <utils/mkl_interface.hpp> +#include<iostream> + namespace util_funcs { @@ -56,6 +58,15 @@ namespace util_funcs return (ddot_(size, a, 1, b, 1) - static_cast<double>(size) * mean(a, size) * mean(b, size)) / (static_cast<double>(size) * stand_dev(a, size) * stand_dev(b, size)); } + double r(double* a, double* b, std::vector<int>& sizes); + + inline double r2(double* a, double* b, int size) + { + return std::pow((ddot_(size, a, 1, b, 1) - static_cast<double>(size) * mean(a, size) * mean(b, size)) / (static_cast<double>(size) * stand_dev(a, size) * stand_dev(b, size)), 2.0); + } + + double r2(double* a, double* b, std::vector<int>& sizes); + inline std::vector<int> argsort(std::vector<double>& vec) { std::vector<int> index(vec.size()); diff --git a/src/utils/project.cpp b/src/utils/project.cpp new file mode 100644 index 0000000000000000000000000000000000000000..926485501375a7ce88a232daa1e95fd8435496bb --- /dev/null +++ b/src/utils/project.cpp @@ -0,0 +1,29 @@ +#include <utils/project.hpp> + +void project_funcs::project_r(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& sizes, int n_prop) +{ + int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); + for(int ff = 0; ff < phi.size(); ++ff) + *(scores + ff) = -1.0 * util_funcs::r2(prop, phi[ff]->value_ptr(), sizes); + + for(int pp = 1; pp < n_prop; ++pp) + { + prop += n_samp; + for(int ff = 0; ff < phi.size(); ++ff) + *(scores + ff) = std::min(-1.0 * util_funcs::r2(prop, phi[ff]->value_ptr(), sizes), *(scores + ff)); + } +} + +void project_funcs::project_r2(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& sizes, int n_prop) +{ + int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); + for(int ff = 0; ff < phi.size(); ++ff) + *(scores + ff) = -1.0 * util_funcs::r2(prop, phi[ff]->value_ptr(), sizes); + + for(int pp = 1; pp < n_prop; ++pp) + { + prop += n_samp; + for(int ff = 0; ff < phi.size(); ++ff) + *(scores + ff) = std::min(-1.0 * util_funcs::r2(prop, phi[ff]->value_ptr(), sizes), *(scores + ff)); + } +} \ No newline at end of file diff --git a/src/utils/project.hpp b/src/utils/project.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8489f759e34fea8e712e57fcadd3aa6d8248c108 --- /dev/null +++ b/src/utils/project.hpp @@ -0,0 +1,7 @@ +#include <feature_creation/node/Node.hpp> + +namespace project_funcs +{ + void project_r(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop); + void project_r2(double* prop, double* scores, std::vector<node_ptr>& phi, std::vector<int>& size, int n_prop); +} \ No newline at end of file