diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp index 3ad3c65b7c2e47a6b6d02dc0d708c5021e809b01..bd6afc583d3d537ffee70995d16933b40c0369cb 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp @@ -13,7 +13,21 @@ SISSOClassifier::SISSOClassifier( const int n_residual, const int n_models_store ): - SISSO_DI(feat_space, prop_label, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, false), + SISSO_DI( + "classification", + feat_space, + prop_label, + prop_unit, + prop, + prop_test, + task_sizes_train, + task_sizes_test, + leave_out_inds, + n_dim, + n_residual, + n_models_store, + false + ), _c(1000.0), _width(1.0e-5), _n_class(1) diff --git a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp index 44b3d33371ef47617999bdfc66237e2fb1574c3a..fda71b6a02a54216ae09a796c29c5f50d1c918be 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.cpp @@ -18,41 +18,14 @@ SISSOLogRegressor::SISSOLogRegressor( _prop_no_log(prop) { std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); -} - -void SISSOLogRegressor::set_a(const std::vector<int>& inds, const int start, const int n_samp, double* a) -{ - for(int ii = 0; ii < inds.size(); ++ii) - { - std::transform( - node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, - node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_samp, - a + ii * n_samp, - [](double v){return std::log(v);} - ); - } - - if(!_fix_intercept) - { - std::fill_n(a + inds.size() * n_samp, n_samp, 1.0); - } -} - -void SISSOLogRegressor::set_error(const std::vector<int>& inds, const double* coefs, const int start, const int n_samp, double* error) const -{ - std::fill_n(error + start, n_samp, -1 * (!_fix_intercept) * coefs[inds.size()]); - for(int ii = 0; ii < inds.size(); ++ii) - { - std::transform( - node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, - node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_samp, - error + start, - error + start, - [&coefs, &ii](double fv, double e){return e - coefs[ii] * std::log(fv);} - ); - } - - daxpy_(n_samp, 1.0, _prop.data() + start, 1, error + start, 1); + _loss = loss_function_util::get_loss_function( + "log_regression", + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } void SISSOLogRegressor::output_models(std::vector<double>& residual) @@ -119,13 +92,6 @@ void SISSOLogRegressor::l0_norm(std::vector<double>& prop, int n_dim) std::vector<int> min_inds_private(min_inds); std::vector<double> min_errors_private(min_errors); - std::vector<double> error(_prop.size()); - std::vector<double> a((n_dim + (1 - _fix_intercept)) * _prop.size()); - std::vector<double> b(_prop.size()); - - _lwork = get_opt_lwork(n_dim + (1 - _fix_intercept), a.data(), b.data()); - std::vector<double> work(_lwork, 0.0); - unsigned long long int ii_prev = 0; #ifdef OMP45 @@ -136,13 +102,28 @@ void SISSOLogRegressor::l0_norm(std::vector<double>& prop, int n_dim) for(unsigned long long int ii = _mpi_comm->rank(); ii < n_interactions; ii += static_cast<unsigned long long int>(_mpi_comm->size())) { util_funcs::iterate(inds, inds.size(), ii - ii_prev); - least_squares(inds, coefs.data(), 0, _n_samp, a.data(), b.data(), work.data()); - set_error(inds, coefs.data(), 0, _n_samp, error.data()); - - double rmse = std::sqrt(ddot_(_n_samp, error.data(), 1, error.data(), 1) / _n_samp); - if(rmse < min_errors_private[max_error_ind]) + double loss = (*_loss)(inds); + if(loss < -1.0) + { + std::string err_msg = "A parameter passed to dgels was invalid. This is likely from a NaN in the descriptor matrix. The features that caused this are: "; + for(auto& ind : inds) + { + err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + } + throw std::logic_error(err_msg.substr(0, err_msg.size() - 2) + "."); + } + else if(loss < 0.0) + { + std::string err_msg = "Descriptor matrix is not full-rank. The features that caused this are: "; + for(auto& ind : inds) + { + err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + } + std::cerr << err_msg.substr(0, err_msg.size() - 2) << "." << std::endl; + } + else if(loss < min_errors_private[max_error_ind]) { - min_errors_private[max_error_ind] = rmse; + min_errors_private[max_error_ind] = loss; std::copy_n(inds.begin(), inds.size(), min_inds_private.begin() + max_error_ind * n_dim); max_error_ind = std::max_element(min_errors_private.begin(), min_errors_private.end()) - min_errors_private.begin(); } @@ -181,4 +162,13 @@ void SISSOLogRegressor::l0_norm(std::vector<double>& prop, int n_dim) } add_model(indexes); } + std::vector<std::shared_ptr<Model>> model_ptrs(_models.back().size()); + std::transform( + _models.back().begin(), + _models.back().end(), + model_ptrs.begin(), + [](ModelLogRegressor model){return std::make_shared<ModelLogRegressor>(model);} + ); + + _loss->set_model_list(model_ptrs); } diff --git a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp index 21af9913f67a76653f74531527a8c80ea72df883..9f4f67d7f4c3c184e386bbc57f3280860d23e6f2 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOLogRegressor.hpp @@ -60,26 +60,6 @@ public: const bool fix_intercept=false ); - /** - * @brief Set the A matrix for the least squares problem - * - * @param inds indexes of the selected features - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - */ - void set_a(const std::vector<int>& inds, const int start, const int n_samp, double* a); - - /** - * @brief Set the residual for the next step - * - * @param inds indexes of the selected features - * @param coeffs Coefficients of the model - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - * @param error pointer to the error vector - */ - virtual void set_error(const std::vector<int>& inds, const double* coefs, const int start, const int n_samp, double* error) const; - /** * @brief Adds a model to the model list * diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp index bd3f8c19274c0ca7fea3c85b7bb511d4d57b47a6..e24e66a3d828fb5e2d859f00f4b4604c5df48f67 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp @@ -15,6 +15,7 @@ SISSORegressor::SISSORegressor( const bool fix_intercept ): SISSO_DI( + "regression", feat_space, prop_label, prop_unit, @@ -30,83 +31,6 @@ SISSORegressor::SISSORegressor( ) {} -void SISSORegressor::set_a(const std::vector<int>& inds, int start, int n_samp, double* a) -{ - for(int ii = 0; ii < inds.size(); ++ii) - { - std::copy_n(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, n_samp, a + ii * n_samp); - } - - if(!_fix_intercept) - { - std::fill_n(a + inds.size() * n_samp, n_samp, 1.0); - } -} - -void SISSORegressor::least_squares(const std::vector<int>& inds, double* coeffs, const int start, const int n_samp, double* a, double*b, double* work) -{ - int info; - int n_dim = inds.size() + (1 - _fix_intercept); - - set_a(inds, start, n_samp, a); - std::copy_n(_prop.data() + start, n_samp, b); - - dgels_('N', n_samp, n_dim, 1, a, n_samp, b, n_samp, work, _lwork, &info); - - if(info == 0) - { - std::copy_n(b, n_dim, coeffs); - } - else if(info > 0) - { - std::string err_msg = "Descriptor matrix is not full-rank. The features that caused this are: "; - for(auto& ind : inds) - { - err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; - } - std::cerr << err_msg.substr(0, err_msg.size() - 2) << "." << std::endl; - std::fill_n(coeffs, n_dim, std::numeric_limits<double>::infinity()); - } - else - { - std::string err_msg = "The " + std::to_string(-1*info) + - " parameter to dgels was invalid. This is likely from a NaN in the descriptor matrix. The features that caused this are: "; - for(auto& ind : inds) - { - err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; - } - throw std::logic_error(err_msg.substr(0, err_msg.size() - 2) + "."); - } -} - -int SISSORegressor::get_opt_lwork(const int n_dim, double* a, double* b) -{ - std::vector<double> work(1, 0.0); - int info = 0, rank = 0; - - dgels_('N', _n_samp, n_dim, 1, a, _n_samp, b, _n_samp, work.data(), -1, &info); - - if(info == 0) - { - return static_cast<int>(std::round(work[0])); - } - else - { - throw std::logic_error("Failed to get lwork."); - } -} - -void SISSORegressor::set_error(const std::vector<int>& inds, const double* coefs, const int start, const int n_samp, double* error) const -{ - std::fill_n(error + start, n_samp, -1 * (!_fix_intercept) * coefs[inds.size()]); - for(int ii = 0; ii < inds.size(); ++ii) - { - daxpy_(n_samp, -1.0 * coefs[ii], node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, 1, error + start, 1); - } - daxpy_(n_samp, 1.0, _prop.data() + start, 1, error + start, 1); - -} - void SISSORegressor::add_model(const std::vector<int> indexes) { if(_models.size() < indexes.size()) @@ -172,17 +96,8 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) std::vector<int> min_inds_private(min_inds); std::vector<double> min_errors_private(min_errors); - - std::vector<double> error(_prop.size()); - std::vector<double> a((n_dim + (1 - _fix_intercept)) * _prop.size()); - std::vector<double> b(_prop.size()); - - _lwork = get_opt_lwork(n_dim + (1 - _fix_intercept), a.data(), b.data()); - std::vector<double> work(_lwork, 0.0); - unsigned long long int ii_prev = 0; - #ifdef OMP45 #pragma omp for schedule(monotonic: dynamic) #else @@ -191,24 +106,34 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) for(unsigned long long int ii = _mpi_comm->rank(); ii < n_interactions; ii += static_cast<unsigned long long int>(_mpi_comm->size())) { util_funcs::iterate(inds, inds.size(), ii - ii_prev); - int start = 0; - for(auto& sz : _task_sizes_train) + double loss = (*_loss)(inds); + if(loss < -1.0) + { + std::string err_msg = "A parameter passed to dgels was invalid. This is likely from a NaN in the descriptor matrix. The features that caused this are: "; + for(auto& ind : inds) + { + err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + } + throw std::logic_error(err_msg.substr(0, err_msg.size() - 2) + "."); + } + else if(loss < 0.0) { - least_squares(inds, coefs.data(), start, sz, a.data(), b.data(), work.data()); - set_error(inds, coefs.data(), start, sz, error.data()); - start += sz; + std::string err_msg = "Descriptor matrix is not full-rank. The features that caused this are: "; + for(auto& ind : inds) + { + err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", "; + } + std::cerr << err_msg.substr(0, err_msg.size() - 2) << "." << std::endl; } - double rmse = std::sqrt(ddot_(_n_samp, error.data(), 1, error.data(), 1) / _n_samp); - if(rmse < min_errors_private[max_error_ind]) + else if(loss < min_errors_private[max_error_ind]) { - min_errors_private[max_error_ind] = rmse; + min_errors_private[max_error_ind] = loss; std::copy_n(inds.begin(), inds.size(), min_inds_private.begin() + max_error_ind * n_dim); max_error_ind = std::max_element(min_errors_private.begin(), min_errors_private.end()) - min_errors_private.begin(); } ii_prev = ii; } - #pragma omp critical { max_error_ind = std::max_element(min_errors.begin(), min_errors.end()) - min_errors.begin(); @@ -243,6 +168,15 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) } add_model(indexes); } + std::vector<std::shared_ptr<Model>> model_ptrs(_models.back().size()); + std::transform( + _models.back().begin(), + _models.back().end(), + model_ptrs.begin(), + [](ModelRegressor model){return std::make_shared<ModelRegressor>(model);} + ); + + _loss->set_model_list(model_ptrs); } void SISSORegressor::fit() diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp index 8216394db2691b23f12f96e2d394debc06086c1e..d758d9d8a2ce0629b1fcb8eb3070004802a5919d 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp @@ -30,9 +30,6 @@ class SISSORegressor: public SISSO_DI private: std::vector<std::vector<ModelRegressor>> _models; //!< List of models -protected: - int _lwork; //!< size of the work array (for dgels_) - public: /** * @brief Constructor for the Regressor @@ -65,50 +62,6 @@ public: const bool fix_intercept=false ); - /** - * @brief Set the residual for the next step - * - * @param inds indexes of the selected features - * @param coeffs Coefficients of the model - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - * @param error pointer to the error vector - */ - virtual void set_error(const std::vector<int>& inds, const double* coefs, const int start, const int n_samp, double* error) const; - - /** - * @brief Get the optimal size of the working array for dgels (solve Ax = b) - * - * @param n_dim Dimension of the least squares matrix - * @param a the pointer to the A matrix - * @param b the pointer to the b vector - * @return Optimal size of the working array - */ - int get_opt_lwork(const int n_dim, double* a, double* b); - - /** - * @brief Preform Least squares optimization using dgels (solve Ax = b) - * - * @param inds Feature indexes to get the model of - * @param coefs Coefficients for the model - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - * @param a the pointer to the A matrix - * @param b the pointer to the b vector - * @param work pointer to the working space for dgels - */ - void least_squares(const std::vector<int>& inds, double* coefs, const int start, const int n_samp, double* a, double*b, double* work); - - /** - * @brief Set the A matrix for the least squares problem - * - * @param inds indexes of the selected features - * @param start The index in the property and feature vectors start copying into _b and _a - * @param n_samp number of samples to perform least squares optimization on - * @param a the pointer to the A matrix - */ - virtual void set_a(const std::vector<int>& inds, int start, int n_samp, double* a); - /** * @brief Adds a model to the model list * diff --git a/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp b/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp index 7c60edf7cf1359904f5cf165daf6cedb14d8149b..d8e861806c4262206531e960944928a1bb0f55e9 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp @@ -1,6 +1,7 @@ #include "descriptor_identifier/SISSO_DI/SISSO_DI.hpp" SISSO_DI::SISSO_DI( + const std::string loss_type, const std::shared_ptr<FeatureSpace> feat_space, const std::string prop_label, const Unit prop_unit, @@ -30,4 +31,13 @@ SISSO_DI::SISSO_DI( _n_residual(n_residual), _n_models_store(n_models_store), _fix_intercept(fix_intercept) -{} +{ + _loss = loss_function_util::get_loss_function( + loss_type, + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); +} diff --git a/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp b/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp index 0b232ae92d81061de0727b45c347df0d20a5c9e3..ba99cb13c29b679cb92115ab0d33ecc075dc57a6 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp @@ -41,6 +41,7 @@ protected: const std::shared_ptr<FeatureSpace> _feat_space; //!< Feature Space for the problem std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI Communicator + std::shared_ptr<LossFunction> _loss; //!< The loss function for the calculation const int _n_task; //!< The number of tasks const int _n_samp; //!< the number of samples per feature @@ -67,6 +68,7 @@ public: * @param fix_intrecept If true fix intercept to 0 */ SISSO_DI( + const std::string loss_type, const std::shared_ptr<FeatureSpace> feat_space, const std::string prop_label, const Unit prop_unit, @@ -158,6 +160,7 @@ public: * @param fix_intrecept If true fix intercept to 0 */ SISSO_DI( + std::string loss_type, std::shared_ptr<FeatureSpace> feat_space, std::string prop_label, Unit prop_unit, @@ -190,6 +193,7 @@ public: * @param fix_intrecept If true fix intercept to 0 */ SISSO_DI( + std::string loss_type, std::shared_ptr<FeatureSpace> feat_space, std::string prop_label, Unit prop_unit, diff --git a/src/feature_creation/feature_space/FeatureSpace.cpp b/src/feature_creation/feature_space/FeatureSpace.cpp index 491c58850d5cf4eacf0d458c2d44943a40b96650..08d113e85a70ea9976d9964fb20f6cb6fb735454 100644 --- a/src/feature_creation/feature_space/FeatureSpace.cpp +++ b/src/feature_creation/feature_space/FeatureSpace.cpp @@ -976,7 +976,7 @@ void FeatureSpace::project_generated(std::shared_ptr<LossFunction> loss, std::ve if(node_value_arrs::N_SELECTED > _n_sis_select) { scores_sel_all.resize(_phi_selected.size()); - loss->project(_phi_selected, scores_sel_all.data()); + project_funcs::project_loss(loss, _phi_selected, scores_sel_all.data()); } std::copy_n(scores_sel.data(), scores_sel.size(), &scores_sel_all[node_value_arrs::N_SELECTED - _n_sis_select]); @@ -1037,7 +1037,7 @@ void FeatureSpace::project_generated(std::shared_ptr<LossFunction> loss, std::ve node_value_arrs::clear_temp_reg_thread(); std::vector<double> scores(generated_phi.size()); - loss->project_no_omp(generated_phi, scores.data()); + project_funcs::project_loss_no_omp(loss, generated_phi, scores.data()); std::vector<int> inds = util_funcs::argsort<double>(scores); @@ -1177,7 +1177,7 @@ void FeatureSpace::sis(const std::shared_ptr<LossFunction> loss) // Get projection scores double start_time = omp_get_wtime(); - loss->project(_phi, _scores.data()); + project_funcs::project_loss(loss, _phi, _scores.data()); // _project(prop.data(), _scores.data(), _phi, _task_sizes, prop.size() / _n_samp); _mpi_comm->barrier(); @@ -1202,7 +1202,7 @@ void FeatureSpace::sis(const std::shared_ptr<LossFunction> loss) std::vector<double> scores_sel_all(node_value_arrs::N_SELECTED, 0.0); if(node_value_arrs::N_SELECTED > _n_sis_select) { - loss->project(_phi_selected, scores_sel_all.data()); + project_funcs::project_loss(loss, _phi_selected, scores_sel_all.data()); // _project(prop.data(), scores_sel_all.data(), _phi_selected, _task_sizes, prop.size() / _n_samp); } diff --git a/src/loss_function/LossFunction.cpp b/src/loss_function/LossFunction.cpp index 977e3783651692966cce76719bdb9bf6e7cb21b5..4bf6fbf9669e7347f1b9805652eaf600bc8ede01 100644 --- a/src/loss_function/LossFunction.cpp +++ b/src/loss_function/LossFunction.cpp @@ -17,64 +17,26 @@ LossFunction::LossFunction( _n_samp(std::accumulate(task_sizes_train.begin(), task_sizes_train.end(), 0)), _n_samp_test(std::accumulate(task_sizes_test.begin(), task_sizes_test.end(), 0)), _n_task(task_sizes_train.size()), - _n_project_prop(prop_train.size() / _n_samp), + _n_project_prop(_projection_prop.size() / _n_samp), _n_feat(1), _n_dim(1 + (!fix_intercept)), _fix_intercept(fix_intercept) {} LossFunction::LossFunction(std::shared_ptr<LossFunction> o) : - _projection_prop(o->prop()), - _prop_train(o->prop_test()), - _prop_test(o->prop_project()), + _model_last_gen(o->model_last_gen()), + _projection_prop(o->prop_project()), + _prop_train(o->prop_train()), + _prop_test(o->prop_test()), _error_train(_prop_train.size(), 0.0), - _error_test(_prop_train.size(), 0.0), + _error_test(_prop_test.size(), 0.0), _task_sizes_train(o->task_sizes_train()), _task_sizes_test(o->task_sizes_test()), _n_samp(std::accumulate(_task_sizes_train.begin(), _task_sizes_train.end(), 0)), _n_samp_test(std::accumulate(_task_sizes_test.begin(), _task_sizes_test.end(), 0)), _n_task(_task_sizes_train.size()), - _n_project_prop(_prop_train.size() / _n_samp), + _n_project_prop(_projection_prop.size() / _n_samp), _n_feat(o->n_feat()), _n_dim(o->n_dim()), _fix_intercept(o->fix_intercept()) -{ - _model_last_gen = o->model_last_gen(); -} - -void LossFunction::project(const std::vector<node_ptr>& feats, double* scores) -{ - prepare_project(); - std::fill_n(scores, feats.size(), std::numeric_limits<double>::max()); - #pragma omp parallel - { - int start = ( - omp_get_thread_num() * (feats.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num(), static_cast<int>(feats.size()) % omp_get_num_threads()) - ); - int end = ( - (omp_get_thread_num() + 1) * (feats.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num() + 1, static_cast<int>(feats.size()) % omp_get_num_threads()) - ); - std::transform( - feats.begin() + start, - feats.begin() + end, - scores + start, - [this](node_ptr feat){return project(feat);} - ); - - #pragma omp barrier - } -} - -void LossFunction::project_no_omp(const std::vector<node_ptr>& feats, double* scores) -{ - prepare_project(); - std::fill_n(scores, feats.size(), std::numeric_limits<double>::max()); - std::transform( - feats.begin(), - feats.end(), - scores, - [this](node_ptr feat){return project(feat);} - ); -} +{} diff --git a/src/loss_function/LossFunction.hpp b/src/loss_function/LossFunction.hpp index f24c5542180d3e02fde229422507a7337c22852e..ca2ad1e99c5ae7f7aa82fdf74b605aa545b5420b 100644 --- a/src/loss_function/LossFunction.hpp +++ b/src/loss_function/LossFunction.hpp @@ -21,16 +21,16 @@ protected: std::vector<double> _error_train; //!< The error vector for the training set std::vector<double> _error_test; //!< The error vector for the test set - std::vector<int> _task_sizes_train; //!< Number of training samples per task - std::vector<int> _task_sizes_test; //!< Number of testing samples per task + const std::vector<int> _task_sizes_train; //!< Number of training samples per task + const std::vector<int> _task_sizes_test; //!< Number of testing samples per task - int _n_samp; //!< Number of samples in the training set - int _n_samp_test; //!< Number of samples in the test set - int _n_task; //!< Number of tasks + const int _n_samp; //!< Number of samples in the training set + const int _n_samp_test; //!< Number of samples in the test set int _n_project_prop; //!< Number of properties to project over int _n_feat; int _n_dim; - bool _fix_intercept; + const int _n_task; //!< Number of tasks + const bool _fix_intercept; public: /** @@ -92,22 +92,6 @@ public: */ virtual double project(const node_ptr& feat) = 0; - /** - * @brief Calculate the projection score of a set of features - * - * @param feats The set of features to calculate - * @param scores vector to output the scores to - */ - void project(const std::vector<node_ptr>& feats, double* scores); - - /** - * @brief Calculate the projection score of a set of features without using OpenMP - * - * @param feats The set of features to calculate - * @param scores vector to output the scores to - */ - void project_no_omp(const std::vector<node_ptr>& feats, double* scores); - /** * @brief Evaluate the loss function for a set of features whose data is stored in a central location * @@ -140,7 +124,7 @@ public: inline std::vector<int> task_sizes_train() const {return _task_sizes_train;} inline std::vector<int> task_sizes_test() const {return _task_sizes_test;} - inline const std::vector<double>& prop() const {return _prop_train;} + inline const std::vector<double>& prop_train() const {return _prop_train;} inline const std::vector<double>& prop_test() const {return _prop_test;} inline const std::vector<double>& prop_project() const {return _projection_prop;} diff --git a/src/loss_function/LossFunctionLogPearsonRMSE.cpp b/src/loss_function/LossFunctionLogPearsonRMSE.cpp index 5c69ad94767e26d9ad138e81d56d91dd8e162d59..fbd7138d0075470fbbe17809bd2b0b512fb85b16 100644 --- a/src/loss_function/LossFunctionLogPearsonRMSE.cpp +++ b/src/loss_function/LossFunctionLogPearsonRMSE.cpp @@ -16,7 +16,9 @@ LossFunctionLogPearsonRMSE::LossFunctionLogPearsonRMSE( LossFunctionLogPearsonRMSE::LossFunctionLogPearsonRMSE(std::shared_ptr<LossFunction> o) : LossFunctionPearsonRMSE(o), _log_feat(_n_samp, 0.0) -{} +{ + std::copy_n(_prop_train.begin(), _prop_train.size(), _projection_prop.begin()); +} void LossFunctionLogPearsonRMSE::set_model_list(const std::vector<std::shared_ptr<Model>>& model_list) { @@ -56,10 +58,7 @@ double LossFunctionLogPearsonRMSE::project(const node_ptr& feat) ); double r2 = calc_max_pearson(_log_feat.data()); - // std::cout << r2 << '\t' << std::isfinite(r2) << '\t' << *std::max_element(val_ptr, val_ptr + _n_samp, [](double v1, double v2){return std::abs(v1) < std::abs(v1);}) << std::endl; - // std::cout << r2 << '\t' << std::isfinite(r2) << '\t' << *std::min_element(val_ptr, val_ptr + _n_samp) << std::endl; - // std::cout << r2 << '\t' << std::isfinite(r2) << '\t' << *std::min_element(_projection_prop.begin(), _projection_prop.end()) << std::endl; - return r2 * (std::isfinite(r2)); + return std::isfinite(r2) ? r2 : 0.0; } void LossFunctionLogPearsonRMSE::set_a(const std::vector<int>& inds, int taskind, int start) @@ -70,7 +69,7 @@ void LossFunctionLogPearsonRMSE::set_a(const std::vector<int>& inds, int taskind std::transform( val_ptr, val_ptr + _task_sizes_train[taskind], - &_a[ff * start + taskind * (_n_feat + (!_fix_intercept))], + &_a[ff * _task_sizes_train[taskind] + start * _n_dim], [](double val){return std::log10(val);} ); } @@ -84,8 +83,64 @@ void LossFunctionLogPearsonRMSE::set_a(const std::vector<node_ptr>& feats, int t std::transform( val_ptr, val_ptr + _task_sizes_train[taskind], - &_a[ff * start + start * (_n_feat + (!_fix_intercept))], + &_a[ff * _task_sizes_train[taskind] + start * _n_dim], + [](double val){return std::log10(val);} + ); + } +} + +void LossFunctionLogPearsonRMSE::set_error(const std::vector<int>& inds, int taskind, int start) +{ + std::fill_n( + _error_train.begin() + start, + _task_sizes_train[taskind], + -1 * (!_fix_intercept) * _coefs[(taskind + 1) * _n_dim - 1] + ); + + for(int ff = 0; ff < inds.size(); ++ff) + { + std::transform( + node_value_arrs::get_d_matrix_ptr(inds[ff]) + start, + node_value_arrs::get_d_matrix_ptr(inds[ff]) + start + _task_sizes_train[taskind], + &_log_feat[start], [](double val){return std::log10(val);} ); + daxpy_( + _task_sizes_train[taskind], + -1.0 * _coefs[taskind * _n_dim + ff], + &_log_feat[start], + 1, + &_error_train[start], + 1 + ); + } + daxpy_(_task_sizes_train[taskind], 1.0, &_prop_train[start], 1, &_error_train[start], 1); +} + +void LossFunctionLogPearsonRMSE::set_error(const std::vector<node_ptr>& feats, int taskind, int start) +{ + std::fill_n( + _error_train.begin() + start, + _task_sizes_train[taskind], + -1 * (!_fix_intercept) * _coefs[(taskind + 1) * _n_feat - 1] + ); + + for(int ff = 0; ff < feats.size(); ++ff) + { + std::transform( + feats[ff]->value_ptr() + start, + feats[ff]->value_ptr() + start + _task_sizes_train[taskind], + &_log_feat[start], + [](double val){return std::log10(val);} + ); + daxpy_( + _task_sizes_train[taskind], + -1.0 * _coefs[taskind * _n_dim + ff], + &_log_feat[start], + 1, + &_error_train[start], + 1 + ); } + daxpy_(_task_sizes_train[taskind], 1.0, &_prop_train[start], 1, &_error_train[start], 1); } diff --git a/src/loss_function/LossFunctionLogPearsonRMSE.hpp b/src/loss_function/LossFunctionLogPearsonRMSE.hpp index 4ae036d83058cd11179b66c8eccc4dc7bdca6887..1fdaa9c9030ca35d8588faf7dbc57777a48b6801 100644 --- a/src/loss_function/LossFunctionLogPearsonRMSE.hpp +++ b/src/loss_function/LossFunctionLogPearsonRMSE.hpp @@ -92,6 +92,22 @@ public: */ void set_a(const std::vector<node_ptr>& feats, int taskind, int start); + /** + * @brief Set the error and return the RMSE + * + * @param inds Indexes to access the central storage area + * @return The RMSE of the model + */ + void set_error(const std::vector<int>& inds, int taskind, int start); + + /** + * @brief Set the error and return the RMSE + * + * @param feats The features used to evaluate the loss function + * @return The RMSE of the model + */ + void set_error(const std::vector<node_ptr>& feats, int taskind, int start); + /** * @brief Set the model list from the previous generation * @details [long description] diff --git a/src/loss_function/LossFunctionPearsonRMSE.cpp b/src/loss_function/LossFunctionPearsonRMSE.cpp index a1647e31ff227017890f492716c05b9cebca8606..f7bf5282c0077600d6ce5b40f7d2a6aebb50e359 100644 --- a/src/loss_function/LossFunctionPearsonRMSE.cpp +++ b/src/loss_function/LossFunctionPearsonRMSE.cpp @@ -13,7 +13,9 @@ LossFunctionPearsonRMSE::LossFunctionPearsonRMSE( _c(_n_task, 0.0), _coefs(_n_task * _n_dim, 0.0), _lwork(0) -{} +{ + set_opt_lwork(); +} LossFunctionPearsonRMSE::LossFunctionPearsonRMSE(std::shared_ptr<LossFunction> o) : LossFunction(o), @@ -22,7 +24,9 @@ LossFunctionPearsonRMSE::LossFunctionPearsonRMSE(std::shared_ptr<LossFunction> o _c(_n_task, 0.0), _coefs(_n_task * _n_dim, 0.0), _lwork(0) -{} +{ + set_opt_lwork(); +} void LossFunctionPearsonRMSE::set_model_list(const std::vector<std::shared_ptr<Model>>& model_list) { @@ -165,10 +169,9 @@ double LossFunctionPearsonRMSE::operator()(const std::vector<int>& inds) if(info != 0) { - return -1.0 + info / std::abs(info); + return -1.0 + info / std::abs(info) * 0.5; } - - return 1.0 / static_cast<double>(_n_samp) * std::sqrt( + return std::sqrt( 1.0 / static_cast<double>(_n_samp) * std::accumulate( _error_train.begin(), _error_train.end(), @@ -197,10 +200,10 @@ double LossFunctionPearsonRMSE::operator()(const std::vector<node_ptr>& feats) if(info != 0) { - return -1.0 + info / std::abs(info); + return -1.0 + info / std::abs(info) * 0.5; } - return 1.0 / static_cast<double>(_n_samp) * std::sqrt( + return std::sqrt(1.0 / static_cast<double>(_n_samp) * std::accumulate( _error_train.begin(), _error_train.end(), @@ -217,7 +220,7 @@ void LossFunctionPearsonRMSE::set_a(const std::vector<int>& inds, int taskind, i std::copy_n( node_value_arrs::get_d_matrix_ptr(inds[ff]) + start, _task_sizes_train[taskind], - &_a[ff * start + taskind * (_n_feat + (!_fix_intercept))] + &_a[ff * _task_sizes_train[taskind] + start * _n_dim] ); } } @@ -229,7 +232,7 @@ void LossFunctionPearsonRMSE::set_a(const std::vector<node_ptr>& feats, int task std::copy_n( feats[ff]->value_ptr() + start, _task_sizes_train[start], - &_a[ff * start + start * (_n_feat + (!_fix_intercept))] + &_a[ff * _task_sizes_train[taskind] + start * _n_dim] ); } } @@ -242,7 +245,7 @@ int LossFunctionPearsonRMSE::least_squares(int taskind, int start) _task_sizes_train[taskind], _n_dim, 1, - &_a[start], + &_a[start * _n_dim], _task_sizes_train[taskind], &_b[start], _task_sizes_train[taskind], @@ -250,7 +253,6 @@ int LossFunctionPearsonRMSE::least_squares(int taskind, int start) _lwork, &info ); - std::copy_n(&_b[start], _n_dim, &_coefs[taskind * _n_dim]); return info; } @@ -260,7 +262,7 @@ void LossFunctionPearsonRMSE::set_error(const std::vector<int>& inds, int taskin std::fill_n( _error_train.begin() + start, _task_sizes_train[taskind], - -1 * (!_fix_intercept) * _coefs[(taskind + 1) * _n_feat - 1] + -1 * (!_fix_intercept) * _coefs[(taskind + 1) * _n_dim - 1] ); for(int ff = 0; ff < inds.size(); ++ff) @@ -274,6 +276,7 @@ void LossFunctionPearsonRMSE::set_error(const std::vector<int>& inds, int taskin 1 ); } + daxpy_(_task_sizes_train[taskind], 1.0, &_prop_train[start], 1, &_error_train[start], 1); } void LossFunctionPearsonRMSE::set_error(const std::vector<node_ptr>& feats, int taskind, int start) @@ -295,4 +298,5 @@ void LossFunctionPearsonRMSE::set_error(const std::vector<node_ptr>& feats, int 1 ); } + daxpy_(_task_sizes_train[taskind], 1.0, &_prop_train[start], 1, &_error_train[start], 1); } diff --git a/src/loss_function/LossFunctionPearsonRMSE.hpp b/src/loss_function/LossFunctionPearsonRMSE.hpp index 7c1789b95451341eae03b1e2f8315e6cd63dc3d3..e844b6f9fbf5a96290fd7bcd5593b336d32fba5b 100644 --- a/src/loss_function/LossFunctionPearsonRMSE.hpp +++ b/src/loss_function/LossFunctionPearsonRMSE.hpp @@ -140,7 +140,7 @@ public: * @param inds Indexes to access the central storage area * @return The RMSE of the model */ - void set_error(const std::vector<int>& inds, int taskind, int start); + virtual void set_error(const std::vector<int>& inds, int taskind, int start); /** * @brief Set the error and return the RMSE @@ -148,7 +148,7 @@ public: * @param feats The features used to evaluate the loss function * @return The RMSE of the model */ - void set_error(const std::vector<node_ptr>& feats, int taskind, int start); + virtual void set_error(const std::vector<node_ptr>& feats, int taskind, int start); /** * @brief Perform least squares regression @@ -165,7 +165,7 @@ public: */ virtual void set_model_list(const std::vector<std::shared_ptr<Model>>& model_list); - inline LOSS_TYPE type() const {return LOSS_TYPE::PEARSON_RMSE;} + virtual inline LOSS_TYPE type() const {return LOSS_TYPE::PEARSON_RMSE;} }; #endif diff --git a/src/loss_function/utils.cpp b/src/loss_function/utils.cpp index bbbf65e39bbfa50a504ad8361ae7a4fbe5709dae..c17fb3b00bcd79eef1b9e16b229b4f36cdb4d693 100644 --- a/src/loss_function/utils.cpp +++ b/src/loss_function/utils.cpp @@ -44,15 +44,15 @@ std::shared_ptr<LossFunction> loss_function_util::get_loss_function( std::shared_ptr<LossFunction> loss_function_util::copy(std::shared_ptr<LossFunction> loss) { - if(loss->type() == LOSS_TYPE::CONVEX_HULL) + if(loss->type() == LOSS_TYPE::PEARSON_RMSE) { return std::make_shared<LossFunctionPearsonRMSE>(loss); } - else if(loss->type() == LOSS_TYPE::PEARSON_RMSE) + else if(loss->type() == LOSS_TYPE::LOG_PEARSON_RMSE) { return std::make_shared<LossFunctionLogPearsonRMSE>(loss); } - else if(loss->type() == LOSS_TYPE::LOG_PEARSON_RMSE) + else if(loss->type() == LOSS_TYPE::CONVEX_HULL) { return std::make_shared<LossFunctionConvexHull>(loss); } diff --git a/src/python/descriptor_identifier/SISSOClassifier.cpp b/src/python/descriptor_identifier/SISSOClassifier.cpp index a4857b067905b1bd1bd6f9834ce06d9703227a4e..94e8fb250c03fc8bac75595deb4fa1b81fbb1eea 100644 --- a/src/python/descriptor_identifier/SISSOClassifier.cpp +++ b/src/python/descriptor_identifier/SISSOClassifier.cpp @@ -13,7 +13,21 @@ SISSOClassifier::SISSOClassifier( int n_residual, int n_models_store ) : - SISSO_DI(feat_space, prop_label, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, false), + SISSO_DI( + "classification", + feat_space, + prop_label, + prop_unit, + prop, + prop_test, + task_sizes_train, + task_sizes_test, + leave_out_inds, + n_dim, + n_residual, + n_models_store, + false + ), _c(100.0), _width(1.0e-5), _n_class(1) @@ -38,7 +52,21 @@ SISSOClassifier::SISSOClassifier( int n_residual, int n_models_store ) : - SISSO_DI(feat_space, prop_label, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, false), + SISSO_DI( + "classification", + feat_space, + prop_label, + prop_unit, + prop, + prop_test, + task_sizes_train, + task_sizes_test, + leave_out_inds, + n_dim, + n_residual, + n_models_store, + false + ), _c(100.0), _width(1.0e-5), _n_class(1) diff --git a/src/python/descriptor_identifier/SISSOLogRegressor.cpp b/src/python/descriptor_identifier/SISSOLogRegressor.cpp index 97ff8ad17589a54182a26eba22554b2f0d317f6c..cf7d1b193a2fba03c368482c69d9d12483f50dc1 100644 --- a/src/python/descriptor_identifier/SISSOLogRegressor.cpp +++ b/src/python/descriptor_identifier/SISSOLogRegressor.cpp @@ -31,6 +31,14 @@ SISSOLogRegressor::SISSOLogRegressor( _prop_no_log(_prop) { std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); + _loss = loss_function_util::get_loss_function( + "log_regression", + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } SISSOLogRegressor::SISSOLogRegressor( @@ -64,6 +72,14 @@ SISSOLogRegressor::SISSOLogRegressor( _prop_no_log(_prop) { std::transform(_prop.begin(), _prop.end(), _prop.begin(), [](double p){return std::log(p);}); + _loss = loss_function_util::get_loss_function( + "log_regression", + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } py::list SISSOLogRegressor::models_log_reg_py() diff --git a/src/python/descriptor_identifier/SISSORegressor.cpp b/src/python/descriptor_identifier/SISSORegressor.cpp index 98c507d15a6726b931572f98a466b75b44423ff6..530af82be3fa6d01395a4bc46d7453c9a46803c4 100644 --- a/src/python/descriptor_identifier/SISSORegressor.cpp +++ b/src/python/descriptor_identifier/SISSORegressor.cpp @@ -15,6 +15,7 @@ SISSORegressor::SISSORegressor( bool fix_intercept ) : SISSO_DI( + "regression", feat_space, prop_label, prop_unit, @@ -45,6 +46,7 @@ SISSORegressor::SISSORegressor( bool fix_intercept ) : SISSO_DI( + "regression", feat_space, prop_label, prop_unit, diff --git a/src/python/descriptor_identifier/SISSO_DI.cpp b/src/python/descriptor_identifier/SISSO_DI.cpp index 4d8998cbaffab2c6bbefcf13cfddd8c6c96fe38e..c3b54f070724129b9949d63651b12395cd20b6ab 100644 --- a/src/python/descriptor_identifier/SISSO_DI.cpp +++ b/src/python/descriptor_identifier/SISSO_DI.cpp @@ -1,6 +1,7 @@ #include "descriptor_identifier/SISSO_DI/SISSO_DI.hpp" SISSO_DI::SISSO_DI( + std::string loss_type, std::shared_ptr<FeatureSpace> feat_space, std::string prop_label, Unit prop_unit, @@ -32,9 +33,19 @@ SISSO_DI::SISSO_DI( _fix_intercept(fix_intercept) { node_value_arrs::initialize_d_matrix_arr(); + + _loss = loss_function_util::get_loss_function( + loss_type, + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } SISSO_DI::SISSO_DI( + std::string loss_type, std::shared_ptr<FeatureSpace> feat_space, std::string prop_label, Unit prop_unit, @@ -66,4 +77,13 @@ SISSO_DI::SISSO_DI( _fix_intercept(fix_intercept) { node_value_arrs::initialize_d_matrix_arr(); + + _loss = loss_function_util::get_loss_function( + loss_type, + _prop, + _prop_test, + _task_sizes_train, + _task_sizes_test, + _fix_intercept + ); } diff --git a/src/utils/project.cpp b/src/utils/project.cpp index 376e9bbf39d908d8c08dcfdacfc2fb5d5b8f13fa..635e537ec5ec8a36d274c48e67a392cb18956a64 100644 --- a/src/utils/project.cpp +++ b/src/utils/project.cpp @@ -1,34 +1,5 @@ #include "utils/project.hpp" -void project_funcs::set_project_fxn( - const std::string project_type, - const int n_task, - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)>& project, - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)>& project_no_omp -) -{ - if(project_type.compare("regression") == 0) - { - project = project_r2; - project_no_omp = project_r2_no_omp; - } - else if(project_type.compare("classification") == 0) - { - project = project_classify; - project_no_omp = project_classify_no_omp; - } - else if(project_type.compare("log_regression") == 0) - { - if(n_task > 1) - throw std::logic_error("Log Regression can not be done using multiple tasks."); - project = project_log_r2; - project_no_omp = project_log_r2_no_omp; - } - else - throw std::logic_error("Wrong projection type passed to FeatureSpace constructor."); - -} - void project_funcs::project_r( const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop ) @@ -78,140 +49,6 @@ void project_funcs::project_r( } } -void project_funcs::project_r2( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) -{ - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - #pragma omp parallel firstprivate(prop) - { - int start = ( - omp_get_thread_num() * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num(), static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - int end = ( - (omp_get_thread_num() + 1) * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num() + 1, static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - - std::vector<double> mean_prop(sizes.size(), 0.0); - std::vector<double> std_prop(sizes.size(), 0.0); - std::fill_n(scores + start, end - start, 0.0); - for(int pp = 0; pp < n_prop; ++pp) - { - int pos = 0; - for(int tt = 0; tt < sizes.size(); ++tt) - { - mean_prop[tt] = util_funcs::mean<double>(prop + pos, sizes[tt]); - std_prop[tt] = util_funcs::stand_dev(prop + pos, sizes[tt], mean_prop[tt]); - pos += sizes[tt]; - } - std::transform( - phi.begin() + start, - phi.begin() + end, - scores + start, - scores + start, - [&prop, &sizes, &mean_prop, &std_prop](node_ptr feat, double sc){ - return std::min( - -1 * util_funcs::r2(prop, feat->value_ptr(-1, true), sizes.data(), mean_prop.data(), std_prop.data(), sizes.size()), - sc - ); - } - ); - prop += n_samp; - } - std::transform(scores + start, scores + end, scores + start, [](double score){return std::isnan(score) ? 0.0 : score;}); - - #pragma omp barrier - } -} - -void project_funcs::project_log_r2( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) -{ - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - #pragma omp parallel firstprivate(prop) - { - int start = ( - omp_get_thread_num() * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num(), static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - int end = ( - (omp_get_thread_num() + 1) * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num() + 1, static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - - std::vector<double> mean_prop(sizes.size(), 0.0); - std::vector<double> std_prop(sizes.size(), 0.0); - std::vector<double> log_feat(n_samp, 0.0); - - std::fill_n(scores + start, end - start, 0.0); - for(int pp = 0; pp < n_prop; ++pp) - { - int pos = 0; - for(int tt = 0; tt < sizes.size(); ++tt) - { - mean_prop[tt] = util_funcs::mean<double>(prop + pos, sizes[tt]); - std_prop[tt] = util_funcs::stand_dev(prop + pos, sizes[tt], mean_prop[tt]); - pos += sizes[tt]; - } - std::transform( - phi.begin() + start, - phi.begin() + end, - scores + start, - scores + start, - [&prop, &sizes, &mean_prop, &std_prop, &log_feat](node_ptr feat, double sc){ - return std::min( - -1 * util_funcs::log_r2(feat->value_ptr(-1, true), prop, log_feat.data(), sizes.data(), mean_prop.data(), std_prop.data(), sizes.size()), - sc - ); - } - ); - prop += n_samp; - } - std::transform(scores + start, scores + end, scores + start, [](double score){return std::isnan(score) ? 0.0 : score;}); - - #pragma omp barrier - } -} - -void project_funcs::project_classify( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) -{ - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - std::fill_n(scores, phi.size(), std::numeric_limits<double>::max()); - #pragma omp parallel firstprivate(sizes, prop) - { - int start = ( - omp_get_thread_num() * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num(), static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - int end = ( - (omp_get_thread_num() + 1) * (phi.size() / omp_get_num_threads()) + - std::min(omp_get_thread_num() + 1, static_cast<int>(phi.size()) % omp_get_num_threads()) - ); - - for(int pp = 0; pp < n_prop; ++pp) - { - ConvexHull1D convex_hull(sizes, prop); - std::transform( - phi.begin() + start, - phi.begin() + end, - scores + start, - scores + start, - [&convex_hull](node_ptr feat, double score){return std::min(convex_hull.overlap_1d(feat->value_ptr(-1, true)), score);} - ); - prop += n_samp; - #pragma omp barrier - } - std::transform(scores + start, scores + end, scores + start, [](double score){return std::isnan(score) ? std::numeric_limits<double>::infinity() : score;}); - } -} - void project_funcs::project_r_no_omp( const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop ) @@ -249,99 +86,42 @@ void project_funcs::project_r_no_omp( std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;}); } -void project_funcs::project_r2_no_omp( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) +void project_funcs::project_loss(std::shared_ptr<LossFunction> loss, const std::vector<node_ptr>& feats, double* scores) { - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - std::vector<double> mean_prop(sizes.size(), 0.0); - std::vector<double> std_prop(sizes.size(), 0.0); - - std::fill_n(scores, phi.size(), 0.0); - - for(int pp = 0; pp < n_prop; ++pp) + std::fill_n(scores, feats.size(), std::numeric_limits<double>::max()); + #pragma omp parallel firstprivate(loss) { - int pos = 0; - for(int tt = 0; tt < sizes.size(); ++tt) - { - mean_prop[tt] = util_funcs::mean<double>(prop + pos, sizes[tt]); - std_prop[tt] = util_funcs::stand_dev(prop + pos, sizes[tt], mean_prop[tt]); - pos += sizes[tt]; - } - std::transform( - phi.begin(), - phi.end(), - scores, - scores, - [&prop, &sizes, &mean_prop, &std_prop](node_ptr feat, double sc){ - return std::min( - -1 * util_funcs::r2(prop, feat->value_ptr(-1, true), sizes.data(), mean_prop.data(), std_prop.data(), sizes.size()), - sc - ); - } + loss = loss_function_util::copy(loss); + loss->prepare_project(); + int start = ( + omp_get_thread_num() * (feats.size() / omp_get_num_threads()) + + std::min(omp_get_thread_num(), static_cast<int>(feats.size()) % omp_get_num_threads()) ); - prop += n_samp; - } - std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;}); -} - -void project_funcs::project_log_r2_no_omp( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) -{ - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - std::vector<double> log_feat(n_samp, 0.0); - std::vector<double> mean_prop(sizes.size(), 0.0); - std::vector<double> std_prop(sizes.size(), 0.0); - std::fill_n(scores, phi.size(), 0.0); + int end = ( + (omp_get_thread_num() + 1) * (feats.size() / omp_get_num_threads()) + + std::min(omp_get_thread_num() + 1, static_cast<int>(feats.size()) % omp_get_num_threads()) + ); - for(int pp = 0; pp < n_prop; ++pp) - { - int pos = 0; - for(int tt = 0; tt < sizes.size(); ++tt) - { - mean_prop[tt] = util_funcs::mean<double>(prop + pos, sizes[tt]); - std_prop[tt] = util_funcs::stand_dev(prop + pos, sizes[tt], mean_prop[tt]); - pos += sizes[tt]; - } std::transform( - phi.begin(), - phi.end(), - scores, - scores, - [&prop, &sizes, &mean_prop, &std_prop, &log_feat](node_ptr feat, double sc){ - return std::min( - -1 * util_funcs::log_r2(feat->value_ptr(-1, true), prop, log_feat.data(), sizes.data(), mean_prop.data(), std_prop.data(), sizes.size()), - sc - ); - } + feats.begin() + start, + feats.begin() + end, + scores + start, + [&loss](node_ptr feat){return loss->project(feat);} ); - prop += n_samp; + + #pragma omp barrier } - std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? 0.0 : score;}); } -void project_funcs::project_classify_no_omp( - const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& sizes, const int n_prop -) +void project_funcs::project_loss_no_omp(std::shared_ptr<LossFunction> loss, const std::vector<node_ptr>& feats, double* scores) { - int n_samp = std::accumulate(sizes.begin(), sizes.end(), 0); - - std::fill_n(scores, phi.size(), std::numeric_limits<double>::max()); - for(int pp = 0; pp < n_prop; ++pp) - { - ConvexHull1D convex_hull(sizes, prop); - std::transform( - phi.begin(), - phi.end(), - scores, - scores, - [&convex_hull](node_ptr feat, double score){return std::min(convex_hull.overlap_1d(feat->value_ptr(-1, true)), score);} - ); - prop += n_samp; - } - std::transform(scores, scores + phi.size(), scores, [](double score){return std::isnan(score) ? std::numeric_limits<double>::infinity() : score;}); + loss->prepare_project(); + std::fill_n(scores, feats.size(), std::numeric_limits<double>::max()); + std::transform( + feats.begin(), + feats.end(), + scores, + [&loss](node_ptr feat){return loss->project(feat);} + ); } diff --git a/src/utils/project.hpp b/src/utils/project.hpp index f3ac91513bf79a8fa58a3adffe6e7850b757a9e8..3a59b51616ae955049271cb453fed3e09e3aa61a 100644 --- a/src/utils/project.hpp +++ b/src/utils/project.hpp @@ -13,27 +13,13 @@ #include "classification/ConvexHull1D.hpp" #include "feature_creation/node/Node.hpp" +#include "loss_function/utils.hpp" #ifdef PARAMETERIZE #include "nl_opt/NLOptWrapper.hpp" #endif namespace project_funcs { - /** - * @brief Set the project functions for SIS - * - * @param project_type Type of projection to perform - * @param n_task number of tasks in calculation - * @param poroject The _poroject function from the feature_space - * @param project_no_omp The _project_no_omp function from the feature_space - */ - void set_project_fxn( - const std::string project_type, - const int n_task, - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)>& project, - std::function<void(const double*, double*, const std::vector<node_ptr>&, const std::vector<int>&, const int)>& project_no_omp - ); - /** * @brief Calculate the projection scores of a set of features to a vector via Pearson correlation * @@ -45,40 +31,6 @@ namespace project_funcs */ void project_r(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - /** - * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination - * - * @param prop The pointer to the property vector to calculate the Coefficient of Determination against - * @param scores The pointer of the score vector - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size Vector of the size of all of the tasks - * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of - */ - void project_r2(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - - /** - * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination (on the log transformed problem) - * - * @param prop The pointer to the property vector to calculate the Coefficient of Determination against - * @param scores The pointer of the score vector - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size Vector of the size of all of the tasks - * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of - */ - void project_log_r2(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - - /** - * @brief Calculate projection scores for classification - * @details Create a 1D-convex hull for each class and calculate the projection score - * - * @param prop The classes to separate the property - * @param scores the vector to store the projection scores - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size list of the sizes for each task - * @param n_prop number of properties - */ - void project_classify(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - /** * @brief Calculate the projection scores of a set of features to a vector via Pearson correlation * @@ -91,37 +43,22 @@ namespace project_funcs void project_r_no_omp(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); /** - * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination - * - * @param prop The pointer to the property vector to calculate the Coefficient of Determination against - * @param scores The pointer of the score vector - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size Vector of the size of all of the tasks - * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of - */ - void project_r2_no_omp(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); - - /** - * @brief Calculate the projection scores of a set of features to a vector via Coefficient of Determination (on the log transform problem) + * @brief Calculate the projection score of a set of features * - * @param prop The pointer to the property vector to calculate the Coefficient of Determination against - * @param scores The pointer of the score vector - * @param phi The set of features to calculate the Coefficient of Determination of - * @param size Vector of the size of all of the tasks - * @param n_prop The number of properties to calculate the Coefficient of Determination of and return the maximum of + * @param loss The LossFunnction used for the projection + * @param feats The set of features to calculate + * @param scores vector to output the scores to */ - void project_log_r2_no_omp(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); + void project_loss(std::shared_ptr<LossFunction> loss, const std::vector<node_ptr>& feats, double* scores); /** - * @brief Calculate projection scores for classification - * @details Create a 1D-convex hull for each class and calculate the projection score + * @brief Calculate the projection score of a set of features without using OpenMP * - * @param prop The classes to separate the property - * @param scores the vector to store the projection scores - * @param size list of the sizes for each task - * @param n_prop number of properties + * @param loss The LossFunnction used for the projection + * @param feats The set of features to calculate + * @param scores vector to output the scores to */ - void project_classify_no_omp(const double* prop, double* scores, const std::vector<node_ptr>& phi, const std::vector<int>& size, const int n_prop); + void project_loss_no_omp(std::shared_ptr<LossFunction> loss, const std::vector<node_ptr>& feats, double* scores); } #endif