diff --git a/src/descriptor_identifier/Model/ModelClassifier.cpp b/src/descriptor_identifier/Model/ModelClassifier.cpp index 11c240c128c7634cb76728829766f296a2535f88..365c6abf23475de8336a04f9235248b5fb535c4b 100644 --- a/src/descriptor_identifier/Model/ModelClassifier.cpp +++ b/src/descriptor_identifier/Model/ModelClassifier.cpp @@ -13,134 +13,14 @@ ModelClassifier::ModelClassifier(Unit prop_unit, std::vector<double> prop_train, _prop_test_est.reserve(_n_samp_test); // Objects needed to calculate the number of points in the overlapping regions of the convex hulls - int n_row = _n_dim + 1; - int n_class = 0; - - ClpSimplex lp_simplex; - lp_simplex.setLogLevel(0); - lp_simplex.setMoreSpecialOptions(2); - - std::vector<std::vector<int>> inds_check; - std::vector<double> lp_column_lower(_n_samp_train + 1, 0.0); - std::vector<double> lp_column_upper(_n_samp_train + 1, COIN_DBL_MAX); - std::vector<double> lp_objective(_n_samp_train + 1, 0.0); - std::vector<double> lp_elements(_n_samp_train * n_row, 1.0); - std::vector<double> lp_row_eq(n_row, 1.0); - - std::vector<int> lp_column_start(_n_samp_train + 1, 0); - std::vector<int> lp_row(_n_samp_train * (_n_dim + 1), 0); - - for(int rr = 0; rr < n_row; ++rr) - scopy_(_n_samp_train, std::vector<int>(_n_samp_train, rr).data(), 1, &lp_row[rr], n_row); - - std::iota(lp_column_start.begin(), lp_column_start.end(), 0); - std::transform(lp_column_start.begin(), lp_column_start.end(), lp_column_start.begin(), [&n_row](int ii){return ii * (n_row);}); - - // Calculate number of overlapping points by checking feasibility of a linear optimization problem - int task_start_train = 0; - int task_start_test = 0; - - for(int tt = 0; tt < _task_sizes_train.size(); ++tt) - { - std::vector<int> inds_train(_task_sizes_train[tt], 0); - std::iota(inds_train.begin(), inds_train.end(), task_start_train); - util_funcs::argsort(inds_train.data(), inds_train.data() + inds_train.size(), &prop_train[task_start_train]); - - std::vector<int> inds_test(_task_sizes_test[tt], 0); - std::iota(inds_test.begin(), inds_test.end(), task_start_test); - util_funcs::argsort(inds_test.data(), inds_test.data() + inds_test.size(), &prop_test[task_start_test]); - - std::vector<std::vector<int>> inds_class_train(1, {inds_train[0]}); - - std::vector<std::vector<int>> inds_class_test; - if(_task_sizes_test[tt] > 0) - inds_class_test = std::vector<std::vector<int>>(1, {inds_test[0]}); - - for(int ii = 1; ii < inds_train.size(); ++ii) - { - if((prop_train[inds_train[ii]] == prop_train[inds_train[ii - 1]])) - inds_class_train.back().push_back(inds_train[ii]); - else - inds_class_train.push_back({inds_train[ii]}); - } - - n_class = int(std::round(static_cast<double>(inds_class_train.size()) / static_cast<double>(_task_sizes_train.size()))); - for(int ii = 1; ii < inds_test.size(); ++ii) - { - if((prop_test[inds_test[ii]] == prop_test[inds_test[ii - 1]])) - inds_class_test.back().push_back(inds_test[ii]); - else - inds_class_test.push_back({inds_test[ii]}); - } - - if(inds_class_test.size() == 0) - inds_class_test = std::vector<std::vector<int>>(inds_class_train.size()); - - for(int c1 = 0; c1 < inds_class_train.size(); ++c1) - { - int n_col = inds_class_train[c1].size(); - - for(int dd = 0; dd < _n_dim; ++dd) - for(int ii = 0; ii < n_col; ++ii) - lp_elements[ii * n_row + dd] = _feats[dd]->value_ptr()[inds_class_train[c1][ii]]; - - for(int c2 = 0; c2 < inds_class_train.size(); ++c2) - { - if(c2 == c1) - continue; - for(auto& ind : inds_class_train[c2]) - { - std::transform(_feats.begin(), _feats.end(), lp_row_eq.begin(), [&ind](model_node_ptr feat){return feat->value_ptr()[ind];}); - lp_simplex.loadProblem( - n_col, - n_row, - lp_column_start.data(), - lp_row.data(), - lp_elements.data(), - lp_column_lower.data(), - lp_column_upper.data(), - lp_objective.data(), - lp_row_eq.data(), - lp_row_eq.data() - ); - lp_simplex.dual(); - _prop_train_est[ind] += lp_simplex.primalFeasible(); - _train_n_convex_overlap += lp_simplex.primalFeasible(); - } - for(auto& ind : inds_class_test[c2]) - { - std::transform(_feats.begin(), _feats.end(), lp_row_eq.begin(), [&ind](model_node_ptr feat){return feat->test_value_ptr()[ind];}); - lp_simplex.loadProblem( - n_col, - n_row, - lp_column_start.data(), - lp_row.data(), - lp_elements.data(), - lp_column_lower.data(), - lp_column_upper.data(), - lp_objective.data(), - lp_row_eq.data(), - lp_row_eq.data() - ); - lp_simplex.dual(); - _prop_test_est[ind] += lp_simplex.primalFeasible(); - _test_n_convex_overlap += lp_simplex.primalFeasible(); - } - } - } - - task_start_train += _task_sizes_train[tt]; - task_start_test += _task_sizes_test[tt]; - } - std::transform(_prop_train_est.begin(), _prop_train_est.end(), _prop_train.begin(), _train_error.begin(), [](double est, double real){return (std::abs(est - real) < 1e-10) ? -1 : real;}); - std::transform(_prop_test_est.begin(), _prop_test_est.end(), _prop_test.begin(), _test_error.begin(), [](double est, double real){return (std::abs(est - real) < 1e-10) ? -1 : real;}); + set_train_test_error(); // Perform SVM int start = 0; int start_test = 0; for(int tt = 0; tt < _task_sizes_train.size(); ++tt) { - SVMWrapper svm(n_class, _n_dim, _task_sizes_train[tt], &_prop_train[start]); + SVMWrapper svm(_n_class, _n_dim, _task_sizes_train[tt], &_prop_train[start]); std::vector<double*> node_val_ptrs(_n_dim); std::vector<double*> node_test_val_ptrs(_n_dim); @@ -184,6 +64,7 @@ ModelClassifier::ModelClassifier(std::string train_file) std::vector<std::string> split_str; std::vector<std::string> feature_expr_train = populate_model(train_file, true); + _task_sizes_test = std::vector<int>(_task_sizes_train.size(), 0); for(int ff = 0; ff < feature_expr_train.size(); ++ff) { split_str = str_utils::split_string_trim(feature_expr_train[ff]); @@ -199,6 +80,13 @@ ModelClassifier::ModelClassifier(std::string train_file) model_node_ptr feat = std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str)); _feats.push_back(feat); } + + int file_train_n_convex_overlap = _train_n_convex_overlap; + _train_n_convex_overlap = 0; + + set_train_test_error(); + if((file_train_n_convex_overlap != _train_n_convex_overlap)) + throw std::logic_error("The file does not have the same convex overlap (" + std::to_string(file_train_n_convex_overlap) + ") as calculated here (" + std::to_string(_train_n_convex_overlap) + ")."); } ModelClassifier::ModelClassifier(std::string train_file, std::string test_file) @@ -227,6 +115,16 @@ ModelClassifier::ModelClassifier(std::string train_file, std::string test_file) _feats.push_back(std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str))); } + + int file_train_n_convex_overlap = _train_n_convex_overlap; + _train_n_convex_overlap = 0; + + int file_test_n_convex_overlap = _test_n_convex_overlap; + _test_n_convex_overlap = 0; + + set_train_test_error(); + if((file_train_n_convex_overlap != _train_n_convex_overlap) || (file_test_n_convex_overlap != _test_n_convex_overlap)) + throw std::logic_error("The file does not have the same convex overlap (" + std::to_string(file_train_n_convex_overlap) + ", " + std::to_string(file_test_n_convex_overlap) + ") as calculated here (" + std::to_string(_train_n_convex_overlap) + ", " + std::to_string(_test_n_convex_overlap) + ")."); } std::vector<std::string> ModelClassifier::populate_model(std::string filename, bool train) @@ -276,7 +174,7 @@ std::vector<std::string> ModelClassifier::populate_model(std::string filename, b std::getline(file_stream, line); int n_task = 0; - int _n_dim = 0; + _n_dim = 0; std::getline(file_stream, line); do { @@ -290,8 +188,7 @@ std::vector<std::string> ModelClassifier::populate_model(std::string filename, b } std::getline(file_stream, line); } while(line.substr(0, 39).compare("# Feature Rung, Units, and Expressions") != 0); - _fix_intercept = false; - + _fix_intercept = false; std::getline(file_stream, line); for(int ff = 0; ff < _n_dim; ++ff) { @@ -342,14 +239,14 @@ std::vector<std::string> ModelClassifier::populate_model(std::string filename, b { _prop_train[ns] = std::stod(split_line[0]); _prop_train_est[ns] = std::stod(split_line[1]); - _train_error[ns] = _prop_train_est[ns] - _prop_train[ns]; + _train_error[ns] = 0; } else { _prop_test[ns] = std::stod(split_line[0]); _prop_test_est[ns] = std::stod(split_line[1]); - _test_error[ns] = _prop_test_est[ns] - _prop_test[ns]; + _test_error[ns] = 0; } for(int nf = 0; nf < _n_dim; ++nf) { @@ -371,9 +268,141 @@ std::vector<std::string> ModelClassifier::populate_model(std::string filename, b } file_stream.close(); + return feature_expr; } +void ModelClassifier::set_train_test_error() +{ + int n_row = _n_dim + 1; + + ClpSimplex lp_simplex; + lp_simplex.setLogLevel(0); + lp_simplex.setMoreSpecialOptions(2); + + std::vector<std::vector<int>> inds_check; + std::vector<double> lp_column_lower(_n_samp_train + 1, 0.0); + std::vector<double> lp_column_upper(_n_samp_train + 1, COIN_DBL_MAX); + std::vector<double> lp_objective(_n_samp_train + 1, 0.0); + std::vector<double> lp_elements(_n_samp_train * n_row, 1.0); + std::vector<double> lp_row_low(n_row, 1.0 - 1e-5); + std::vector<double> lp_row_up(n_row, 1.0 + 1e-5); + + std::vector<int> lp_column_start(_n_samp_train + 1, 0); + std::vector<int> lp_row(_n_samp_train * (_n_dim + 1), 0); + + for(int rr = 0; rr < n_row; ++rr) + scopy_(_n_samp_train, std::vector<int>(_n_samp_train, rr).data(), 1, &lp_row[rr], n_row); + + std::iota(lp_column_start.begin(), lp_column_start.end(), 0); + std::transform(lp_column_start.begin(), lp_column_start.end(), lp_column_start.begin(), [&n_row](int ii){return ii * (n_row);}); + + // Calculate number of overlapping points by checking feasibility of a linear optimization problem + int task_start_train = 0; + int task_start_test = 0; + + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) + { + std::vector<int> inds_train(_task_sizes_train[tt], 0); + std::iota(inds_train.begin(), inds_train.end(), task_start_train); + util_funcs::argsort(inds_train.data(), inds_train.data() + inds_train.size(), &_prop_train[task_start_train]); + + std::vector<int> inds_test(_task_sizes_test[tt], 0); + std::iota(inds_test.begin(), inds_test.end(), task_start_test); + util_funcs::argsort(inds_test.data(), inds_test.data() + inds_test.size(), &_prop_test[task_start_test]); + + std::vector<std::vector<int>> inds_class_train(1, {inds_train[0]}); + + std::vector<std::vector<int>> inds_class_test; + if(_task_sizes_test[tt] > 0) + inds_class_test = std::vector<std::vector<int>>(1, {inds_test[0]}); + + for(int ii = 1; ii < inds_train.size(); ++ii) + { + if((_prop_train[inds_train[ii]] == _prop_train[inds_train[ii - 1]])) + inds_class_train.back().push_back(inds_train[ii]); + else + inds_class_train.push_back({inds_train[ii]}); + } + for(auto& ind_lst : inds_class_train) + _n_class = int(std::round(static_cast<double>(inds_class_train.size()) / static_cast<double>(_task_sizes_train.size()))); + for(int ii = 1; ii < inds_test.size(); ++ii) + { + if((_prop_test[inds_test[ii]] == _prop_test[inds_test[ii - 1]])) + inds_class_test.back().push_back(inds_test[ii]); + else + inds_class_test.push_back({inds_test[ii]}); + } + + if(inds_class_test.size() == 0) + inds_class_test = std::vector<std::vector<int>>(inds_class_train.size()); + + for(int c1 = 0; c1 < inds_class_train.size(); ++c1) + { + int n_col = inds_class_train[c1].size(); + + for(int dd = 0; dd < _n_dim; ++dd) + { + for(int ii = 0; ii < n_col; ++ii) + { + lp_elements[ii * n_row + dd] = _feats[dd]->value_ptr()[inds_class_train[c1][ii]]; + } + } + + for(int c2 = 0; c2 < inds_class_train.size(); ++c2) + { + if(c2 == c1) + continue; + for(auto& ind : inds_class_train[c2]) + { + std::transform(_feats.begin(), _feats.end(), lp_row_low.begin(), [&ind](model_node_ptr feat){return feat->value_ptr()[ind];}); + std::transform(_feats.begin(), _feats.end(), lp_row_up.begin(), [&ind](model_node_ptr feat){return feat->value_ptr()[ind];}); + lp_simplex.loadProblem( + n_col, + n_row, + lp_column_start.data(), + lp_row.data(), + lp_elements.data(), + lp_column_lower.data(), + lp_column_upper.data(), + lp_objective.data(), + lp_row_low.data(), + lp_row_up.data() + ); + lp_simplex.dual(); + _train_error[ind] += lp_simplex.primalFeasible(); + _train_n_convex_overlap += lp_simplex.primalFeasible(); + } + for(auto& ind : inds_class_test[c2]) + { + std::transform(_feats.begin(), _feats.end(), lp_row_up.begin(), [&ind](model_node_ptr feat){return feat->test_value_ptr()[ind];}); + std::transform(_feats.begin(), _feats.end(), lp_row_low.begin(), [&ind](model_node_ptr feat){return feat->test_value_ptr()[ind];}); + lp_simplex.loadProblem( + n_col, + n_row, + lp_column_start.data(), + lp_row.data(), + lp_elements.data(), + lp_column_lower.data(), + lp_column_upper.data(), + lp_objective.data(), + lp_row_low.data(), + lp_row_up.data() + ); + lp_simplex.dual(); + _test_error[ind] += lp_simplex.primalFeasible(); + _test_n_convex_overlap += lp_simplex.primalFeasible(); + } + } + } + + task_start_train += _task_sizes_train[tt]; + task_start_test += _task_sizes_test[tt]; + } + std::transform(_train_error.begin(), _train_error.end(), _prop_train.begin(), _train_error.begin(), [](double err, double real){return (err < 1e-10) ? -1 : real;}); + std::transform(_test_error.begin(), _test_error.end(), _prop_test.begin(), _test_error.begin(), [](double err, double real){return (err < 1e-10) ? -1 : real;}); +} + std::string ModelClassifier::toString() const { std::stringstream unit_rep; diff --git a/src/descriptor_identifier/Model/ModelClassifier.hpp b/src/descriptor_identifier/Model/ModelClassifier.hpp index d2a416b87b14f678cef0eb0dfb47e6279e59dfba..030a0f09c0340ca8bf141e4fd48e5cd227cbfe27 100644 --- a/src/descriptor_identifier/Model/ModelClassifier.hpp +++ b/src/descriptor_identifier/Model/ModelClassifier.hpp @@ -50,6 +50,8 @@ class ModelClassifier : public Model std::vector<double> _train_error; //!< The error of the model (training) std::vector<double> _test_error; //!< The error of the model (testing) + int _n_class; //!< The number of classes + int _train_n_convex_overlap; //!< Number of data points in regions where the convex hulls overlap in the training set int _test_n_convex_overlap; //!< Number of data points in regions where the convex hulls overlap for the test data set @@ -150,6 +152,7 @@ public: */ void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}); + void set_train_test_error(); // DocString: model_classn_convex_overlap_train /** * @brief The number of samples in overlapping convex hull regions (training data)