diff --git a/src/classification/utils.hpp b/src/classification/utils.hpp index 7173210ad3f6b5125ed8008c69cc0263e372d2d8..13fef0871641867794eea0caa9055b6ffe4c09e1 100644 --- a/src/classification/utils.hpp +++ b/src/classification/utils.hpp @@ -41,7 +41,7 @@ namespace convex_hull * @param class_szs number of elements in each class * @returns The projection score for the particular feature */ - double overlap_1d(double* value, double width = 1e-3); + double overlap_1d(double* value, double width = 1e-5); } diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp index 36b1c1318a9b8aa7447741177f4ed17644be6b9e..3f06869176510abec7084b8a7d6d70b07f7d1840 100644 --- a/src/descriptor_identifier/Model/Model.hpp +++ b/src/descriptor_identifier/Model/Model.hpp @@ -135,6 +135,12 @@ public: */ virtual void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}) = 0; + // DocString: model_fix_intercept + /** + * @brief Access whether the intercept is fixed at 0 or not + */ + inline bool fix_intercept(){return _fix_intercept;} + #ifdef PY_BINDINGS // DocString: model_coefs /** @@ -176,6 +182,17 @@ public: */ inline np::ndarray prop_test(){return python_conv_utils::to_ndarray<double>(_prop_test);} + // DocString: model_task_sizes_train + /** + * @brief Access the size of the training set for each task + */ + inline py::list task_sizes_train(){return python_conv_utils::to_list<int>(_task_sizes_train);} + + // DocString: model_task_sizes_test + /** + * @brief Access the size of the test set for each task + */ + inline py::list task_sizes_test(){return python_conv_utils::to_list<int>(_task_sizes_test);} #endif }; diff --git a/src/descriptor_identifier/Model/ModelClassifier.cpp b/src/descriptor_identifier/Model/ModelClassifier.cpp index a5d16ce8e666a68e02ffb98653c8710428015372..5056ebf57c88e5c2581b84ec6534877401ead371 100644 --- a/src/descriptor_identifier/Model/ModelClassifier.cpp +++ b/src/descriptor_identifier/Model/ModelClassifier.cpp @@ -14,6 +14,8 @@ ModelClassifier::ModelClassifier(Unit prop_unit, std::vector<double> prop_train, // Objects needed to calculate the number of points in the overlapping regions of the convex hulls int n_row = _n_dim + 1; + int n_class = 0; + ClpSimplex lp_simplex; //!< Model used to find if a point is inside the convex hull lp_simplex.setLogLevel(0); lp_simplex.setMoreSpecialOptions(2); @@ -62,6 +64,7 @@ ModelClassifier::ModelClassifier(Unit prop_unit, std::vector<double> prop_train, inds_class_train.push_back({inds_train[ii]}); } + n_class = int(std::round(static_cast<double>(inds_class_train.size()) / static_cast<double>(_task_sizes_train.size()))); for(int ii = 1; ii < inds_test.size(); ++ii) { if((prop_test[inds_test[ii]] == prop_test[inds_test[ii - 1]])) @@ -69,8 +72,10 @@ ModelClassifier::ModelClassifier(Unit prop_unit, std::vector<double> prop_train, else inds_class_test.push_back({inds_test[ii]}); } + if(inds_class_test.size() == 0) inds_class_test = std::vector<std::vector<int>>(inds_class_train.size()); + for(int c1 = 0; c1 < inds_class_train.size(); ++c1) { int n_col = inds_class_train[c1].size(); @@ -198,38 +203,57 @@ ModelClassifier::ModelClassifier(Unit prop_unit, std::vector<double> prop_train, } // Perform SVM training - shark::LinearCSvmTrainer<shark::RealVector> trainer(1.0, !_fix_intercept); - trainer.setMcSvmType(shark::McSvm::WW); - trainer.stoppingCondition().maxSeconds = 0.01; // Process SVM model - shark::LinearClassifier<shark::RealVector> model; + std::shared_ptr<shark::AbstractKernelFunction<shark::RealVector>> _kernel_ptr = std::make_shared<shark::LinearKernel<shark::RealVector>>(); + shark::CSvmTrainer<shark::RealVector> trainer(_kernel_ptr.get(), 100.0, !_fix_intercept); + shark::KernelClassifier<shark::RealVector> model; shark::ZeroOneLoss<unsigned int> loss; shark::Data<unsigned int> output; + int ii_train = 0; int ii_test = 0; + for(int tt = 0; tt < _task_sizes_train.size(); ++tt) { - trainer.stoppingCondition().maxSeconds = 0.001; - trainer.stoppingCondition().maxIterations = 1000; - trainer.stoppingCondition().minAccuracy = 0.01; + trainer.stoppingCondition().maxSeconds = 0.1; trainer.train(model, training[tt]); if(_fix_intercept) { - _coefs.push_back(std::vector<double>(_n_dim, 0.0)); - std::copy_n(model.parameterVector().begin(), _n_dim, _coefs.back().begin()); - for(int cc = 0; cc < _n_dim; ++cc) - _coefs.back()[cc] = _feats[cc]->remap_w_svm(_coefs.back()[cc]); + for(int cc = 0; cc < n_class - (n_class == 2); ++cc) + { + _coefs.push_back(std::vector<double>(_n_dim, 0.0)); + int sv = 0; + for(auto el : model.decisionFunction().basis().elements()) + { + daxpy_(_n_dim, model.parameterVector()(sv), &el(0), 1, _coefs.back().data(), 1); + ++sv; + } + for(int dd = 0; dd < _n_dim; ++dd) + { + _coefs.back()[dd] = _feats[dd]->remap_w_svm(_coefs.back()[dd]); + } + } } else { - _coefs.push_back(std::vector<double>(_n_dim + 1, 0.0)); - std::copy_n(model.parameterVector().begin(), _n_dim + 1, _coefs.back().begin()); - for(int cc = 0; cc < _n_dim; ++cc) + for(int cc = 0; cc < n_class - (n_class == 2); ++cc) { - _coefs.back()[cc] = _feats[cc]->remap_w_svm(_coefs.back()[cc]); - _coefs.back()[_n_dim] -= _feats[cc]->remap_b_svm(_coefs.back()[cc]); + _coefs.push_back(std::vector<double>(_n_dim + 1, 0.0)); + int sv = 0; + for(auto el : model.decisionFunction().basis().elements()) + { + daxpy_(_n_dim, model.parameterVector()(sv), &el(0), 1, _coefs.back().data(), 1); + ++sv; + } + _coefs.back()[_n_dim] = model.parameterVector()(sv); + + for(int dd = 0; dd < _n_dim; ++dd) + { + _coefs.back()[dd] = _feats[dd]->remap_w_svm(_coefs.back()[dd]); + _coefs.back()[_n_dim] -= _feats[dd]->remap_b_svm(_coefs.back()[dd]); + } } } output = model(training[tt].inputs()); diff --git a/src/descriptor_identifier/Model/ModelClassifier.hpp b/src/descriptor_identifier/Model/ModelClassifier.hpp index 735001def2695c21b02c6b9f6ab9485d51af06cd..50749f78c763370be52985e22454a0391a74cead 100644 --- a/src/descriptor_identifier/Model/ModelClassifier.hpp +++ b/src/descriptor_identifier/Model/ModelClassifier.hpp @@ -19,9 +19,8 @@ #include<iostream> #include <shark/Algorithms/Trainers/CSvmTrainer.h> -#include <shark/Algorithms/Trainers/NormalizeComponentsUnitVariance.h> +#include <shark/Models/Kernels/LinearKernel.h> #include <shark/Data/Dataset.h> -#include <shark/Models/Normalizer.h> #include <shark/ObjectiveFunctions/Loss/ZeroOneLoss.h> #include <coin/ClpSimplex.hpp> @@ -186,6 +185,33 @@ public: inline double percent_test_error(){return 100.0 * static_cast<double>(_test_n_convex_overlap) / static_cast<double>(_n_samp_test); } #ifdef PY_BINDINGS + /** + * @brief Construct a new Model with updated coefficient + * @details Copy a model but update its coefficients + * + * @param o Model to be copied + * @param new_coefs The new coefficients + */ + ModelClassifier(const ModelClassifier& o, py::list new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est); + + /** + * @brief Construct a new Model with updated coefficient + * @details Copy a model but update its coefficients + * + * @param o Model to be copied + * @param new_coefs The new coefficients + */ + ModelClassifier(const ModelClassifier& o, np::ndarray new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est); + + // DocString: model_class_to_file + /** + * @brief Convert the ModelClassifier into an output file + * + * @param filename The name of the file to output to + * @param train If true output the training data + * @param test_inds The indexes of the test set + */ + inline void to_file_py(std::string filename, bool train = true){to_file(filename, train);} // DocString: model_class_prop_train_est /** diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp index f5b794589f840b4c903518209afd2ec923a494c0..36c5a58c8c281712ca54b372a0764a2ad955cb33 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp @@ -26,9 +26,10 @@ SISSOClassifier::SISSOClassifier( _int_prop(prop.size()), _int_prop_test(prop_test.size()), _int_error(prop.size()), - _trainer(1.0, !_fix_intercept), - _c(1.0), - _width(1e-3), + _kernel_ptr(std::make_shared<shark::LinearKernel<shark::RealVector>>()), + _trainer(_kernel_ptr.get(), 100.0, !_fix_intercept), + _c(100.0), + _width(1.0e-5), _n_class(1), _lp_n_col(0), _lp_n_row(0) @@ -39,7 +40,6 @@ SISSOClassifier::SISSOClassifier( _lp_simplex.setLogLevel(0); _lp_simplex.setMoreSpecialOptions(8192 + 64 + 2); - _trainer.stoppingCondition().maxSeconds = 0.001; std::vector<int> inds = util_funcs::argsort(prop); std::map<double, double> class_map; @@ -56,8 +56,6 @@ SISSOClassifier::SISSOClassifier( } prop[inds[ii]] = class_num; } - if(_n_class > 2) - _trainer.setMcSvmType(shark::McSvm::WW); std::transform(prop_test.begin(), prop_test.end(), prop_test.begin(), [&class_map](double pp){return class_map[pp];}); @@ -146,11 +144,11 @@ void SISSOClassifier::set_data(std::vector<int>& inds, int start, int task) { for(int ii = 0; ii < inds.size(); ++ii) { - double max_feat = *std::max_element(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + _training[task].numberOfElements()); - double min_feat = *std::min_element(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + _training[task].numberOfElements()); + double max_feat = *std::max_element(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + _task_sizes_train[task]); + double min_feat = *std::min_element(node_value_arrs::get_d_matrix_ptr(inds[ii]) + start, node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + _task_sizes_train[task]); - dcopy_(shark::batchSize(batch), std::vector<double>(shark::batchSize(batch), -1.0 * min_feat).data(), 1, &batch(0, 0) + ii, inds.size()); - daxpy_(shark::batchSize(batch), 1.0 / (max_feat - min_feat), node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_els, 1, &batch(0, 0) + ii, inds.size()); + dcopy_(shark::batchSize(batch), std::vector<double>(shark::batchSize(batch), (max_feat + min_feat) / (max_feat - min_feat)).data(), 1, &batch(0, 0) + ii, inds.size()); + daxpy_(shark::batchSize(batch), 2.0 / (max_feat - min_feat), node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_els, 1, &batch(0, 0) + ii, inds.size()); } n_els += shark::batchSize(batch); } @@ -158,17 +156,18 @@ void SISSOClassifier::set_data(std::vector<int>& inds, int start, int task) void SISSOClassifier::svm(std::vector<int>& inds, double* coefs, int start, int task) { - _trainer.stoppingCondition().maxSeconds = 0.001; - _trainer.stoppingCondition().maxIterations = 1000; - _trainer.stoppingCondition().minAccuracy = 0.01; - // _trainer.stoppingCondition().maxIterations = 100; - set_data(inds, start, task); + _trainer.stoppingCondition().maxSeconds = 0.1; + set_data(inds, start, task); _trainer.train(_model, _training[task]); - std::copy_n(_model.parameterVector().begin(), inds.size() + (!_fix_intercept), coefs); - if(!_fix_intercept) - coefs[inds.size()] = _model.parameterVector()(inds.size()) / util_funcs::norm(coefs, inds.size()); + int sv = 0; + std::fill_n(coefs, inds.size(), 0.0); + for(auto el : _model.decisionFunction().basis().elements()) + { + daxpy_(inds.size(), _model.parameterVector()(sv), &el(0), 1, coefs, 1); + ++sv; + } } std::vector<std::vector<int>> SISSOClassifier::find_min_svm(std::vector<std::vector<int>>& test_inds, int n_keep) @@ -176,13 +175,11 @@ std::vector<std::vector<int>> SISSOClassifier::find_min_svm(std::vector<std::vec int n_dim = test_inds[0].size(); std::vector<std::vector<int>> to_return(n_keep); - std::vector<double> min_error(n_keep, 1.0); + std::vector<double> min_error(n_keep, _n_samp); std::vector<double> min_dist_error(n_keep, 1.0); std::vector<int> ind_list(n_keep, 1.0); - std::vector<double> gamma(_n_samp, 0.0); - std::vector<double> gamma_feat(_n_samp, 0.0); - std::vector<double> coefs(test_inds[0].size() + (!_fix_intercept)); + std::vector<double> coefs(test_inds[0].size()); int max_error_ind = 0; int ii = 0; @@ -195,22 +192,13 @@ std::vector<std::vector<int>> SISSOClassifier::find_min_svm(std::vector<std::vec for(int tt = 0; tt < _n_task; ++ tt) { svm(feat_inds, coefs.data(), start, tt); - double norm = util_funcs::norm(coefs.data(), n_dim); - std::fill_n(gamma.begin(), _task_sizes_train[tt], (_fix_intercept ? 0.0 : coefs[n_dim]) / norm); - std::fill_n(gamma_feat.begin(), _task_sizes_train[tt], 0.0); - int n_els = 0; - for(auto& batch: _training[tt].inputs().batches()) - { - for(int ii = 0; ii < feat_inds.size(); ++ii) - daxpy_(shark::batchSize(batch), coefs[ii] / norm, &batch(0, 0) + ii, n_dim, gamma_feat.data() + n_els, 1); - n_els += shark::batchSize(batch); - } - std::transform(gamma_feat.begin(), gamma_feat.begin() + _task_sizes_train[tt], gamma.begin(), gamma.begin(), [](double d1, double d2){return std::abs(d1) + d2;}); error += _loss.eval(_training[tt].labels(), _model(_training[tt].inputs())) * _task_sizes_train[tt]; - dist_error += *std::min_element(gamma.begin(), gamma.end()); + dist_error += 1.0 / util_funcs::norm(coefs.data(), n_dim); + start += _task_sizes_train[tt]; } + dist_error /= static_cast<double>(_n_task); if((error < min_error[max_error_ind]) || ((error == min_error[max_error_ind]) && (dist_error > min_dist_error[max_error_ind]))) @@ -226,11 +214,12 @@ std::vector<std::vector<int>> SISSOClassifier::find_min_svm(std::vector<std::vec } ++ii; } + double max_dist = 0.01 + *std::max_element(min_dist_error.begin(), min_dist_error.end()); + daxpy_(n_keep, -1.0 / max_dist, min_dist_error.data(), 1, min_error.data(), 1); + std::vector<int> sorted_socres_inds = util_funcs::argsort(min_error); + for(int ii = 0; ii < ind_list.size(); ++ii) - { - // std::cout << test_inds.size() << '\t' << ind_list[ii] << '\t' << test_inds[ind_list[ii]][0] << std::endl; - to_return[ii] = test_inds[ind_list[ii]]; - } + to_return[ii] = test_inds[ind_list[sorted_socres_inds[ii]]]; return to_return; } @@ -340,45 +329,55 @@ void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim) std::vector<double> gamma(_prop.size(), 0.0); std::vector<double> gamma_feat(_prop.size(), 0.0); - // std::cout << "iterate" << std::endl; + do { int n_convex_overlap = get_n_overlap_lp(); - if(n_convex_overlap < min_n_convex_overlap[max_error_ind]) + int el_num = std::find(min_n_convex_overlap.begin(), min_n_convex_overlap.end(), n_convex_overlap) - min_n_convex_overlap.begin(); + + if(el_num < n_get_models) + { + min_inds[el_num].push_back(_cur_inds); + } + else if(n_convex_overlap < min_n_convex_overlap[max_error_ind]) { min_n_convex_overlap[max_error_ind] = n_convex_overlap; min_inds[max_error_ind] = {_cur_inds}; max_error_ind = std::max_element(min_n_convex_overlap.begin(), min_n_convex_overlap.end()) - min_n_convex_overlap.begin(); } - else if(n_convex_overlap == min_n_convex_overlap[max_error_ind]) - { - min_inds[max_error_ind].push_back(_cur_inds); - } } while(util_funcs::iterate(_cur_inds, _cur_inds.size(), _mpi_comm->size())); - // std::cout << "process" << std::endl; std::vector<int> inds_min(n_get_models * n_dim, -1); - if(min_inds[max_error_ind].size() > 1) + std::vector<int> sorted_overlap_inds = util_funcs::argsort(min_n_convex_overlap); + std::vector<int> min_n_convex_overlap_copy(min_n_convex_overlap); + + int ii = 0; + int nn = 0; + while((ii < n_get_models) && (nn < min_n_convex_overlap.size())) { - std::vector<std::vector<int>> test_inds(min_inds[max_error_ind]); - std::vector<int> inds_change(1, max_error_ind); - int n_keep = 1; - for(int nn = 0; nn < n_get_models ; ++nn) + if(((ii + min_inds[sorted_overlap_inds[nn]].size()) * n_dim <= inds_min.size())) + { + for(auto& mi : min_inds[sorted_overlap_inds[nn]]) + { + std::copy_n(mi.begin(), mi.size(), &inds_min[ii * n_dim]); + min_n_convex_overlap[ii] = min_n_convex_overlap_copy[sorted_overlap_inds[nn]]; + ++ii; + } + } + else { - if((nn != max_error_ind) && (min_n_convex_overlap[nn] == min_n_convex_overlap[max_error_ind])) + std::vector<std::vector<int>> test_inds = find_min_svm(min_inds[sorted_overlap_inds[nn]], n_get_models - ii); + int kk = 0; + while((ii < inds_min.size()) && (kk < test_inds.size())) { - test_inds.push_back(min_inds[nn][0]); - inds_change.push_back(nn); - ++n_keep; + std::copy_n(test_inds[kk].begin(), test_inds[kk].size(), &inds_min[ii * n_dim]); + min_n_convex_overlap[ii] = min_n_convex_overlap_copy[sorted_overlap_inds[nn]]; + ++ii; + ++kk; } } - test_inds = find_min_svm(test_inds, n_keep); - for(int nn = 0; nn < n_keep; ++nn) - min_inds[inds_change[nn]] = {test_inds[nn]}; + ++nn; } - // std::cout << "mpi" << std::endl; - for(int ii = 0; ii < n_get_models; ++ii) - std::copy_n(min_inds[ii][0].begin(), min_inds[ii][0].size(), &inds_min[n_dim * ii]); std::vector<int> all_min_n_convex_overlap(_mpi_comm->size() * n_get_models); std::vector<int> all_inds_min(_mpi_comm->size() * n_get_models * n_dim); @@ -387,45 +386,47 @@ void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim) mpi::all_gather(*_mpi_comm, inds_min.data(), n_get_models * n_dim, all_inds_min); std::vector<int> inds = util_funcs::argsort(all_min_n_convex_overlap); - // std::cout << all_min_n_convex_overlap.size() << '\t' << n_get_models << std::endl; - if((_mpi_comm->size() > 1) && (all_min_n_convex_overlap[inds[n_get_models - 1]] == all_min_n_convex_overlap[inds[n_get_models]])) + std::vector<int> all_inds_min_original(all_inds_min); + + ii = 1; + nn = 0; + while(nn < n_get_models) { - std::vector<std::vector<int>> test_inds; - std::vector<int> el_list; - for(int ii = 0; ii < inds.size(); ++ii) + while((ii < all_min_n_convex_overlap.size()) && (all_min_n_convex_overlap[inds[ii]] == all_min_n_convex_overlap[inds[nn]])) + ++ii; + if(ii - nn == 1) { - if(all_min_n_convex_overlap[inds[ii]] == all_min_n_convex_overlap[inds[n_get_models - 1]]) - { - test_inds.push_back({all_inds_min[ii * n_dim]}); - for(int dd = 1; dd < n_dim; ++dd) - test_inds.back().push_back({all_inds_min[ii * n_dim + dd]}); - el_list.push_back(ii); - } + nn = ii; + ++ii; + continue; } - // std::cout << el_list.size() << '\t' << el_list[0] << std::endl; - int n_keep = n_get_models - el_list[0]; - test_inds = find_min_svm(test_inds, n_keep); - for(int ii = 0; ii < test_inds.size(); ++ii) - std::copy_n(test_inds[ii].begin(), test_inds[ii].size(), &all_inds_min[(el_list[0] + ii) * n_dim]); + std::vector<std::vector<int>> test_inds(ii - nn, std::vector<int>(n_dim, 0)); + for(int cc = nn; cc < ii; ++cc) + std::copy_n(&all_inds_min_original[inds[cc] * n_dim], n_dim, test_inds[cc - nn].begin()); + + test_inds = find_min_svm(test_inds, std::min(ii - nn, n_get_models - nn)); + + for(int cc = 0; cc < test_inds.size(); ++cc) + std::copy_n(test_inds[cc].begin(), n_dim, &all_inds_min[(cc + nn) * n_dim]); + nn = ii; + ++ii; } - // std::cout << "create models" << std::endl; std::vector<model_node_ptr> min_nodes(n_dim); std::vector<ModelClassifier> models; - for(int rr = 0; rr < n_get_models; ++rr) { node_value_arrs::clear_temp_test_reg(); for(int ii = 0; ii < n_dim; ++ii) { - int index = all_inds_min[inds[rr] * n_dim + ii]; + int index = all_inds_min[rr * n_dim + ii]; min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit()); } models.push_back(ModelClassifier(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept)); } + _models.push_back(models); - // std::cout << "out" << std::endl; } void SISSOClassifier::fit() diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp index fb2c461799513778f610b8dd714d224a924f6c9c..9312355fd67635cd3e71068cf77f4bb3c4eb4830 100644 --- a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp +++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp @@ -14,13 +14,15 @@ #include <shark/Data/Dataset.h> #include <shark/ObjectiveFunctions/Loss/ZeroOneLoss.h> #include <shark/Algorithms/Trainers/CSvmTrainer.h> +#include <shark/Models/Kernels/LinearKernel.h> #include <classification/prop_sorted_d_mat.hpp> #include <feature_creation/feature_space/FeatureSpace.hpp> #include <descriptor_identifier/Model/ModelClassifier.hpp> #include <descriptor_identifier/SISSO_DI/SISSO_DI.hpp> -#include <ctime> +#include <ctime> +#include <iomanip> #ifdef PY_BINDINGS namespace np = boost::python::numpy; namespace py = boost::python; @@ -58,10 +60,12 @@ protected: std::vector<unsigned int> _int_prop; //!< Property array (training data) std::vector<unsigned int> _int_prop_test; //!< Property array (testing data) std::vector<int> _int_error; //!< Array to calculate the residuals for the models (training data) - shark::ZeroOneLoss<unsigned int> _loss; //!< The loss calculator - shark::LinearCSvmTrainer<shark::RealVector> _trainer; //!< A list of training data for each task the SVM - shark::LinearClassifier<shark::RealVector> _model; //!< Model for the SVM classification problem + std::shared_ptr<shark::AbstractKernelFunction<shark::RealVector>> _kernel_ptr; + + shark::ZeroOneLoss<unsigned int> _loss; //!< The loss calculator + shark::CSvmTrainer<shark::RealVector> _trainer; //!< A list of training data for each task the SVM + shark::KernelClassifier<shark::RealVector> _model; //!< Model for the SVM classification problem double _c; double _width; diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp index 426d4acc9902a305e3c4af97fdba837d8d6dd9c0..a9e57c3540303e345f374e14132db0e41ba94a6d 100644 --- a/src/python/bindings_docstring_keyed.cpp +++ b/src/python/bindings_docstring_keyed.cpp @@ -353,6 +353,9 @@ void sisso::descriptor_identifier::registerModel() .add_property("feats", &Model::feats, "@DocString_model_feats@") .add_property("coefs", &Model::coefs, "@DocString_model_coefs@") .add_property("prop_unit", &Model::prop_unit, "@DocString_model_prop_unit@") + .add_property("task_size_train", &Model::task_sizes_train, "@DocString_model_task_sizes_train") + .add_property("task_size_test", &Model::task_sizes_test, "@DocString_model_task_sizes_test") + .add_property("fix_intercept", &Model::fix_intercept, "@DocString_model_fix_intercept") ; } @@ -391,8 +394,11 @@ void sisso::descriptor_identifier::registerModelClassifier() class_<ModelClassifier, bases<Model>>("ModelClassifier", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>()) .def(init<std::string>()) .def(init<std::string, std::string>()) + .def(init<ModelClassifier, py::list, np::ndarray, np::ndarray>()) + .def(init<ModelClassifier, np::ndarray, np::ndarray, np::ndarray>()) .def("__str__", &ModelClassifier::toString, "@DocString_model_class_str@") .def("__repr__", &ModelClassifier::toString, "@DocString_model_class_str@") + .def("to_file", &ModelClassifier::to_file_py, "@DocString_model_class_to_file@") .add_property("fit", &ModelClassifier::prop_train_est, "@DocString_model_class_prop_train_est@") .add_property("predict", &ModelClassifier::prop_test_est, "@DocString_model_class_prop_test_est@") .add_property("train_error", &ModelClassifier::train_error, "@DocString_model_class_train_error@") diff --git a/src/python/classification/__init__.py b/src/python/classification/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/python/classification/svm.py b/src/python/classification/svm.py new file mode 100644 index 0000000000000000000000000000000000000000..c1d54982ca521ec5cccf58dee6001820d5eeac96 --- /dev/null +++ b/src/python/classification/svm.py @@ -0,0 +1,90 @@ +from sklearn import svm +import numpy as np +from cpp_sisso import ModelClassifier + + +def update_model_svm(model, c=1.0, max_iter=100000, tol=0.01, filename=None): + """Generate a new model with an updated SVM from sckitlearn + + Args: + model (list of str, str or ModelClassifier): The model to be updated + c (float): The new c value to use + max_iter(int): The maximum number of iterations to use + tol(float): Maximum allowable error + filename (str): Filename to store the updated model + + Returns: + new_model (ModelClassifier): The updated model with better SVM parameters + """ + if isinstance(model, str): + model = ModelClassifier(model) + elif isinstance(model, list): + if ( + len(model) != 2 + or not isinstance(model[0], str) + or not isinstance(model[1], str) + ): + raise ValeError( + "If model is a list it must only contain the train/test filenames in that order." + ) + model = ModelClassifier(model[0], model[1]) + + start_train = 0 + start_test = 0 + updated_coefs = [] + updated_prop_train_est = [] + updated_prop_test_est = [] + + for ts_train, ts_test in zip(model.task_size_train, model.task_size_test): + X = np.column_stack( + [feat.value[start_train : start_train + ts_train] for feat in model.feats] + ) + + c0 = np.min(X, axis=0) + a = 1.0 / (np.max(X, axis=0) - c0) + + lin_clf = svm.LinearSVC(C=c, max_iter=max_iter, tol=tol) + lin_clf.fit( + a * (X - c0), model.prop_train[start_train : start_train + ts_train] + ) + + if model.fix_intercept: + updated_coefs.append( + np.column_stack((lin_clf.coef_, np.zeros(len(lin_clf.coef_)))) + ) + else: + updated_coefs.append(np.column_stack((lin_clf.coef_, lin_clf.intercept_))) + + for cc in range(len(lin_clf.coef_)): + for dd in range(model.n_dim): + updated_coefs[-1][cc][dd] = a[dd] * lin_clf.coef_[cc][dd] + updated_coefs[-1][cc][-1] -= c0[dd] * updated_coefs[-1][cc][dd] + + updated_prop_train_est.append(lin_clf.predict(a * (X - c0))) + + if ts_test > 0: + X = np.column_stack( + [ + feat.test_value[start_test : start_test + ts_test] + for feat in model.feats + ] + ) + updated_prop_test_est.append(lin_clf.predict(a * (X - c0))) + else: + updated_prop_test_est.append([]) + start_train += ts_train + start_test += ts_test + + print(updated_coefs) + + new_model = ModelClassifier( + model, + np.row_stack(updated_coefs), + np.concatenate(updated_prop_train_est), + np.concatenate(updated_prop_test_est), + ) + + if filename: + new_model.to_file(filename, True) + + return new_model diff --git a/src/python/descriptor_identifier/ModelClassifier.cpp b/src/python/descriptor_identifier/ModelClassifier.cpp new file mode 100644 index 0000000000000000000000000000000000000000..599f3a67e4e4af5a25ce89085f91d2ddf809301f --- /dev/null +++ b/src/python/descriptor_identifier/ModelClassifier.cpp @@ -0,0 +1,51 @@ +#include <descriptor_identifier/Model/ModelClassifier.hpp> + +ModelClassifier::ModelClassifier(const ModelClassifier& o, py::list new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est) : + Model(o), + _prop_train_est(python_conv_utils::from_ndarray<double>(prop_train_est)), + _prop_test_est(python_conv_utils::from_ndarray<double>(prop_test_est)), + _train_error(o._train_error), + _test_error(o._test_error), + _train_n_convex_overlap(o._fix_intercept), + _test_n_convex_overlap(o._test_n_convex_overlap) +{ + _coefs = {}; + for(auto& coef_list : python_conv_utils::from_list<py::list>(new_coefs)) + _coefs.push_back(python_conv_utils::from_list<double>(coef_list)); + + std::vector<int> misclassified(_n_samp_train); + std::transform(_prop_train.begin(), _prop_train.end(), _prop_train_est.begin(), misclassified.begin(), [](double p1, double p2){return p1 != p2;}); + _train_n_svm_misclassified = std::accumulate(misclassified.begin(), misclassified.end(), 0); + + std::vector<int> misclassified_test(_n_samp_test); + std::transform(_prop_test.begin(), _prop_test.end(), _prop_test_est.begin(), misclassified_test.begin(), [](double p1, double p2){return p1 != p2;}); + _test_n_svm_misclassified = std::accumulate(misclassified_test.begin(), misclassified_test.end(), 0); +} + +ModelClassifier::ModelClassifier(const ModelClassifier& o, np::ndarray new_coefs, np::ndarray prop_train_est, np::ndarray prop_test_est) : + Model(o), + _prop_train_est(python_conv_utils::from_ndarray<double>(prop_train_est)), + _prop_test_est(python_conv_utils::from_ndarray<double>(prop_test_est)), + _train_error(o._train_error), + _test_error(o._test_error), + _train_n_convex_overlap(o._fix_intercept), + _test_n_convex_overlap(o._test_n_convex_overlap) +{ + _coefs = {}; + for(int ii = 0; ii < new_coefs.shape(0); ++ii) + { + // _coefs.push_back(std::vector<double>(new_coefs.shape(1))); + // for(int jj = 0; jj < new_coefs.shape(1); ++jj) + // _coefs.back()[jj] = py::extract<double>(new_coefs[ii * new_coefs.shape(0) + jj]); + + _coefs.push_back(python_conv_utils::from_ndarray<double>(py::extract<np::ndarray>(new_coefs[ii]))); + } + + std::vector<int> misclassified(_n_samp_train); + std::transform(_prop_train.begin(), _prop_train.end(), _prop_train_est.begin(), misclassified.begin(), [](double p1, double p2){return p1 != p2;}); + _train_n_svm_misclassified = std::accumulate(misclassified.begin(), misclassified.end(), 0); + + std::vector<int> misclassified_test(_n_samp_test); + std::transform(_prop_test.begin(), _prop_test.end(), _prop_test_est.begin(), misclassified_test.begin(), [](double p1, double p2){return p1 != p2;}); + _test_n_svm_misclassified = std::accumulate(misclassified_test.begin(), misclassified_test.end(), 0); +} diff --git a/src/python/descriptor_identifier/SISSOClassifier.cpp b/src/python/descriptor_identifier/SISSOClassifier.cpp index 3f73094b0510f050a71c6e5691c0d5b73d012e19..ac7805c9c8489fbc87f3f5f439eb7b699ecf9b66 100644 --- a/src/python/descriptor_identifier/SISSOClassifier.cpp +++ b/src/python/descriptor_identifier/SISSOClassifier.cpp @@ -26,7 +26,8 @@ SISSOClassifier::SISSOClassifier( _int_prop(_prop.size()), _int_prop_test(_prop_test.size()), _int_error(_prop.size()), - _trainer(1, !fix_intercept), + _kernel_ptr(std::make_shared<shark::LinearKernel<shark::RealVector>>()), + _trainer(_kernel_ptr.get(), 100.0, !_fix_intercept), _c(1.0), _width(1e-3), _n_class(1), @@ -158,7 +159,8 @@ SISSOClassifier::SISSOClassifier( _int_prop(_prop.size()), _int_prop_test(_prop_test.size()), _int_error(_prop.size()), - _trainer(1, !fix_intercept), + _kernel_ptr(std::make_shared<shark::LinearKernel<shark::RealVector>>()), + _trainer(_kernel_ptr.get(), 100.0, !_fix_intercept), _c(1.0) { std::vector<double> prop_vec = python_conv_utils::from_list<double>(prop);