Commit 66507755 authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Modularize the regression

Use one fit/l0_regularization but add helper functions for the differences
parent 6b0b1741
......@@ -68,6 +68,8 @@ SVMWrapper::SVMWrapper(const SVMWrapper& o):
_model(nullptr),
_y(o._y),
_y_est(o._y_est),
_x_space(o._n_samp * (o._n_dim + 1)),
_x(o._n_samp),
_coefs(o._coefs),
_intercept(o._intercept),
_w_remap(o._w_remap),
......
......@@ -113,7 +113,7 @@ public:
*
* @param o The object to be moved
*/
SVMWrapper(SVMWrapper&& o) = delete;
SVMWrapper(SVMWrapper&& o) = default;
/**
* @brief The destructor for the SVMWrapper
......
......@@ -34,6 +34,13 @@ SISSOClassifier::SISSOClassifier(
_fix_intercept = false;
}
setup_d_mat_transfer();
int start = 0;
for(int tt = 0; tt < _n_task; ++tt)
{
_svm_vec.push_back(SVMWrapper(_c, _loss->n_class(tt), _n_dim, _task_sizes_train[tt], _loss->prop_pointer() + start));
start += _task_sizes_train[tt];
}
}
void SISSOClassifier::setup_d_mat_transfer()
......@@ -57,6 +64,21 @@ void SISSOClassifier::setup_d_mat_transfer()
}
}
void SISSOClassifier::transfer_d_mat_to_sorted() const
{
prop_sorted_d_mat::resize_sorted_d_matrix_arr(node_value_arrs::N_SELECTED);
for(auto& el : _sample_inds_to_sorted_dmat_inds)
{
dcopy_(
node_value_arrs::N_SELECTED,
&node_value_arrs::D_MATRIX[el.first],
node_value_arrs::N_SAMPLES,
&prop_sorted_d_mat::SORTED_D_MATRIX[el.second],
prop_sorted_d_mat::N_SAMPLES
);
}
}
std::array<double, 2> SISSOClassifier::svm_error(std::vector<SVMWrapper>& svm, const std::vector<int>& feat_inds) const
{
double error = 0.0;
......@@ -74,271 +96,83 @@ std::array<double, 2> SISSOClassifier::svm_error(std::vector<SVMWrapper>& svm, c
return {error, dist_error};
}
int SISSOClassifier::get_max_error_ind(
const int n_models, const int* n_convex_overlap, const double* svm_score, const double* svm_margin, double* scores
) const
void SISSOClassifier::update_min_inds_scores(
const std::vector<int>& inds,
double score,
int max_error_ind,
std::vector<int>& min_inds_private,
std::vector<double>& min_scores_private
)
{
std::transform(
n_convex_overlap,
n_convex_overlap + n_models,
svm_score,
scores,
[this](int n_overlap, double score){return static_cast<double>(n_overlap * _n_samp * _n_class) + score;}
);
double max_dist = std::abs(*std::max_element(svm_margin, svm_margin + n_models, [](double v1, double v2){return std::abs(v1) < std::abs(v2);})) + 0.01;
std::transform(
svm_margin,
svm_margin + n_models,
scores,
scores,
[&max_dist](double margin, double score){return score + (1.0 - margin / max_dist);}
);
return std::max_element(scores, scores + n_models) - scores;
}
// Make a copy of the SVM
std::vector<SVMWrapper> svm_vec(_svm_vec);
std::array<double, 2> svm_err = svm_error(svm_vec, inds);
score += svm_err[0] + svm_err[1] / static_cast<double>(_n_samp);
void SISSOClassifier::transfer_d_mat_to_sorted() const
{
prop_sorted_d_mat::resize_sorted_d_matrix_arr(node_value_arrs::N_SELECTED);
for(auto& el : _sample_inds_to_sorted_dmat_inds)
if(score < min_scores_private[max_error_ind])
{
dcopy_(
node_value_arrs::N_SELECTED,
&node_value_arrs::D_MATRIX[el.first],
node_value_arrs::N_SAMPLES,
&prop_sorted_d_mat::SORTED_D_MATRIX[el.second],
prop_sorted_d_mat::N_SAMPLES
);
min_scores_private[max_error_ind] = score;
std::copy_n(inds.begin(), inds.size(), min_inds_private.begin() + max_error_ind * inds.size());
}
}
void SISSOClassifier::l0_regularization(const int n_dim)
void SISSOClassifier::add_models(const std::vector<std::vector<int>> indexes)
{
const int n_get_models = std::max(_n_residual, _n_models_store);
std::vector<double> coefs(n_dim + 1, 0.0);
std::vector<int> inds(n_dim, 0);
std::vector<int> min_inds(n_get_models * n_dim, -1);
std::vector<int> min_n_convex_overlap(n_get_models, _n_samp * _n_class * _n_class);
std::vector<double> min_svm_score(n_get_models, _n_samp * _n_class * _n_class);
std::vector<double> min_svm_margin(n_get_models, -1.0);
unsigned long long int n_interactions = 1;
int n_dim_fact = 1;
for(int rr = 0; rr < n_dim; ++rr)
{
inds[rr] = _feat_space->phi_selected().size() - 1 - rr;
n_interactions *= inds[rr] + 1;
n_dim_fact *= (rr + 1);
}
n_interactions /= n_dim_fact;
int max_error_ind = 0;
transfer_d_mat_to_sorted();
if(inds.back() >= 0)
{
#pragma omp parallel firstprivate(max_error_ind, inds) shared(_loss)
{
std::shared_ptr<LossFunction> loss_copy;
#pragma omp critical
{
loss_copy = std::make_shared<LossFunctionConvexHull>(_loss);
}
_models.push_back({});
std::vector<std::vector<model_node_ptr>> min_nodes;
std::vector<int> min_inds_private(min_inds);
std::vector<int> min_n_convex_overlap_private(min_n_convex_overlap);
std::vector<double> min_svm_score_private(min_svm_score);
std::vector<double> min_svm_margin_private(min_svm_margin);
std::array<double, 2> temp_svm_error;
std::vector<double> scores(n_get_models);
std::vector<SVMWrapper> svm_vec;
int start = 0;
for(int tt = 0; tt < _n_task; ++tt)
{
svm_vec.push_back(SVMWrapper(_c, _loss->n_class(tt), _n_dim, _task_sizes_train[tt], loss_copy->prop_pointer() + start));
start += _task_sizes_train[tt];
}
unsigned long long int ii_prev = 0;
#ifdef OMP45
#pragma omp for schedule(monotonic: dynamic)
#else
#pragma omp for schedule(dynamic)
#endif
for(unsigned long long int ii = _mpi_comm->rank(); ii < n_interactions; ii += static_cast<unsigned long long int>(_mpi_comm->size()))
{
util_funcs::iterate(inds, inds.size(), ii - ii_prev);
ii_prev = ii;
int n_convex_overlap = (*loss_copy)(inds);
if(n_convex_overlap <= min_n_convex_overlap_private[max_error_ind])
{
temp_svm_error = svm_error(svm_vec, inds);
if(
(n_convex_overlap < min_n_convex_overlap_private[max_error_ind]) ||
(temp_svm_error[0] < min_svm_score_private[max_error_ind]) ||
((temp_svm_error[0] == min_svm_score_private[max_error_ind]) && (temp_svm_error[1] > min_svm_margin_private[max_error_ind]))
)
{
min_n_convex_overlap_private[max_error_ind] = n_convex_overlap;
min_svm_score_private[max_error_ind] = temp_svm_error[0];
min_svm_margin_private[max_error_ind] = temp_svm_error[1];
std::copy_n(inds.begin(), n_dim, min_inds_private.begin() + max_error_ind * n_dim);
max_error_ind = get_max_error_ind(
n_get_models,
min_n_convex_overlap_private.data(),
min_svm_score_private.data(),
min_svm_margin_private.data(),
scores.data()
);
}
}
}
#pragma omp critical
{
max_error_ind = get_max_error_ind(
n_get_models,
min_n_convex_overlap.data(),
min_svm_score.data(),
min_svm_margin.data(),
scores.data()
);
for(int ee = 0; ee < min_n_convex_overlap.size(); ++ee)
{
if(
(min_n_convex_overlap_private[ee] < min_n_convex_overlap[max_error_ind]) ||
((min_n_convex_overlap_private[ee] == min_n_convex_overlap[max_error_ind]) && (min_svm_score_private[ee] < min_svm_score[max_error_ind])) ||
((min_n_convex_overlap_private[ee] == min_n_convex_overlap[max_error_ind]) && (min_svm_score_private[ee] == min_svm_score[max_error_ind]) && (min_svm_margin_private[ee] > min_svm_margin[max_error_ind]))
)
{
min_n_convex_overlap[max_error_ind] = min_n_convex_overlap_private[ee];
min_svm_score[max_error_ind] = min_svm_score_private[ee];
min_svm_margin[max_error_ind] = min_svm_margin_private[ee];
std::copy_n(min_inds_private.begin() + ee * n_dim, n_dim, min_inds.begin() + max_error_ind * n_dim);
max_error_ind = get_max_error_ind(
n_get_models,
min_n_convex_overlap.data(),
min_svm_score.data(),
min_svm_margin.data(),
scores.data()
);
}
}
}
}
}
std::vector<int> all_min_n_convex_overlap(_mpi_comm->size() * n_get_models);
std::vector<double> all_min_svm_score(_mpi_comm->size() * n_get_models);
std::vector<double> all_min_svm_margin(_mpi_comm->size() * n_get_models);
std::vector<int> all_min_inds(_mpi_comm->size() * n_get_models * n_dim);
mpi::all_gather(*_mpi_comm, min_n_convex_overlap.data(), n_get_models, all_min_n_convex_overlap);
mpi::all_gather(*_mpi_comm, min_svm_score.data(), n_get_models, all_min_svm_score);
mpi::all_gather(*_mpi_comm, min_svm_margin.data(), n_get_models, all_min_svm_margin);
mpi::all_gather(*_mpi_comm, min_inds.data(), n_get_models * n_dim, all_min_inds);
std::vector<double> scores(all_min_svm_score);
std::transform(
scores.begin(),
scores.end(),
all_min_n_convex_overlap.begin(),
scores.begin(),
[this](double score, int n_overlap){return score + n_overlap * _n_samp;}
);
double max_dist = *std::max_element(all_min_svm_margin.begin(), all_min_svm_margin.end()) + 0.01;
std::transform(
all_min_svm_margin.begin(),
all_min_svm_margin.end(),
scores.begin(),
scores.begin(),
[&max_dist](double margin, double score){return score + (1.0 - margin / max_dist);}
);
inds = util_funcs::argsort<double>(scores);
std::vector<std::vector<model_node_ptr>> min_nodes(n_get_models, std::vector<model_node_ptr>(n_dim));
std::vector<ModelClassifier> models;
for(int rr = 0; rr < n_get_models; ++rr)
for(auto& inds: indexes)
{
node_value_arrs::clear_temp_test_reg();
for(int ii = 0; ii < n_dim; ++ii)
min_nodes.push_back(std::vector<model_node_ptr>(inds.size()));
for(int ii = 0; ii < inds.size(); ++ii)
{
int index = all_min_inds[inds[rr] * n_dim + ii];
min_nodes[rr][ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]);
int index = inds[ii];
min_nodes.back()[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]);
}
models.push_back(
ModelClassifier(
_prop_label,
_prop_unit,
loss_function_util::copy(_loss),
min_nodes[rr],
_leave_out_inds,
_sample_ids_train,
_sample_ids_test,
_task_names
)
ModelClassifier model(
_prop_label,
_prop_unit,
loss_function_util::copy(_loss),
min_nodes.back(),
_leave_out_inds,
_sample_ids_train,
_sample_ids_test,
_task_names
);
_models.back().push_back(model);
}
_models.push_back(models);
min_nodes.resize(_n_residual);
_loss->reset_projection_prop(min_nodes);
}
void SISSOClassifier::fit()
void SISSOClassifier::output_models()
{
int dd = 1;
while(
(dd <= _n_dim) &&
(*std::max_element(_loss->prop_project_pointer(), _loss->prop_project_pointer() + _loss->n_prop_project() * _n_samp) > 0.0)
)
if(_mpi_comm->rank() == 0)
{
double start = omp_get_wtime();
_feat_space->sis(_loss);
_mpi_comm->barrier();
double duration = omp_get_wtime() - start;
if(_mpi_comm->rank() == 0)
for(int rr = 0; rr < _n_models_store; ++rr)
{
std::cout << "Time for SIS: " << duration << " s" << std::endl;
}
start = omp_get_wtime();
l0_regularization(dd);
_mpi_comm->barrier();
duration = omp_get_wtime() - start;
if(_mpi_comm->rank() == 0)
{
std::cout << "Time for l0-norm: " << duration << " s" << std::endl;
for(int rr = 0; rr < _n_models_store; ++rr)
_models.back()[rr].to_file("models/train_dim_" + std::to_string(_models.size()) + "_model_" + std::to_string(rr) + ".dat");
if(_leave_out_inds.size() > 0)
{
_models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat");
if(_leave_out_inds.size() > 0)
{
_models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false);
}
_models.back()[rr].to_file(
"models/test_dim_" + std::to_string(_models.size()) + "_model_" + std::to_string(rr) + ".dat", false
);
}
}
++dd;
}
if(dd <= _n_dim)
}
bool SISSOClassifier::continue_calc(int dd)
{
bool cont = (
(dd <= _n_dim) &&
(*std::max_element(_loss->prop_project_pointer(), _loss->prop_project_pointer() + _loss->n_prop_project() * _n_samp) > 0.0)
);
if(!cont && (dd <= _n_dim))
{
std::cerr << "WARNING: All points sperated before reaching the requested dimension." << std::endl;
}
}
return cont;
}
......@@ -42,6 +42,7 @@ class SISSOClassifier: public SISSOSolver
{
protected:
std::vector<std::vector<ModelClassifier>> _models; //!< List of models
std::vector<SVMWrapper> _svm_vec; //!< Vector storing the SVMWrappers for the problem
std::map<int, int> _sample_inds_to_sorted_dmat_inds; //!< map from input sample inds to the SORTED_D_MATRIX_INDS
......@@ -79,34 +80,54 @@ public:
std::array<double, 2> svm_error(std::vector<SVMWrapper>& svm, const std::vector<int>& feat_inds) const;
/**
* @brief Gets the max error index for the classification problem
* @brief Sort the property vector by class and store the mapped indexes to _sample_inds_to_sorted_dmat_inds
*/
void setup_d_mat_transfer();
/**
* @brief If true calculate the model for the dimension dd
*
* @param n_models number of models to be stored
* @param n_convex_overlap number of points in the overlapping convex hull regions (in all models)
* @param svm_score the number of points misclassified by SVM (in all models)
* @param svm_margin The margin of the SVM model (in all models)
* @param scores the pointer to the scores array
* @return The index with the maximum error
* @param dd Dimension of the model to train
* @return true if the requested dimension should be calculated.
*/
int get_max_error_ind(const int n_models, const int* n_convex_overlap, const double* svm_score, const double* svm_margin, double* scores) const;
inline bool continue_calc(int dd);
/**
* @brief Sort the property vector by class and store the mapped indexes to _sample_inds_to_sorted_dmat_inds
* @brief Output the models to files and copy the residuals
*/
void setup_d_mat_transfer();
void output_models();
/**
* @brief Perform any steps that need to be done to initialize the regularization
*/
inline void setup_regulairzation()
{
transfer_d_mat_to_sorted();
}
/**
* @brief Preform an l0-Regularization to find the best n_dim dimensional model
* @brief Set the min_scores and min_inds vectors given a score and max_error_ind
*
* @param n_dim The number of features to use in the linear model
* @param inds The current set of indexes
* @param score The score for the current set of indexes
* @param max_error_ind The current index of the maximum score among the best models
* @param min_inds_private Current list of feature indexes for the best models
* @param min_scores_private Current list of the socres of the best models
*/
void l0_regularization(const int n_dim);
void update_min_inds_scores(
const std::vector<int>& inds,
double score,
int max_error_ind,
std::vector<int>& min_inds_private,
std::vector<double>& min_scores_private
);
// DocString: sisso_class_fit
/**
* @brief Iteratively run SISSO on the FeatureSpace an Property vector until the maximum dimenisonality is reached
* @brief Create a Model for a given set of features and store them in _models
*
* @param indexes Vector storing all of the indexes of features in _feat_space->phi_selected() to use for the model
*/
void fit();
virtual void add_models(const std::vector<std::vector<int>> indexes);
/**
* @brief The selected models (n_dim, n_models_store)
......
......@@ -87,106 +87,3 @@ void SISSOLogRegressor::add_models(const std::vector<std::vector<int>> indexes)
min_nodes.resize(_n_residual);
_loss->reset_projection_prop(min_nodes);
}
void SISSOLogRegressor::l0_regularization(const int n_dim)
{
int n_get_models = std::max(_n_residual, _n_models_store);
std::vector<int> inds(n_dim, 0);
std::vector<int> min_inds(n_get_models * n_dim, -1);
std::vector<double> min_errors(n_get_models, util_funcs::norm(_loss->prop_pointer(), _n_samp));
unsigned long long int n_interactions = 1;
int n_dim_fact = 1;
for(int rr = 0; rr < n_dim; ++rr)
{
inds[rr] = _feat_space->phi_selected().size() - 1 - rr;
n_interactions *= inds[rr] + 1;
n_dim_fact *= (rr + 1);
}
n_interactions /= n_dim_fact;
if(inds.back() >= 0)
{
#pragma omp parallel shared(min_inds, min_errors, n_interactions, n_get_models) firstprivate(inds)
{
std::shared_ptr<LossFunction> loss_copy;
#pragma omp critical
{
loss_copy = std::make_shared<LossFunctionLogPearsonRMSE>(_loss);
}
int max_error_ind = 0;
std::vector<int> min_inds_private(min_inds);
std::vector<double> min_errors_private(min_errors);
unsigned long long int ii_prev = 0;
#ifdef OMP45
#pragma omp for schedule(monotonic: dynamic)
#else
#pragma omp for schedule(dynamic)
#endif
for(unsigned long long int ii = _mpi_comm->rank(); ii < n_interactions; ii += static_cast<unsigned long long int>(_mpi_comm->size()))
{
util_funcs::iterate(inds, inds.size(), ii - ii_prev);
double loss = (*loss_copy)(inds);
if(loss < -1.0)
{
std::string err_msg = "A parameter passed to dgels was invalid. This is likely from a NaN in the descriptor matrix. The features that caused this are: ";
for(auto& ind : inds)
{
err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", ";
}
throw std::logic_error(err_msg.substr(0, err_msg.size() - 2) + ".");
}
else if(loss < 0.0)
{
std::string err_msg = "Descriptor matrix is not full-rank. The features that caused this are: ";
for(auto& ind : inds)
{
err_msg += std::to_string(ind) + ": " + _feat_space->phi_selected()[ind]->expr() + ", ";
}
std::cerr << err_msg.substr(0, err_msg.size() - 2) << "." << std::endl;
}
else if(loss < min_errors_private[max_error_ind])
{
min_errors_private[max_error_ind] = loss;
std::copy_n(inds.begin(), inds.size(), min_inds_private.begin() + max_error_ind * n_dim);
max_error_ind = std::max_element(min_errors_private.begin(), min_errors_private.end()) - min_errors_private.begin();
}
ii_prev = ii;
}
#pragma omp critical
{
max_error_ind = std::max_element(min_errors.begin(), min_errors.end()) - min_errors.begin();
for(int ee = 0; ee < n_get_models; ++ee)
{
if(min_errors_private[ee] < min_errors[max_error_ind])
{
min_errors[max_error_ind] = min_errors_private[ee];
std::copy_n(min_inds_private.begin() + ee * n_dim, n_dim, min_inds.begin() + max_error_ind * n_dim);
max_error_ind = std::max_element(min_errors.begin(), min_errors.end()) - min_errors.begin();
}
}
}
}
}
std::vector<double> all_min_error(_mpi_comm->size() * n_get_models);
std::vector<int> all_min_inds(_mpi_comm->size() * n_get_models * n_dim);
mpi::all_gather(*_mpi_comm, min_errors.data(), n_get_models, all_min_error);
mpi::all_gather(*_mpi_comm, min_inds.data(), n_get_models * n_dim, all_min_inds);
inds = util_funcs::argsort<double>(all_min_error);
std::vector<std::vector<int>> indexes(n_get_models, std::vector<int>(n_dim));
for(int rr = 0; rr < n_get_models; ++rr)
{
node_value_arrs::clear_temp_test_reg();
for(int ii = 0; ii < n_dim; ++ii)
{
indexes[rr][ii] = all_min_inds[inds[rr] * n_dim + ii];
}
}
add_models(indexes);
}
......@@ -66,13 +66,6 @@ public:
*/
void output_models();
/**
* @brief Preform an l0-Regularization to find the best n_dim dimensional model
*
* @param n_dim The number of features to use in the linear model
*/
void l0_regularization(const int n_dim);