Commit 7786a5a3 authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Merge branch 'data_overwrite_error' into 'joss'

Use inner_product of Standardized Values instead of r for calculating overlap

See merge request tpurcell/cpp_sisso!37
parents 34492b28 6f8b8658
...@@ -63,14 +63,15 @@ if(EXTERNAL_BOOST) ...@@ -63,14 +63,15 @@ if(EXTERNAL_BOOST)
message(STATUS "Using external boost") message(STATUS "Using external boost")
set(EXTERNAL_BOOST TRUE) set(EXTERNAL_BOOST TRUE)
else(EXTERNAL_BOOST) else(EXTERNAL_BOOST)
if(NOT DEFINED EXTERNAL_BUILD_N_PROCS)
set(EXTERNAL_BUILD_N_PROCS 1 CACHE STRING "Number of processes to use when building Boost")
endif()
message(STATUS "Building boost wth ${EXTERNAL_BUILD_N_PROCS} process(es)") message(STATUS "Building boost wth ${EXTERNAL_BUILD_N_PROCS} process(es)")
include( ExternalProject ) include( ExternalProject )
set(EXTERNAL_BOOST FALSE) set(EXTERNAL_BOOST FALSE)
endif() endif()
if(NOT DEFINED EXTERNAL_BUILD_N_PROCS)
set(EXTERNAL_BUILD_N_PROCS 1 CACHE STRING "Number of processes to use when building Boost")
endif()
# Check for FindOpenMP # Check for FindOpenMP
find_package(OpenMP REQUIRED) find_package(OpenMP REQUIRED)
if (OPENMP_FOUND) if (OPENMP_FOUND)
......
...@@ -573,6 +573,110 @@ void FeatureSpace::generate_non_param_feats( ...@@ -573,6 +573,110 @@ void FeatureSpace::generate_non_param_feats(
} }
} }
#ifdef PARAMETERIZE
int FeatureSpace::reorder_by_n_params(std::vector<node_ptr>& feat_set, int start)
{
std::vector<int> feat_n_params(feat_set.size() - start);
std::transform(
feat_set.begin() + start,
feat_set.end(),
feat_n_params.begin(),
[](node_ptr feat){return feat->n_params();}
);
std::vector<int> inds = util_funcs::argsort<int>(feat_n_params);
std::vector<node_ptr> feat_set_copy(feat_n_params.size());
std::copy_n(feat_set.begin() + start, feat_n_params.size(), feat_set_copy.begin());
std::transform(
inds.begin(),
inds.end(),
feat_set.begin() + start,
[&feat_set_copy](int ind){return feat_set_copy[ind];}
);
for(int ff = start; ff < feat_set.size(); ++ff)
{
feat_set[ff]->reindex(ff);
}
// Set how many features have no parameters
return start + std::count_if(
feat_n_params.begin(),
feat_n_params.end(),
[](int n_param){return n_param == 0;}
);
}
#endif
void FeatureSpace::remove_duplicate_features(std::vector<node_ptr>& feat_set, int start)
{
std::vector<double> scores(feat_set.size(), 0.0);
project_funcs::project_r(_prop_train.data(), scores.data(), feat_set, _task_sizes_train, 1);
scores.erase(scores.begin(), scores.begin() + start);
if(scores.size() == 0)
{
throw std::logic_error("No features created during this rung (" + std::to_string(feat_set.back()->rung() + 1) + ")");
}
std::vector<int> inds = util_funcs::argsort<double>(scores);
std::vector<int> del_inds;
node_value_arrs::clear_temp_reg();
for(int sc = 0; sc < scores.size() - 1; ++sc)
{
#ifdef PARAMETERIZE
if(feat_set[inds[sc] + start]->n_params() > 0)
{
continue;
}
#endif
double* val_ptr = feat_set[start + inds[sc]]->stand_value_ptr();
double base_val = std::abs(
std::inner_product(
val_ptr,
val_ptr + _n_samp_train,
val_ptr,
0.0
)
);
int sc2 = sc + 1;
while((sc2 < scores.size()) && (scores[inds[sc2]] - scores[inds[sc]] < 1e-7))
{
double comp = std::abs(
base_val -
std::abs(
std::inner_product(
val_ptr,
val_ptr + _n_samp_train,
feat_set[start + inds[sc2]]->stand_value_ptr(true),
0.0
)
)
);
if(comp / static_cast<double>(_n_samp_train) < 1e-10)
{
del_inds.push_back(-1 * (inds[sc] + start));
break;
}
++sc2;
}
}
inds = util_funcs::argsort<int>(del_inds);
for(int ii = 0; ii < inds.size(); ++ii)
{
feat_set.erase(feat_set.begin() - del_inds[inds[ii]]);
}
// Reindex
for(int ff = start; ff < feat_set.size(); ++ff)
{
feat_set[ff]->reindex(ff);
}
}
void FeatureSpace::generate_feature_space( void FeatureSpace::generate_feature_space(
std::vector<node_ptr>& feat_set, std::vector<node_ptr>& feat_set,
std::vector<int>& start_rung, std::vector<int>& start_rung,
...@@ -669,7 +773,6 @@ void FeatureSpace::generate_feature_space( ...@@ -669,7 +773,6 @@ void FeatureSpace::generate_feature_space(
if((nn < _max_rung) || (nn <= _n_rung_store) || (_mpi_comm->size() == 1)) if((nn < _max_rung) || (nn <= _n_rung_store) || (_mpi_comm->size() == 1))
{ {
int new_phi_size; int new_phi_size;
int phi_size_start = feat_set.size();
if(_mpi_comm->rank() == 0) if(_mpi_comm->rank() == 0)
{ {
std::vector<std::vector<node_ptr>> next_phi_gathered; std::vector<std::vector<node_ptr>> next_phi_gathered;
...@@ -679,7 +782,6 @@ void FeatureSpace::generate_feature_space( ...@@ -679,7 +782,6 @@ void FeatureSpace::generate_feature_space(
{ {
feat_set.insert(feat_set.end(), next_phi_vec.begin(), next_phi_vec.end()); feat_set.insert(feat_set.end(), next_phi_vec.begin(), next_phi_vec.end());
} }
new_phi_size = feat_set.size();
// Sort the features to ensure consistent feature spaces for all MPI/OpenMP configurations // Sort the features to ensure consistent feature spaces for all MPI/OpenMP configurations
std::sort( std::sort(
...@@ -694,118 +796,44 @@ void FeatureSpace::generate_feature_space( ...@@ -694,118 +796,44 @@ void FeatureSpace::generate_feature_space(
feat_set.end(), feat_set.end(),
[&feat_ind](node_ptr n){n->reindex(feat_ind); ++feat_ind;} [&feat_ind](node_ptr n){n->reindex(feat_ind); ++feat_ind;}
); );
if(nn < _max_rung)
mpi::broadcast(*_mpi_comm, new_phi_size, 0); {
remove_duplicate_features(feat_set, start_rung.back());
for(int bb = 0; bb <= (new_phi_size - phi_size_start) / 10000; ++bb) }
#ifdef PARAMETERIZE
if(!reparam)
{ {
mpi::broadcast(*_mpi_comm, &feat_set[phi_size_start + bb * 10000], std::min(10000, new_phi_size - phi_size_start - bb * 10000), 0); int no_param_sz = reorder_by_n_params(feat_set, start_rung.back());
mpi::broadcast(*_mpi_comm, no_param_sz, 0);
_end_no_params.push_back(no_param_sz);
} }
#endif
new_phi_size = feat_set.size();
mpi::broadcast(*_mpi_comm, new_phi_size, 0);
mpi::broadcast(*_mpi_comm, &feat_set[start_rung.back()], new_phi_size - start_rung.back(), 0);
} }
else else
{ {
mpi::gather(*_mpi_comm, next_phi, 0); mpi::gather(*_mpi_comm, next_phi, 0);
mpi::broadcast(*_mpi_comm, new_phi_size, 0);
feat_set.resize(new_phi_size);
for(int bb = 0; bb <= (new_phi_size - phi_size_start) / 10000; ++bb) #ifdef PARAMETERIZE
if(!reparam)
{ {
mpi::broadcast(*_mpi_comm, &feat_set[phi_size_start + bb * 10000], std::min(10000, new_phi_size - phi_size_start - bb * 10000), 0); int no_param_sz;
mpi::broadcast(*_mpi_comm, no_param_sz, 0);
_end_no_params.push_back(no_param_sz);
} }
} #endif
if(phi_size_start == new_phi_size) mpi::broadcast(*_mpi_comm, new_phi_size, 0);
feat_set.resize(new_phi_size);
mpi::broadcast(*_mpi_comm, &feat_set[start_rung.back()], new_phi_size - start_rung.back(), 0);
}
if(start_rung.back() == feat_set.size())
{ {
throw std::logic_error("No features created during this rung (" + std::to_string(nn) + ")"); throw std::logic_error("No features created during this rung (" + std::to_string(nn) + ")");
} }
node_value_arrs::clear_temp_reg();
if(nn < _max_rung)
{
// Remove identical features
_scores.resize(feat_set.size());
_mpi_comm->barrier();
project_funcs::project_r(_prop_train.data(), _scores.data(), feat_set, _task_sizes_train, 1);
_scores.erase(_scores.begin(), _scores.begin() + start_rung[start_rung.size() - 1]);
inds = util_funcs::argsort<double>(_scores);
std::vector<int> del_inds;
_mpi_comm->barrier();
node_value_arrs::clear_temp_reg();
for(int sc = 0; sc < _scores.size() - 1; ++sc)
{
#ifdef PARAMETERIZE
if(feat_set[inds[sc] + start_rung.back()]->n_params() > 0)
{
continue;
}
#endif
if(_scores[inds[sc]] > -1e-10)
{
double base_val = std::abs(
util_funcs::r(
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
_n_samp_train
)
);
for(int sc2 = sc + 1; sc2 < _scores.size(); ++sc2)
{
double comp = std::abs(
base_val - std::abs(
util_funcs::r(
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
feat_set[start_rung.back() + inds[sc2]]->value_ptr(0, true),
_n_samp_train
)
)
);
if(comp < 1e-10)
{
del_inds.push_back(-1 * (inds[sc] + start_rung.back()));
break;
}
}
}
else if(_scores[inds[sc + 1]] - _scores[inds[sc]] < 1e-10)
{
double base_val = std::abs(
util_funcs::r(
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
_n_samp_train
)
);
double comp = std::abs(
base_val - std::abs(
util_funcs::r(
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
feat_set[start_rung.back() + inds[sc + 1]]->value_ptr(0, true),
_n_samp_train
)
)
);
if(comp < 1e-10)
{
del_inds.push_back(-1 * (inds[sc] + start_rung.back()));
}
}
}
inds = util_funcs::argsort<int>(del_inds);
for(int ii = 0; ii < inds.size(); ++ii)
{
feat_set.erase(feat_set.begin() - del_inds[inds[ii]]);
}
// Reindex
for(int ff = start_rung.back(); ff < feat_set.size(); ++ff)
{
feat_set[ff]->reindex(ff);
}
}
node_value_arrs::clear_temp_reg(); node_value_arrs::clear_temp_reg();
if(!reparam) if(!reparam)
{ {
...@@ -960,49 +988,19 @@ void FeatureSpace::generate_feature_space( ...@@ -960,49 +988,19 @@ void FeatureSpace::generate_feature_space(
} }
} }
} }
#pragma omp parallel for
for(int ff = start_rung.back(); ff < feat_set.size(); ++ff) for(int ff = start_rung.back(); ff < feat_set.size(); ++ff)
{ {
feat_set[ff]->reindex(ff + n_feat_below_rank, ff); feat_set[ff]->reindex(ff + n_feat_below_rank, ff);
feat_set[ff]->set_value();
feat_set[ff]->set_test_value();
} }
}
#ifdef PARAMETERIZE #ifdef PARAMETERIZE
if(!reparam) if(!reparam)
{
// Reorder features based on the number of parameters they have (none goes first)
std::vector<int> feat_n_params(feat_set.size() - start_rung.back());
std::transform(
feat_set.begin() + start_rung.back(),
feat_set.end(),
feat_n_params.begin(),
[](node_ptr feat){return feat->n_params();}
);
inds = util_funcs::argsort<int>(feat_n_params);
next_phi.resize(feat_n_params.size());
std::copy_n(feat_set.begin() + start_rung.back(), feat_n_params.size(), next_phi.begin());
std::transform(
inds.begin(),
inds.end(),
feat_set.begin() + start_rung.back(),
[&next_phi](int ind){return next_phi[ind];}
);
for(int ff = start_rung.back(); ff < feat_set.size(); ++ff)
{ {
feat_set[ff]->reindex(ff); int no_param_sz = reorder_by_n_params(feat_set, start_rung.back());
feat_set[ff]->set_value(); _end_no_params.push_back(no_param_sz);
} }
#endif
// Set how many features have no parameters
_end_no_params.push_back(
start_rung.back() +
std::count_if(feat_n_params.begin(), feat_n_params.end(), [](int n_param){return n_param == 0;})
);
} }
#endif
} }
} }
...@@ -1019,12 +1017,14 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std: ...@@ -1019,12 +1017,14 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std:
int worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin(); int worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin();
double worst_score = scores_sel[worst_score_ind]; double worst_score = scores_sel[worst_score_ind];
int num_valid_feats = phi_sel.size();
#pragma omp parallel firstprivate(worst_score, worst_score_ind, scores_sel_all) #pragma omp parallel firstprivate(worst_score, worst_score_ind, scores_sel_all)
{ {
std::shared_ptr<LossFunction> loss_copy = loss_function_util::copy(loss); std::shared_ptr<LossFunction> loss_copy = loss_function_util::copy(loss);
std::vector<node_ptr> phi_sel_private(phi_sel); std::vector<node_ptr> phi_sel_private(phi_sel);
std::vector<double> scores_sel_private(scores_sel); std::vector<double> scores_sel_private(scores_sel);
int index_base = _phi.size() + _n_sis_select * (omp_get_thread_num() + _mpi_comm->size()); int index_base = _phi.size() + 2 * _n_sis_select * (omp_get_thread_num() + _mpi_comm->size());
int num_valid_feats_private = num_valid_feats;
#ifdef PARAMETERIZE #ifdef PARAMETERIZE
std::shared_ptr<NLOptimizer> optimizer; std::shared_ptr<NLOptimizer> optimizer;
...@@ -1044,14 +1044,15 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std: ...@@ -1044,14 +1044,15 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std:
} }
#endif #endif
auto start = _phi.begin() + _start_rung.back() + _mpi_comm->rank();
#ifdef OMP45 #ifdef OMP45
#pragma omp for schedule(monotonic: dynamic) #pragma omp for schedule(monotonic: dynamic)
#else #else
#pragma omp for schedule(dynamic) #pragma omp for schedule(dynamic)
#endif #endif
for(auto feat = _phi.begin() + _start_rung.back() + _mpi_comm->rank(); feat < _phi.end(); feat += _mpi_comm->size()) for(auto feat = start; feat < _phi.end(); feat += _mpi_comm->size())
{ {
unsigned long int feat_ind = _phi.size() + _n_sis_select * (omp_get_num_threads() + _mpi_comm->size()); unsigned long int feat_ind = _phi.size() + 2 * _n_sis_select * (omp_get_num_threads() + _mpi_comm->size());
node_value_arrs::clear_temp_reg_thread(); node_value_arrs::clear_temp_reg_thread();
std::vector<node_ptr> generated_phi; std::vector<node_ptr> generated_phi;
...@@ -1089,9 +1090,10 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std: ...@@ -1089,9 +1090,10 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std:
while((ii < inds.size()) && ((scores[inds[ii]] < worst_score) || (phi_sel_private.size() < _n_sis_select))) while((ii < inds.size()) && ((scores[inds[ii]] < worst_score) || (phi_sel_private.size() < _n_sis_select)))
{ {
node_value_arrs::clear_temp_reg_thread();
double cur_score = scores[inds[ii]]; double cur_score = scores[inds[ii]];
bool valid_feat = _is_valid( int valid_feat = _is_valid(
generated_phi[inds[ii]]->value_ptr(0), generated_phi[inds[ii]]->stand_value_ptr(),
_n_samp_train, _n_samp_train,
_cross_cor_max, _cross_cor_max,
scores_sel_all, scores_sel_all,
...@@ -1099,27 +1101,27 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std: ...@@ -1099,27 +1101,27 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std:
node_value_arrs::N_SELECTED - _n_sis_select, node_value_arrs::N_SELECTED - _n_sis_select,
0 0
); );
valid_feat = valid_feat && _is_valid_feat_list( if(valid_feat > 0)
generated_phi[inds[ii]]->value_ptr(0),
_n_samp_train,
_cross_cor_max,
phi_sel_private,
scores_sel_private,
cur_score
);
if(valid_feat)
{ {
if(scores_sel_private.size() == _n_sis_select) valid_feat = _is_valid_feat_list(
generated_phi[inds[ii]]->stand_value_ptr(),
_n_samp_train,
_cross_cor_max,
phi_sel_private,
scores_sel_private,
cur_score
);
if((valid_feat > 0) && (num_valid_feats_private >= _n_sis_select))
{ {
generated_phi[inds[ii]]->reindex(index_base + worst_score_ind); generated_phi[inds[ii]]->reindex(index_base + worst_score_ind);
generated_phi[inds[ii]]->set_value();
phi_sel_private[worst_score_ind] = generated_phi[inds[ii]]; phi_sel_private[worst_score_ind] = generated_phi[inds[ii]];
scores_sel_private[worst_score_ind] = cur_score; scores_sel_private[worst_score_ind] = cur_score;
} }
else else if(valid_feat != 0)
{ {
num_valid_feats_private += valid_feat > 0;
generated_phi[inds[ii]]->reindex(index_base + scores_sel_private.size()); generated_phi[inds[ii]]->reindex(index_base + scores_sel_private.size());
generated_phi[inds[ii]]->set_value();
phi_sel_private.push_back(generated_phi[inds[ii]]); phi_sel_private.push_back(generated_phi[inds[ii]]);
scores_sel_private.push_back(cur_score); scores_sel_private.push_back(cur_score);
} }
...@@ -1132,38 +1134,88 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std: ...@@ -1132,38 +1134,88 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std:
#pragma omp critical #pragma omp critical
{ {
index_base = _phi.size() + _n_sis_select * _mpi_comm->rank(); index_base = _phi.size() + 2 * _n_sis_select * _mpi_comm->rank();
worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin(); worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin();
for(int sc = 0; sc < scores_sel_private.size(); ++sc) for(int sc = 0; sc < scores_sel_private.size(); ++sc)
{ {
if( node_value_arrs::clear_temp_reg_thread();
((phi_sel.size() < _n_sis_select) || (scores_sel_private[sc] < scores_sel[worst_score_ind])) && if(scores_sel_private[sc] > scores_sel[worst_score_ind])
_is_valid_feat_list(phi_sel_private[sc]->value_ptr(), _n_samp_train, _cross_cor_max, phi_sel, scores_sel, scores_sel_private[sc])
)
{ {
if(phi_sel.size() == _n_sis_select) continue;
}
int valid_feat = _is_valid_feat_list(
phi_sel_private[sc]->stand_value_ptr(),
_n_samp_train,
_cross_cor_max,
phi_sel,
scores_sel,
scores_sel_private[sc]
);
if((valid_feat > 0) && (num_valid_feats >= _n_sis_select))
{
scores_sel[worst_score_ind] = scores_sel_private[sc];
phi_sel[worst_score_ind] = phi_sel_private[sc];
if(phi_sel[worst_score_ind]->rung() == _max_rung)
{ {
scores_sel[worst_score_ind] = scores_sel_private[sc]; phi_sel[worst_score_ind]->reindex(index_base + worst_score_ind);
phi_sel[worst_score_ind] = phi_sel_private[sc];
if(phi_sel[worst_score_ind]->rung() == _max_rung)
{
phi_sel[worst_score_ind]->reindex(index_base + worst_score_ind);
}
} }
else worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin();
}
else if(valid_feat != 0)
{
num_valid_feats += (valid_feat > 0);
scores_sel.push_back(scores_sel_private[sc]);
phi_sel.push_back(phi_sel_private[sc]);
if(phi_sel.back()->rung() == _max_rung)
{ {
scores_sel.push_back(scores_sel_private[sc]); phi_sel.back()->reindex(index_base + phi_sel.size());
phi_sel.push_back(phi_sel_private[sc]);
if(phi_sel.back()->rung() == _max_rung)
{
phi_sel.back()->reindex(index_base + phi_sel.size());
}
} }
worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin(); worst_score_ind = std::max_element(scores_sel.begin(), scores_sel.end()) - scores_sel.begin();
}