Commit a8f3e7db authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Move del inds to pre-broadcast

Hopefully will remove the data overwrite issue
parent e1924308
......@@ -573,6 +573,92 @@ void FeatureSpace::generate_non_param_feats(
}
}
void FeatureSpace::remove_duplicate_features(std::vector<node_ptr>& feat_set, int start)
{
std::vector<double> scores(feat_set.size(), 0.0);
project_funcs::project_r(_prop_train.data(), scores.data(), feat_set, _task_sizes_train, 1);
scores.erase(scores.begin(), scores.begin() + start);
std::vector<int> inds = util_funcs::argsort<double>(scores);
std::vector<int> del_inds;
node_value_arrs::clear_temp_reg();
for(int sc = 0; sc < scores.size() - 1; ++sc)
{
#ifdef PARAMETERIZE
if(feat_set[inds[sc] + start]->n_params() > 0)
{
continue;
}
#endif
double* val_ptr = feat_set[start + inds[sc]]->stand_value_ptr();
double base_val = std::abs(
std::inner_product(
val_ptr,
val_ptr + _n_samp_train,
val_ptr,
0.0
)
);
if(scores[inds[sc]] > -1e-7)
{
// If score is 0.0 then check against all other 0.0 values
for(int sc2 = 0; sc2 < sc; ++sc2)
{
double comp = 1.0 / static_cast<double>(_n_samp_train) * std::abs(
base_val -
std::abs(
std::inner_product(
val_ptr,
val_ptr + _n_samp_train,
feat_set[start + inds[sc2]]->stand_value_ptr(true),
0.0
)
)
);
if(comp < 1e-10)
{
del_inds.push_back(-1 * (inds[sc] + start));
break;
}
}
}
else if(scores[inds[sc + 1]] - scores[inds[sc]] < 1e-7)
{
// Otherwise just compare against the closest neighbor
double comp = 1.0 / static_cast<double>(_n_samp_train) * std::abs(
base_val -
std::abs(
std::inner_product(
val_ptr,
val_ptr + _n_samp_train,
feat_set[start + inds[sc + 1]]->stand_value_ptr(true),
0.0
)
)
);
if(comp < 1e-10)
{
del_inds.push_back(-1 * (inds[sc] + start));
}
}
}
inds = util_funcs::argsort<int>(del_inds);
for(int ii = 0; ii < inds.size(); ++ii)
{
feat_set.erase(feat_set.begin() - del_inds[inds[ii]]);
}
// Reindex
for(int ff = start; ff < feat_set.size(); ++ff)
{
feat_set[ff]->reindex(ff);
}
}
void FeatureSpace::generate_feature_space(
std::vector<node_ptr>& feat_set,
std::vector<int>& start_rung,
......@@ -669,7 +755,6 @@ void FeatureSpace::generate_feature_space(
if((nn < _max_rung) || (nn <= _n_rung_store) || (_mpi_comm->size() == 1))
{
int new_phi_size;
int phi_size_start = feat_set.size();
if(_mpi_comm->rank() == 0)
{
std::vector<std::vector<node_ptr>> next_phi_gathered;
......@@ -679,7 +764,6 @@ void FeatureSpace::generate_feature_space(
{
feat_set.insert(feat_set.end(), next_phi_vec.begin(), next_phi_vec.end());
}
new_phi_size = feat_set.size();
// Sort the features to ensure consistent feature spaces for all MPI/OpenMP configurations
std::sort(
......@@ -694,118 +778,27 @@ void FeatureSpace::generate_feature_space(
feat_set.end(),
[&feat_ind](node_ptr n){n->reindex(feat_ind); ++feat_ind;}
);
mpi::broadcast(*_mpi_comm, new_phi_size, 0);
for(int bb = 0; bb <= (new_phi_size - phi_size_start) / 10000; ++bb)
if(nn < _max_rung)
{
mpi::broadcast(*_mpi_comm, &feat_set[phi_size_start + bb * 10000], std::min(10000, new_phi_size - phi_size_start - bb * 10000), 0);
remove_duplicate_features(feat_set, start_rung.back());
}
new_phi_size = feat_set.size();
mpi::broadcast(*_mpi_comm, new_phi_size, 0);
mpi::broadcast(*_mpi_comm, &feat_set[start_rung.back()], new_phi_size - start_rung.back(), 0);
}
else
{
mpi::gather(*_mpi_comm, next_phi, 0);
mpi::broadcast(*_mpi_comm, new_phi_size, 0);
feat_set.resize(new_phi_size);
for(int bb = 0; bb <= (new_phi_size - phi_size_start) / 10000; ++bb)
{
mpi::broadcast(*_mpi_comm, &feat_set[phi_size_start + bb * 10000], std::min(10000, new_phi_size - phi_size_start - bb * 10000), 0);
}
feat_set.resize(new_phi_size);
mpi::broadcast(*_mpi_comm, &feat_set[start_rung.back()], new_phi_size - start_rung.back(), 0);
}
if(phi_size_start == new_phi_size)
if(start_rung.back() == feat_set.size())
{
throw std::logic_error("No features created during this rung (" + std::to_string(nn) + ")");
}
node_value_arrs::clear_temp_reg();
if(nn < _max_rung)
{
// Remove identical features
_scores.resize(feat_set.size());
_mpi_comm->barrier();
project_funcs::project_r(_prop_train.data(), _scores.data(), feat_set, _task_sizes_train, 1);
_scores.erase(_scores.begin(), _scores.begin() + start_rung[start_rung.size() - 1]);
inds = util_funcs::argsort<double>(_scores);
std::vector<int> del_inds;
_mpi_comm->barrier();
node_value_arrs::clear_temp_reg();
for(int sc = 0; sc < _scores.size() - 1; ++sc)
{
#ifdef PARAMETERIZE
if(feat_set[inds[sc] + start_rung.back()]->n_params() > 0)
{
continue;
}
#endif
if(_scores[inds[sc]] > -1e-10)
{
double base_val = std::abs(
util_funcs::r(
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
_n_samp_train
)
);
for(int sc2 = sc + 1; sc2 < _scores.size(); ++sc2)
{
double comp = std::abs(
base_val - std::abs(
util_funcs::r(
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
feat_set[start_rung.back() + inds[sc2]]->value_ptr(0, true),
_n_samp_train
)
)
);
if(comp < 1e-10)
{
del_inds.push_back(-1 * (inds[sc] + start_rung.back()));
break;
}
}
}
else if(_scores[inds[sc + 1]] - _scores[inds[sc]] < 1e-10)
{
double base_val = std::abs(
util_funcs::r(
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
_n_samp_train
)
);
double comp = std::abs(
base_val - std::abs(
util_funcs::r(
feat_set[start_rung.back() + inds[sc]]->value_ptr(),
feat_set[start_rung.back() + inds[sc + 1]]->value_ptr(0, true),
_n_samp_train
)
)
);
if(comp < 1e-10)
{
del_inds.push_back(-1 * (inds[sc] + start_rung.back()));
}
}
}
inds = util_funcs::argsort<int>(del_inds);
for(int ii = 0; ii < inds.size(); ++ii)
{
feat_set.erase(feat_set.begin() - del_inds[inds[ii]]);
}
// Reindex
for(int ff = start_rung.back(); ff < feat_set.size(); ++ff)
{
feat_set[ff]->reindex(ff);
}
}
node_value_arrs::clear_temp_reg();
if(!reparam)
{
......@@ -1046,12 +1039,13 @@ void FeatureSpace::generate_and_project(std::shared_ptr<LossFunction> loss, std:
}
#endif
auto start = _phi.begin() + _start_rung.back() + _mpi_comm->rank();
#ifdef OMP45
#pragma omp for schedule(monotonic: dynamic)
#else
#pragma omp for schedule(dynamic)
#endif
for(auto feat = _phi.begin() + _start_rung.back() + _mpi_comm->rank(); feat < _phi.end(); feat += _mpi_comm->size())
for(auto feat = start; feat < _phi.end(); feat += _mpi_comm->size())
{
unsigned long int feat_ind = _phi.size() + 2 * _n_sis_select * (omp_get_num_threads() + _mpi_comm->size());
......
......@@ -146,6 +146,14 @@ public:
*/
void initialize_fs_output_files() const;
/**
* @brief Remove duplicate features from the feature space
*
* @param feat_set Feature space to remove the duplicates from
* @param start The index to start the removal from
*/
void remove_duplicate_features(std::vector<node_ptr>& feat_set, int start);
/**
* @brief Populate _phi using _phi_0 and the allowed operators up to (_max_rung - _n_rung_generate)^th rung
*/
......
......@@ -59,6 +59,18 @@ void mpi_reduce_op::set_op(std::string project_type, double cross_cor_max, int n
std::vector<node_sc_pair> mpi_reduce_op::select_top_feats(std::vector<node_sc_pair> in_vec_1, std::vector<node_sc_pair> in_vec_2)
{
for(int ff = 0; ff < in_vec_1.size(); ++ff)
{
std::cout << ff << '\t' << in_vec_1[ff]._feat->feat(0)->arr_ind() << '\t' << in_vec_1[ff]._feat->expr() << std::endl;
}
std::cout << std::endl;
for(int ff = 0; ff < in_vec_2.size(); ++ff)
{
std::cout << ff << '\t' << in_vec_2[ff]._feat->feat(0)->arr_ind() << '\t' << in_vec_2[ff]._feat->expr() << std::endl;
}
std::cout << std::endl;
// Set up an output vector
std::vector<node_sc_pair> out_vec;
out_vec.reserve(N_SIS_SELECT);
......@@ -79,5 +91,12 @@ std::vector<node_sc_pair> mpi_reduce_op::select_top_feats(std::vector<node_sc_pa
}
++ff;
}
for(int ff = 0; ff < out_vec.size(); ++ff)
{
std::cout << ff << '\t' << out_vec[ff]._feat->feat(0)->arr_ind() << '\t' << out_vec[ff]._feat->expr() << std::endl;
}
std::cout << std::endl;
return out_vec;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment