Commit eac6b4d0 authored by Thomas Purcell's avatar Thomas Purcell
Browse files

Cleaned up all of the non-node files

parent 5f820471
......@@ -10,46 +10,66 @@ Unit::Unit(std::map<std::string, double> dct) :
Unit::Unit(std::string unit_str) :
_dct()
{
// Remove spaces and Unitless markers from the string
boost::ireplace_all(unit_str, " ", "");
boost::ireplace_all(unit_str, "Unitless", "");
boost::ireplace_all(unit_str, "unitless", "");
// Split the unit based on multiplication/division keys
std::vector<std::string> split_unit;
boost::algorithm::split(split_unit, unit_str, boost::algorithm::is_any_of("*/"));
// Find where multiplication occurs
std::vector<int> mult_ops;
std::vector<StringRange> mult_range;
boost::algorithm::find_all(mult_range, unit_str, "*");
for(auto& it : mult_range)
{
mult_ops.push_back(it.begin() - unit_str.begin());
}
// Find where division occurs
std::vector<int> div_ops;
std::vector<StringRange> div_range;
boost::algorithm::find_all(div_range, unit_str, "/");
for(auto& it : div_range)
{
div_ops.push_back(it.begin() - unit_str.begin());
}
// Combine all operators
std::vector<int> all_ops;
all_ops.reserve(div_ops.size() + mult_ops.size());
all_ops.insert(all_ops.end(), mult_ops.begin(), mult_ops.end());
all_ops.insert(all_ops.end(), div_ops.begin(), div_ops.end());
std::sort(all_ops.begin(), all_ops.end());
// If division occurs ensure the exponent is -1 * actual exponent
std::vector<double> ops((unit_str != "") + all_ops.size(), 1.0);
for(int oo = 1; oo < ops.size(); ++oo)
{
if(std::any_of(div_ops.begin(), div_ops.end(), [&all_ops, &oo](int i){return i == all_ops[oo-1];}))
{
ops[oo] *= -1.0;
}
}
// Create the unit dictionary with the correct exponents
for(int oo = 0; oo < ops.size(); ++oo)
{
std::vector<std::string> get_power = str_utils::split_string_trim(split_unit[oo], "^");
if(get_power[0][0] == '1')
{
continue;
}
if(get_power.size() > 1)
{
_dct[get_power[0]] += ops[oo] * std::stod(get_power[1]);
}
else
{
_dct[get_power[0]] += ops[oo];
}
}
}
......@@ -64,22 +84,32 @@ std::string Unit::toString() const
keys.reserve(_dct.size());
for(auto& el : _dct)
{
keys.push_back(el.first);
}
std::sort(keys.begin(), keys.end());
for(auto& key : keys)
{
if(_dct.at(key) == 1)
{
unit_rep << " * " << key;
}
else if(_dct.at(key) > 0)
{
unit_rep << " * " << key << "^" << _dct.at(key);
}
else if(_dct.at(key) < 0)
{
unit_rep << " * " << key << "^-" << std::abs(_dct.at(key));
}
}
if(unit_rep.str().size() == 0)
{
return "Unitless";
}
return unit_rep.str().substr(3);
}
......@@ -91,22 +121,32 @@ std::string Unit::toLatexString() const
keys.reserve(_dct.size());
for(auto& el : _dct)
{
keys.push_back(el.first);
}
std::sort(keys.begin(), keys.end());
for(auto& key : keys)
{
if(_dct.at(key) == 1)
{
unit_rep << key;
}
else if(_dct.at(key) > 0)
{
unit_rep << key << "$^\\text{" << _dct.at(key) << "}$";
}
else if(_dct.at(key) < 0)
{
unit_rep << key << "$^\\text{-" << std::abs(_dct.at(key)) << "}$";
}
}
if(unit_rep.str().size() == 0)
{
return "Unitless";
}
return unit_rep.str();
}
......@@ -117,9 +157,13 @@ Unit Unit::operator*(Unit unit_2)
for(auto& el : unit_2.dct())
{
if(to_out.count(el.first) > 0)
{
to_out[el.first] += el.second;
}
else
{
to_out[el.first] = el.second;
}
}
return Unit(to_out);
}
......@@ -130,9 +174,13 @@ Unit Unit::operator/(Unit unit_2)
for(auto& el : unit_2.dct())
{
if(to_out.count(el.first) > 0)
{
to_out[el.first] -= el.second;
}
else
{
to_out[el.first] = -1.0 * el.second;
}
}
return Unit(to_out);
......@@ -143,9 +191,13 @@ Unit& Unit::operator*=(Unit unit_2)
for(auto& el : unit_2.dct())
{
if(_dct.count(el.first) > 0)
{
_dct[el.first] += el.second;
}
else
{
_dct[el.first] = el.second;
}
}
return *this;
}
......@@ -155,9 +207,13 @@ Unit& Unit::operator/=(Unit unit_2)
for(auto& el : unit_2.dct())
{
if(_dct.count(el.first) > 0)
{
_dct[el.first] -= el.second;
}
else
{
_dct[el.first] = -1.0 * el.second;
}
}
return *this;
......@@ -167,10 +223,14 @@ Unit Unit::operator^(double power)
{
std::map<std::string, double> to_out = dct();
if(power == 0.0)
{
return Unit(to_out);
}
for(auto& el : to_out)
{
to_out[el.first] *= power;
}
return Unit(to_out);
}
......@@ -178,7 +238,9 @@ Unit Unit::inverse()
{
std::map<std::string, double> to_out = dct();
for(auto& el : to_out)
{
to_out[el.first] *= -1.0;
}
return Unit(to_out);
}
......@@ -187,15 +249,21 @@ bool Unit::equal(Unit unit_2) const
for(auto& el : unit_2.dct())
{
if((_dct.count(el.first) == 0) && (el.second != 0))
{
return false;
}
}
for(auto& el : _dct)
{
if((unit_2.dct().count(el.first) == 0) && (el.second != 0))
{
return false;
}
else if(unit_2.dct()[el.first] != el.second)
{
return false;
}
}
if(unit_2.dct().size() == 0)
......@@ -203,7 +271,9 @@ bool Unit::equal(Unit unit_2) const
for(auto& el : _dct)
{
if((_dct.count(el.first) != 0) && (el.second != 0))
{
return false;
}
}
}
return true;
......
......@@ -23,17 +23,21 @@ InputParser::InputParser(pt::ptree IP, std::string fn, std::shared_ptr<MPI_Inter
_max_param_depth(IP.get<int>("max_feat_param_depth", _max_rung)),
_fix_intercept(IP.get<bool>("fix_intercept", false))
{
// Read the data file
std::ifstream data_stream;
std::string line;
data_stream.open(_data_file, std::ios::in);
// Get the header line for the names of all primary features
std::getline(data_stream, line);
std::vector<std::string> split_line;
boost::algorithm::split(split_line, line, boost::algorithm::is_any_of(","));
if(split_line.size() == 1)
{
throw std::logic_error("data_file is not a comma separated file");
}
// Separate names/units from the headers
std::vector<std::string> headers;
std::vector<Unit> units;
for(int dd = 1; dd < split_line.size(); ++dd)
......@@ -60,34 +64,48 @@ InputParser::InputParser(pt::ptree IP, std::string fn, std::shared_ptr<MPI_Inter
}
}
// Get the column corresponding the the task ID
int taskind = 0;
while((taskind < headers.size()) && (headers[taskind] != _task_key))
{
++taskind;
}
// Get the tasks IDs for each row
std::map<std::string, std::vector<int>> tasks;
if(taskind >= headers.size())
{
// If no task id then everything is in a task called none
while (std::getline(data_stream, line))
{
++_n_samp;
}
tasks["none"] = std::vector<int>(_n_samp);
std::iota(tasks["none"].begin(), tasks["none"].end(), 0);
}
else
{
// Otherwise correctly assign each row to a task
while (std::getline(data_stream, line))
{
boost::algorithm::split(split_line, line, [](char c) {return c==',';});
if(tasks.count(split_line[taskind+1]) > 0)
{
tasks[split_line[taskind+1]].push_back(_n_samp);
}
else
{
tasks[split_line[taskind+1]] = {_n_samp};
}
++_n_samp;
}
}
// Make the train-test split
double leave_out_frac = IP.get<double>("leave_out_frac", static_cast<double>(IP.get<int>("n_leave_out", 0)) / static_cast<double>(_n_samp) );
if((_leave_out_inds.size() == 0) && leave_out_frac > 0.0)
{
// If only fraction given create a test set that pull equally from all tasks
if(comm->rank() == 0)
{
unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
......@@ -108,24 +126,32 @@ InputParser::InputParser(pt::ptree IP, std::string fn, std::shared_ptr<MPI_Inter
start += _task_sizes_test.back();
}
}
// Broadcast the task sizes/leave out indexes
mpi::broadcast(*comm, _leave_out_inds, 0);
mpi::broadcast(*comm, _task_sizes_test, 0);
mpi::broadcast(*comm, _task_sizes_train, 0);
}
else if(_leave_out_inds.size() > 0)
{
// If row numbers of the test set samples are passed use those and set up the correct task sizes
for(auto& el : tasks)
{
int n_test = 0;
for(auto& ind: el.second)
{
if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [ind](int i1){return i1 == ind;}))
{
++n_test;
}
}
_task_sizes_test.push_back(n_test);
_task_sizes_train.push_back(el.second.size() - n_test);
}
}
else
{
// Everything is training
for(auto& el : tasks)
{
_task_sizes_test.push_back(0);
......@@ -133,16 +159,6 @@ InputParser::InputParser(pt::ptree IP, std::string fn, std::shared_ptr<MPI_Inter
}
}
std::vector<std::string> fix_intercept_problem_a = {
"add",
"sub",
"cb",
"sq",
"six_pow",
"sqrt",
"cbrt"
};
if((_opset.size() == 0) && (_param_opset.size() == 0))
{
_opset = {
......@@ -166,11 +182,20 @@ InputParser::InputParser(pt::ptree IP, std::string fn, std::shared_ptr<MPI_Inter
};
}
// Generate a feature space from the data file
generate_feature_space(comm, headers, units, tasks, taskind);
}
void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, std::vector<std::string> headers, std::vector<Unit> units, std::map<std::string, std::vector<int>> tasks, int taskind)
void InputParser::generate_feature_space(
std::shared_ptr<MPI_Interface> comm,
std::vector<std::string> headers,
std::vector<Unit> units,
std::map<std::string,
std::vector<int>> tasks,
int taskind
)
{
// Open the data file
std::string line;
std::ifstream data_stream;
data_stream.open(_data_file, std::ios::in);
......@@ -178,11 +203,23 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st
int number_of_lines = 0;
// Get the sample id's
std::vector<std::string> samples;
std::vector<int> task_size;
std::vector<std::vector<double>> data(headers.size(), std::vector<double>(std::accumulate(_task_sizes_train.begin(), _task_sizes_train.end(), 0)));
std::vector<std::vector<double>> test_data(headers.size(), std::vector<double>(std::accumulate(_task_sizes_test.begin(), _task_sizes_test.end(), 0)));
// Initialize the initial data objects
std::vector<std::vector<double>> data(
headers.size(),
std::vector<double>(
std::accumulate(_task_sizes_train.begin(), _task_sizes_train.end(), 0)
)
);
std::vector<std::vector<double>> test_data(
headers.size(),
std::vector<double>(
std::accumulate(_task_sizes_test.begin(), _task_sizes_test.end(), 0)
)
);
std::vector<std::string> bad_samples;
......@@ -193,63 +230,98 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st
while (std::getline(data_stream, line))
{
std::vector<std::string> split_line;
boost::algorithm::split(split_line, line, [](char c) {return c==',';});
boost::algorithm::split(split_line, line, [](char c) {return c == ',';});
samples.push_back(split_line[0]);
// Check that the rows are all have the same number of columns as the header line
if(split_line.size() != headers.size() + 1)
{
bad_samples.push_back(split_line[0]);
}
if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&cur_line](int ind){return ind == cur_line;}))
{
// This data point is in the validation dataset
// Find where the row should be in the data structure
n_test_samp = 0;
for(auto& task : tasks)
{
int task_ind = std::find(task.second.begin(), task.second.end(), cur_line) - task.second.begin();
for(int ii = 0; ii < task_ind; ++ii)
{
if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&ii, &task](int ind){return ind == task.second[ii];}))
{
++n_test_samp;
}
}
if(task_ind < task.second.size())
{
break;
}
}
// Add the point to the appropriate location in the test data structure
for(int ii = 1; ii < split_line.size(); ++ii)
if(ii-1 != taskind)
test_data[ii-1][n_test_samp] = std::stod(split_line[ii]);
{
if(ii - 1 != taskind)
{
test_data[ii - 1][n_test_samp] = std::stod(split_line[ii]);
}
}
}
else
{
// This data point is in the training dataset
n_train_samp = 0;
// Find where the row should be in the data structure
for(auto& task : tasks)
{
int task_ind = std::find(task.second.begin(), task.second.end(), cur_line) - task.second.begin();
for(int ii = 0; ii < task_ind; ++ii)
{
if(std::none_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&ii, &task](int ind){return ind == task.second[ii];}))
{
++n_train_samp;
}
}
if(task_ind < task.second.size())
{
break;
}
}
// Add the point to the appropriate location in the test data structure
for(int ii = 1; ii < split_line.size(); ++ii)
{
if(ii-1 != taskind)
{
data[ii-1][n_train_samp] = std::stod(split_line[ii]);
}
}
}
++cur_line;
}
// If there are any rows with a different number of columns than the header line throw an error
if(bad_samples.size() > 0)
{
std::string msg = "The data for the following samples are incomplete: ";
for(auto& sample : bad_samples)
{
msg += sample + ", ";
}
throw std::logic_error(msg.substr(0, msg.size() - 2));
}
// Find which column corresponds to the target property
int propind = 0;
while((headers[propind] != _prop_key) && (propind < headers.size()))
{
++propind;
}
if(propind >= headers.size())
{
......@@ -257,6 +329,7 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st
}
else
{
// Get the relevant property information associated with that column
_prop_unit = Unit(units[propind]);
_prop_label = headers[propind];
_prop_train = std::vector<double>(data[propind].size(), 0.0);
......@@ -265,15 +338,20 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st
std::copy_n(data[propind].begin(), data[propind].size(), _prop_train.begin());
std::copy_n(test_data[propind].begin(), test_data[propind].size(), _prop_test.begin());
// Remove that column from the data sets to not make a feature with the property's data
data.erase(data.begin() + propind);
test_data.erase(test_data.begin() + propind);
headers.erase(headers.begin() + propind);
units.erase(units.begin() + propind);
}
// If the task is in a column to the right of the property then subtract 1 from the index as one of the columns was removed
if(taskind > propind)
{
--taskind;
}
// Remove the task column to not make a feature with that data
if(taskind < headers.size())
{
data.erase(data.begin() + taskind);
......@@ -282,12 +360,33 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st
units.erase(units.begin() + taskind);
}
// Initialize the central data storage area
node_value_arrs::initialize_values_arr(_prop_train.size(), _prop_test.size(), headers.size(), _max_rung);
// Create \Phi_0 of primary features
std::vector<node_ptr> phi_0;
for(int ff = 0; ff < headers.size(); ++ff)
{
phi_0.push_back(std::make_shared<FeatureNode>(ff, headers[ff], data[ff], test_data[ff], units[ff]));
}
_feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _param_opset, _prop_train, _task_sizes_train, _calc_type, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _cross_cor_max, _l_bound, _u_bound);
// Create the feature space
_feat_space = std::make_shared<FeatureSpace>(
comm,
phi_0,
_opset,
_param_opset,
_prop_train,
_task_sizes_train,
_calc_type,
_max_rung,
_n_sis_select,
_max_store_rung,
_n_rung_generate,
_cross_cor_max,
_l_bound,
_u_bound
);
}
void stripComments(std::string& filename)
......@@ -299,7 +398,7 @@ void stripComments(std::string& filename)
std::ofstream inputcopy;
inputcopy.open(newfn);
//search for '//', delete everything following, print remainder to new file
//search for '//' or '#', delete everything following, print remainder to new file
std::string line;
int found, found2, find_py_comment;
while (std::getline(inputfile,line))
......@@ -308,11 +407,17 @@ void stripComments(std::string& filename)
found2 = line.find('/', found+1);
find_py_comment = line.find('#');
if (found != line.npos && found2 == found+1)
{
inputcopy << line.erase(found, line.length()) << std::endl;
}
else if(find_py_comment != std::string::npos)
{
inputcopy << line.erase(find_py_comment, line.length()) << std::endl;
}
else
{
inputcopy << line << std::endl;
}
}
inputcopy.close();
//update filename;
......
......@@ -95,7 +95,14 @@ public:
* @param tasks map where keys are the task name and values are the number of samples in each task
* @param taskind index in the columns that correspond the the task column
*/
void generate_feature_space(std::shared_ptr<MPI_Interface> comm, std::vector<std::string> headers, std::vector<Unit> units, std::map<std::string, std::vector<int>> tasks, int taskind);