InputParser.cpp 10.3 KB
Newer Older
Thomas Purcell's avatar
Thomas Purcell committed
1
2
#include <inputs/InputParser.hpp>

3
InputParser::InputParser(boost::property_tree::ptree IP, std::string fn, std::shared_ptr<MPI_Interface> comm) :
Thomas Purcell's avatar
Thomas Purcell committed
4
5
6
7
    _opset(as_vector<std::string>(IP, "opset")),
    _filename(fn),
    _data_file(IP.get<std::string>("data_file", "data.csv")),
    _prop_key(IP.get<std::string>("property_key", "prop")),
Thomas Purcell's avatar
Thomas Purcell committed
8
    _leave_out_inds(as_vector<int>(IP, "leave_out_inds")),
Thomas Purcell's avatar
Thomas Purcell committed
9
10
    _l_bound(IP.get<double>("min_abs_feat_val", 1e-50)),
    _u_bound(IP.get<double>("max_abs_feat_val", 1e50)),
Thomas Purcell's avatar
Thomas Purcell committed
11
12
13
    _n_dim(IP.get<int>("desc_dim")),
    _n_sis_select(IP.get<int>("_n_sis_select")),
    _max_rung(IP.get<int>("max_rung")),
14
    _max_store_rung(IP.get<int>("n_rung_store", _max_rung - 1)),
15
    _n_rung_generate(IP.get<int>("n_rung_generate", 0)),
Thomas Purcell's avatar
Thomas Purcell committed
16
    _n_samp(0),
Thomas Purcell's avatar
Thomas Purcell committed
17
    _n_residuals(IP.get<int>("n_residual", 1))
Thomas Purcell's avatar
Thomas Purcell committed
18
19
20
21
{
    std::ifstream data_stream;
    std::string line;
    data_stream.open(_data_file, std::ios::in);
Thomas Purcell's avatar
Thomas Purcell committed
22
23
24
25
26
27
28
29
30
31
32

    std::getline(data_stream, line);
    std::vector<std::string> split_line;

    boost::algorithm::split(split_line, line, boost::algorithm::is_any_of(","));
    if(split_line.size() == 1)
        throw std::logic_error("data_file is not a comma separated file");

    std::vector<std::string> headers;
    std::vector<Unit> units;
    for(int dd = 1; dd < split_line.size(); ++dd)
33
    {
Thomas Purcell's avatar
Thomas Purcell committed
34
35
36
        std::vector<std::string> name_unit_split;
        boost::algorithm::split(name_unit_split, split_line[dd], boost::algorithm::is_any_of("()"));
        if(name_unit_split.size() == 1)
Thomas Purcell's avatar
Bug fix    
Thomas Purcell committed
37
        {
Thomas Purcell's avatar
Thomas Purcell committed
38
39
40
41
42
43
44
45
            boost::algorithm::trim(split_line[dd]);
            headers.push_back(split_line[dd]);
            units.push_back(Unit());
        }
        else if(name_unit_split.size() == 3)
        {
            boost::algorithm::trim(name_unit_split[0]);
            boost::algorithm::trim(name_unit_split[1]);
46

Thomas Purcell's avatar
Thomas Purcell committed
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
            headers.push_back(name_unit_split[0]);
            units.push_back(name_unit_split[1]);
        }
        else
        {
            throw std::logic_error("Invalid feature name \"" + split_line[dd] + "\" in header of file");
        }
    }

    int taskind = 0;
    while((taskind < headers.size()) && (headers[taskind] != "Task"))
        ++taskind;

    std::map<std::string, std::vector<int>> tasks;
    if(taskind >= headers.size())
    {
        while (std::getline(data_stream, line))
            ++_n_samp;
        tasks["none"] = std::vector<int>(_n_samp);
        std::iota(tasks["none"].begin(), tasks["none"].end(), 0);
    }
    else
    {
        while (std::getline(data_stream, line))
        {
            boost::algorithm::split(split_line, line, [](char c) {return c==',';});
            if(tasks.count(split_line[taskind+1]) > 0)
                tasks[split_line[taskind+1]].push_back(_n_samp);
            else
                tasks[split_line[taskind+1]] = {_n_samp};
            ++_n_samp;
        }
    }

    double leave_out_frac = IP.get<double>("leave_out_frac", static_cast<double>(IP.get<int>("n_leave_out", 0)) / static_cast<double>(_n_samp) );
    if((_leave_out_inds.size() == 0) && leave_out_frac > 0.0)
    {
        if(comm->rank() == 0)
        {
Thomas Purcell's avatar
Bug fix    
Thomas Purcell committed
86
            unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
Thomas Purcell's avatar
Thomas Purcell committed
87
88
89
90
91
92
93
94
95
96
            int start = 0;
            for(auto& el : tasks)
            {
                _task_sizes_test.push_back(static_cast<int>(std::round(leave_out_frac * el.second.size())));
                _task_sizes_train.push_back(el.second.size() - _task_sizes_test.back());

                _leave_out_inds.resize(_leave_out_inds.size() + _task_sizes_test.back());

                std::vector<int> inds(el.second);
                std::shuffle(inds.begin(), inds.end(), std::default_random_engine(seed));
97

Thomas Purcell's avatar
Thomas Purcell committed
98
99
100
101
102
                std::copy_n(inds.begin(), _task_sizes_test.back(), _leave_out_inds.begin() + start);
                std::sort(_leave_out_inds.begin() + start, _leave_out_inds.end());

                start += _task_sizes_test.back();
            }
Thomas Purcell's avatar
Bug fix    
Thomas Purcell committed
103
104
        }
        mpi::broadcast(*comm, _leave_out_inds, 0);
Thomas Purcell's avatar
Thomas Purcell committed
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
        mpi::broadcast(*comm, _task_sizes_test, 0);
        mpi::broadcast(*comm, _task_sizes_train, 0);
    }
    else if(_leave_out_inds.size() > 0)
    {
        for(auto& el : tasks)
        {
            int n_test = 0;
            for(auto& ind: el.second)
                if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [ind](int i1){return i1 == ind;}))
                    ++n_test;
            _task_sizes_test.push_back(n_test);
            _task_sizes_train.push_back(el.second.size() - n_test);
        }
    }
    else
    {
        for(auto& el : tasks)
        {
            _task_sizes_test.push_back(0);
            _task_sizes_train.push_back(el.second.size());
        }
127
    }
Thomas Purcell's avatar
Bug fix    
Thomas Purcell committed
128

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
    if(_opset.size() == 0)
    {
        _opset = {
            "exp",
            "neg_exp",
            "inv",
            "sq",
            "cb",
            "six_pow",
            "sqrt",
            "cbrt",
            "log",
            "abs",
            "sin",
            "cos",
            "add",
            "sub",
            "abs_diff",
            "mult",
            "div"
        };
    }

Thomas Purcell's avatar
Thomas Purcell committed
152
    generate_feature_space(comm, headers, units, tasks, taskind);
Thomas Purcell's avatar
Thomas Purcell committed
153
154
}

Thomas Purcell's avatar
Thomas Purcell committed
155
void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, std::vector<std::string> headers, std::vector<Unit> units, std::map<std::string, std::vector<int>> tasks, int taskind)
Thomas Purcell's avatar
Thomas Purcell committed
156
157
158
159
{
    std::string line;
    std::ifstream data_stream;
    data_stream.open(_data_file, std::ios::in);
Thomas Purcell's avatar
Thomas Purcell committed
160
161
    std::getline(data_stream, line);

Thomas Purcell's avatar
Thomas Purcell committed
162
163
164
    int number_of_lines = 0;

    std::vector<std::string> samples;
Thomas Purcell's avatar
Thomas Purcell committed
165
    std::vector<int> task_size;
Thomas Purcell's avatar
Thomas Purcell committed
166

Thomas Purcell's avatar
Thomas Purcell committed
167
168
    std::vector<std::vector<double>> data(headers.size(), std::vector<double>(std::accumulate(_task_sizes_train.begin(), _task_sizes_train.end(), 0)));
    std::vector<std::vector<double>> test_data(headers.size(), std::vector<double>(std::accumulate(_task_sizes_test.begin(), _task_sizes_test.end(), 0)));
Thomas Purcell's avatar
Thomas Purcell committed
169
170
171
172

    std::vector<std::string> bad_samples;

    int cur_line = 0;
173
174
    int n_train_samp = 0;
    int n_test_samp = 0;
Thomas Purcell's avatar
Thomas Purcell committed
175

Thomas Purcell's avatar
Thomas Purcell committed
176
177
    while (std::getline(data_stream, line))
    {
Thomas Purcell's avatar
Thomas Purcell committed
178
        std::vector<std::string> split_line;
Thomas Purcell's avatar
Thomas Purcell committed
179
180
181
182
183
184
        boost::algorithm::split(split_line, line, [](char c) {return c==',';});
        samples.push_back(split_line[0]);

        if(split_line.size() != headers.size() + 1)
            bad_samples.push_back(split_line[0]);

185
186
        if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&cur_line](int ind){return ind == cur_line;}))
        {
Thomas Purcell's avatar
Thomas Purcell committed
187
188
189
190
191
192
193
194
195
196
197
198
            n_test_samp = 0;
            for(auto& task : tasks)
            {
                int task_ind = std::find(task.second.begin(), task.second.end(), cur_line) - task.second.begin();

                for(int ii = 0; ii < task_ind; ++ii)
                    if(std::any_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&ii, &task](int ind){return ind == task.second[ii];}))
                        ++n_test_samp;

                if(task_ind < task.second.size())
                    break;
            }
199
            for(int ii = 1; ii < split_line.size(); ++ii)
Thomas Purcell's avatar
Thomas Purcell committed
200
201
                if(ii-1 != taskind)
                    test_data[ii-1][n_test_samp] = std::stod(split_line[ii]);
202
203
204
        }
        else
        {
Thomas Purcell's avatar
Thomas Purcell committed
205
206
207
208
209
210
211
212
213
214
215
216
217
            n_train_samp = 0;
            for(auto& task : tasks)
            {

                int task_ind = std::find(task.second.begin(), task.second.end(), cur_line) - task.second.begin();

                for(int ii = 0; ii < task_ind; ++ii)
                    if(std::none_of(_leave_out_inds.begin(), _leave_out_inds.end(), [&ii, &task](int ind){return ind == task.second[ii];}))
                        ++n_train_samp;

                if(task_ind < task.second.size())
                    break;
            }
218
            for(int ii = 1; ii < split_line.size(); ++ii)
Thomas Purcell's avatar
Thomas Purcell committed
219
220
                if(ii-1 != taskind)
                    data[ii-1][n_train_samp] = std::stod(split_line[ii]);
221
        }
Thomas Purcell's avatar
Thomas Purcell committed
222
223
224
225
226
227
228
229
230
231
232
        ++cur_line;
    }

    if(bad_samples.size() > 0)
    {
        std::string msg = "The data for the following samples are incomplete: ";
        for(auto& sample : bad_samples)
            msg += sample + ", ";
        throw std::logic_error(msg.substr(0, msg.size() - 2));
    }

Thomas Purcell's avatar
Thomas Purcell committed
233
234
235
    int propind = 0;
    while((headers[propind] != _prop_key) && (propind < headers.size()))
        ++propind;
Thomas Purcell's avatar
Thomas Purcell committed
236

Thomas Purcell's avatar
Thomas Purcell committed
237
    if(propind >= headers.size())
238
    {
Thomas Purcell's avatar
Thomas Purcell committed
239
        throw std::logic_error("_propkey not found in data_file");
240
    }
Thomas Purcell's avatar
Thomas Purcell committed
241
    else
242
    {
Thomas Purcell's avatar
Thomas Purcell committed
243
244
245
246
247
        _prop_train = std::vector<double>(data[propind].size(), 0.0);
        _prop_test = std::vector<double>(test_data[propind].size(), 0.0);

        std::copy_n(data[propind].begin(), data[propind].size(), _prop_train.begin());
        std::copy_n(test_data[propind].begin(), test_data[propind].size(), _prop_test.begin());
248

Thomas Purcell's avatar
Thomas Purcell committed
249
250
251
252
253
254
255
256
        data.erase(data.begin() + propind);
        test_data.erase(test_data.begin() + propind);
        headers.erase(headers.begin() + propind);
        units.erase(units.begin() + propind);
    }

    if(taskind > propind)
        --taskind;
257

Thomas Purcell's avatar
Thomas Purcell committed
258
259
260
261
262
263
    if(taskind < headers.size())
    {
        data.erase(data.begin() + taskind);
        test_data.erase(test_data.begin() + taskind);
        headers.erase(headers.begin() + taskind);
        units.erase(units.begin() + taskind);
264
    }
Thomas Purcell's avatar
Thomas Purcell committed
265

266
    node_value_arrs::initialize_values_arr(_prop_train.size(), _prop_test.size(), headers.size());
Thomas Purcell's avatar
Thomas Purcell committed
267
268
    std::vector<node_ptr> phi_0;
    for(int ff = 0; ff < headers.size(); ++ff)
269
        phi_0.push_back(std::make_shared<FeatureNode>(ff, headers[ff], data[ff], test_data[ff], units[ff]));
Thomas Purcell's avatar
Thomas Purcell committed
270

271
    _feat_space = std::make_shared<FeatureSpace>(comm, phi_0, _opset, _prop_train, _task_sizes_train, _max_rung, _n_sis_select, _max_store_rung, _n_rung_generate, _l_bound, _u_bound);
Thomas Purcell's avatar
Thomas Purcell committed
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
}

void stripComments(std::string& filename)
{
    //Open input and output file
    std::string newfn = "stripped_" + filename;
    std::fstream inputfile;
    inputfile.open(filename);
    std::ofstream inputcopy;
    inputcopy.open(newfn);

    //search for '//', delete everything following, print remainder to new file
    std::string line;
    int found, found2;
    while (std::getline(inputfile,line))
    {
        found  = line.find('/');
        found2 = line.find('/', found+1);
        if (found != line.npos && found2 == found+1)
            inputcopy << line.erase(found, line.length()) << std::endl;
        else
            inputcopy << line << std::endl;
    }
    inputcopy.close();
    //update filename;
    filename = newfn;
}