import_dataframe.py 15.6 KB
Newer Older
Thomas Purcell's avatar
Thomas Purcell committed
1
# Copyright 2021 Thomas A. R. Purcell
2
#
Thomas Purcell's avatar
Thomas Purcell committed
3
4
5
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
6
#
Thomas Purcell's avatar
Thomas Purcell committed
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
Thomas Purcell's avatar
Thomas Purcell committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions used to convert a data.csv file into a primary feature space

Functions:

get_unit: Get the unit from a header
strip_units: Strip the units of the column names in a DataFrame
extract_col: Get the property vector, label and unit from df
read_csv: Create initial feature set from a csv file
"""
23
import math
Thomas Purcell's avatar
Thomas Purcell committed
24
25
26
import numpy as np
import pandas as pd

Thomas Purcell's avatar
Thomas Purcell committed
27
28
29
30
31
32
33
from sissopp import initialize_values_arr

try:
    from sissopp import initialize_param_storage
except ImportError:
    pass

34
from sissopp._sisso import Unit, Inputs, FeatureNode, initialize_values_arr
Thomas Purcell's avatar
Thomas Purcell committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83


def get_unit(header):
    """Get the unit from a header

    Args:
        header (str): Column header to get the unit of

    Returns:
        str: The string representation of the unit of the features
    """
    try:
        unit_str = header.split(":")[0].split("(")[1].split(")")[0].replace(" ", "")
        if unit_str.lower() == "unitless":
            return Unit()
        return Unit(unit_str)
    except IndexError:
        return Unit()


def strip_units(df):
    """Strip the units of the column names in a DataFrame

    Args:
        df (pd.DataFrame): The DataFrame to rename the columns of

    Returns:
        pd.DataFrame: A new DataFrame with the correct column names
    """
    if isinstance(df, str):
        df = pd.read_csv(df, index_col=0)

    # Strip out units from column names
    col_rename = {}
    for col in df.columns:
        col_rename[col] = col.split("(")[0].strip()

    return df.rename(columns=col_rename)


def extract_col(df, key, drop_col=True):
    """Get the property vector, label and unit from df

    Args:
        df (pd.DataFrame): DataFrame storing the primary feature space and property vector
        prop_key (str): The key of the column to extract from df
        drop_col (bool): If True then drop the column from the DataFrame

    Returns:
84
85
86
87
        tuple: A tuple containing:
            - col_data (np.ndarray): The data of the column
            - col_label (str): The label for the column
            - col_unit (Unit): The unit of the column
Thomas Purcell's avatar
Thomas Purcell committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
    """
    col_label = key.split("(")[0].strip()
    col_ind = np.where([col.split("(")[0].strip() == col_label for col in df.columns])[
        0
    ]

    if len(col_ind) == 1:
        col_ind = col_ind[0]
    elif len(col_ind) > 1:
        raise ValueError(f"The property key {prop_key} appears more than once in df.")
    else:
        raise ValueError(f"The property key {prop_key} does not appear in df.")

    col_unit = get_unit(df.columns[col_ind])
    col_data = df[df.columns[col_ind]].to_numpy()

    if drop_col:
        df.drop(df.columns[col_ind], axis=1, inplace=True)

    return col_data, col_label, col_unit


def read_csv(
    df,
    prop_key,
113
    inputs=None,
Thomas Purcell's avatar
Thomas Purcell committed
114
115
116
117
118
119
    cols="all",
    task_key=None,
    leave_out_frac=0.0,
    leave_out_inds=None,
    max_rung=None,
):
120
    """Read a data.csv file and populate inputs with the relevant information
Thomas Purcell's avatar
Thomas Purcell committed
121
122
123
124

    Args:
        df (str or pandas.DataFrame): The DataFrame of csv file of the initial feature set
        prop_key (str): The key corresponding to which column in the csv file the property is stored in
125
        inputs (Inputs): The inputs object for the calculation
Thomas Purcell's avatar
Thomas Purcell committed
126
127
128
129
130
        cols (list or str): The columns to include in the initial feature set
        task_key (str): The key corresponding to which column in the csv file the task differentiation is stored in
        leave_out_frac (float): The fraction (as a decimal) of indcies to leave out of the calculations
        leave_out_inds (list): List of indices to pull from the training data to act as a test set
        max_rung (int): Maximum rung of a feature
131

Thomas Purcell's avatar
Thomas Purcell committed
132
    Returns:
133
        inputs (Inputs): The updated inputs object for the calculation
Thomas Purcell's avatar
Thomas Purcell committed
134
    """
135
136
137
138
139
140
141
142
    if inputs:
        inputs_out = Inputs(inputs)
        inputs_out.clear_data()

        if not max_rung:
            max_rung = inputs.max_rung
    else:
        inputs_out = Inputs()
Thomas Purcell's avatar
Thomas Purcell committed
143
144
145
146
147
148
    if not max_rung:
        raise ValueError("Maximum rung for the calculation is not defined.")

    # Load csv file
    if not isinstance(df, pd.DataFrame):
        df = pd.read_csv(str(df), index_col=0)
Thomas Purcell's avatar
Thomas Purcell committed
149
150
    else:
        df = df.copy()
Thomas Purcell's avatar
Thomas Purcell committed
151
152
153
154
155
156

    # Extract the Property and Task Columns
    prop, prop_label, prop_unit = extract_col(df, prop_key)
    if task_key:
        task, _, _ = extract_col(df, task_key)
    else:
157
        task = np.array(["all"] * len(prop))
Thomas Purcell's avatar
Thomas Purcell committed
158
159
160

    # Map out which index belongs to which task and get the size of each task
    task_map = {}
161
    task_names, task_sizes = np.unique(task, return_counts=True)
Thomas Purcell's avatar
Thomas Purcell committed
162
163
    task_sizes = task_sizes.astype(np.int32)

164
    for kk, key in enumerate(task_names):
Thomas Purcell's avatar
Thomas Purcell committed
165
        task_map[key] = np.where(task == key)[0].astype(np.int32)
Thomas Purcell's avatar
Thomas Purcell committed
166
167
168
169
170
171
172
173
174
175
176
        assert task_sizes[kk] == len(task_map[key])

    # DataFrame should not only contain floats and no strings
    df = df.astype(float)

    # Split the data into training and test sets
    if not leave_out_inds:
        leave_out_inds = []
        if leave_out_frac > 0.0:
            task_sizes_test = [int(math.ceil(ts * leave_out_frac)) for ts in task_sizes]

177
            for kk, key in enumerate(task_names):
Thomas Purcell's avatar
Thomas Purcell committed
178
                leave_out_inds += list(
Thomas Purcell's avatar
Thomas Purcell committed
179
180
181
                    np.random.choice(task_map[key], task_sizes_test[kk], False).astype(
                        np.int32
                    )
Thomas Purcell's avatar
Thomas Purcell committed
182
183
                )
        else:
184
            task_sizes_test = list(np.zeros(len(task_names), dtype=np.int32))
Thomas Purcell's avatar
Thomas Purcell committed
185
186
187
188
189
    else:
        assert (leave_out_frac == 0.0) or (
            int(round(len(df) * leave_out_frac)) == len(leave_out_inds)
        )

190
191
        task_sizes_test = list(np.zeros(len(task_names), dtype=np.int32))
        for kk, key in enumerate(task_names):
Thomas Purcell's avatar
Thomas Purcell committed
192
193
194
            left_out = [ind for ind in leave_out_inds if ind in task_map[key]]
            task_sizes_test[kk] = len(left_out)

Thomas Purcell's avatar
Thomas Purcell committed
195
    task_sizes_train = [int(ts - tst) for ts, tst in zip(task_sizes, task_sizes_test)]
Thomas Purcell's avatar
Thomas Purcell committed
196
    train_inds = []
197

Thomas Purcell's avatar
Thomas Purcell committed
198
    for val in task_map.values():
Thomas Purcell's avatar
Thomas Purcell committed
199
200
201
202
203
204
205
206
207
208
209
210
211
212
        train_inds += [ind for ind in val if ind not in leave_out_inds]

    # Set the cols list and initialize central storage
    if cols == "all":
        cols = df.columns.tolist()

    initialize_values_arr(task_sizes_train, task_sizes_test, len(cols), max_rung)
    try:
        initialize_param_storage()
    except NameError:
        pass

    # Create Primary Feature Space
    phi_0 = []
Thomas Purcell's avatar
Thomas Purcell committed
213
    for feat_ind, col in enumerate(df.columns):
Thomas Purcell's avatar
Thomas Purcell committed
214
215
216
217
218
219
220
221
222
223
224
        data, label, unit = extract_col(df, col, False)
        phi_0.append(
            FeatureNode(
                feat_ind,
                label.replace(" ", "_"),
                data[train_inds],
                data[leave_out_inds],
                unit,
            )
        )

225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
    inputs_out.phi_0 = phi_0
    inputs_out.max_rung = max_rung
    inputs_out.sample_ids_train = list(df.index[train_inds].to_numpy().astype(str))
    inputs_out.sample_ids_test = list(df.index[leave_out_inds].to_numpy().astype(str))
    inputs_out.prop_key = prop_key
    inputs_out.prop_label = prop_label
    inputs_out.prop_unit = Unit(prop_unit)
    inputs_out.prop_train = prop[train_inds].flatten()
    inputs_out.prop_test = prop[leave_out_inds].flatten()
    inputs_out.task_names = list(task_names)
    inputs_out.task_sizes_train = task_sizes_train
    inputs_out.task_sizes_test = task_sizes_test
    inputs_out.leave_out_inds = leave_out_inds
    inputs_out.leave_out_frac = len(leave_out_inds) / len(prop)

    return inputs_out
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446


def create_inputs(
    df=None,
    cols="all",
    calc_type="regression",
    phi_0=None,
    sample_ids_train=None,
    sample_ids_test=None,
    task_names=None,
    prop_key=None,
    prop_label="Property",
    prop_unit=Unit(),
    prop_train=None,
    prop_test=None,
    task_key=None,
    task_sizes_train=None,
    task_sizes_test=None,
    leave_out_frac=0.0,
    leave_out_inds=None,
    allowed_ops="all",
    allowed_param_ops=None,
    cross_cor_max=1.0,
    l_bound=1e-50,
    u_bound=1e50,
    n_dim=1,
    max_rung=0,
    n_rung_store=0,
    n_rung_generate=0,
    n_sis_select=1,
    n_residual=1,
    n_models_store=1,
    max_param_depth=None,
    nlopt_seed=42,
    fix_intercept=False,
    global_param_opt=True,
    reparam_residual=False,
):
    """Create the Inputs object for the calculation (used for creating the FeatureSpace and SISSOSOlver)
    Args:
        df (str or pandas.DataFrame): The DataFrame of csv file of the initial feature set
        cols (list or str): The columns to include in the initial feature set
        calc_type (str): The type of LossFunction to use when projecting the features onto a property
        phi_0 (list of FeatureNodes): A list of FeatureNodes for the primary feature space
        sample_ids_train (list): A list storing all sample ids for the training samples
        sample_ids_test (list): A list storing all sample ids for the test samples
        task_names (list): A list storing the ID of the task names
        prop_key (str): Key used to find the property column in the data file
        prop_label (str): The label of the property
        prop_unit (Unit): The Unit of the property
        prop_train (np.ndarray): The value of the property to evaluate the loss function against for the training set
        prop_test (np.ndarray): The value of the property to evaluate the loss function against for the test set
        task_key (str): Key used to find the task column in the data file
        task_sizes_train (list): Number of training samples per task
        task_sizes_test (list): Number of testing samples per task
        leave_out_frac (float): The fraction (as a decimal) of indcies to leave out of the calculations
        leave_out_inds (list): List of indexes from the initial data file in the test set
        allowed_ops (list): A list containing all allowed operators strings for operators with free parameters
        allowed_param_ops (list): A list containing all allowed operators strings
        cross_cor_max (float): Maximum cross-correlation used for selecting features
        l_bound (float): The lower bound for the maximum absolute value of the features
        u_bound (float): The upper bound for the maximum absolute value of the features
        n_dim (int): The maximum number of features allowed in the linear model
        max_rung (int): Maximum rung for the feature creation
        n_rung_store (int): The number of rungs to calculate and store the value of the features for all samples
        n_rung_generate (int): Either 0 or 1, and is the number of rungs to generate on the fly during SIS
        n_sis_select (int): Number of features to select during each SIS iteration
        n_residual (int): Number of residuals to pass to the next sis model
        n_models_store (int): The number of models to output to files
        max_param_depth (int): The maximum depth in the binary expression tree to set non-linear optimization
        nlopt_seed (int): The seed used for the nlOpt library
        fix_intercept (bool): If true the bias term is fixed at 0
        global_param_opt (bool): True if global optimization is requested for non-linear optimization of parameters (Can break reproducibility)
        reparam_residual (bool): If True then reparameterize features using the residuals of each model

    Returns:
        inputs (Inputs): The updated inputs object for the calculation
    """
    if allowed_ops == "all":
        allowed_ops = [
            "add",
            "sub",
            "mult",
            "div",
            "abs_diff",
            "inv",
            "abs",
            "cos",
            "sin",
            "exp",
            "neg_exp",
            "log",
            "sq",
            "sqrt",
            "cb",
            "cbrt",
            "six_pow",
        ]
    inputs = Inputs()

    # Set values that have well defined defaults
    inputs.allowed_ops = allowed_ops
    inputs.calc_type = calc_type
    inputs.cross_cor_max = cross_cor_max
    inputs.fix_intercept = fix_intercept
    inputs.global_param_opt = global_param_opt
    inputs.l_bound = l_bound
    inputs.max_rung = max_rung
    inputs.n_dim = n_dim
    inputs.n_models_store = n_models_store
    inputs.reparam_residual = reparam_residual
    inputs.n_residual = n_residual
    inputs.n_rung_store = n_rung_store
    inputs.n_rung_generate = n_rung_generate
    inputs.n_sis_select = n_sis_select
    inputs.nlopt_seed = nlopt_seed
    inputs.u_bound = u_bound

    # Set values with no stand alone defaults
    if task_key is not None:
        inputs.task_key = task_key

    if allowed_param_ops is not None:
        inputs.allowed_param_ops = allowed_param_ops

    if max_param_depth is not None:
        inputs.max_param_depth = max_param_depth
    else:
        inputs.max_param_depth = max_rung

    # Add items that can be read from a DataFrame or from the items passed here
    if df:
        inputs = read_csv(
            df,
            prop_key,
            inputs,
            cols,
            task_key,
            leave_out_inds=leave_out_inds,
            max_rung=max_rung,
        )
    else:
        if not phi_0:
            raise ValueError("If no DataFrame is passed then phi_0 must be passed")

        if not prop_train:
            raise ValueError("If no DataFrame is passed then prop_train must be passed")

        if not task_sizes_train:
            raise ValueError(
                "If no DataFrame is passed then task_sizes_train must be passed"
            )

        n_samp_train = np.sum(task_sizes_train, dtype=np.int32)
        if not sample_ids_train:
            sample_ids_train = [str(ii) for ii in range(n_samp_train)]

        if not task_names:
            task_names = [f"task_{ii}" for ii in range(len(task_sizes_train))]
        else:
            assert len(task_sizes_train) == len(task_names)

        if not task_sizes_test:
            task_sizes_test = [0] * len(task_sizes_train)
            assert len(sample_ids_test) == 0
            assert len(leave_out_inds) == 0
        else:
            assert len(task_sizes_train) == len(task_sizes_test)

        n_samp_test = np.sum(task_sizes_test, dtype=np.int32)
        if not sample_ids_test:
            sample_ids_test = [
                str(ii) for ii in range(n_samp_train, n_samp_train + n_samp_test)
            ]
        else:
            assert len(sample_ids_test) == n_samp_test
        if not leave_out_inds:
            leave_out_inds = [
                ii for ii in range(n_samp_train, n_samp_train + n_samp_test)
            ]
        else:
            assert len(leave_out_inds) == n_samp_test

        if not prop_key:
            prop_key = f"{prop_label} ({prop_unit})"

        if not prop_test and (n_samp_test == 0):
            prop_test = np.zeros(0)
        else:
            assert len(prop_test) == n_samp_test

        inputs.phi_0 = phi_0
        inputs.sample_ids_train = sample_ids_train
        inputs.sample_ids_test = sample_ids_test
        inputs.prop_key = prop_key
        inputs.prop_label = prop_label
        inputs.prop_unit = prop_unit
        inputs.prop_train = prop_train
        inputs.prop_test = prop_test
        inputs.task_names = task_names
        inputs.task_sizes_train = task_sizes_train
        inputs.task_sizes_test = task_sizes_test
        inputs.leave_out_inds = leave_out_inds
        inputs.leave_out_frac = n_samp_test / (n_samp_train + n_samp_test)

    return inputs