Add basic error analysis functions

basic analysis scripts completed

Add basic error analysis functions
6ec76bd3 · Thomas Purcell · 3041b3ab · 6ec76bd3 · 6ec76bd3 · 6ec76bd3
Commit 6ec76bd3 authored 5 years ago by Thomas Purcell
--- a/src/descriptor_identifier/Model/Model.cpp
+++ b/src/descriptor_identifier/Model/Model.cpp
@@ -123,7 +123,6 @@ Model::Model(std::string train_file, std::string test_file)

 std::vector<std::string> Model::populate_model(std::string filename, bool train)
 {
-
    std::ifstream file_stream;
    file_stream.open(filename, std::ios::in);

@@ -149,7 +148,6 @@ std::vector<std::string> Model::populate_model(std::string filename, bool train)
    int n_task = 0;
    int _n_dim = 0;
    std::getline(file_stream, line);
-
    do
    {
        ++n_task;
@@ -183,6 +181,7 @@ std::vector<std::string> Model::populate_model(std::string filename, bool train)
        else
            _task_sizes_test.push_back(std::stoi(split_line[1]));
    }
+
    if(train)
    {
        _n_samp_train = n_samp;
@@ -197,10 +196,12 @@ std::vector<std::string> Model::populate_model(std::string filename, bool train)
        _prop_test_est.resize(n_samp);
        _test_error.resize(n_samp);
    }
+
    std::getline(file_stream, line);
    std::getline(file_stream, line);
    if(!train)
        std::getline(file_stream, line);
+
    std::vector<std::vector<double>> feat_vals(_n_dim, std::vector<double>(n_samp, 0.0));
    for(int ns = 0; ns < n_samp; ++ns)
    {
@@ -224,6 +225,7 @@ std::vector<std::string> Model::populate_model(std::string filename, bool train)
            feat_vals[nf][ns] = std::stod(split_line[2 + nf]);
        }
    }
+
    if(train)
    {
        _D_train.resize(_n_dim * n_samp);
@@ -236,6 +238,7 @@ std::vector<std::string> Model::populate_model(std::string filename, bool train)
        for(int nf = 0; nf < _n_dim; ++nf)
            std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]);
    }
+
    return feature_expr;
 }

@@ -334,3 +337,35 @@ void Model::to_file(std::string filename, bool train, std::vector<int> test_inds
    }
    out_file_stream.close();
 }
+
+std::vector<double> Model::sorted_error()
+{
+    std::vector<double> sorted_error(_train_error.size(), 0.0);
+    std::copy_n(_train_error.data(), _train_error.size(), sorted_error.data());
+    std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);});
+    std::sort(sorted_error.begin(), sorted_error.end());
+    return sorted_error;
+}
+
+std::vector<double> Model::sorted_test_error()
+{
+    std::vector<double> sorted_error(_test_error.size(), 0.0);
+    std::copy_n(_test_error.data(), _test_error.size(), sorted_error.data());
+    std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);});
+    std::sort(sorted_error.begin(), sorted_error.end());
+    return sorted_error;
+}
+
+double Model::mape()
+{
+    std::vector<double> percent_error(_train_error.size(), 0.0);
+    std::transform(_train_error.begin(), _train_error.end(), _prop_train.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);});
+    return util_funcs::mean(percent_error);
+}
+
+double Model::test_mape()
+{
+    std::vector<double> percent_error(_test_error.size(), 0.0);
+    std::transform(_test_error.begin(), _test_error.end(), _prop_test.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);});
+    return util_funcs::mean(percent_error);
+}
--- a/src/descriptor_identifier/Model/Model.hpp
+++ b/src/descriptor_identifier/Model/Model.hpp
@@ -169,6 +169,103 @@ public:
        return std::abs(*std::max_element(_test_error.data(), _test_error.data() + _n_samp_test, [](double d1, double d2){return std::abs(d1) < std::abs(d2);}));
    }

+    // DocString: model_mae
+    /**
+     * @brief The mean absolute error of the model
+     * @return The mean absolute error of the training data
+     */
+    inline double mae(){return std::accumulate(_train_error.begin(), _train_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_train;}
+
+    // DocString: model_test_mae
+    /**
+     * @brief The mean absolute test error of the model
+     * @return The mean absolute error of the test data
+     */
+    inline double test_mae(){return std::accumulate(_test_error.begin(), _test_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_test;}
+
+    // DocString: model_mape
+    /**
+     * @brief The mean absolute error of the model
+     * @return The mean absolute error of the training data
+     */
+    double mape();
+
+    // DocString: model_test_mape
+    /**
+     * @brief The mean absolute test error of the model
+     * @return The mean absolute error of the test data
+     */
+    double test_mape();
+
+    /**
+     * @brief Sort the training error based on magnitude
+     * @return The error vector sorted
+     */
+    std::vector<double> sorted_error();
+
+    /**
+     * @brief Sort the training test_error based on magnitude
+     * @return The test_error vector sorted
+     */
+    std::vector<double> sorted_test_error();
+
+    // DocString: model_percentile_25_ae
+    /**
+     * @brief The mean absolute error of the model
+     * @return The mean absolute error of the training data
+     */
+    inline double percentile_25_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.25))];}
+
+    // DocString: model_test_percentile_25_ae
+    /**
+     * @brief The mean absolute test error of the model
+     * @return The mean absolute error of the test data
+     */
+    inline double percentile_25_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.25))];}
+
+    // DocString: model_percentile_50_ae
+    /**
+     * @brief The mean absolute error of the model
+     * @return The mean absolute error of the training data
+     */
+    inline double percentile_50_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.50))];}
+
+    // DocString: model_test_percentile_50_ae
+    /**
+     * @brief The mean absolute test error of the model
+     * @return The mean absolute error of the test data
+     */
+    inline double percentile_50_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.50))];}
+
+    // DocString: model_percentile_75_ae
+    /**
+     * @brief The mean absolute error of the model
+     * @return The mean absolute error of the training data
+     */
+    inline double percentile_75_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.75))];}
+
+    // DocString: model_test_percentile_75_ae
+    /**
+     * @brief The mean absolute test error of the model
+     * @return The mean absolute error of the test data
+     */
+    inline double percentile_75_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.75))];}
+
+    // DocString: model_percentile_95_ae
+    /**
+     * @brief The mean absolute error of the model
+     * @return The mean absolute error of the training data
+     */
+    inline double percentile_95_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.95))];}
+
+    // DocString: model_test_percentile_95_ae
+    /**
+     * @brief The mean absolute test error of the model
+     * @return The mean absolute error of the test data
+     */
+    inline double percentile_95_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.95))];}
+
+
    /**
     * @brief Convert the Model into an output file
     *

--- a/src/python/analysis/__init__.py
+++ b/src/python/analysis/__init__.py
--- a/src/python/analysis/feature_space_analysis.py
+++ b/src/python/analysis/feature_space_analysis.py
+"""Selected feature space analysis"""
+
+import numpy as np
+
+from sisso import phi_selected_from_file
+
+
+def get_prevelance_of_primary_features(sisso_file, phi_0):
+    """Get the prevalence of features inside of phi_selected
+
+    Args:
+        sisso_file (str): The selected feature file
+        phi_0 (list): the primary feature list
+
+    Returns:
+        dict (str, double): fraction each primary feature appears in the selected features
+    """
+    phi_selected = phi_selected_from_file(sisso_file, phi_0)
+    phi_0_in_phi_sel = {}
+    for feat in phi_0:
+        phi_0_in_phi_sel[str(feat)] = 0.0
+
+    for feat in phi_selected:
+        for key in feat.primary_feat_decomp.keys():
+            phi_0_in_phi_sel[str(phi_0[key])] += 1.0 / len(phi_selected)
+    return phi_0_in_phi_sel
--- a/src/python/bindings_docstring_keyed.cpp
+++ b/src/python/bindings_docstring_keyed.cpp
@@ -352,7 +352,20 @@ void sisso::descriptor_identifier::registerModel()
        .add_property("rmse", &Model::rmse, "@DocString_model_rmse@")
        .add_property("test_rmse", &Model::test_rmse, "@DocString_model_test_rmse@")
        .add_property("max_ae", &Model::max_ae, "@DocString_model_max_ae@")
-        .add_property("test_max_ae", &Model::test_max_ae, "@DocString_model_test_max_ae@");
+        .add_property("test_max_ae", &Model::test_max_ae, "@DocString_model_test_max_ae@")
+        .add_property("mae", &Model::mae, "@DocString_model_mae@")
+        .add_property("test_mae", &Model::test_mae, "@DocString_model_test_mae@")
+        .add_property("mape", &Model::mape, "@DocString_model_mape@")
+        .add_property("test_mape", &Model::test_mape, "@DocString_model_test_mape@")
+        .add_property("percentile_25_ae", &Model::percentile_25_ae, "@DocString_model_percentile_25_ae@")
+        .add_property("percentile_25_test_ae", &Model::percentile_25_test_ae, "@DocString_model_test_percentile_25_test_ae@")
+        .add_property("percentile_50_ae", &Model::percentile_50_ae, "@DocString_model_percentile_50_ae@")
+        .add_property("percentile_50_test_ae", &Model::percentile_50_test_ae, "@DocString_model_test_percentile_50_test_ae@")
+        .add_property("percentile_75_ae", &Model::percentile_75_ae, "@DocString_model_percentile_75_ae@")
+        .add_property("percentile_75_test_ae", &Model::percentile_75_test_ae, "@DocString_model_test_percentile_75_test_ae@")
+        .add_property("percentile_95_ae", &Model::percentile_95_ae, "@DocString_model_percentile_95_ae@")
+        .add_property("percentile_95_test_ae", &Model::percentile_95_test_ae, "@DocString_model_test_percentile_95_test_ae@")
+    ;
 }

 void sisso::descriptor_identifier::registerSISSORegressor()

--- a/src/python/conversion_utils.hpp
+++ b/src/python/conversion_utils.hpp
@@ -29,8 +29,8 @@ namespace python_conv_utils
    template<typename T>
    std::vector<T> from_list(py::list lst)
    {
-        std::vector<T> vec(len(lst));
-        for(int ll = vec.size() - 1; ll >= 0; --ll)
+        std::vector<T> vec(py::len(lst));
+        for(int ll = 0; ll < vec.size(); ++ll)
            vec[ll] = py::extract<T>(lst[ll]);
        return vec;
    }
@@ -66,9 +66,9 @@ namespace python_conv_utils
    template<typename T_ptr, typename T_base>
    std::vector<std::shared_ptr<T_ptr>> shared_ptr_vec_from_list(py::list lst)
    {
-        std::vector<std::shared_ptr<T_ptr>> vec(len(lst));
-        for(int ll = vec.size() - 1; ll >= 0; --ll)
-            vec[ll] = std::make_shared<T_base>(py::extract<T_base>(lst.pop()));
+        std::vector<std::shared_ptr<T_ptr>> vec(py::len(lst));
+        for(int ll = 0; ll < vec.size(); ++ll)
+            vec[ll] = std::make_shared<T_base>(py::extract<T_base>(lst[ll]));
        return vec;
    }


--- a/src/python/feature_creation/node_utils.cpp
+++ b/src/python/feature_creation/node_utils.cpp
@@ -3,7 +3,6 @@
 py::list str2node::phi_selected_from_file_py(std::string filename, py::list phi_0)
 {
    std::vector<node_ptr> phi_selected = phi_selected_from_file(filename, python_conv_utils::shared_ptr_vec_from_list<Node, FeatureNode>(phi_0));
-
    py::list feat_lst;
    for(auto& feat : phi_selected)
        feat_lst.append<ModelNode>(ModelNode(feat->d_mat_ind(), feat->rung(), feat->expr(), feat->postfix_expr(), feat->value(), feat->test_value(), feat->unit()));