diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 31815159af51e6c2d9ba26abe2314797cfe29e30..9d3512c38977863ed4c6b969c88eba8c63cc445d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -30,14 +30,13 @@ set_target_properties(sisso++ LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib" RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin" ) -message(STATUS ${Boost_LIBRARIES}) + target_link_libraries(sisso++ ${LAPACK_LIBRARIES} ${MPI_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} -Wl,--rpath=${LAPACK_DIR} ${Boost_LIBRARIES}) install(TARGETS sisso++ DESTINATION ${CMAKE_CURRENT_LIST_DIR}/../bin/) if(USE_PYTHON) include(${CMAKE_CURRENT_LIST_DIR}/../cmake/TransferDocStrings.cmake) set(CMAKE_INSTALL_RPATH "${Boost_LIBRARY_DIRS};${PYTHON_PREFIX}/lib/;${MPI_DIR}") - message(STATUS "CMAKE_INSTALL_RPATH = ${CMAKE_INSTALL_RPATH}") transfer_doc_string(${CMAKE_CURRENT_LIST_DIR}/python/bindings_docstring_keyed.cpp ${CMAKE_CURRENT_LIST_DIR}/python/bindings.cpp) transfer_doc_string(${CMAKE_CURRENT_LIST_DIR}/python/bindings_docstring_keyed.hpp ${CMAKE_CURRENT_LIST_DIR}/python/bindings.hpp) diff --git a/src/descriptor_identifier/Model/Model.cpp b/src/descriptor_identifier/Model/Model.cpp index 1a512bb40839048c25d3701cac1cb5ac6dec1d0c..a7b940c71e44c58ee8228642a23d1a398dc74b08 100644 --- a/src/descriptor_identifier/Model/Model.cpp +++ b/src/descriptor_identifier/Model/Model.cpp @@ -1,6 +1,6 @@ #include <descriptor_identifier/Model/Model.hpp> -Model::Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept) : +Model::Model(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept) : _n_samp_train(feats[0]->n_samp()), _n_samp_test(feats[0]->n_test_samp()), _n_dim(feats.size()), @@ -15,6 +15,7 @@ Model::Model(std::vector<double> prop_train, std::vector<double> prop_test, std: _prop_test_est(_n_samp_test, 0.0), _task_sizes_train(task_sizes_train), _task_sizes_test(task_sizes_test), + _prop_unit(prop_unit), _fix_intercept(fix_intercept) { _prop_train_est.reserve(_n_samp_train); @@ -78,7 +79,6 @@ Model::Model(std::string train_file) for(int ff = 0; ff < feature_expr_train.size(); ++ff) { split_str = str_utils::split_string_trim(feature_expr_train[ff]); - int rung = std::stoi(split_str[0]); std::string unit_str = split_str[1]; std::string postfix_expr = split_str[2]; @@ -134,9 +134,23 @@ std::vector<std::string> Model::populate_model(std::string filename, bool train) std::string model_line; std::getline(file_stream, model_line); - // Get the error + // Get the property unit and error + std::string unit_line; std::string error_line; - std::getline(file_stream, error_line); + + std::getline(file_stream, unit_line); + if(unit_line.substr(0, 8).compare("# RMSE: ") != 0) + { + split_line = str_utils::split_string_trim(unit_line); + _prop_unit = Unit(split_line.back()); + std::getline(file_stream, error_line); + } + else + { + _prop_unit = Unit(); + error_line = unit_line; + } + split_line = str_utils::split_string_trim(error_line); double rmse = std::stod(split_line[1]); double max_ae = std::stod(split_line[3]); @@ -164,6 +178,8 @@ std::vector<std::string> Model::populate_model(std::string filename, bool train) if(_coefs.back().back() == 0.0) _fix_intercept = true; + else + _fix_intercept = false; _n_dim = n_dim + 1 - _fix_intercept; @@ -245,6 +261,7 @@ std::vector<std::string> Model::populate_model(std::string filename, bool train) std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]); } + file_stream.close(); return feature_expr; } @@ -272,6 +289,7 @@ void Model::to_file(std::string filename, bool train, std::vector<int> test_inds out_file_stream.open(filename); out_file_stream << "# " << toString() << std::endl; + out_file_stream << "# Property of the Unit: " << _prop_unit.toString() << std::endl; if(train) out_file_stream << "# RMSE: " << std::setprecision(15) << rmse() << "; Max AE: " << max_ae() << std::endl; else diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp index a44fbb961d48cd47cf5a4673c90cf182def07313..b07228be43df5b2af2f66d7bd1105cf69959bb8f 100644 --- a/src/descriptor_identifier/Model/Model.hpp +++ b/src/descriptor_identifier/Model/Model.hpp @@ -55,18 +55,21 @@ class Model std::vector<int> _task_sizes_train; //!< Number of training samples in each task std::vector<int> _task_sizes_test; //!< Number of testing samples in each task + Unit _prop_unit; //!< The Unit for the property + bool _fix_intercept; //!< If true fix intercept to 0 public: /** * @brief Constructor for the model * + * @param prop_unit The unit of the property * @param prop_train The property vector for the training samples * @param prop_test The property vector for the test samples * @param feats The features for the model * @param task_sizes_train Number of samples per task in the training data * @param task_sizes_test Number of samples per task in the test data */ - Model(std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept); + Model(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept); // DocString: model_init_str /** @@ -267,6 +270,12 @@ public: */ inline double percentile_95_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.95))];} + // DocString: model_prop_unit + /** + * @brief The unit of the property + * @return The unit of the property + */ + inline Unit prop_unit(){return _prop_unit;} /** * @brief Convert the Model into an output file diff --git a/src/descriptor_identifier/SISSORegressor.cpp b/src/descriptor_identifier/SISSORegressor.cpp index ed0a0085c371a34cdec937f171bdf56a3b8caa06..c956cdf3de0ff7ba9fd40e9a104e21c98c63a0e1 100644 --- a/src/descriptor_identifier/SISSORegressor.cpp +++ b/src/descriptor_identifier/SISSORegressor.cpp @@ -2,6 +2,7 @@ SISSORegressor::SISSORegressor( std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, @@ -20,6 +21,7 @@ SISSORegressor::SISSORegressor( _task_sizes_train(task_sizes_train), _task_sizes_test(task_sizes_test), _leave_out_inds(leave_out_inds), + _prop_unit(prop_unit), _feat_space(feat_space), _mpi_comm(feat_space->mpi_comm()), _n_samp(prop.size()), @@ -117,7 +119,7 @@ void SISSORegressor::fit() { node_value_arrs::clear_temp_test_reg(); model_node_ptr model_feat = std::make_shared<ModelNode>(_feat_space->phi_selected()[rr]->arr_ind(), _feat_space->phi_selected()[rr]->rung(), _feat_space->phi_selected()[rr]->expr(), _feat_space->phi_selected()[rr]->postfix_expr(), _feat_space->phi_selected()[rr]->value(), _feat_space->phi_selected()[rr]->test_value(), _feat_space->phi_selected()[rr]->unit()); - models.push_back(Model(_prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test, _fix_intercept)); + models.push_back(Model(_prop_unit, _prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test, _fix_intercept)); models.back().copy_error(&residual[rr * _n_samp]); if(_mpi_comm->rank() == 0) { @@ -218,7 +220,7 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim) int index = all_inds_min[inds[rr] * n_dim + ii]; min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit()); } - models.push_back(Model(_prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept)); + models.push_back(Model(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept)); } _models.push_back(models); diff --git a/src/descriptor_identifier/SISSORegressor.hpp b/src/descriptor_identifier/SISSORegressor.hpp index 22b14d3280646818bf96e371d5cbbae143503144..5320ac1f922c9e586a78a40f69dc9599c4d4ef8a 100644 --- a/src/descriptor_identifier/SISSORegressor.hpp +++ b/src/descriptor_identifier/SISSORegressor.hpp @@ -40,6 +40,9 @@ protected: std::vector<int> _task_sizes_train; //!< Number of training samples per task std::vector<int> _task_sizes_test; //!< Number of testing samples per task std::vector<int> _leave_out_inds; //!< List of indexes from the initial data file in the test set + + Unit _prop_unit; //!< The Unit for the property + std::shared_ptr<FeatureSpace> _feat_space; //!< Feature Space for the problem std::shared_ptr<MPI_Interface> _mpi_comm; //!< MPI Communicator @@ -55,6 +58,7 @@ public: * @brief Constructor for the Regressor * * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property * @param prop Vector storing all data to train the SISSO models with * @param prpo_test Vector storing all data to test the SISSO models with * @param task_sizes_train Number of training samples per task @@ -64,7 +68,7 @@ public: * @param n_residual Number of residuals to pass to the next SIS operation * @param fix_intrecept If true fix intercept to 0 */ - SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual, bool fix_intercept=false); + SISSORegressor(std::shared_ptr<FeatureSpace> feat_space, Unit prop_unit, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual, bool fix_intercept=false); /** * @brief Get the optimal size of the working array @@ -167,6 +171,7 @@ public: * @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSORegressor.cpp) * * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property * @param prop Vector storing all data to train the SISSO models with * @param prpo_test Vector storing all data to test the SISSO models with * @param task_sizes_train Number of training samples per task @@ -178,6 +183,7 @@ public: */ SISSORegressor( std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, np::ndarray prop, np::ndarray prop_test, py::list task_sizes_train, @@ -193,6 +199,7 @@ public: * @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSORegressor.cpp) * * @param feat_space The feature space to run SISSO on + * @param prop_unit The unit of the property * @param prop Vector storing all data to train the SISSO models with * @param prpo_test Vector storing all data to test the SISSO models with * @param task_sizes_train Number of training samples per task @@ -204,6 +211,7 @@ public: */ SISSORegressor( std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, py::list prop, py::list prop_test, py::list task_sizes_train, diff --git a/src/feature_creation/units/Unit.cpp b/src/feature_creation/units/Unit.cpp index ce4520ddda7dbe4e351803fd97b6ddb76c5db7a6..edce61677a16c4e3720fb7ae3cf2c2847163279d 100644 --- a/src/feature_creation/units/Unit.cpp +++ b/src/feature_creation/units/Unit.cpp @@ -60,21 +60,14 @@ std::string Unit::toString() const std::stringstream unit_rep; for(auto& el : _dct) { - if(el.second == 1) + if(el.second == 1.0) unit_rep << " * " << el.first; - else if(el.second > 0) + else unit_rep << " * " << el.first << "^" << el.second; - else if(el.second == -1) - unit_rep << " / " << el.first; - else if(el.second < 0) - unit_rep << " / " << el.first << "^" << std::abs(el.second); } if(unit_rep.str().size() == 0) - return ""; - - if(unit_rep.str().substr(0, 3) == " / ") - return "1" + unit_rep.str(); + return "Unitless"; return unit_rep.str().substr(3); } diff --git a/src/inputs/InputParser.cpp b/src/inputs/InputParser.cpp index 65d8cfd5c7d1e7492c47b0d1173aaea88be448e7..24ed6dab514b669e22329095f1c8189d75cb0c6a 100644 --- a/src/inputs/InputParser.cpp +++ b/src/inputs/InputParser.cpp @@ -242,6 +242,7 @@ void InputParser::generate_feature_space(std::shared_ptr<MPI_Interface> comm, st } else { + _prop_unit = Unit(units[propind]); _prop_train = std::vector<double>(data[propind].size(), 0.0); _prop_test = std::vector<double>(test_data[propind].size(), 0.0); diff --git a/src/inputs/InputParser.hpp b/src/inputs/InputParser.hpp index 08ba841e8a00f7c495f0e82cec0eefbe94de5940..29ece7ad91fec90572f186fc1b42eb3bd8d59371 100644 --- a/src/inputs/InputParser.hpp +++ b/src/inputs/InputParser.hpp @@ -42,6 +42,8 @@ public: std::vector<int> _task_sizes_train; //!< Number of samples in each task's training data std::vector<int> _task_sizes_test; //!< //!< Number of samples in each task's test data + Unit _prop_unit; //!< The Unit for the property + std::string _filename; //!< Name of the input file std::string _data_file; //!< Name of the data file std::string _prop_key; //!< Key used to find the property column in the data file diff --git a/src/main.cpp b/src/main.cpp index 395c3c6f7ff78c6ede1598a7b1bd309898e53a25..e0de0f62aac004f015036201dd3d42f3ddd2717f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -33,10 +33,10 @@ int main(int argc, char const *argv[]) mpi_setup::comm->barrier(); duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC; if(mpi_setup::comm->rank() == 0) - std::cout<< "time input_parsing/Feature space generation: "<< duration << std::endl; + std::cout<< "time input_parsing/Feature space generation: " << duration << std::endl; node_value_arrs::initialize_d_matrix_arr(); - SISSORegressor sisso(IP._feat_space, IP._prop_train, IP._prop_test, IP._task_sizes_train, IP._task_sizes_test, IP._leave_out_inds, IP._n_dim, IP._n_residuals, IP._fix_intercept); + SISSORegressor sisso(IP._feat_space, IP._prop_unit, IP._prop_train, IP._prop_test, IP._task_sizes_train, IP._task_sizes_test, IP._leave_out_inds, IP._n_dim, IP._n_residuals, IP._fix_intercept); sisso.fit(); if(mpi_setup::comm->rank() == 0) @@ -49,12 +49,6 @@ int main(int argc, char const *argv[]) else std::cout << std::endl; std::cout << sisso.models()[ii][0] << "\n" << std::endl; - // for(int jj = 0; jj < sisso.models()[ii].size(); ++jj) - // { - // sisso.models()[ii][jj].to_file("models/train_dim_" + std::to_string(ii) + "_model_" + std::to_string(jj) + ".dat"); - // if(IP._prop_test.size() > 0) - // sisso.models()[ii][jj].to_file("models/test_dim_" + std::to_string(ii) + "_model_" + std::to_string(jj) + ".dat", false, IP._leave_out_inds); - // } } } diff --git a/src/python/__init__.py b/src/python/__init__.py index fa6cd111030cb40c9490e72f1ecd982bed5f8d47..54fe87f32f76662dc22db20af3436d692e14d95f 100644 --- a/src/python/__init__.py +++ b/src/python/__init__.py @@ -66,6 +66,7 @@ def generate_phi_0_from_csv( # Get prop prop_key = cols[prop_ind] + prop_unit = get_unit(prop_key) prop = df[prop_key].to_numpy() df = df.drop([prop_key], axis=1) @@ -125,6 +126,7 @@ def generate_phi_0_from_csv( return ( phi_0, + prop_unit, prop[train_inds].flatten(), prop[leave_out_inds].flatten(), task_sizes_train, @@ -158,7 +160,7 @@ def generate_fs_prop( Returns: fs (FeatureSpace): The FeatureSpace for the calculation """ - phi_0, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv( + phi_0, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv( df_csv, prop_key, cols=cols, task_key=task_key, leave_out_frac=leave_out_frac ) if allowed_ops == "all": @@ -263,7 +265,7 @@ def generate_fs_sr_from_csv( fs (FeatureSpace): The FeatureSpace for the calculation sr (SISSORegressor): The SISSORegressor for the calculation """ - phi_0, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv( + phi_0, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds = generate_phi_0_from_csv( df_csv, prop_key, cols=cols, task_key=task_key, leave_out_frac=leave_out_frac ) @@ -273,6 +275,7 @@ def generate_fs_sr_from_csv( sr = SISSORegressor( fs, + prop_unit, prop, prop_test, task_sizes_train, diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp index 17dfd0c9a4ee043c4eb7132c0392e5e2621dd6cf..4907dba2b477ef0ac70c371b065d7a8a88ce17b7 100644 --- a/src/python/bindings_docstring_keyed.cpp +++ b/src/python/bindings_docstring_keyed.cpp @@ -333,7 +333,7 @@ void sisso::feature_creation::node::registerSixPowNode() void sisso::descriptor_identifier::registerModel() { - class_<Model>("Model", init<std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>()) + class_<Model>("Model", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>()) .def(init<std::string>()) .def(init<std::string, std::string>()) .def("__str__", &Model::toString, "@DocString_model_str@") @@ -364,14 +364,15 @@ void sisso::descriptor_identifier::registerModel() .add_property("percentile_75_ae", &Model::percentile_75_ae, "@DocString_model_percentile_75_ae@") .add_property("percentile_75_test_ae", &Model::percentile_75_test_ae, "@DocString_model_test_percentile_75_test_ae@") .add_property("percentile_95_ae", &Model::percentile_95_ae, "@DocString_model_percentile_95_ae@") - .add_property("percentile_95_test_ae", &Model::percentile_95_test_ae, "@DocString_model_test_percentile_95_test_ae@") + .add_property("percentile_95_ae", &Model::percentile_95_ae, "@DocString_model_percentile_95_ae@") + .add_property("prop_unit", &Model::prop_unit, "@DocString_model_prop_unit@") ; } void sisso::descriptor_identifier::registerSISSORegressor() { - class_<SISSORegressor>("SISSORegressor", init<std::shared_ptr<FeatureSpace>, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, optional<bool>>()) - .def(init<std::shared_ptr<FeatureSpace>, py::list, py::list, py::list, py::list, py::list, int, int, optional<bool>>()) + class_<SISSORegressor>("SISSORegressor", init<std::shared_ptr<FeatureSpace>, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, optional<bool>>()) + .def(init<std::shared_ptr<FeatureSpace>, Unit, py::list, py::list, py::list, py::list, py::list, int, int, optional<bool>>()) .def("fit", &SISSORegressor::fit, "@DocString_sisso_reg_fit@") .add_property("prop", &SISSORegressor::prop_py, "@DocString_sisso_reg_prop_py@") .add_property("prop_test", &SISSORegressor::prop_test_py, "@DocString_sisso_reg_prop_test_py@") diff --git a/src/python/descriptor_identifier/SISSORegressor.cpp b/src/python/descriptor_identifier/SISSORegressor.cpp index f5ad50c6e3ae768021fffeca1fc90717368a0d4a..12c82530675f990686dba18e3aa8300413585aaf 100644 --- a/src/python/descriptor_identifier/SISSORegressor.cpp +++ b/src/python/descriptor_identifier/SISSORegressor.cpp @@ -2,6 +2,7 @@ SISSORegressor::SISSORegressor( std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, np::ndarray prop, np::ndarray prop_test, py::list task_sizes_train, @@ -20,6 +21,7 @@ SISSORegressor::SISSORegressor( _task_sizes_train(python_conv_utils::from_list<int>(task_sizes_train)), _task_sizes_test(python_conv_utils::from_list<int>(task_sizes_test)), _leave_out_inds(python_conv_utils::from_list<int>(leave_out_inds)), + _prop_unit(prop_unit), _feat_space(feat_space), _mpi_comm(feat_space->mpi_comm()), _n_samp(prop.shape(0)), @@ -42,6 +44,7 @@ SISSORegressor::SISSORegressor( SISSORegressor::SISSORegressor( std::shared_ptr<FeatureSpace> feat_space, + Unit prop_unit, py::list prop, py::list prop_test, py::list task_sizes_train, @@ -60,6 +63,7 @@ SISSORegressor::SISSORegressor( _task_sizes_train(python_conv_utils::from_list<int>(task_sizes_train)), _task_sizes_test(python_conv_utils::from_list<int>(task_sizes_test)), _leave_out_inds(python_conv_utils::from_list<int>(leave_out_inds)), + _prop_unit(prop_unit), _feat_space(feat_space), _mpi_comm(feat_space->mpi_comm()), _n_samp(py::len(prop)), diff --git a/src/python/postprocess/__init__.py b/src/python/postprocess/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..42e7b632a2d9436a48ca5f94305ccb714d556798 --- /dev/null +++ b/src/python/postprocess/__init__.py @@ -0,0 +1,240 @@ +from sisso._sisso import * + +import numpy as np +import pandas as pd + +from glob import glob + +import seaborn as sns +from matplotlib import pyplot as plt +from matplotlib.patches import PathPatch + + +def adjust_box_widths(ax, fac): + """ + Adjust the widths of a seaborn-generated boxplot. + """ + # iterating through axes artists: + for c in ax.get_children(): + + # searching for PathPatches + if isinstance(c, PathPatch): + # getting current width of box: + p = c.get_path() + verts = p.vertices + verts_sub = verts[:-1] + xmin = np.min(verts_sub[:, 0]) + xmax = np.max(verts_sub[:, 0]) + xmid = 0.5 * (xmin + xmax) + xhalf = 0.5 * (xmax - xmin) + + # setting new width of box + xmin_new = xmid - fac * xhalf + xmax_new = xmid + fac * xhalf + verts_sub[verts_sub[:, 0] == xmin, 0] = xmin_new + verts_sub[verts_sub[:, 0] == xmax, 0] = xmax_new + + # setting new width of median line + for l in ax.lines: + if np.all(l.get_xdata() == [xmin, xmax]): + l.set_xdata([xmin_new, xmax_new]) + + +def violin_plot_error(dir_expr, train=True, filename=None, fig_params=None): + """Generate violin plots of the error for a model + + Args: + dir_expr (str): Regular expression for the directory list + train (bool): If True use the training error + fig_params (dict): Figure parameters + filename (str): Name of the file to store the violin plot + + Returns: + violin plot of the error + """ + if fig_params is None: + fig_params = {} + + train_model_list = sorted( + glob(dir_expr + "/models/train_*_model_0.dat"), + key=lambda s: int(s.split("/")[-1].split("_")[2]), + ) + test_model_list = sorted( + glob(dir_expr + "/models/test_*_model_0.dat"), + key=lambda s: int(s.split("/")[-1].split("_")[2]), + ) + + if len(test_model_list) == 0: + models = [Model(train_file) for train_file in train_model_list] + else: + models = [ + Model(train_file, test_file) + for train_file, test_file in zip(train_model_list, test_model_list) + ] + + if train: + error = np.array([model.train_error for model in models]).flatten() + color = "#009e73" + else: + error = np.array([model.test_error for model in models]).flatten() + color = "#56b4e9" + n_dim = int(train_model_list[-1].split("/")[-1].split("_")[2]) + dimension = np.ones(len(error) // n_dim) + + for nn in range(2, n_dim + 1): + dimension = np.hstack((dimension, np.ones(len(error) // n_dim) * nn)) + + fig, ax = plt.subplots(nrows=1, ncols=1, **fig_params) + + ax.set_xlabel("Model Dimension") + if train: + ax.set_ylabel("Absolute Training Error (" + str(models[0].prop_unit) + ")") + else: + ax.set_ylabel("Absolute Test Error (" + str(models[0].prop_unit) + ")") + ax = sns.violinplot( + x=dimension, y=np.abs(error), inner=None, palette=[color], cut=0, ax=ax + ) + sns.boxplot( + x=dimension, + y=np.abs(error), + palette=["#222222", "#222222"], + width=0.0625, + ax=ax, + boxprops={"zorder": 100}, + meanprops={"markerfacecolor": "#aaaaaa", "markeredgecolor": "#aaaaaa"}, + medianprops={"color": "#ffffff"}, + zorder=100, + showmeans=True, + showcaps=False, + showfliers=False, + ) + ax.set_xticklabels(labels=[str(nn) for nn in range(1, n_dim + 1)]) + + if filename is not None: + fig.savefig(filename) + + return ax + + +def violin_plot_split_error(dir_expr, filename=None, fig_params=None): + """Generate violin plots of the error for a model + + Args: + dir_expr (str): Regular expression for the directory list + fig_params (dict): Figure parameters + filename (str): Name of the file to store the violin plot + + Returns: + violin plot of the error + """ + if fig_params is None: + fig_params = {} + + train_model_list = sorted( + glob(dir_expr + "/models/train_*_model_0.dat"), + key=lambda s: int(s.split("/")[-1].split("_")[2]), + ) + test_model_list = sorted( + glob(dir_expr + "/models/test_*_model_0.dat"), + key=lambda s: int(s.split("/")[-1].split("_")[2]), + ) + + models = [ + Model(train_file, test_file) + for train_file, test_file in zip(train_model_list, test_model_list) + ] + + train_error = np.array([model.train_error for model in models]).flatten() + test_error = np.array([model.test_error for model in models]).flatten() + error = np.hstack((train_error, test_error)) + n_dim = int(train_model_list[-1].split("/")[-1].split("_")[2]) + + dimension = np.ones(len(train_error) // n_dim) + for nn in range(2, n_dim + 1): + dimension = np.hstack((dimension, np.ones(len(train_error) // n_dim) * nn)) + + for nn in range(1, n_dim + 1): + dimension = np.hstack((dimension, np.ones(len(test_error) // n_dim) * nn)) + + train_test = np.hstack((np.ones(len(train_error)), 2 * np.ones(len(test_error)))) + fig, ax = plt.subplots(nrows=1, ncols=1, **fig_params) + + ax.set_xlabel("Model Dimension") + ax.set_ylabel("Absolute Error (" + str(models[0].prop_unit) + ")") + sns.violinplot( + x=dimension, + y=np.abs(error), + hue=train_test, + split=True, + inner=None, + palette=["#009e73", "#56b4e9"], + cut=0, + ax=ax, + ) + + sns.boxplot( + x=dimension, + y=np.abs(error), + hue=train_test, + palette=["#222222", "#222222"], + width=0.5, + ax=ax, + boxprops={"zorder": 100}, + meanprops={"markerfacecolor": "#aaaaaa", "markeredgecolor": "#aaaaaa"}, + medianprops={"color": "#ffffff"}, + zorder=100, + showmeans=True, + showcaps=False, + showfliers=False, + ) + adjust_box_widths(ax, 0.25) + ax.set_xticklabels(labels=[str(nn) for nn in range(1, n_dim + 1)]) + handles, labels = ax.get_legend_handles_labels() + ax.legend(handles[0:2], ["Train", "Test"], edgecolor="white") + + if filename is not None: + fig.savefig(filename) + + return ax + + +def jackknife_cv_conv_est(dir_expr): + """Get the jackknife variance of the CV test error + + Args: + dir_expr (str): Regular expression for the directory list + + Returns: + avg_error: The average rmse of the test error + variance: The jackknife estimate of the variance of the test RMSE + """ + train_model_list = sorted( + glob(dir_expr + "/models/train_*_model_0.dat"), + key=lambda s: int(s.split("/")[-1].split("_")[2]), + ) + test_model_list = sorted( + glob(dir_expr + "/models/test_*_model_0.dat"), + key=lambda s: int(s.split("/")[-1].split("_")[2]), + ) + n_dim = int(train_model_list[-1].split("/")[-1].split("_")[2]) + + models = [ + Model(train_file, test_file) + for train_file, test_file in zip(train_model_list, test_model_list) + ] + + test_rmse = np.array([model.test_rmse for model in models]).reshape(n_dim, -1) + x_bar_i = [] + for dim_error in test_rmse: + x_bar_i.append([]) + for ii in range(len(dim_error)): + x_bar_i[-1].append(np.delete(dim_error, ii).mean()) + + x_bar_i = np.array(x_bar_i) + avg_error = x_bar_i.mean(axis=1) + variance = ( + (test_rmse.shape[1] - 1.0) + / test_rmse.shape[1] + * np.sum((x_bar_i - avg_error.reshape(n_dim, 1)) ** 2.0, axis=1) + ) + return avg_error, variance