From 6295195fac44cfe2c34984a82d462bc2de734226 Mon Sep 17 00:00:00 2001
From: Thomas Purcell <purcell@fhi-berlin.mpg.de>
Date: Sat, 5 Sep 2020 17:55:28 +0200
Subject: [PATCH] Added functionality for classification
Compiles but not tested
---
CMakeLists.txt | 40 ++
src/CMakeLists.txt | 8 +-
src/descriptor_identifier/Model/Model.cpp | 384 +----------------
src/descriptor_identifier/Model/Model.hpp | 226 +---------
.../Model/ModelClassifier.cpp | 386 +++++++++++++++++
.../Model/ModelClassifier.hpp | 197 +++++++++
.../Model/ModelRegressor.cpp | 387 ++++++++++++++++++
.../Model/ModelRegressor.hpp | 305 ++++++++++++++
.../SISSO_DI/SISSOClassifier.cpp | 252 ++++++++++++
.../SISSO_DI/SISSOClassifier.hpp | 183 +++++++++
.../SISSO_DI/SISSORegressor.cpp | 75 +++-
.../SISSO_DI/SISSORegressor.hpp | 23 +-
.../SISSO_DI/SISSO_DI.cpp | 71 ----
.../SISSO_DI/SISSO_DI.hpp | 26 +-
src/python/bindings_docstring_keyed.cpp | 88 ++--
src/python/bindings_docstring_keyed.hpp | 24 +-
.../descriptor_identifier/SISSOClassifier.cpp | 123 ++++++
.../descriptor_identifier/SISSORegressor.cpp | 13 +
src/python/descriptor_identifier/SISSO_DI.cpp | 13 -
.../test_descriptor_identifier/test_model.py | 4 +-
20 files changed, 2095 insertions(+), 733 deletions(-)
create mode 100644 src/descriptor_identifier/Model/ModelClassifier.cpp
create mode 100644 src/descriptor_identifier/Model/ModelClassifier.hpp
create mode 100644 src/descriptor_identifier/Model/ModelRegressor.cpp
create mode 100644 src/descriptor_identifier/Model/ModelRegressor.hpp
create mode 100644 src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp
create mode 100644 src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp
create mode 100644 src/python/descriptor_identifier/SISSOClassifier.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6290c3b6..cd49df28 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,8 @@
cmake_minimum_required(VERSION 3.10)
+# Include External Project options
+include( ExternalProject )
+
# set the project name
project(sisso++ VERSION 0.1 LANGUAGES CXX)
@@ -265,6 +268,43 @@ set(MPI_LIBRARIES, ${MPI_CXX_LIBRARIES})
list(GET MPI_CXX_LIBRARIES 0 MPI_LIBRARY)
get_filename_component(MPI_DIR ${MPI_LIBRARY} DIRECTORY)
+# Find the Shark libraries and includes
+# set Shark_DIR to the proper location of Shark
+# find_package(Shark)
+# if(NOT SHARK_FOUND)
+ set(SHARK_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/shark/build/")
+ set(SHARK_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/external/shark/bin/")
+ set(SHARK_INCLUDE_DIRS "${SHARK_INSTALL_DIR}/include")
+ set(SHARK_LIBRARY_DIRS "${SHARK_INSTALL_DIR}/lib")
+ set(SHARK_CMAKE_ARGS "-DBUILD_EXAMPLES=OFF;-DBUILD_TESTING=OFF;-DBUILD_SHARED_LIBS=ON;-DENABLE_OPENMP=OFF;-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER};-DCMAKE_INSTALL_PREFIX=${SHARK_INSTALL_DIR};-DCBLAS_LIBRARY_PATH=${LAPACK_LIBRARY_DIR};")
+ ExternalProject_Add(
+ external_shark
+ PREFIX "external/shark"
+ GIT_REPOSITORY "https://github.com/Shark-ML/Shark.git"
+ GIT_TAG "4.0"
+ CMAKE_ARGS "${SHARK_CMAKE_ARGS}"
+ BINARY_DIR "${SHARK_BUILD_DIR}"
+ INSTALL_DIR "${SHARK_INSTALL_DIR}"
+ )
+ add_dependencies(external_shark boost::filesystem)
+ add_dependencies(external_shark boost::system)
+ add_library( shark SHARED IMPORTED )
+ set_property( TARGET shark PROPERTY IMPORTED_LOCATION ${SHARK_LIBRARY_DIRS}/libshark.so )
+ set_property( TARGET shark PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${SHARK_INCLUDE_DIRS} )
+ set_property( TARGET shark PROPERTY INTERFACE_LINK_LIBRARIES "${Boost_LIBRARIES};${LAPACK_LIBRARIES}" )
+ add_dependencies(shark external_shark)
+ set(SHARK_LIBRARIES "${SHARK_LIBRARY_DIRS}/libshark.so")
+ # The Shark version number
+ set(SHARK_VERSION_MAJOR "4")
+ set(SHARK_VERSION_MINOR "0")
+ set(SHARK_VERSION_PATCH "0")
+
+ include_directories(${SHARK_INCLUDE_DIRS})
+# else(NOT SHARK_FOUND)
+# message(STATUS "SHARK_USE_FILE ${SHARK_USE_FILE}")
+# include(${SHARK_USE_FILE})
+# endif()
+
include_directories(${CMAKE_CURRENT_LIST_DIR}/src)
add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/src)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1faefe13..942abe34 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,7 +13,7 @@ include_directories(${CMAKE_CURRENT_LIST_DIR}/python/)
include_directories(${CMAKE_CURRENT_LIST_DIR}/utils/)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated-declarations")
-set(CMAKE_INSTALL_RPATH ${Boost_LIBRARY_DIRS};${LAPACK_DIR};${MPI_DIR})
+set(CMAKE_INSTALL_RPATH ${Boost_LIBRARY_DIRS};${LAPACK_DIR};${MPI_DIR};${SHARK_LIBRARY_DIRS})
# set(INSTALL_RPATH ${Boost_LIB_DIR})
file(GLOB_RECURSE SISSOPP_SOURCES *.cpp)
@@ -31,12 +31,12 @@ set_target_properties(sisso++
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
)
-target_link_libraries(sisso++ ${LAPACK_LIBRARIES} ${MPI_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} -Wl,--rpath=${LAPACK_DIR} ${Boost_LIBRARIES})
+target_link_libraries(sisso++ ${LAPACK_LIBRARIES} ${MPI_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} -Wl,--rpath=${LAPACK_DIR} ${Boost_LIBRARIES} ${SHARK_LIBRARIES})
install(TARGETS sisso++ DESTINATION ${CMAKE_CURRENT_LIST_DIR}/../bin/)
if(USE_PYTHON)
include(${CMAKE_CURRENT_LIST_DIR}/../cmake/TransferDocStrings.cmake)
- set(CMAKE_INSTALL_RPATH "${Boost_LIBRARY_DIRS};${PYTHON_PREFIX}/lib/;${MPI_DIR}")
+ set(CMAKE_INSTALL_RPATH "${Boost_LIBRARY_DIRS};${PYTHON_PREFIX}/lib/;${MPI_DIR};${SHARK_LIBRARY_DIRS}")
transfer_doc_string(${CMAKE_CURRENT_LIST_DIR}/python/bindings_docstring_keyed.cpp ${CMAKE_CURRENT_LIST_DIR}/python/bindings.cpp)
transfer_doc_string(${CMAKE_CURRENT_LIST_DIR}/python/bindings_docstring_keyed.hpp ${CMAKE_CURRENT_LIST_DIR}/python/bindings.hpp)
@@ -59,7 +59,7 @@ if(USE_PYTHON)
PREFIX ""
SUFFIX ".so"
)
- target_link_libraries(_sisso ${MPI_LIBRARIES} -Wl,--rpath=${PYTHON_PREFIX}/lib/ ${LAPACK_LIBRARIES} ${PYTHON_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} ${Boost_LIBRARIES} ${Boost_PYTHON_LIBRARIES})
+ target_link_libraries(_sisso ${MPI_LIBRARIES} -Wl,--rpath=${PYTHON_PREFIX}/lib/ ${LAPACK_LIBRARIES} ${PYTHON_LIBRARIES} -Wl,--rpath=${Boost_LIB_DIR} ${Boost_LIBRARIES} ${Boost_PYTHON_LIBRARIES} ${SHARK_LIBRARIES})
install(TARGETS _sisso DESTINATION "${PYTHON_INSTDIR}/cpp_sisso")
install(
DIRECTORY ${CMAKE_CURRENT_LIST_DIR}/python/ DESTINATION ${PYTHON_INSTDIR}/cpp_sisso
diff --git a/src/descriptor_identifier/Model/Model.cpp b/src/descriptor_identifier/Model/Model.cpp
index b44d9965..50f0316f 100644
--- a/src/descriptor_identifier/Model/Model.cpp
+++ b/src/descriptor_identifier/Model/Model.cpp
@@ -7,391 +7,13 @@ Model::Model(Unit prop_unit, std::vector<double> prop_train, std::vector<double>
_feats(feats),
_prop_train(prop_train),
_prop_test(prop_test),
- _train_error(_n_samp_train),
- _test_error(_n_samp_test),
_D_train(_n_samp_train * (feats.size() + (1 - fix_intercept))),
_D_test(_n_samp_test * (feats.size() + (1 - fix_intercept))),
- _prop_train_est(_n_samp_train, 0.0),
- _prop_test_est(_n_samp_test, 0.0),
_task_sizes_train(task_sizes_train),
_task_sizes_test(task_sizes_test),
_prop_unit(prop_unit),
_fix_intercept(fix_intercept)
-{
- _prop_train_est.reserve(_n_samp_train);
- _prop_test_est.reserve(_n_samp_test);
+{}
- std::vector<double> a(_D_train.size(), 1.0);
- std::vector<double> work(_D_train.size(), 0.0);
-
- int info;
- int start = 0;
-
- for(auto& sz : _task_sizes_train)
- {
- std::fill_n(a.data(), a.size(), 1.0);
- std::fill_n(_D_train.data(), _D_train.size(), 1.0);
-
- for(int ff = 0; ff < feats.size(); ++ff)
- {
- std::copy_n(feats[ff]->value_ptr() + start, sz, _D_train.data() + ff * sz);
- std::copy_n(feats[ff]->value_ptr() + start, sz, a.data() + ff * sz);
- }
-
- dgels_('N', sz, _n_dim + 1 - _fix_intercept, 1, a.data(), sz, prop_train.data() + start, sz, work.data(), work.size(), &info);
-
- _coefs.push_back(std::vector<double>(_n_dim + (!fix_intercept), 0.0));
-
- std::copy_n(prop_train.begin() + start, _n_dim + 1 - _fix_intercept, _coefs.back().data());
- dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_train.data(), sz, _coefs.back().data(), 1, 0.0, _prop_train_est.data() + start, 1);
- std::transform(_prop_train_est.begin() + start, _prop_train_est.begin() + start + sz, _prop_train.data() + start, _train_error.data() + start, std::minus<double>());
-
- start += sz;
- }
-
- start = 0;
- int ii = 0;
- for(auto& sz : _task_sizes_test)
- {
- if(sz > 0)
- {
- for(int ff = 0; ff < feats.size(); ++ff)
- std::copy_n(feats[ff]->test_value().data() + start, sz, _D_test.data() + ff * sz);
-
- std::fill_n(_D_test.data() + feats.size() * sz, sz, 1.0);
- dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_test.data(), sz, _coefs[ii].data(), 1, 0.0, _prop_test_est.data() + start, 1);
- std::transform(_prop_test_est.begin() + start, _prop_test_est.begin() + start + sz, _prop_test.data() + start, _test_error.data() + start, std::minus<double>());
- }
- ++ii;
- start += sz;
- }
-}
-
-Model::Model(std::string train_file)
-{
- _n_samp_test = 0;
-
- std::vector<std::string> split_str;
- std::vector<std::string> feature_expr_train = populate_model(train_file, true);
-
- for(int ff = 0; ff < feature_expr_train.size(); ++ff)
- {
- split_str = str_utils::split_string_trim(feature_expr_train[ff]);
- int rung = std::stoi(split_str[0]);
- std::string unit_str = split_str[1];
- std::string postfix_expr = split_str[2];
- std::string expr = split_str[3];
-
- std::vector<double> feat_val(_n_samp_train);
- std::vector<double> feat_test_val = {};
- std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data());
-
- model_node_ptr feat = std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str));
- _feats.push_back(feat);
- }
-
-}
-
-Model::Model(std::string train_file, std::string test_file)
-{
- std::vector<std::string> split_str;
- std::vector<std::string> feature_expr_train = populate_model(train_file, true);
- std::vector<std::string> feature_expr_test = populate_model(test_file, false);
-
- for(int ff = 0; ff < feature_expr_train.size(); ++ff)
- {
- if(feature_expr_train[ff] != feature_expr_test[ff])
- throw std::logic_error("Features for train and test file do not agree");
-
- split_str = str_utils::split_string_trim(feature_expr_train[ff]);
-
- int rung = std::stoi(split_str[0]);
- std::string unit_str = split_str[1];
- std::string postfix_expr = split_str[2];
- std::string expr = split_str[3];
-
- std::vector<double> feat_val(_n_samp_train);
- std::vector<double> feat_test_val(_n_samp_test);
-
- std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data());
- std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data());
-
- _feats.push_back(std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str)));
- }
-}
-
-std::vector<std::string> Model::populate_model(std::string filename, bool train)
-{
- std::ifstream file_stream;
- file_stream.open(filename, std::ios::in);
-
- std::vector<std::string> feature_expr;
- std::vector<std::string> split_line;
-
- // Store model line
- std::string model_line;
- std::getline(file_stream, model_line);
-
- // Get the property unit and error
- std::string unit_line;
- std::string error_line;
-
- std::getline(file_stream, unit_line);
- if(unit_line.substr(0, 8).compare("# RMSE: ") != 0)
- {
- split_line = str_utils::split_string_trim(unit_line);
- _prop_unit = Unit(split_line.back());
- std::getline(file_stream, error_line);
- }
- else
- {
- _prop_unit = Unit();
- error_line = unit_line;
- }
-
- split_line = str_utils::split_string_trim(error_line);
-
- double rmse = std::stod(split_line[1]);
- double max_ae = std::stod(split_line[3]);
-
- // Get coefficients
- std::string line;
- std::getline(file_stream, line);
- std::getline(file_stream, line);
-
- int n_task = 0;
- int n_dim = 0;
- std::getline(file_stream, line);
- do
- {
- ++n_task;
- split_line = str_utils::split_string_trim(line);
- n_dim = split_line.size() - 3;
- if(train)
- {
- _coefs.push_back(std::vector<double>(n_dim + 1, 0.0));
- std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);});
- }
- std::getline(file_stream, line);
- } while(line.substr(0, 39).compare("# Feature Rung, Units, and Expressions") != 0);
-
- if(_coefs.back().back() == 0.0)
- _fix_intercept = true;
- else
- _fix_intercept = false;
-
- _n_dim = n_dim + 1 - _fix_intercept;
-
- std::getline(file_stream, line);
- for(int ff = 0; ff < n_dim; ++ff)
- {
- feature_expr.push_back(line.substr(6));
- std::getline(file_stream, line);
- }
-
- std::getline(file_stream, line);
-
- int n_samp = 0;
- for(int tt = 0; tt < n_task; ++tt)
- {
- std::getline(file_stream, line);
- split_line = str_utils::split_string_trim(line);
- n_samp += std::stoi(split_line[1]);
- if(train)
- _task_sizes_train.push_back(std::stoi(split_line[1]));
- else
- _task_sizes_test.push_back(std::stoi(split_line[1]));
- }
-
- if(train)
- {
- _n_samp_train = n_samp;
- _prop_train.resize(n_samp);
- _prop_train_est.resize(n_samp);
- _train_error.resize(n_samp);
- }
- else
- {
- _n_samp_test = n_samp;
- _prop_test.resize(n_samp);
- _prop_test_est.resize(n_samp);
- _test_error.resize(n_samp);
- }
-
- std::getline(file_stream, line);
- std::getline(file_stream, line);
- if(!train)
- std::getline(file_stream, line);
-
- std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(n_samp, 0.0));
- for(int ns = 0; ns < n_samp; ++ns)
- {
- std::getline(file_stream, line);
- split_line = str_utils::split_string_trim(line);
- if(train)
- {
- _prop_train[ns] = std::stod(split_line[0]);
- _prop_train_est[ns] = std::stod(split_line[1]);
- _train_error[ns] = _prop_train_est[ns] - _prop_train[ns];
-
- }
- else
- {
- _prop_test[ns] = std::stod(split_line[0]);
- _prop_test_est[ns] = std::stod(split_line[1]);
- _test_error[ns] = _prop_test_est[ns] - _prop_test[ns];
- }
- for(int nf = 0; nf < n_dim; ++nf)
- {
- feat_vals[nf][ns] = std::stod(split_line[2 + nf]);
- }
- }
-
- if(train)
- {
- _D_train.resize(_n_dim * n_samp);
- for(int nf = 0; nf < n_dim; ++nf)
- std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]);
- }
- else
- {
- _D_test.resize(_n_dim * n_samp);
- for(int nf = 0; nf < n_dim; ++nf)
- std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]);
- }
-
- file_stream.close();
- return feature_expr;
-}
-
-std::string Model::toString() const
-{
- std::stringstream unit_rep;
- unit_rep << "c0";
- for(int ff = 0; ff < _feats.size(); ++ff)
- unit_rep << " + a" << std::to_string(ff) << " * " << _feats[ff]->expr();
- return unit_rep.str();
-}
-
-std::ostream& operator<< (std::ostream& outStream, const Model& model)
-{
- outStream << model.toString();
- return outStream;
-}
-
-void Model::to_file(std::string filename, bool train, std::vector<int> test_inds)
-{
- boost::filesystem::path p(filename.c_str());
- boost::filesystem::create_directories(p.remove_filename());
-
- std::ofstream out_file_stream = std::ofstream();
- out_file_stream.open(filename);
-
- out_file_stream << "# " << toString() << std::endl;
- out_file_stream << "# Property of the Unit: " << _prop_unit.toString() << std::endl;
- if(train)
- out_file_stream << "# RMSE: " << std::setprecision(15) << rmse() << "; Max AE: " << max_ae() << std::endl;
- else
- out_file_stream << "# RMSE: " << std::setprecision(15) << test_rmse() << "; Max AE: " << test_max_ae() << std::endl;
-
- out_file_stream << "# Coefficients" << std::endl;
- out_file_stream << std::setw(10) << std::left << "# Task;";
-
- for(int cc = 0; cc < _coefs[0].size() - (1 - _fix_intercept); ++cc)
- out_file_stream << std::setw(24) << " a" + std::to_string(cc);
-
- if(!_fix_intercept)
- out_file_stream << " c0" << std::endl;
- else
- out_file_stream << std::endl;
-
- for(int cc = 0; cc < _coefs.size(); ++cc)
- {
- out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", ";
- for(auto& coeff : _coefs[cc])
- out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", ";
- out_file_stream << "\n";
- }
-
- out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl;
- for(int ff = 0; ff < _feats.size(); ++ff)
- out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + ", " << std::to_string(_feats[ff]->rung()) + ", " << std::setw(50) << _feats[ff]->unit().toString() + ", " << _feats[ff]->postfix_expr() + "," << _feats[ff]->expr() << std::endl;
-
- out_file_stream << "# Number of Samples Per Task" << std::endl;
- if(train)
- {
- out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl;
- for(int tt = 0; tt < _task_sizes_train.size(); ++tt)
- out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl;
- }
- else
- {
- out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_test" << std::endl;
- for(int tt = 0; tt < _task_sizes_test.size(); ++tt)
- out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl;
-
- out_file_stream << "# Test Indexes: [ " << test_inds[0];
- for(int ii = 1; ii < test_inds.size(); ++ii)
- out_file_stream << ", " << test_inds[ii];
- out_file_stream << " ]" << std::endl;
- }
-
- out_file_stream << "\n" << std::setw(24) << std::left << "#Property Value" << std::setw(24) << " Property Value (EST)";
- for(int ff = 0; ff < _feats.size(); ++ff)
- out_file_stream << std::setw(24) << " Feature " + std::to_string(ff) + " Value";
- out_file_stream << std::endl;
-
- if(train)
- {
- for(int ss = 0; ss < _n_samp_train; ++ss)
- {
- out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", " << std::setw(22) << _prop_train_est[ss];
- for(int ff = 0; ff < _n_dim; ++ff)
- out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss];
- out_file_stream << std::endl;
- }
- }
- else
- {
- for(int ss = 0; ss < _n_samp_test; ++ss)
- {
- out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", " << std::setw(22) << _prop_test_est[ss];
- for(int ff = 0; ff < _n_dim; ++ff)
- out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss];
- out_file_stream << std::endl;
- }
- }
- out_file_stream.close();
-}
-
-std::vector<double> Model::sorted_error()
-{
- std::vector<double> sorted_error(_train_error.size(), 0.0);
- std::copy_n(_train_error.data(), _train_error.size(), sorted_error.data());
- std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);});
- std::sort(sorted_error.begin(), sorted_error.end());
- return sorted_error;
-}
-
-std::vector<double> Model::sorted_test_error()
-{
- std::vector<double> sorted_error(_test_error.size(), 0.0);
- std::copy_n(_test_error.data(), _test_error.size(), sorted_error.data());
- std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);});
- std::sort(sorted_error.begin(), sorted_error.end());
- return sorted_error;
-}
-
-double Model::mape()
-{
- std::vector<double> percent_error(_train_error.size(), 0.0);
- std::transform(_train_error.begin(), _train_error.end(), _prop_train.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);});
- return util_funcs::mean(percent_error);
-}
-
-double Model::test_mape()
-{
- std::vector<double> percent_error(_test_error.size(), 0.0);
- std::transform(_test_error.begin(), _test_error.end(), _prop_test.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);});
- return util_funcs::mean(percent_error);
-}
+Model::Model()
+{}
diff --git a/src/descriptor_identifier/Model/Model.hpp b/src/descriptor_identifier/Model/Model.hpp
index b07228be..36b1c131 100644
--- a/src/descriptor_identifier/Model/Model.hpp
+++ b/src/descriptor_identifier/Model/Model.hpp
@@ -35,6 +35,7 @@ typedef std::shared_ptr<ModelNode> model_node_ptr;
*/
class Model
{
+protected:
int _n_samp_train; //!< The number of samples per feature
int _n_samp_test; //!< The number of test samples per feature
int _n_dim; //!< Dimension of the model
@@ -44,14 +45,9 @@ class Model
std::vector<std::vector<double>> _coefs; //!< Coefficients for the features
std::vector<double> _prop_train; //!< The property to be modeled (training data)
std::vector<double> _prop_test; //!< The property to be modeled (testing data)
- std::vector<double> _train_error; //!< The error of the model (training)
- std::vector<double> _test_error; //!< The error of the model (testing)
std::vector<double> _D_train; //!< The Descriptor matrix (training data)
std::vector<double> _D_test; //!< The Descriptor matrix (testing data)
- std::vector<double> _prop_train_est; //!< The estimated Property (training data)
- std::vector<double> _prop_test_est; //!< The estimated Property (testing data)
-
std::vector<int> _task_sizes_train; //!< Number of training samples in each task
std::vector<int> _task_sizes_test; //!< Number of testing samples in each task
@@ -59,6 +55,12 @@ class Model
bool _fix_intercept; //!< If true fix intercept to 0
public:
+ // DocString: model_init
+ /**
+ * @brief Base constructer for a model
+ */
+ Model();
+
/**
* @brief Constructor for the model
*
@@ -71,72 +73,33 @@ public:
*/
Model(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept);
- // DocString: model_init_str
/**
- * @brief Construct a model from a training output file
- * @details Reads in all of the data from the output file and recreates the model object
+ * @brief Copy Constructor
*
- * @param train_file Previously generated model file
+ * @param o Model to be copied
*/
- Model(std::string train_file);
+ Model(const Model& o) = default;
- // DocString: model_init_str_str
/**
- * @brief Construct a model from a training and testing output file
- * @details Reads in all of the data from the output files and recreates the model object
+ * @brief Move Constructor
*
- * @param train_file Previously generated training model output file
- * @param train_file Previously generated testing model output file
+ * @param o Model to be moved
*/
- Model(std::string train_file, std::string test_file);
+ Model(Model&& o) = default;
/**
- * @brief Read an output file and extract all relevant information
- * @details Takes in an output file and extracts all data needed to recreate the model
- *
- * @param filename Name of the output file
- * @param train If true then the output represents training data
+ * @brief Copy Assignment operator
*
- * @return [description]
+ * @param o Model to be copied
*/
- std::vector<std::string> populate_model(std::string filename, bool train);
+ Model& operator= (const Model& o) = default;
- // DocString: model_str
/**
- * @brief Convert the model to a string
-
- * @return The string representation of the model
- */
- std::string toString() const;
-
- /**
- * @brief The estimated property values
- */
- inline std::vector<double> predict(){return _prop_test_est;}
-
- /**
- * @brief Accessor function to predict property values
- */
- inline std::vector<double> predict_train(){return _prop_train_est;}
-
- /**
- * @brief Copy the error into a new array
+ * @brief Move Assignment operator
*
- * @param res pointer to the beginning of the vector to store the residual
- */
- inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);}
-
- // DocString: model_rmse
- /**
- * @brief The training rmse of the model
- */
- inline double rmse(){return util_funcs::norm(_train_error.data(), _n_samp_train) / std::sqrt(static_cast<double>(_n_samp_train));}
-
- // DocString: model_test_rmse
- /**
- * @brief The test rmse of the model
+ * @param o Model to be moved
*/
- inline double test_rmse(){return util_funcs::norm(_test_error.data(), _n_samp_test) / std::sqrt(static_cast<double>(_n_samp_test));}
+ Model& operator= (Model&& o) = default;
// DocString: model_n_samp_train
/**
@@ -156,120 +119,6 @@ public:
*/
inline int n_dim(){return _n_dim;}
- // DocString: model_max_ae
- /**
- * @brief The max Absolute error of the training data
- */
- inline double max_ae()
- {
- return std::abs(*std::max_element(_train_error.data(), _train_error.data() + _n_samp_train, [](double d1, double d2){return std::abs(d1) < std::abs(d2);}));
- }
-
- // DocString: model_test_max_ae
- /**
- * @brief The max Absolute error of the testing data
- */
- inline double test_max_ae()
- {
- return std::abs(*std::max_element(_test_error.data(), _test_error.data() + _n_samp_test, [](double d1, double d2){return std::abs(d1) < std::abs(d2);}));
- }
-
- // DocString: model_mae
- /**
- * @brief The mean absolute error of the model
- * @return The mean absolute error of the training data
- */
- inline double mae(){return std::accumulate(_train_error.begin(), _train_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_train;}
-
- // DocString: model_test_mae
- /**
- * @brief The mean absolute test error of the model
- * @return The mean absolute error of the test data
- */
- inline double test_mae(){return std::accumulate(_test_error.begin(), _test_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_test;}
-
- // DocString: model_mape
- /**
- * @brief The mean absolute error of the model
- * @return The mean absolute error of the training data
- */
- double mape();
-
- // DocString: model_test_mape
- /**
- * @brief The mean absolute test error of the model
- * @return The mean absolute error of the test data
- */
- double test_mape();
-
- /**
- * @brief Sort the training error based on magnitude
- * @return The error vector sorted
- */
- std::vector<double> sorted_error();
-
- /**
- * @brief Sort the training test_error based on magnitude
- * @return The test_error vector sorted
- */
- std::vector<double> sorted_test_error();
-
- // DocString: model_percentile_25_ae
- /**
- * @brief The mean absolute error of the model
- * @return The mean absolute error of the training data
- */
- inline double percentile_25_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.25))];}
-
- // DocString: model_test_percentile_25_ae
- /**
- * @brief The mean absolute test error of the model
- * @return The mean absolute error of the test data
- */
- inline double percentile_25_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.25))];}
-
- // DocString: model_percentile_50_ae
- /**
- * @brief The mean absolute error of the model
- * @return The mean absolute error of the training data
- */
- inline double percentile_50_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.50))];}
-
- // DocString: model_test_percentile_50_ae
- /**
- * @brief The mean absolute test error of the model
- * @return The mean absolute error of the test data
- */
- inline double percentile_50_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.50))];}
-
- // DocString: model_percentile_75_ae
- /**
- * @brief The mean absolute error of the model
- * @return The mean absolute error of the training data
- */
- inline double percentile_75_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.75))];}
-
- // DocString: model_test_percentile_75_ae
- /**
- * @brief The mean absolute test error of the model
- * @return The mean absolute error of the test data
- */
- inline double percentile_75_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.75))];}
-
- // DocString: model_percentile_95_ae
- /**
- * @brief The mean absolute error of the model
- * @return The mean absolute error of the training data
- */
- inline double percentile_95_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.95))];}
-
- // DocString: model_test_percentile_95_ae
- /**
- * @brief The mean absolute test error of the model
- * @return The mean absolute error of the test data
- */
- inline double percentile_95_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.95))];}
-
// DocString: model_prop_unit
/**
* @brief The unit of the property
@@ -284,7 +133,7 @@ public:
* @param train If true output the training data
* @param test_inds The indexes of the test set
*/
- void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {});
+ virtual void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}) = 0;
#ifdef PY_BINDINGS
// DocString: model_coefs
@@ -313,20 +162,6 @@ public:
return feat_lst;
}
- // DocString: model_prop_train_est
- /**
- * @brief The estimation of the property
- * @return _prop_train_est as a numpy ndarray
- */
- inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);}
-
- // DocString: model_prop_test_est
- /**
- * @brief The estimation of the properties test values
- * @return _prop_test_est as a numpy ndarray
- */
- inline np::ndarray prop_test_est(){return python_conv_utils::to_ndarray<double>(_prop_test_est);}
-
// DocString: model_prop_train
/**
* @brief The training data property to be learned
@@ -341,28 +176,7 @@ public:
*/
inline np::ndarray prop_test(){return python_conv_utils::to_ndarray<double>(_prop_test);}
- // DocString: model_train_error
- /**
- * @brief The training error
- * @return _train_error as a numpy ndarray
- */
- inline np::ndarray train_error(){return python_conv_utils::to_ndarray<double>(_train_error);}
-
- // DocString: model_test_error
- /**
- * @brief The test error
- * @return _test_error as a numpy ndarray
- */
- inline np::ndarray test_error(){return python_conv_utils::to_ndarray<double>(_test_error);}
#endif
};
-/**
- * @brief Print a model to an string stream
- *
- * @param outStream The output stream the model is to be printed
- * @param model The model to be printed
- */
-std::ostream& operator<< (std::ostream& outStream, const Model& model);
-
#endif
\ No newline at end of file
diff --git a/src/descriptor_identifier/Model/ModelClassifier.cpp b/src/descriptor_identifier/Model/ModelClassifier.cpp
new file mode 100644
index 00000000..7108e9ed
--- /dev/null
+++ b/src/descriptor_identifier/Model/ModelClassifier.cpp
@@ -0,0 +1,386 @@
+#include <descriptor_identifier/Model/ModelClassifier.hpp>
+
+ModelClassifier::ModelClassifier(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept) :
+ Model(prop_unit, prop_train, prop_test, feats, task_sizes_train, task_sizes_test, fix_intercept),
+ _prop_train_est(_n_samp_train, 0.0),
+ _prop_test_est(_n_samp_test, 0.0),
+ _train_error(_n_samp_train),
+ _test_error(_n_samp_test)
+{
+ _prop_train_est.reserve(_n_samp_train);
+ _prop_test_est.reserve(_n_samp_test);
+
+ std::vector<shark::LabeledData<shark::RealVector, unsigned int>> training(_task_sizes_train.size());
+ std::vector<shark::LabeledData<shark::RealVector, unsigned int>> test(_task_sizes_test.size());
+ std::vector<unsigned int> prop_to_copy;
+ int n_els = 0;
+ for(int tt = 0; tt < _task_sizes_train.size(); ++tt)
+ {
+ shark::Data<shark::RealVector> data(_task_sizes_train[tt], shark::RealVector(_n_dim));
+ shark::Data<unsigned int> labels(_task_sizes_train[tt], 0);
+
+ for(int ii = 0; ii < _task_sizes_train[tt]; ++ii)
+ prop_to_copy[ii] = static_cast<unsigned int>(_prop_train[ii + n_els]);
+
+ int n_copied = 0;
+ for(auto& batch : labels.batches())
+ {
+ std::copy_n(prop_to_copy.begin() + n_copied, shark::batchSize(batch), &batch(0));
+ n_els += shark::batchSize(batch);
+ n_copied += shark::batchSize(batch);
+ }
+ training[tt] = shark::LabeledData<shark::RealVector, unsigned int>(data, labels);
+ }
+
+ n_els = 0;
+ for(int tt = 0; tt < _task_sizes_test.size(); ++tt)
+ {
+ shark::Data<shark::RealVector> data(_task_sizes_test[tt], shark::RealVector(_n_dim));
+ shark::Data<unsigned int> labels(_task_sizes_test[tt], 0);
+
+ for(int ii = 0; ii < _task_sizes_test[tt]; ++ii)
+ prop_to_copy[ii] = static_cast<unsigned int>(_prop_test[ii + n_els]);
+
+ int n_copied = 0;
+ for(auto& batch : labels.batches())
+ {
+ std::copy_n(prop_to_copy.begin() + n_copied, shark::batchSize(batch), &batch(0));
+ n_els += shark::batchSize(batch);
+ n_copied += shark::batchSize(batch);
+ }
+ test[tt] = shark::LabeledData<shark::RealVector, unsigned int>(data, labels);
+ }
+
+ shark::LinearCSvmTrainer<shark::RealVector> trainer(1.0, !_fix_intercept);
+ shark::LinearClassifier<shark::RealVector> model;
+ shark::ZeroOneLoss<unsigned int> loss;
+ shark::Data<unsigned int> output;
+ int ii_train = 0;
+ int ii_test = 0;
+ for(int tt = 0; tt < _task_sizes_train.size(); ++tt)
+ {
+ trainer.train(model, training[tt]);
+ output = model(training[tt].inputs());
+ for(auto element: output.elements())
+ {
+ _prop_train_est[ii_train] = static_cast<double>(element);
+ ++ii_train;
+ }
+
+ output = model(test[tt].inputs());
+ for(auto element: output.elements())
+ {
+ _prop_train_est[ii_test] = static_cast<double>(element);
+ ++ii_test;
+ }
+ if(_fix_intercept)
+ {
+ _coefs.push_back(std::vector<double>(_n_dim, 0.0));
+ std::copy_n(model.parameterVector().begin(), _n_dim, _coefs.back().begin());
+ }
+ else
+ {
+ _coefs.push_back(std::vector<double>(_n_dim + 1, model.bias()(0)));
+ std::copy_n(model.parameterVector().begin(), _n_dim, _coefs.back().begin());
+ }
+ }
+ std::transform(_prop_train_est.begin(), _prop_train_est.end(), _prop_train.begin(), _train_error.begin(), [](double est, double real){return (std::abs(est - real) < 1e-10) ? -1 : real;});
+ std::transform(_prop_test_est.begin(), _prop_test_est.end(), _prop_test.begin(), _test_error.begin(), [](double est, double real){return (std::abs(est - real) < 1e-10) ? -1 : real;});
+}
+
+ModelClassifier::ModelClassifier(std::string train_file)
+{
+ _n_samp_test = 0;
+
+ std::vector<std::string> split_str;
+ std::vector<std::string> feature_expr_train = populate_model(train_file, true);
+
+ for(int ff = 0; ff < feature_expr_train.size(); ++ff)
+ {
+ split_str = str_utils::split_string_trim(feature_expr_train[ff]);
+ int rung = std::stoi(split_str[0]);
+ std::string unit_str = split_str[1];
+ std::string postfix_expr = split_str[2];
+ std::string expr = split_str[3];
+
+ std::vector<double> feat_val(_n_samp_train);
+ std::vector<double> feat_test_val = {};
+ std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data());
+
+ model_node_ptr feat = std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str));
+ _feats.push_back(feat);
+ }
+
+}
+
+ModelClassifier::ModelClassifier(std::string train_file, std::string test_file)
+{
+ std::vector<std::string> split_str;
+ std::vector<std::string> feature_expr_train = populate_model(train_file, true);
+ std::vector<std::string> feature_expr_test = populate_model(test_file, false);
+
+ for(int ff = 0; ff < feature_expr_train.size(); ++ff)
+ {
+ if(feature_expr_train[ff] != feature_expr_test[ff])
+ throw std::logic_error("Features for train and test file do not agree");
+
+ split_str = str_utils::split_string_trim(feature_expr_train[ff]);
+
+ int rung = std::stoi(split_str[0]);
+ std::string unit_str = split_str[1];
+ std::string postfix_expr = split_str[2];
+ std::string expr = split_str[3];
+
+ std::vector<double> feat_val(_n_samp_train);
+ std::vector<double> feat_test_val(_n_samp_test);
+
+ std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data());
+ std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data());
+
+ _feats.push_back(std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str)));
+ }
+}
+
+std::vector<std::string> ModelClassifier::populate_model(std::string filename, bool train)
+{
+ std::ifstream file_stream;
+ file_stream.open(filename, std::ios::in);
+
+ std::vector<std::string> feature_expr;
+ std::vector<std::string> split_line;
+
+ // Store model line
+ std::string model_line;
+ std::getline(file_stream, model_line);
+
+ // Get the property unit and error
+ std::string unit_line;
+ std::string error_line;
+
+ std::getline(file_stream, unit_line);
+ if(unit_line.substr(0, 16).compare("# Percent Error:") != 0)
+ {
+ split_line = str_utils::split_string_trim(unit_line);
+ _prop_unit = Unit(split_line.back());
+ std::getline(file_stream, error_line);
+ }
+ else
+ {
+ _prop_unit = Unit();
+ error_line = unit_line;
+ }
+
+ split_line = str_utils::split_string_trim(error_line);
+
+ double percent_error = std::stod(split_line[1]);
+
+ // Get coefficients
+ std::string line;
+ std::getline(file_stream, line);
+ std::getline(file_stream, line);
+
+ int n_task = 0;
+ int n_dim = 0;
+ std::getline(file_stream, line);
+ do
+ {
+ ++n_task;
+ split_line = str_utils::split_string_trim(line);
+ n_dim = split_line.size() - 3;
+ if(train)
+ {
+ _coefs.push_back(std::vector<double>(n_dim + 1, 0.0));
+ std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);});
+ }
+ std::getline(file_stream, line);
+ } while(line.substr(0, 39).compare("# Feature Rung, Units, and Expressions") != 0);
+
+ if(_coefs.back().back() == 0.0)
+ _fix_intercept = true;
+ else
+ _fix_intercept = false;
+
+ _n_dim = n_dim + 1 - _fix_intercept;
+
+ std::getline(file_stream, line);
+ for(int ff = 0; ff < n_dim; ++ff)
+ {
+ feature_expr.push_back(line.substr(6));
+ std::getline(file_stream, line);
+ }
+
+ std::getline(file_stream, line);
+
+ int n_samp = 0;
+ for(int tt = 0; tt < n_task; ++tt)
+ {
+ std::getline(file_stream, line);
+ split_line = str_utils::split_string_trim(line);
+ n_samp += std::stoi(split_line[1]);
+ if(train)
+ _task_sizes_train.push_back(std::stoi(split_line[1]));
+ else
+ _task_sizes_test.push_back(std::stoi(split_line[1]));
+ }
+
+ if(train)
+ {
+ _n_samp_train = n_samp;
+ _prop_train.resize(n_samp);
+ _prop_train_est.resize(n_samp);
+ _train_error.resize(n_samp);
+ }
+ else
+ {
+ _n_samp_test = n_samp;
+ _prop_test.resize(n_samp);
+ _prop_test_est.resize(n_samp);
+ _test_error.resize(n_samp);
+ }
+
+ std::getline(file_stream, line);
+ std::getline(file_stream, line);
+ if(!train)
+ std::getline(file_stream, line);
+
+ std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(n_samp, 0.0));
+ for(int ns = 0; ns < n_samp; ++ns)
+ {
+ std::getline(file_stream, line);
+ split_line = str_utils::split_string_trim(line);
+ if(train)
+ {
+ _prop_train[ns] = std::stod(split_line[0]);
+ _prop_train_est[ns] = std::stod(split_line[1]);
+ _train_error[ns] = _prop_train_est[ns] - _prop_train[ns];
+
+ }
+ else
+ {
+ _prop_test[ns] = std::stod(split_line[0]);
+ _prop_test_est[ns] = std::stod(split_line[1]);
+ _test_error[ns] = _prop_test_est[ns] - _prop_test[ns];
+ }
+ for(int nf = 0; nf < n_dim; ++nf)
+ {
+ feat_vals[nf][ns] = std::stod(split_line[2 + nf]);
+ }
+ }
+
+ if(train)
+ {
+ _D_train.resize(_n_dim * n_samp);
+ for(int nf = 0; nf < n_dim; ++nf)
+ std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]);
+ }
+ else
+ {
+ _D_test.resize(_n_dim * n_samp);
+ for(int nf = 0; nf < n_dim; ++nf)
+ std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]);
+ }
+
+ file_stream.close();
+ return feature_expr;
+}
+
+std::string ModelClassifier::toString() const
+{
+ std::stringstream unit_rep;
+ unit_rep << "[" << _feats[0]->expr();
+ for(int ff = 1; ff < _feats.size(); ++ff)
+ unit_rep << ", " << _feats[ff]->expr();
+ unit_rep << "]";
+ return unit_rep.str();
+}
+
+std::ostream& operator<< (std::ostream& outStream, const ModelClassifier& model)
+{
+ outStream << model.toString();
+ return outStream;
+}
+
+void ModelClassifier::to_file(std::string filename, bool train, std::vector<int> test_inds)
+{
+ boost::filesystem::path p(filename.c_str());
+ boost::filesystem::create_directories(p.remove_filename());
+
+ std::ofstream out_file_stream = std::ofstream();
+ out_file_stream.open(filename);
+
+ out_file_stream << "# " << toString() << std::endl;
+ out_file_stream << "# Property of the Unit: " << _prop_unit.toString() << std::endl;
+ if(train)
+ out_file_stream << "# Percent Error: " << std::setprecision(15) << percent_train_error() << std::endl;
+ else
+ out_file_stream << "# Percent Error: " << std::setprecision(15) << percent_test_error() << std::endl;
+
+ out_file_stream << "# Plane Divider" << std::endl;
+ out_file_stream << std::setw(10) << std::left << "# Task";
+
+ for(int cc = 0; cc < _coefs[0].size() - (1 - _fix_intercept); ++cc)
+ out_file_stream << std::setw(24) << " w" + std::to_string(cc);
+
+ if(!_fix_intercept)
+ out_file_stream << " b" << std::endl;
+ else
+ out_file_stream << std::endl;
+
+ for(int cc = 0; cc < _coefs.size(); ++cc)
+ {
+ out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", ";
+ for(auto& coeff : _coefs[cc])
+ out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", ";
+ out_file_stream << "\n";
+ }
+
+ out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl;
+ for(int ff = 0; ff < _feats.size(); ++ff)
+ out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + ", " << std::to_string(_feats[ff]->rung()) + ", " << std::setw(50) << _feats[ff]->unit().toString() + ", " << _feats[ff]->postfix_expr() + "," << _feats[ff]->expr() << std::endl;
+
+ out_file_stream << "# Number of Samples Per Task" << std::endl;
+ if(train)
+ {
+ out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl;
+ for(int tt = 0; tt < _task_sizes_train.size(); ++tt)
+ out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl;
+ }
+ else
+ {
+ out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_test" << std::endl;
+ for(int tt = 0; tt < _task_sizes_test.size(); ++tt)
+ out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl;
+
+ out_file_stream << "# Test Indexes: [ " << test_inds[0];
+ for(int ii = 1; ii < test_inds.size(); ++ii)
+ out_file_stream << ", " << test_inds[ii];
+ out_file_stream << " ]" << std::endl;
+ }
+
+ out_file_stream << "\n" << std::setw(24) << std::left << "#Property Value" << std::setw(24) << " Property Value (EST)";
+ for(int ff = 0; ff < _feats.size(); ++ff)
+ out_file_stream << std::setw(24) << " Feature " + std::to_string(ff) + " Value";
+ out_file_stream << std::endl;
+
+ if(train)
+ {
+ for(int ss = 0; ss < _n_samp_train; ++ss)
+ {
+ out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", " << std::setw(22) << _prop_train_est[ss];
+ for(int ff = 0; ff < _n_dim; ++ff)
+ out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss];
+ out_file_stream << std::endl;
+ }
+ }
+ else
+ {
+ for(int ss = 0; ss < _n_samp_test; ++ss)
+ {
+ out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", " << std::setw(22) << _prop_test_est[ss];
+ for(int ff = 0; ff < _n_dim; ++ff)
+ out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss];
+ out_file_stream << std::endl;
+ }
+ }
+ out_file_stream.close();
+}
diff --git a/src/descriptor_identifier/Model/ModelClassifier.hpp b/src/descriptor_identifier/Model/ModelClassifier.hpp
new file mode 100644
index 00000000..889b8945
--- /dev/null
+++ b/src/descriptor_identifier/Model/ModelClassifier.hpp
@@ -0,0 +1,197 @@
+/** @file descriptor_identifier/ModelClassifier/ModelClassifier.hpp
+ * @brief Object to store the models generated form SISSO
+ *
+ * Creates a ModelClassifier generated from SISSO and the corresponding output file.
+ * It also has functionality to read in an output file to regenerate the model.
+ *
+ * @author Thomas A. R. Purcell (tpurcell)
+ * @bug No known bugs.
+ */
+#ifndef MODEL_CLASSIFIER
+#define MODEL_CLASSIFIER
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/trim.hpp>
+#include <boost/filesystem.hpp>
+
+#include<iomanip>
+#include<fstream>
+#include<iostream>
+
+#include <shark/Data/Dataset.h>
+#include <shark/ObjectiveFunctions/Loss/ZeroOneLoss.h>
+#include <shark/Algorithms/Trainers/CSvmTrainer.h>
+
+#include <descriptor_identifier/Model/Model.hpp>
+#include <feature_creation/node/ModelNode.hpp>
+#include <utils/string_utils.hpp>
+
+#ifdef PY_BINDINGS
+ namespace np = boost::python::numpy;
+ namespace py = boost::python;
+#endif
+
+typedef std::shared_ptr<ModelNode> model_node_ptr;
+
+// DocString: cls_model_class
+/**
+ * @brief Class to store the models found from SISSO
+ *
+ */
+class ModelClassifier : public Model
+{
+ std::vector<double> _prop_train_est; //!< The estimated Property (training data)
+ std::vector<double> _prop_test_est; //!< The estimated Property (testing data)
+
+ std::vector<double> _train_error; //!< The error of the model (training)
+ std::vector<double> _test_error; //!< The error of the model (testing)
+
+public:
+ /**
+ * @brief Constructor for the model
+ *
+ * @param prop_unit The unit of the property
+ * @param prop_train The property vector for the training samples
+ * @param prop_test The property vector for the test samples
+ * @param feats The features for the model
+ * @param task_sizes_train Number of samples per task in the training data
+ * @param task_sizes_test Number of samples per task in the test data
+ */
+ ModelClassifier(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept);
+
+ // DocString: model_class_init_str
+ /**
+ * @brief Construct a model from a training output file
+ * @details Reads in all of the data from the output file and recreates the model object
+ *
+ * @param train_file Previously generated model file
+ */
+ ModelClassifier(std::string train_file);
+
+ // DocString: model_class_init_str_str
+ /**
+ * @brief Construct a model from a training and testing output file
+ * @details Reads in all of the data from the output files and recreates the model object
+ *
+ * @param train_file Previously generated training model output file
+ * @param train_file Previously generated testing model output file
+ */
+ ModelClassifier(std::string train_file, std::string test_file);
+
+
+ /**
+ * @brief Copy Constructor
+ *
+ * @param o ModelClassifier to be copied
+ */
+ ModelClassifier(const ModelClassifier& o) = default;
+
+ /**
+ * @brief Move Constructor
+ *
+ * @param o ModelClassifier to be moved
+ */
+ ModelClassifier(ModelClassifier&& o) = default;
+
+ /**
+ * @brief Copy Assignment operator
+ *
+ * @param o ModelClassifier to be copied
+ */
+ ModelClassifier& operator= (const ModelClassifier& o) = default;
+
+ /**
+ * @brief Move Assignment operator
+ *
+ * @param o ModelClassifier to be moved
+ */
+ ModelClassifier& operator= (ModelClassifier&& o) = default;
+
+ /**
+ * @brief Read an output file and extract all relevant information
+ * @details Takes in an output file and extracts all data needed to recreate the model
+ *
+ * @param filename Name of the output file
+ * @param train If true then the output represents training data
+ *
+ * @return [description]
+ */
+ std::vector<std::string> populate_model(std::string filename, bool train);
+
+ // DocString: model_class_str
+ /**
+ * @brief Convert the model to a string
+
+ * @return The string representation of the model
+ */
+ std::string toString() const;
+
+ /**
+ * @brief Copy the error into a new array
+ *
+ * @param res pointer to the beginning of the vector to store the residual
+ */
+ inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);}
+
+ /**
+ * @brief Convert the ModelClassifier into an output file
+ *
+ * @param filename The name of the file to output to
+ * @param train If true output the training data
+ * @param test_inds The indexes of the test set
+ */
+ void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {});
+
+ // DocString: model_class_precent_train_error
+ /**
+ * @brief Percent of all training samples misclassified
+ */
+ inline double percent_train_error(){100.0 * std::count_if(_train_error.begin(), _train_error.end(), [](double te){return te >= 0;}) / _n_samp_train; }
+
+ // DocString: model_class_precent_test_error
+ /**
+ * @brief Percent of all tset samples misclassified
+ */
+ inline double percent_test_error(){100.0 * std::count_if(_test_error.begin(), _test_error.end(), [](double te){return te >= 0;}) / _n_samp_test; }
+
+ #ifdef PY_BINDINGS
+
+ // DocString: model_class_prop_train_est
+ /**
+ * @brief The estimation of the property
+ * @return _prop_train_est as a numpy ndarray
+ */
+ inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);}
+
+ // DocString: model_class_prop_test_est
+ /**
+ * @brief The estimation of the properties test values
+ * @return _prop_test_est as a numpy ndarray
+ */
+ inline np::ndarray prop_test_est(){return python_conv_utils::to_ndarray<double>(_prop_test_est);}
+
+ // DocString: model_class_train_error
+ /**
+ * @brief The training error
+ * @return _train_error as a numpy ndarray
+ */
+ inline np::ndarray train_error(){return python_conv_utils::to_ndarray<double>(_train_error);}
+
+ // DocString: model_class_test_error
+ /**
+ * @brief The test error
+ * @return _test_error as a numpy ndarray
+ */
+ inline np::ndarray test_error(){return python_conv_utils::to_ndarray<double>(_test_error);}
+ #endif
+};
+
+/**
+ * @brief Print a model to an string stream
+ *
+ * @param outStream The output stream the model is to be printed
+ * @param model The model to be printed
+ */
+std::ostream& operator<< (std::ostream& outStream, const ModelClassifier& model);
+
+#endif
\ No newline at end of file
diff --git a/src/descriptor_identifier/Model/ModelRegressor.cpp b/src/descriptor_identifier/Model/ModelRegressor.cpp
new file mode 100644
index 00000000..3e5116db
--- /dev/null
+++ b/src/descriptor_identifier/Model/ModelRegressor.cpp
@@ -0,0 +1,387 @@
+#include <descriptor_identifier/Model/ModelRegressor.hpp>
+
+ModelRegressor::ModelRegressor(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept) :
+ Model(prop_unit, prop_train, prop_test, feats, task_sizes_train, task_sizes_test, fix_intercept),
+ _prop_train_est(_n_samp_train, 0.0),
+ _prop_test_est(_n_samp_test, 0.0),
+ _train_error(_n_samp_train),
+ _test_error(_n_samp_test)
+{
+ _prop_train_est.reserve(_n_samp_train);
+ _prop_test_est.reserve(_n_samp_test);
+
+ std::vector<double> a(_D_train.size(), 1.0);
+ std::vector<double> work(_D_train.size(), 0.0);
+
+ int info;
+ int start = 0;
+
+ std::copy_n(_D_train.begin(), _D_train.size(), a.begin());
+ for(auto& sz : _task_sizes_train)
+ {
+ std::fill_n(a.data(), a.size(), 1.0);
+ std::fill_n(_D_train.data(), _D_train.size(), 1.0);
+
+ for(int ff = 0; ff < feats.size(); ++ff)
+ {
+ std::copy_n(feats[ff]->value_ptr() + start, sz, _D_train.data() + ff * sz);
+ std::copy_n(feats[ff]->value_ptr() + start, sz, a.data() + ff * sz);
+ }
+
+ dgels_('N', sz, _n_dim + 1 - _fix_intercept, 1, a.data(), sz, prop_train.data() + start, sz, work.data(), work.size(), &info);
+
+ _coefs.push_back(std::vector<double>(_n_dim + (!fix_intercept), 0.0));
+
+ std::copy_n(prop_train.begin() + start, _n_dim + 1 - _fix_intercept, _coefs.back().data());
+ dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_train.data(), sz, _coefs.back().data(), 1, 0.0, _prop_train_est.data() + start, 1);
+ std::transform(_prop_train_est.begin() + start, _prop_train_est.begin() + start + sz, _prop_train.data() + start, _train_error.data() + start, std::minus<double>());
+
+ start += sz;
+ }
+
+ start = 0;
+ int ii = 0;
+ for(auto& sz : _task_sizes_test)
+ {
+ if(sz > 0)
+ {
+ for(int ff = 0; ff < feats.size(); ++ff)
+ std::copy_n(feats[ff]->test_value().data() + start, sz, _D_test.data() + ff * sz);
+
+ std::fill_n(_D_test.data() + feats.size() * sz, sz, 1.0);
+ dgemv_('N', sz, _n_dim + 1 - _fix_intercept, 1.0, _D_test.data(), sz, _coefs[ii].data(), 1, 0.0, _prop_test_est.data() + start, 1);
+ std::transform(_prop_test_est.begin() + start, _prop_test_est.begin() + start + sz, _prop_test.data() + start, _test_error.data() + start, std::minus<double>());
+ }
+ ++ii;
+ start += sz;
+ }
+}
+
+ModelRegressor::ModelRegressor(std::string train_file)
+{
+ _n_samp_test = 0;
+
+ std::vector<std::string> split_str;
+ std::vector<std::string> feature_expr_train = populate_model(train_file, true);
+
+ for(int ff = 0; ff < feature_expr_train.size(); ++ff)
+ {
+ split_str = str_utils::split_string_trim(feature_expr_train[ff]);
+ int rung = std::stoi(split_str[0]);
+ std::string unit_str = split_str[1];
+ std::string postfix_expr = split_str[2];
+ std::string expr = split_str[3];
+
+ std::vector<double> feat_val(_n_samp_train);
+ std::vector<double> feat_test_val = {};
+ std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data());
+
+ model_node_ptr feat = std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str));
+ _feats.push_back(feat);
+ }
+
+}
+
+ModelRegressor::ModelRegressor(std::string train_file, std::string test_file)
+{
+ std::vector<std::string> split_str;
+ std::vector<std::string> feature_expr_train = populate_model(train_file, true);
+ std::vector<std::string> feature_expr_test = populate_model(test_file, false);
+
+ for(int ff = 0; ff < feature_expr_train.size(); ++ff)
+ {
+ if(feature_expr_train[ff] != feature_expr_test[ff])
+ throw std::logic_error("Features for train and test file do not agree");
+
+ split_str = str_utils::split_string_trim(feature_expr_train[ff]);
+
+ int rung = std::stoi(split_str[0]);
+ std::string unit_str = split_str[1];
+ std::string postfix_expr = split_str[2];
+ std::string expr = split_str[3];
+
+ std::vector<double> feat_val(_n_samp_train);
+ std::vector<double> feat_test_val(_n_samp_test);
+
+ std::copy_n(&_D_train[ff * _n_samp_train], _n_samp_train, feat_val.data());
+ std::copy_n(&_D_test[ff * _n_samp_test], _n_samp_test, feat_test_val.data());
+
+ _feats.push_back(std::make_shared<ModelNode>(ff, rung, expr, postfix_expr, feat_val, feat_test_val, Unit(unit_str)));
+ }
+}
+
+std::vector<std::string> ModelRegressor::populate_model(std::string filename, bool train)
+{
+ std::ifstream file_stream;
+ file_stream.open(filename, std::ios::in);
+
+ std::vector<std::string> feature_expr;
+ std::vector<std::string> split_line;
+
+ // Store model line
+ std::string model_line;
+ std::getline(file_stream, model_line);
+
+ // Get the property unit and error
+ std::string unit_line;
+ std::string error_line;
+
+ std::getline(file_stream, unit_line);
+ if(unit_line.substr(0, 8).compare("# RMSE: ") != 0)
+ {
+ split_line = str_utils::split_string_trim(unit_line);
+ _prop_unit = Unit(split_line.back());
+ std::getline(file_stream, error_line);
+ }
+ else
+ {
+ _prop_unit = Unit();
+ error_line = unit_line;
+ }
+
+ split_line = str_utils::split_string_trim(error_line);
+
+ double rmse = std::stod(split_line[1]);
+ double max_ae = std::stod(split_line[3]);
+
+ // Get coefficients
+ std::string line;
+ std::getline(file_stream, line);
+ std::getline(file_stream, line);
+
+ int n_task = 0;
+ int n_dim = 0;
+ std::getline(file_stream, line);
+ do
+ {
+ ++n_task;
+ split_line = str_utils::split_string_trim(line);
+ n_dim = split_line.size() - 3;
+ if(train)
+ {
+ _coefs.push_back(std::vector<double>(n_dim + 1, 0.0));
+ std::transform(split_line.begin() + 1, split_line.end()-1, _coefs.back().data(), [](std::string s){return std::stod(s);});
+ }
+ std::getline(file_stream, line);
+ } while(line.substr(0, 39).compare("# Feature Rung, Units, and Expressions") != 0);
+
+ if(_coefs.back().back() == 0.0)
+ _fix_intercept = true;
+ else
+ _fix_intercept = false;
+
+ _n_dim = n_dim + 1 - _fix_intercept;
+
+ std::getline(file_stream, line);
+ for(int ff = 0; ff < n_dim; ++ff)
+ {
+ feature_expr.push_back(line.substr(6));
+ std::getline(file_stream, line);
+ }
+
+ std::getline(file_stream, line);
+
+ int n_samp = 0;
+ for(int tt = 0; tt < n_task; ++tt)
+ {
+ std::getline(file_stream, line);
+ split_line = str_utils::split_string_trim(line);
+ n_samp += std::stoi(split_line[1]);
+ if(train)
+ _task_sizes_train.push_back(std::stoi(split_line[1]));
+ else
+ _task_sizes_test.push_back(std::stoi(split_line[1]));
+ }
+
+ if(train)
+ {
+ _n_samp_train = n_samp;
+ _prop_train.resize(n_samp);
+ _prop_train_est.resize(n_samp);
+ _train_error.resize(n_samp);
+ }
+ else
+ {
+ _n_samp_test = n_samp;
+ _prop_test.resize(n_samp);
+ _prop_test_est.resize(n_samp);
+ _test_error.resize(n_samp);
+ }
+
+ std::getline(file_stream, line);
+ std::getline(file_stream, line);
+ if(!train)
+ std::getline(file_stream, line);
+
+ std::vector<std::vector<double>> feat_vals(n_dim, std::vector<double>(n_samp, 0.0));
+ for(int ns = 0; ns < n_samp; ++ns)
+ {
+ std::getline(file_stream, line);
+ split_line = str_utils::split_string_trim(line);
+ if(train)
+ {
+ _prop_train[ns] = std::stod(split_line[0]);
+ _prop_train_est[ns] = std::stod(split_line[1]);
+ _train_error[ns] = _prop_train_est[ns] - _prop_train[ns];
+
+ }
+ else
+ {
+ _prop_test[ns] = std::stod(split_line[0]);
+ _prop_test_est[ns] = std::stod(split_line[1]);
+ _test_error[ns] = _prop_test_est[ns] - _prop_test[ns];
+ }
+ for(int nf = 0; nf < n_dim; ++nf)
+ {
+ feat_vals[nf][ns] = std::stod(split_line[2 + nf]);
+ }
+ }
+
+ if(train)
+ {
+ _D_train.resize(_n_dim * n_samp);
+ for(int nf = 0; nf < n_dim; ++nf)
+ std::copy_n(feat_vals[nf].data(), n_samp, &_D_train[nf * n_samp]);
+ }
+ else
+ {
+ _D_test.resize(_n_dim * n_samp);
+ for(int nf = 0; nf < n_dim; ++nf)
+ std::copy_n(feat_vals[nf].data(), n_samp, &_D_test[nf * n_samp]);
+ }
+
+ file_stream.close();
+ return feature_expr;
+}
+
+std::string ModelRegressor::toString() const
+{
+ std::stringstream unit_rep;
+ unit_rep << "c0";
+ for(int ff = 0; ff < _feats.size(); ++ff)
+ unit_rep << " + a" << std::to_string(ff) << " * " << _feats[ff]->expr();
+ return unit_rep.str();
+}
+
+std::ostream& operator<< (std::ostream& outStream, const ModelRegressor& model)
+{
+ outStream << model.toString();
+ return outStream;
+}
+
+void ModelRegressor::to_file(std::string filename, bool train, std::vector<int> test_inds)
+{
+ boost::filesystem::path p(filename.c_str());
+ boost::filesystem::create_directories(p.remove_filename());
+
+ std::ofstream out_file_stream = std::ofstream();
+ out_file_stream.open(filename);
+
+ out_file_stream << "# " << toString() << std::endl;
+ out_file_stream << "# Property of the Unit: " << _prop_unit.toString() << std::endl;
+ if(train)
+ out_file_stream << "# RMSE: " << std::setprecision(15) << rmse() << "; Max AE: " << max_ae() << std::endl;
+ else
+ out_file_stream << "# RMSE: " << std::setprecision(15) << test_rmse() << "; Max AE: " << test_max_ae() << std::endl;
+
+ out_file_stream << "# Coefficients" << std::endl;
+ out_file_stream << std::setw(10) << std::left << "# Task";
+
+ for(int cc = 0; cc < _coefs[0].size() - (1 - _fix_intercept); ++cc)
+ out_file_stream << std::setw(24) << " a" + std::to_string(cc);
+
+ if(!_fix_intercept)
+ out_file_stream << " c0" << std::endl;
+ else
+ out_file_stream << std::endl;
+
+ for(int cc = 0; cc < _coefs.size(); ++cc)
+ {
+ out_file_stream << std::setw(10) << std::left << "# " + std::to_string(cc) + ", ";
+ for(auto& coeff : _coefs[cc])
+ out_file_stream << std::setprecision(15) << std::scientific << std::right << std::setw(22) << coeff << std::setw(2) << ", ";
+ out_file_stream << "\n";
+ }
+
+ out_file_stream << "# Feature Rung, Units, and Expressions" << std::endl;
+ for(int ff = 0; ff < _feats.size(); ++ff)
+ out_file_stream << std::setw(6) << std::left << "# " + std::to_string(ff) + ", " << std::to_string(_feats[ff]->rung()) + ", " << std::setw(50) << _feats[ff]->unit().toString() + ", " << _feats[ff]->postfix_expr() + "," << _feats[ff]->expr() << std::endl;
+
+ out_file_stream << "# Number of Samples Per Task" << std::endl;
+ if(train)
+ {
+ out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_train" << std::endl;
+ for(int tt = 0; tt < _task_sizes_train.size(); ++tt)
+ out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_train[tt] << std::endl;
+ }
+ else
+ {
+ out_file_stream << std::setw(10) << std::left << "# Task;" << std::setw(24) << "n_mats_test" << std::endl;
+ for(int tt = 0; tt < _task_sizes_test.size(); ++tt)
+ out_file_stream << std::left << std::setw(10) << "# " + std::to_string(tt) + ", " << std::left << std::setw(22) << _task_sizes_test[tt] << std::endl;
+
+ out_file_stream << "# Test Indexes: [ " << test_inds[0];
+ for(int ii = 1; ii < test_inds.size(); ++ii)
+ out_file_stream << ", " << test_inds[ii];
+ out_file_stream << " ]" << std::endl;
+ }
+
+ out_file_stream << "\n" << std::setw(24) << std::left << "#Property Value" << std::setw(24) << " Property Value (EST)";
+ for(int ff = 0; ff < _feats.size(); ++ff)
+ out_file_stream << std::setw(24) << " Feature " + std::to_string(ff) + " Value";
+ out_file_stream << std::endl;
+
+ if(train)
+ {
+ for(int ss = 0; ss < _n_samp_train; ++ss)
+ {
+ out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_train[ss] << std::setw(2) << ", " << std::setw(22) << _prop_train_est[ss];
+ for(int ff = 0; ff < _n_dim; ++ff)
+ out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->value()[ss];
+ out_file_stream << std::endl;
+ }
+ }
+ else
+ {
+ for(int ss = 0; ss < _n_samp_test; ++ss)
+ {
+ out_file_stream << std::right << std::setw(22) << std::setprecision(15) << std::scientific << _prop_test[ss] << std::setw(2) << ", " << std::setw(22) << _prop_test_est[ss];
+ for(int ff = 0; ff < _n_dim; ++ff)
+ out_file_stream << std::right << std::setw(2) << ", " << std::setw(22) << std::setprecision(15) << _feats[ff]->test_value()[ss];
+ out_file_stream << std::endl;
+ }
+ }
+ out_file_stream.close();
+}
+
+std::vector<double> ModelRegressor::sorted_error()
+{
+ std::vector<double> sorted_error(_train_error.size(), 0.0);
+ std::copy_n(_train_error.data(), _train_error.size(), sorted_error.data());
+ std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);});
+ std::sort(sorted_error.begin(), sorted_error.end());
+ return sorted_error;
+}
+
+std::vector<double> ModelRegressor::sorted_test_error()
+{
+ std::vector<double> sorted_error(_test_error.size(), 0.0);
+ std::copy_n(_test_error.data(), _test_error.size(), sorted_error.data());
+ std::transform(sorted_error.begin(), sorted_error.end(), sorted_error.begin(), [](double e){return std::abs(e);});
+ std::sort(sorted_error.begin(), sorted_error.end());
+ return sorted_error;
+}
+
+double ModelRegressor::mape()
+{
+ std::vector<double> percent_error(_train_error.size(), 0.0);
+ std::transform(_train_error.begin(), _train_error.end(), _prop_train.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);});
+ return util_funcs::mean(percent_error);
+}
+
+double ModelRegressor::test_mape()
+{
+ std::vector<double> percent_error(_test_error.size(), 0.0);
+ std::transform(_test_error.begin(), _test_error.end(), _prop_test.begin(), percent_error.begin(), [](double e, double p){return std::abs(e / p);});
+ return util_funcs::mean(percent_error);
+}
diff --git a/src/descriptor_identifier/Model/ModelRegressor.hpp b/src/descriptor_identifier/Model/ModelRegressor.hpp
new file mode 100644
index 00000000..4a28d58f
--- /dev/null
+++ b/src/descriptor_identifier/Model/ModelRegressor.hpp
@@ -0,0 +1,305 @@
+/** @file descriptor_identifier/ModelRegressor/ModelRegressor.hpp
+ * @brief Object to store the models generated form SISSO
+ *
+ * Creates a ModelRegressor generated from SISSO and the corresponding output file.
+ * It also has functionality to read in an output file to regenerate the model.
+ *
+ * @author Thomas A. R. Purcell (tpurcell)
+ * @bug No known bugs.
+ */
+#ifndef MODEL_REGRESSOR
+#define MODEL_REGRESSOR
+
+#include <boost/algorithm/string.hpp>
+#include <boost/algorithm/string/trim.hpp>
+#include <boost/filesystem.hpp>
+
+#include<iomanip>
+#include<fstream>
+#include<iostream>
+
+#include <descriptor_identifier/Model/Model.hpp>
+#include <feature_creation/node/ModelNode.hpp>
+#include <utils/string_utils.hpp>
+
+#ifdef PY_BINDINGS
+ namespace np = boost::python::numpy;
+ namespace py = boost::python;
+#endif
+
+typedef std::shared_ptr<ModelNode> model_node_ptr;
+
+// DocString: cls_model_reg
+/**
+ * @brief Class to store the models found from SISSO
+ *
+ */
+class ModelRegressor : public Model
+{
+ std::vector<double> _prop_train_est; //!< The estimated Property (training data)
+ std::vector<double> _prop_test_est; //!< The estimated Property (testing data)
+
+ std::vector<double> _train_error; //!< The error of the model (training)
+ std::vector<double> _test_error; //!< The error of the model (testing)
+
+public:
+ /**
+ * @brief Constructor for the model
+ *
+ * @param prop_unit The unit of the property
+ * @param prop_train The property vector for the training samples
+ * @param prop_test The property vector for the test samples
+ * @param feats The features for the model
+ * @param task_sizes_train Number of samples per task in the training data
+ * @param task_sizes_test Number of samples per task in the test data
+ */
+ ModelRegressor(Unit prop_unit, std::vector<double> prop_train, std::vector<double> prop_test, std::vector<model_node_ptr> feats, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, bool fix_intercept);
+
+ // DocString: model_reg_init_str
+ /**
+ * @brief Construct a model from a training output file
+ * @details Reads in all of the data from the output file and recreates the model object
+ *
+ * @param train_file Previously generated model file
+ */
+ ModelRegressor(std::string train_file);
+
+ // DocString: model_reg_init_str_str
+ /**
+ * @brief Construct a model from a training and testing output file
+ * @details Reads in all of the data from the output files and recreates the model object
+ *
+ * @param train_file Previously generated training model output file
+ * @param train_file Previously generated testing model output file
+ */
+ ModelRegressor(std::string train_file, std::string test_file);
+
+ /**
+ * @brief Copy Constructor
+ *
+ * @param o ModelRegressor to be copied
+ */
+ ModelRegressor(const ModelRegressor& o) = default;
+
+ /**
+ * @brief Move Constructor
+ *
+ * @param o ModelRegressor to be moved
+ */
+ ModelRegressor(ModelRegressor&& o) = default;
+
+ /**
+ * @brief Copy Assignment operator
+ *
+ * @param o ModelRegressor to be copied
+ */
+ ModelRegressor& operator= (const ModelRegressor& o) = default;
+
+ /**
+ * @brief Move Assignment operator
+ *
+ * @param o ModelRegressor to be moved
+ */
+ ModelRegressor& operator= (ModelRegressor&& o) = default;
+
+ /**
+ * @brief Read an output file and extract all relevant information
+ * @details Takes in an output file and extracts all data needed to recreate the model
+ *
+ * @param filename Name of the output file
+ * @param train If true then the output represents training data
+ *
+ * @return [description]
+ */
+ std::vector<std::string> populate_model(std::string filename, bool train);
+
+ // DocString: model_reg_str
+ /**
+ * @brief Convert the model to a string
+
+ * @return The string representation of the model
+ */
+ std::string toString() const;
+
+ /**
+ * @brief Copy the error into a new array
+ *
+ * @param res pointer to the beginning of the vector to store the residual
+ */
+ inline void copy_error(double* res){std::copy_n(_train_error.data(), _n_samp_train, res);}
+
+ // DocString: model_reg_rmse
+ /**
+ * @brief The training rmse of the model
+ */
+ inline double rmse(){return util_funcs::norm(_train_error.data(), _n_samp_train) / std::sqrt(static_cast<double>(_n_samp_train));}
+
+ // DocString: model_reg_test_rmse
+ /**
+ * @brief The test rmse of the model
+ */
+ inline double test_rmse(){return util_funcs::norm(_test_error.data(), _n_samp_test) / std::sqrt(static_cast<double>(_n_samp_test));}
+
+ // DocString: model_reg_max_ae
+ /**
+ * @brief The max Absolute error of the training data
+ */
+ inline double max_ae()
+ {
+ return std::abs(*std::max_element(_train_error.data(), _train_error.data() + _n_samp_train, [](double d1, double d2){return std::abs(d1) < std::abs(d2);}));
+ }
+
+ // DocString: model_reg_test_max_ae
+ /**
+ * @brief The max Absolute error of the testing data
+ */
+ inline double test_max_ae()
+ {
+ return std::abs(*std::max_element(_test_error.data(), _test_error.data() + _n_samp_test, [](double d1, double d2){return std::abs(d1) < std::abs(d2);}));
+ }
+
+ // DocString: model_reg_mae
+ /**
+ * @brief The mean absolute error of the model
+ * @return The mean absolute error of the training data
+ */
+ inline double mae(){return std::accumulate(_train_error.begin(), _train_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_train;}
+
+ // DocString: model_reg_test_mae
+ /**
+ * @brief The mean absolute test error of the model
+ * @return The mean absolute error of the test data
+ */
+ inline double test_mae(){return std::accumulate(_test_error.begin(), _test_error.end(), 0.0, [](double total, double e){return total + std::abs(e);}) / _n_samp_test;}
+
+ // DocString: model_reg_mape
+ /**
+ * @brief The mean absolute error of the model
+ * @return The mean absolute error of the training data
+ */
+ double mape();
+
+ // DocString: model_reg_test_mape
+ /**
+ * @brief The mean absolute test error of the model
+ * @return The mean absolute error of the test data
+ */
+ double test_mape();
+
+ /**
+ * @brief Sort the training error based on magnitude
+ * @return The error vector sorted
+ */
+ std::vector<double> sorted_error();
+
+ /**
+ * @brief Sort the training test_error based on magnitude
+ * @return The test_error vector sorted
+ */
+ std::vector<double> sorted_test_error();
+
+ // DocString: model_reg_percentile_25_ae
+ /**
+ * @brief The mean absolute error of the model
+ * @return The mean absolute error of the training data
+ */
+ inline double percentile_25_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.25))];}
+
+ // DocString: model_reg_test_percentile_25_ae
+ /**
+ * @brief The mean absolute test error of the model
+ * @return The mean absolute error of the test data
+ */
+ inline double percentile_25_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.25))];}
+
+ // DocString: model_reg_percentile_50_ae
+ /**
+ * @brief The mean absolute error of the model
+ * @return The mean absolute error of the training data
+ */
+ inline double percentile_50_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.50))];}
+
+ // DocString: model_reg_test_percentile_50_ae
+ /**
+ * @brief The mean absolute test error of the model
+ * @return The mean absolute error of the test data
+ */
+ inline double percentile_50_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.50))];}
+
+ // DocString: model_reg_percentile_75_ae
+ /**
+ * @brief The mean absolute error of the model
+ * @return The mean absolute error of the training data
+ */
+ inline double percentile_75_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.75))];}
+
+ // DocString: model_reg_test_percentile_75_ae
+ /**
+ * @brief The mean absolute test error of the model
+ * @return The mean absolute error of the test data
+ */
+ inline double percentile_75_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.75))];}
+
+ // DocString: model_reg_percentile_95_ae
+ /**
+ * @brief The mean absolute error of the model
+ * @return The mean absolute error of the training data
+ */
+ inline double percentile_95_ae(){return sorted_error()[static_cast<int>(floor(_n_samp_train * 0.95))];}
+
+ // DocString: model_reg_test_percentile_95_ae
+ /**
+ * @brief The mean absolute test error of the model
+ * @return The mean absolute error of the test data
+ */
+ inline double percentile_95_test_ae(){return sorted_test_error()[static_cast<int>(floor(_n_samp_test * 0.95))];}
+
+ /**
+ * @brief Convert the ModelRegressor into an output file
+ *
+ * @param filename The name of the file to output to
+ * @param train If true output the training data
+ * @param test_inds The indexes of the test set
+ */
+ void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {});
+
+ #ifdef PY_BINDINGS
+ // DocString: model_reg_prop_train_est
+ /**
+ * @brief The estimation of the property
+ * @return _prop_train_est as a numpy ndarray
+ */
+ inline np::ndarray prop_train_est(){return python_conv_utils::to_ndarray<double>(_prop_train_est);}
+
+ // DocString: model_reg_prop_test_est
+ /**
+ * @brief The estimation of the properties test values
+ * @return _prop_test_est as a numpy ndarray
+ */
+ inline np::ndarray prop_test_est(){return python_conv_utils::to_ndarray<double>(_prop_test_est);}
+
+ // DocString: model_reg_train_error
+ /**
+ * @brief The training error
+ * @return _train_error as a numpy ndarray
+ */
+ inline np::ndarray train_error(){return python_conv_utils::to_ndarray<double>(_train_error);}
+
+ // DocString: model_reg_test_error
+ /**
+ * @brief The test error
+ * @return _test_error as a numpy ndarray
+ */
+ inline np::ndarray test_error(){return python_conv_utils::to_ndarray<double>(_test_error);}
+ #endif
+};
+
+/**
+ * @brief Print a model to an string stream
+ *
+ * @param outStream The output stream the model is to be printed
+ * @param model The model to be printed
+ */
+std::ostream& operator<< (std::ostream& outStream, const ModelRegressor& model);
+
+#endif
\ No newline at end of file
diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp
new file mode 100644
index 00000000..d24d8d5f
--- /dev/null
+++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.cpp
@@ -0,0 +1,252 @@
+#include <descriptor_identifier/SISSO_DI/SISSOClassifier.hpp>
+
+SISSOClassifier::SISSOClassifier(
+ std::shared_ptr<FeatureSpace> feat_space,
+ Unit prop_unit,
+ std::vector<double> prop,
+ std::vector<double> prop_test,
+ std::vector<int> task_sizes_train,
+ std::vector<int> task_sizes_test,
+ std::vector<int> leave_out_inds,
+ int n_dim,
+ int n_residual,
+ int n_models_store,
+ bool fix_intercept
+):
+ SISSO_DI(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept),
+ _training(task_sizes_test.size()),
+ _int_prop(prop.size()),
+ _int_prop_test(prop_test.size()),
+ _int_error(prop.size()),
+ _trainer(1, !fix_intercept),
+ _c(1.0)
+{
+ _trainer.stoppingCondition().maxSeconds = 0.01;
+
+ double min_prop = *std::min_element(prop.begin(), prop.end());
+ if(min_prop < 0.0)
+ {
+ std::transform(prop.begin(), prop.end(), prop.begin(), [&min_prop](double pp){return pp - min_prop;});
+ std::transform(prop_test.begin(), prop_test.end(), prop_test.begin(), [&min_prop](double pp){return pp - min_prop;});
+ }
+ std::vector<bool> not_int(prop.size());
+ std::transform(prop.begin(), prop.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;});
+ if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;}))
+ throw std::logic_error("For classification the property (training set) must be an integer");
+
+ not_int.resize(prop_test.size());
+ std::transform(prop_test.begin(), prop_test.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;});
+ if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;}))
+ throw std::logic_error("For classification the property (test set) must be an integer");
+
+ for(auto& pt : prop_test)
+ if(std::none_of(prop.begin(), prop.end(), [&pt](double pp){return std::abs(pp - pt) < 1e-10;}))
+ throw std::logic_error("A class in the property vector (test set) is not in the training set.");
+
+ std::transform(_prop.begin(), _prop.end(), _prop.begin(), [&min_prop](double pp){return pp - min_prop;});
+ std::transform(_prop_test.begin(), _prop_test.end(), _prop_test.begin(), [&min_prop](double pp){return pp - min_prop;});
+
+ for(int pp = 0; pp < _prop.size(); ++pp)
+ _int_prop[pp] = static_cast<unsigned int>(std::round(_prop[pp]));
+
+ for(int pp = 0; pp < _prop_test.size(); ++pp)
+ _int_prop_test[pp] = static_cast<unsigned int>(std::round(_prop_test[pp]));
+
+}
+
+void SISSOClassifier::setup_training_data(int n_dim)
+{
+ std::vector<unsigned int> prop_to_copy;
+ int n_els = 0;
+ for(int tt = 0; tt < _task_sizes_train.size(); ++tt)
+ {
+ shark::Data<shark::RealVector> data(_task_sizes_train[tt], shark::RealVector(n_dim));
+ shark::Data<unsigned int> labels(_task_sizes_train[tt], 0);
+
+ for(int ii = 0; ii < _task_sizes_train[tt]; ++ii)
+ prop_to_copy[ii] = static_cast<unsigned int>(_prop[ii + n_els]);
+
+ int n_copied = 0;
+ for(auto& batch : labels.batches())
+ {
+ std::copy_n(prop_to_copy.begin() + n_copied, shark::batchSize(batch), &batch(0));
+ n_els += shark::batchSize(batch);
+ n_copied += shark::batchSize(batch);
+ }
+ _training[tt] = shark::LabeledData<shark::RealVector, unsigned int>(data, labels);
+ }
+}
+
+void SISSOClassifier::set_data(std::vector<int>& inds, int start, int task)
+{
+ int n_els = 0;
+ for(auto& batch: _training[task].inputs().batches())
+ {
+ for(int ii = 0; ii < inds.size(); ++ii)
+ dcopy_(shark::batchSize(batch), node_value_arrs::get_d_matrix_ptr(inds[ii]) + start + n_els, 1, &batch(0, 0) + ii, inds.size());
+ n_els += shark::batchSize(batch);
+ }
+}
+
+void SISSOClassifier::svm(std::vector<int>& inds, double* coefs, int start, int task)
+{
+ set_data(inds, start, task);
+ _trainer.train(_model, _training[task]);
+ std::copy_n(_model.parameterVector().begin(), inds.size(), coefs);
+ std::copy_n(_model.bias().begin(), _model.bias().size(), coefs);
+}
+
+void SISSOClassifier::l0_norm(std::vector<double>& prop, int n_dim)
+{
+ int n_get_models = std::max(_n_residual, _n_models_store);
+ std::vector<double> coefs(n_dim + 1, 0.0);
+
+ std::vector<int> inds(n_dim, 0);
+ std::vector<int> inds_min(n_get_models * _n_dim, -1);
+
+ std::vector<double> min_errors(n_get_models, util_funcs::norm(_prop.data(), _n_samp));
+ std::vector<double> min_dist_errors(n_get_models, util_funcs::norm(_prop.data(), _n_samp));
+
+ for(int rr = 0; rr < n_dim; ++rr)
+ inds[rr] = _feat_space->phi_selected().size() - 1 - rr;
+ util_funcs::iterate(inds, inds.size(), _mpi_comm->rank());
+ int max_error_ind = 0;
+
+ std::vector<double> gamma(_prop.size(), 0.0);
+ std::vector<double> gamma_feat(_prop.size(), 0.0);
+ do {
+ int start = 0;
+ double error = 0.0;
+ double dist_error = 0.0;
+ // for(auto& sz : _task_sizes_train)
+ for(int tt = 0; tt < _task_sizes_train.size(); ++ tt)
+ {
+ svm(inds, coefs.data(), start, tt);
+ double norm = util_funcs::norm(coefs.data(), n_dim);
+ std::fill_n(gamma.begin(), _task_sizes_train[tt], (_fix_intercept ? 0.0 : coefs[n_dim]) / norm);
+ int n_els = 0;
+ for(auto& batch: _training[tt].inputs().batches())
+ {
+ for(int ii = 0; ii < inds.size(); ++ii)
+ daxpy_(_task_sizes_train[tt], coefs[ii] / norm, &batch(0, 0) + ii, n_dim, gamma_feat.data() + n_els, 1);
+ n_els += shark::batchSize(batch);
+ }
+ std::transform(gamma_feat.begin(), gamma_feat.begin() + _task_sizes_train[tt], gamma.begin(), gamma.begin(), [](double d1, double d2){return std::abs(d1) + d2;});
+
+ error += _loss.eval(_training[tt].labels(), _model(_training[tt].inputs())) * _task_sizes_train[tt];
+ dist_error += *std::min_element(gamma.begin(), gamma.end());
+ start += _task_sizes_train[tt];
+ }
+ dist_error /= static_cast<double>(_task_sizes_train.size());
+
+ if((error < min_errors[max_error_ind]) || ((error == min_errors[max_error_ind]) && (dist_error > min_dist_errors[max_error_ind])))
+ {
+ min_errors[max_error_ind] = error;
+ min_dist_errors[max_error_ind] = dist_error;
+ std::copy_n(inds.begin(), inds.size(), inds_min.begin() + max_error_ind * n_dim);
+
+ double max_dist = *std::max_element(min_dist_errors.begin(), min_dist_errors.end()) + 0.01;
+ daxpy_(n_get_models, -1.0 / max_dist, min_dist_errors.data(), 1, min_errors.data(), 1);
+ max_error_ind = std::max_element(min_errors.begin(), min_errors.end()) - min_errors.begin();
+ daxpy_(n_get_models, 1.0 / max_dist, min_dist_errors.data(), 1, min_errors.data(), 1);
+ }
+ } while(util_funcs::iterate(inds, inds.size(), _mpi_comm->size()));
+
+ std::vector<double> all_min_error(_mpi_comm->size() * n_get_models);
+ std::vector<double> all_min_dist_error(_mpi_comm->size() * n_get_models);
+ std::vector<int> all_inds_min(_mpi_comm->size() * n_get_models * n_dim);
+
+ mpi::all_gather(*_mpi_comm, min_errors.data(), n_get_models, all_min_error);
+ mpi::all_gather(*_mpi_comm, min_dist_errors.data(), n_get_models, all_min_dist_error);
+ mpi::all_gather(*_mpi_comm, inds_min.data(), n_get_models * n_dim, all_inds_min);
+
+ inds = util_funcs::argsort(all_min_error);
+
+ std::vector<model_node_ptr> min_nodes(n_dim);
+ std::vector<ModelClassifier> models;
+
+ for(int rr = 0; rr < n_get_models; ++rr)
+ {
+ node_value_arrs::clear_temp_test_reg();
+ for(int ii = 0; ii < n_dim; ++ii)
+ {
+ int index = all_inds_min[inds[rr] * n_dim + ii];
+ min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit());
+ }
+ models.push_back(ModelClassifier(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept));
+ }
+
+ _models.push_back(models);
+}
+
+
+void SISSOClassifier::fit()
+{
+ std::vector<double> residual(_n_samp * _n_residual);
+ std::clock_t start;
+ double duration;
+ start = std::clock();
+
+ _feat_space->sis(_prop);
+
+ _mpi_comm->barrier();
+ duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
+ if(_mpi_comm->rank() == 0)
+ std::cout << "Time for SIS: " << duration << std::endl;
+
+ start = std::clock();
+
+ std::vector<ModelClassifier> models;
+ for(int rr = 0; rr < std::max(_n_residual, _n_models_store); ++rr)
+ {
+ node_value_arrs::clear_temp_test_reg();
+ model_node_ptr model_feat = std::make_shared<ModelNode>(_feat_space->phi_selected()[rr]->arr_ind(), _feat_space->phi_selected()[rr]->rung(), _feat_space->phi_selected()[rr]->expr(), _feat_space->phi_selected()[rr]->postfix_expr(), _feat_space->phi_selected()[rr]->value(), _feat_space->phi_selected()[rr]->test_value(), _feat_space->phi_selected()[rr]->unit());
+ models.push_back(ModelClassifier(_prop_unit, _prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test, _fix_intercept));
+ if(rr < _n_residual)
+ models.back().copy_error(&residual[rr * _n_samp]);
+
+ if((_mpi_comm->rank() == 0) && (rr < _n_models_store))
+ {
+ models.back().to_file("models/train_dim_1_model_" + std::to_string(rr) + ".dat");
+ if(_leave_out_inds.size() > 0)
+ models.back().to_file("models/test_dim_1_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds);
+ }
+ }
+ _models.push_back(models);
+
+ _mpi_comm->barrier();
+ duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
+ if(_mpi_comm->rank() == 0)
+ std::cout << "Time for Model: " << duration << std::endl;
+
+ for(int dd = 2; dd <= _n_dim; ++dd)
+ {
+ start = std::clock();
+ _feat_space->sis(residual);
+
+ _mpi_comm->barrier();
+ duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
+ if(_mpi_comm->rank() == 0)
+ std::cout << "Time for SIS: " << duration << std::endl;
+
+ start = std::clock();
+ l0_norm(_prop, dd);
+
+ _mpi_comm->barrier();
+ duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
+ if(_mpi_comm->rank() == 0)
+ {
+ std::cout << "Time for l0-norm: " << duration << std::endl;
+
+ for(int rr = 0; rr < _n_models_store; ++rr)
+ {
+ _models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat");
+ if(_leave_out_inds.size() > 0)
+ _models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds);
+ }
+ }
+ for(int rr = 0; rr < _n_residual; ++rr)
+ _models.back()[rr].copy_error(&residual[rr * _n_samp]);
+ }
+}
+
diff --git a/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp
new file mode 100644
index 00000000..6e46c76c
--- /dev/null
+++ b/src/descriptor_identifier/SISSO_DI/SISSOClassifier.hpp
@@ -0,0 +1,183 @@
+/** @file descriptor_identifier/SISSOClassifier.hpp
+ * @brief Perform SISSO on a previously generated Feature Space
+ *
+ * Takes in a feature space and performs SISSO on it while storing the selected features in _models
+ *
+ * @author Thomas A. R. Purcell (tpurcell)
+ * @bug No known bugs.
+ */
+#ifndef SISSO_CLASSIFIER
+#define SISSO_CLASSIFIER
+
+#include <shark/Data/Dataset.h>
+#include <shark/ObjectiveFunctions/Loss/ZeroOneLoss.h>
+#include <shark/Algorithms/Trainers/CSvmTrainer.h>
+
+#include <feature_creation/feature_space/FeatureSpace.hpp>
+#include <descriptor_identifier/Model/ModelClassifier.hpp>
+#include <descriptor_identifier/SISSO_DI/SISSO_DI.hpp>
+#include <ctime>
+
+#ifdef PY_BINDINGS
+ namespace np = boost::python::numpy;
+ namespace py = boost::python;
+#endif
+
+// DocString: cls_sisso_class
+/**
+ * @brief SISSO Regressor class, performs the SISSO algorithm and stores all selected models
+ *
+ */
+class SISSOClassifier: public SISSO_DI
+{
+protected:
+ std::vector<std::vector<ModelClassifier>> _models; //!< List of models
+
+ std::vector<shark::LabeledData<shark::RealVector, unsigned int>> _training; //!< Training data set for all tasks
+
+ std::vector<unsigned int> _int_prop; //!< Property array (training data)
+ std::vector<unsigned int> _int_prop_test; //!< Property array (testing data)
+ std::vector<int> _int_error; //!< Array to calculate the residuals for the models (training data)
+ shark::ZeroOneLoss<unsigned int> _loss; //!< The loss calculator
+
+ shark::LinearCSvmTrainer<shark::RealVector> _trainer; //!< A list of training data for each task the SVM
+ shark::LinearClassifier<shark::RealVector> _model; //!< Model for the SVM classification problem
+
+ double _c;
+
+public:
+ /**
+ * @brief Constructor for the Regressor
+ *
+ * @param feat_space The feature space to run SISSO on
+ * @param prop_unit The unit of the property
+ * @param prop Vector storing all data to train the SISSO models with
+ * @param prpo_test Vector storing all data to test the SISSO models with
+ * @param task_sizes_train Number of training samples per task
+ * @param task_sizes_test Number of testing samples per task
+ * @param leave_out_inds List of indexes from the initial data file in the test set
+ * @param n_dim Maximum dimensionality of the generated models
+ * @param n_residual Number of residuals to pass to the next SIS operation
+ * @param n_models_store Number of features to store in files
+ * @param fix_intrecept If true fix intercept to 0
+ */
+ SISSOClassifier(std::shared_ptr<FeatureSpace> feat_space, Unit prop_unit, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual, int n_models_store, bool fix_intercept=false);
+
+ /**
+ * @brief Get the optimal size of the working array
+ *
+ * @param n_dim Dimension of the least squares matrix
+ * @return Optimal size of the working array
+ */
+ void setup_training_data(int n_dim);
+
+ /**
+ * @brief Preform Least squares optimization
+ *
+ * @param inds Feature indexes to get the model of
+ * @param coeffs Coefficients for the model
+ * @param start The index in the property and feature vectors start copying into _b and _a
+ * @param n_samp number of samples to perform least squares optimization on
+ */
+ void svm(std::vector<int>& inds, double* coeffs, int start, int n_samp);
+
+ /**
+ * @brief Set the data object for the given set of inds
+ *
+ * @param inds indexes of the selected features
+ * @param start The index in the property and feature vectors start copying into _b and _a
+ * @param n_samp number of samples to perform least squares optimization on
+ */
+ void set_data(std::vector<int>& inds, int start, int task);
+
+ /**
+ * @brief Preform the l0 normalization for a property or the residual
+ *
+ * @param prop The property to fit
+ * @param n_dim the dimensionality of the model
+ */
+ void l0_norm(std::vector<double>& prop, int n_dim);
+
+ // DocString: sisso_class_fit
+ /**
+ * @brief Perform SISSO to generate the models
+ * @details Iteratively pefrom SISSO on the Feature space and property until the model dimensionality is equal to _n_dim
+ */
+ void fit();
+
+ /**
+ * @brief Acessor function for models
+ */
+ inline std::vector<std::vector<ModelClassifier>> models(){return _models;}
+
+ // Python interface functions
+ #ifdef PY_BINDINGS
+ /**
+ * @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSOClassifier.cpp)
+ *
+ * @param feat_space The feature space to run SISSO on
+ * @param prop_unit The unit of the property
+ * @param prop Vector storing all data to train the SISSO models with
+ * @param prpo_test Vector storing all data to test the SISSO models with
+ * @param task_sizes_train Number of training samples per task
+ * @param task_sizes_test Number of testing samples per task
+ * @param leave_out_inds List of indexes from the initial data file in the test set
+ * @param n_dim Maximum dimensionality of the generated models
+ * @param n_residual Number of residuals to pass to the next SIS operation
+ * @param n_models_store Number of features to store in files
+ * @param fix_intrecept If true fix intercept to 0
+ */
+ SISSOClassifier(
+ std::shared_ptr<FeatureSpace> feat_space,
+ Unit prop_unit,
+ np::ndarray prop,
+ np::ndarray prop_test,
+ py::list task_sizes_train,
+ py::list task_sizes_test,
+ py::list leave_out_inds,
+ int n_dim,
+ int n_residual,
+ int n_models_store,
+ bool fix_intercept=false
+ );
+
+ // DocString: sisso_class_init_list
+ /**
+ * @brief Constructor for the Regressor that takes in python objects (cpp definition in <python/descriptor_identifier/SISSOClassifier.cpp)
+ *
+ * @param feat_space The feature space to run SISSO on
+ * @param prop_unit The unit of the property
+ * @param prop Vector storing all data to train the SISSO models with
+ * @param prpo_test Vector storing all data to test the SISSO models with
+ * @param task_sizes_train Number of training samples per task
+ * @param task_sizes_test Number of testing samples per task
+ * @param leave_out_inds List of indexes from the initial data file in the test set
+ * @param n_dim Maximum dimensionality of the generated models
+ * @param n_residual Number of residuals to pass to the next SIS operation
+ * @param n_models_store Number of features to store in files
+ * @param fix_intrecept If true fix intercept to 0
+ */
+ SISSOClassifier(
+ std::shared_ptr<FeatureSpace> feat_space,
+ Unit prop_unit,
+ py::list prop,
+ py::list prop_test,
+ py::list task_sizes_train,
+ py::list task_sizes_test,
+ py::list leave_out_inds,
+ int n_dim,
+ int n_residual,
+ int n_models_store,
+ bool fix_intercept=false
+ );
+
+ // DocString: sisso_class_models_py
+ /**
+ * @brief The selected models (cpp definition in <python/descriptor_identifier/SISSO_DI.cpp)
+ * @return models as a python list
+ */
+ py::list models_py();
+ #endif
+};
+
+#endif
diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp
index e7009a78..828129eb 100644
--- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp
+++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.cpp
@@ -125,7 +125,7 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim)
inds = util_funcs::argsort(all_min_error);
std::vector<model_node_ptr> min_nodes(n_dim);
- std::vector<Model> models;
+ std::vector<ModelRegressor> models;
for(int rr = 0; rr < n_get_models; ++rr)
{
@@ -135,8 +135,79 @@ void SISSORegressor::l0_norm(std::vector<double>& prop, int n_dim)
int index = all_inds_min[inds[rr] * n_dim + ii];
min_nodes[ii] = std::make_shared<ModelNode>(_feat_space->phi_selected()[index]->arr_ind(), _feat_space->phi_selected()[index]->rung(), _feat_space->phi_selected()[index]->expr(), _feat_space->phi_selected()[index]->postfix_expr(), _feat_space->phi_selected()[index]->value(), _feat_space->phi_selected()[index]->test_value(), _feat_space->phi_selected()[index]->unit());
}
- models.push_back(Model(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept));
+ models.push_back(ModelRegressor(_prop_unit, _prop, _prop_test, min_nodes, _task_sizes_train, _task_sizes_test, _fix_intercept));
}
_models.push_back(models);
}
+
+
+void SISSORegressor::fit()
+{
+ std::vector<double> residual(_n_samp * _n_residual);
+ std::clock_t start;
+ double duration;
+ start = std::clock();
+
+ _feat_space->sis(_prop);
+
+ _mpi_comm->barrier();
+ duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
+ if(_mpi_comm->rank() == 0)
+ std::cout << "Time for SIS: " << duration << std::endl;
+
+ start = std::clock();
+
+ std::vector<ModelRegressor> models;
+ for(int rr = 0; rr < std::max(_n_residual, _n_models_store); ++rr)
+ {
+ node_value_arrs::clear_temp_test_reg();
+ model_node_ptr model_feat = std::make_shared<ModelNode>(_feat_space->phi_selected()[rr]->arr_ind(), _feat_space->phi_selected()[rr]->rung(), _feat_space->phi_selected()[rr]->expr(), _feat_space->phi_selected()[rr]->postfix_expr(), _feat_space->phi_selected()[rr]->value(), _feat_space->phi_selected()[rr]->test_value(), _feat_space->phi_selected()[rr]->unit());
+ models.push_back(ModelRegressor(_prop_unit, _prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test, _fix_intercept));
+ if(rr < _n_residual)
+ models.back().copy_error(&residual[rr * _n_samp]);
+
+ if((_mpi_comm->rank() == 0) && (rr < _n_models_store))
+ {
+ models.back().to_file("models/train_dim_1_model_" + std::to_string(rr) + ".dat");
+ if(_leave_out_inds.size() > 0)
+ models.back().to_file("models/test_dim_1_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds);
+ }
+ }
+ _models.push_back(models);
+
+ _mpi_comm->barrier();
+ duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
+ if(_mpi_comm->rank() == 0)
+ std::cout << "Time for Model: " << duration << std::endl;
+
+ for(int dd = 2; dd <= _n_dim; ++dd)
+ {
+ start = std::clock();
+ _feat_space->sis(residual);
+
+ _mpi_comm->barrier();
+ duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
+ if(_mpi_comm->rank() == 0)
+ std::cout << "Time for SIS: " << duration << std::endl;
+
+ start = std::clock();
+ l0_norm(_prop, dd);
+
+ _mpi_comm->barrier();
+ duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
+ if(_mpi_comm->rank() == 0)
+ {
+ std::cout << "Time for l0-norm: " << duration << std::endl;
+
+ for(int rr = 0; rr < _n_models_store; ++rr)
+ {
+ _models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat");
+ if(_leave_out_inds.size() > 0)
+ _models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds);
+ }
+ }
+ for(int rr = 0; rr < _n_residual; ++rr)
+ _models.back()[rr].copy_error(&residual[rr * _n_samp]);
+ }
+}
diff --git a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp
index ee62cb5f..939994df 100644
--- a/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp
+++ b/src/descriptor_identifier/SISSO_DI/SISSORegressor.hpp
@@ -10,7 +10,7 @@
#define SISSO_REGRESSOR
#include <feature_creation/feature_space/FeatureSpace.hpp>
-#include <descriptor_identifier/Model/Model.hpp>
+#include <descriptor_identifier/Model/ModelRegressor.hpp>
#include <descriptor_identifier/SISSO_DI/SISSO_DI.hpp>
#include <ctime>
@@ -27,6 +27,8 @@
class SISSORegressor: public SISSO_DI
{
protected:
+ std::vector<std::vector<ModelRegressor>> _models; //!< List of models
+
std::vector<double> _a; //!< A matrix for least squares
std::vector<double> _b; //!< Solution array for least squares
std::vector<double> _work; //!< The work array for least squares problems
@@ -94,6 +96,18 @@ public:
*/
void l0_norm(std::vector<double>& prop, int n_dim);
+ // DocString: sisso_reg_fit
+ /**
+ * @brief Perform SISSO to generate the models
+ * @details Iteratively pefrom SISSO on the Feature space and property until the model dimensionality is equal to _n_dim
+ */
+ void fit();
+
+ /**
+ * @brief Acessor function for models
+ */
+ inline std::vector<std::vector<ModelRegressor>> models(){return _models;}
+
// Python interface functions
#ifdef PY_BINDINGS
/**
@@ -154,6 +168,13 @@ public:
int n_models_store,
bool fix_intercept=false
);
+
+ // DocString: sisso_reg_models_py
+ /**
+ * @brief The selected models (cpp definition in <python/descriptor_identifier/SISSO_DI.cpp)
+ * @return models as a python list
+ */
+ py::list models_py();
#endif
};
diff --git a/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp b/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp
index 9e9dabe7..3556902b 100644
--- a/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp
+++ b/src/descriptor_identifier/SISSO_DI/SISSO_DI.cpp
@@ -28,74 +28,3 @@ SISSO_DI::SISSO_DI(
_n_models_store(n_models_store),
_fix_intercept(fix_intercept)
{}
-
-
-void SISSO_DI::fit()
-{
- std::vector<double> residual(_n_samp * _n_residual);
- std::clock_t start;
- double duration;
- start = std::clock();
-
- _feat_space->sis(_prop);
-
- _mpi_comm->barrier();
- duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
- if(_mpi_comm->rank() == 0)
- std::cout << "Time for SIS: " << duration << std::endl;
-
- start = std::clock();
-
- std::vector<Model> models;
- for(int rr = 0; rr < std::max(_n_residual, _n_models_store); ++rr)
- {
- node_value_arrs::clear_temp_test_reg();
- model_node_ptr model_feat = std::make_shared<ModelNode>(_feat_space->phi_selected()[rr]->arr_ind(), _feat_space->phi_selected()[rr]->rung(), _feat_space->phi_selected()[rr]->expr(), _feat_space->phi_selected()[rr]->postfix_expr(), _feat_space->phi_selected()[rr]->value(), _feat_space->phi_selected()[rr]->test_value(), _feat_space->phi_selected()[rr]->unit());
- models.push_back(Model(_prop_unit, _prop, _prop_test, {model_feat}, _task_sizes_train, _task_sizes_test, _fix_intercept));
- if(rr < _n_residual)
- models.back().copy_error(&residual[rr * _n_samp]);
-
- if((_mpi_comm->rank() == 0) && (rr < _n_models_store))
- {
- models.back().to_file("models/train_dim_1_model_" + std::to_string(rr) + ".dat");
- if(_leave_out_inds.size() > 0)
- models.back().to_file("models/test_dim_1_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds);
- }
- }
- _models.push_back(models);
-
- _mpi_comm->barrier();
- duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
- if(_mpi_comm->rank() == 0)
- std::cout << "Time for Model: " << duration << std::endl;
-
- for(int dd = 2; dd <= _n_dim; ++dd)
- {
- start = std::clock();
- _feat_space->sis(residual);
-
- _mpi_comm->barrier();
- duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
- if(_mpi_comm->rank() == 0)
- std::cout << "Time for SIS: " << duration << std::endl;
-
- start = std::clock();
- l0_norm(_prop, dd);
-
- _mpi_comm->barrier();
- duration = ( std::clock() - start ) / (double) CLOCKS_PER_SEC;
- if(_mpi_comm->rank() == 0)
- {
- std::cout << "Time for l0-norm: " << duration << std::endl;
-
- for(int rr = 0; rr < _n_models_store; ++rr)
- {
- _models.back()[rr].to_file("models/train_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat");
- if(_leave_out_inds.size() > 0)
- _models.back()[rr].to_file("models/test_dim_" + std::to_string(dd) + "_model_" + std::to_string(rr) + ".dat", false, _leave_out_inds);
- }
- }
- for(int rr = 0; rr < _n_residual; ++rr)
- _models.back()[rr].copy_error(&residual[rr * _n_samp]);
- }
-}
diff --git a/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp b/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp
index ddf46e76..44075754 100644
--- a/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp
+++ b/src/descriptor_identifier/SISSO_DI/SISSO_DI.hpp
@@ -26,8 +26,6 @@
class SISSO_DI
{
protected:
- std::vector<std::vector<Model>> _models; //!< List of models
-
std::vector<double> _prop; //!< Property array (training data)
std::vector<double> _prop_test; //!< Property array (testing data)
std::vector<double> _error; //!< Array to calculate the residuals for the models (training data)
@@ -65,21 +63,11 @@ public:
*/
SISSO_DI(std::shared_ptr<FeatureSpace> feat_space, Unit prop_unit, std::vector<double> prop, std::vector<double> prop_test, std::vector<int> task_sizes_train, std::vector<int> task_sizes_test, std::vector<int> leave_out_inds, int n_dim, int n_residual, int n_models_store, bool fix_intercept=false);
- /**
- * @brief Set the residual for the next step
- *
- * @param inds indexes of the selected features
- * @param coeffs Coefficients of the model
- * @param start The index in the property and feature vectors start copying into _b and _a
- * @param n_samp number of samples to perform least squares optimization on
- */
- virtual void set_error(std::vector<int>& inds, double* coeffs, int start, int n_samp) = 0;
-
/**
* @brief Perform SISSO to generate the models
* @details Iteratively pefrom SISSO on the Feature space and property until the model dimensionality is equal to _n_dim
*/
- void fit();
+ virtual void fit() = 0;
/**
* @brief Preform the l0 normalization for a property or the residual
@@ -109,11 +97,6 @@ public:
*/
inline std::vector<double> error(){return _error;}
- /**
- * @brief Acessor function for models
- */
- inline std::vector<std::vector<Model>> models(){return _models;}
-
// DocString: sisso_di_n_samp
/**
* @brief The number of samples per feature
@@ -199,13 +182,6 @@ public:
bool fix_intercept=false
);
- // DocString: sisso_di_models_py
- /**
- * @brief The selected models (cpp definition in <python/descriptor_identifier/SISSO_DI.cpp)
- * @return models as a python list
- */
- py::list models_py();
-
// DocString: sisso_di_prop_py
/**
* @brief The property to be learned
diff --git a/src/python/bindings_docstring_keyed.cpp b/src/python/bindings_docstring_keyed.cpp
index b9c06e41..f055ea4b 100644
--- a/src/python/bindings_docstring_keyed.cpp
+++ b/src/python/bindings_docstring_keyed.cpp
@@ -5,8 +5,11 @@ using namespace boost::python;
void sisso::register_all()
{
sisso::descriptor_identifier::registerModel();
+ sisso::descriptor_identifier::registerModelRegressor();
+ sisso::descriptor_identifier::registerModelClassifier();
sisso::descriptor_identifier::registerSISSO_DI();
sisso::descriptor_identifier::registerSISSORegressor();
+ sisso::descriptor_identifier::registerSISSOClassifier();
sisso::feature_creation::registerFeatureSpace();
sisso::feature_creation::registerUnit();
sisso::feature_creation::node::registerNode();
@@ -341,49 +344,69 @@ void sisso::feature_creation::node::registerSixPowNode()
void sisso::descriptor_identifier::registerModel()
{
- class_<Model>("Model", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>())
- .def(init<std::string>())
- .def(init<std::string, std::string>())
- .def("__str__", &Model::toString, "@DocString_model_str@")
- .def("__repr__", &Model::toString, "@DocString_model_str@")
+ class_<sisso::descriptor_identifier::Model_Wrap, boost::noncopyable>("Model", no_init)
.add_property("n_samp_train", &Model::n_samp_train, "@DocString_model_n_samp_train@")
.add_property("n_samp_test", &Model::n_samp_test, "@DocString_model_n_samp_test@")
.add_property("n_dim", &Model::n_dim, "@DocString_model_n_dim@")
- .add_property("fit", &Model::prop_train_est, "@DocString_model_prop_train_est@")
- .add_property("predict", &Model::prop_test_est, "@DocString_model_prop_test_est@")
.add_property("prop_train", &Model::prop_train, "@DocString_model_prop_train@")
.add_property("prop_test", &Model::prop_test, "@DocString_model_prop_test@")
- .add_property("train_error", &Model::train_error, "@DocString_model_train_error@")
- .add_property("test_error", &Model::test_error, "@DocString_model_test_error@")
.add_property("feats", &Model::feats, "@DocString_model_feats@")
.add_property("coefs", &Model::coefs, "@DocString_model_coefs@")
- .add_property("rmse", &Model::rmse, "@DocString_model_rmse@")
- .add_property("test_rmse", &Model::test_rmse, "@DocString_model_test_rmse@")
- .add_property("max_ae", &Model::max_ae, "@DocString_model_max_ae@")
- .add_property("test_max_ae", &Model::test_max_ae, "@DocString_model_test_max_ae@")
- .add_property("mae", &Model::mae, "@DocString_model_mae@")
- .add_property("test_mae", &Model::test_mae, "@DocString_model_test_mae@")
- .add_property("mape", &Model::mape, "@DocString_model_mape@")
- .add_property("test_mape", &Model::test_mape, "@DocString_model_test_mape@")
- .add_property("percentile_25_ae", &Model::percentile_25_ae, "@DocString_model_percentile_25_ae@")
- .add_property("percentile_25_test_ae", &Model::percentile_25_test_ae, "@DocString_model_test_percentile_25_test_ae@")
- .add_property("percentile_50_ae", &Model::percentile_50_ae, "@DocString_model_percentile_50_ae@")
- .add_property("percentile_50_test_ae", &Model::percentile_50_test_ae, "@DocString_model_test_percentile_50_test_ae@")
- .add_property("percentile_75_ae", &Model::percentile_75_ae, "@DocString_model_percentile_75_ae@")
- .add_property("percentile_75_test_ae", &Model::percentile_75_test_ae, "@DocString_model_test_percentile_75_test_ae@")
- .add_property("percentile_95_ae", &Model::percentile_95_ae, "@DocString_model_percentile_95_ae@")
- .add_property("percentile_95_test_ae", &Model::percentile_95_ae, "@DocString_model_test_percentile_95_ae@")
.add_property("prop_unit", &Model::prop_unit, "@DocString_model_prop_unit@")
;
}
+void sisso::descriptor_identifier::registerModelRegressor()
+{
+ class_<ModelRegressor, bases<Model>>("ModelRegressor", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>())
+ .def(init<std::string>())
+ .def(init<std::string, std::string>())
+ .def("__str__", &ModelRegressor::toString, "@DocString_model_reg_str@")
+ .def("__repr__", &ModelRegressor::toString, "@DocString_model_reg_str@")
+ .add_property("fit", &ModelRegressor::prop_train_est, "@DocString_model_reg_prop_train_est@")
+ .add_property("predict", &ModelRegressor::prop_test_est, "@DocString_model_reg_prop_test_est@")
+ .add_property("train_error", &ModelRegressor::train_error, "@DocString_model_reg_train_error@")
+ .add_property("test_error", &ModelRegressor::test_error, "@DocString_model_reg_test_error@")
+ .add_property("rmse", &ModelRegressor::rmse, "@DocString_model_reg_rmse@")
+ .add_property("test_rmse", &ModelRegressor::test_rmse, "@DocString_model_reg_test_rmse@")
+ .add_property("max_ae", &ModelRegressor::max_ae, "@DocString_model_reg_max_ae@")
+ .add_property("test_max_ae", &ModelRegressor::test_max_ae, "@DocString_model_reg_test_max_ae@")
+ .add_property("mae", &ModelRegressor::mae, "@DocString_model_reg_mae@")
+ .add_property("test_mae", &ModelRegressor::test_mae, "@DocString_model_reg_test_mae@")
+ .add_property("mape", &ModelRegressor::mape, "@DocString_model_reg_mape@")
+ .add_property("test_mape", &ModelRegressor::test_mape, "@DocString_model_reg_test_mape@")
+ .add_property("percentile_25_ae", &ModelRegressor::percentile_25_ae, "@DocString_model_reg_percentile_25_ae@")
+ .add_property("percentile_25_test_ae", &ModelRegressor::percentile_25_test_ae, "@DocString_model_reg_test_percentile_25_test_ae@")
+ .add_property("percentile_50_ae", &ModelRegressor::percentile_50_ae, "@DocString_model_reg_percentile_50_ae@")
+ .add_property("percentile_50_test_ae", &ModelRegressor::percentile_50_test_ae, "@DocString_model_reg_test_percentile_50_test_ae@")
+ .add_property("percentile_75_ae", &ModelRegressor::percentile_75_ae, "@DocString_model_reg_percentile_75_ae@")
+ .add_property("percentile_75_test_ae", &ModelRegressor::percentile_75_test_ae, "@DocString_model_reg_test_percentile_75_test_ae@")
+ .add_property("percentile_95_ae", &ModelRegressor::percentile_95_ae, "@DocString_model_reg_percentile_95_ae@")
+ .add_property("percentile_95_test_ae", &ModelRegressor::percentile_95_ae, "@DocString_model_reg_test_percentile_95_ae@")
+ ;
+}
+
+void sisso::descriptor_identifier::registerModelClassifier()
+{
+ class_<ModelClassifier, bases<Model>>("ModelClassifier", init<Unit, std::vector<double>, std::vector<double>, std::vector<model_node_ptr>, std::vector<int>, std::vector<int>, bool>())
+ .def(init<std::string>())
+ .def(init<std::string, std::string>())
+ .def("__str__", &ModelClassifier::toString, "@DocString_model_class_str@")
+ .def("__repr__", &ModelClassifier::toString, "@DocString_model_class_str@")
+ .add_property("fit", &ModelClassifier::prop_train_est, "@DocString_model_class_prop_train_est@")
+ .add_property("predict", &ModelClassifier::prop_test_est, "@DocString_model_class_prop_test_est@")
+ .add_property("train_error", &ModelClassifier::train_error, "@DocString_model_class_train_error@")
+ .add_property("test_error", &ModelClassifier::test_error, "@DocString_model_class_test_error@")
+ .add_property("percent_error", &ModelClassifier::percent_train_error, "@DocString_model_class_precent_train_error@")
+ .add_property("percent_test_error", &ModelClassifier::percent_test_error, "@DocString_model_class_precent_test_error@")
+ ;
+}
+
void sisso::descriptor_identifier::registerSISSO_DI()
{
class_<sisso::descriptor_identifier::SISSO_DI_Wrap, boost::noncopyable>("SISSO_DI", no_init)
- .def("fit", &SISSO_DI::fit, "@DocString_sisso_reg_fit@")
.add_property("prop", &SISSO_DI::prop_py, "@DocString_sisso_reg_prop_py@")
.add_property("prop_test", &SISSO_DI::prop_test_py, "@DocString_sisso_reg_prop_test_py@")
- .add_property("models", &SISSO_DI::models_py, "@DocString_sisso_reg_models_py@")
.add_property("n_samp", &SISSO_DI::n_samp, "@DocString_sisso_reg_n_samp@")
.add_property("n_dim", &SISSO_DI::n_dim, "@DocString_sisso_reg_n_dim@")
.add_property("n_residual", &SISSO_DI::n_residual, "@DocString_sisso_reg_n_residual@")
@@ -397,6 +420,17 @@ void sisso::descriptor_identifier::registerSISSO_DI()
void sisso::descriptor_identifier::registerSISSORegressor()
{
class_<SISSORegressor, bases<SISSO_DI>>("SISSORegressor", init<std::shared_ptr<FeatureSpace>, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, int, optional<bool>>())
+ .def("fit", &SISSORegressor::fit, "@DocString_sisso_reg_fit@")
+ .def(init<std::shared_ptr<FeatureSpace>, Unit, py::list, py::list, py::list, py::list, py::list, int, int, int, optional<bool>>())
+ .add_property("models", &SISSORegressor::models_py, "@DocString_sisso_reg_models_py@")
+ ;
+}
+
+void sisso::descriptor_identifier::registerSISSOClassifier()
+{
+ class_<SISSOClassifier, bases<SISSO_DI>>("SISSOClassifier", init<std::shared_ptr<FeatureSpace>, Unit, np::ndarray, np::ndarray, py::list, py::list, py::list, int, int, int, optional<bool>>())
+ .def("fit", &SISSOClassifier::fit, "@DocString_sisso_class_fit@")
.def(init<std::shared_ptr<FeatureSpace>, Unit, py::list, py::list, py::list, py::list, py::list, int, int, int, optional<bool>>())
+ .add_property("models", &SISSOClassifier::models_py, "@DocString_sisso_class_models_py@")
;
}
diff --git a/src/python/bindings_docstring_keyed.hpp b/src/python/bindings_docstring_keyed.hpp
index ecf45ded..05048f02 100644
--- a/src/python/bindings_docstring_keyed.hpp
+++ b/src/python/bindings_docstring_keyed.hpp
@@ -9,6 +9,7 @@
#include <descriptor_identifier/SISSO_DI/SISSO_DI.hpp>
#include <descriptor_identifier/SISSO_DI/SISSORegressor.hpp>
+#include <descriptor_identifier/SISSO_DI/SISSOClassifier.hpp>
#include <feature_creation/feature_space/FeatureSpace.hpp>
#include <python/feature_creation/node_utils.hpp>
@@ -200,8 +201,13 @@ namespace sisso
{
struct SISSO_DI_Wrap : SISSO_DI, py::wrapper<SISSO_DI>
{
- inline void set_error(){this->get_override("set_error")();}
inline void l0_norm(){this->get_override("l0_norm")();}
+ inline void fit(){this->get_override("fit")();}
+ };
+
+ struct Model_Wrap : Model, py::wrapper<Model>
+ {
+ inline void to_file(std::string filename, bool train = true, std::vector<int> test_inds = {}){this->get_override("to_file")();}
};
/**
@@ -209,6 +215,16 @@ namespace sisso
*/
static void registerModel();
+ /**
+ * @brief Register the Model object for conversion to a python object
+ */
+ static void registerModelRegressor();
+
+ /**
+ * @brief Register the Model object for conversion to a python object
+ */
+ static void registerModelClassifier();
+
/**
* @brief Register the SISSORegressor object for conversion to a python object
*/
@@ -218,6 +234,12 @@ namespace sisso
* @brief Register the SISSORegressor object for conversion to a python object
*/
static void registerSISSORegressor();
+
+
+ /**
+ * @brief Register the SISSORegressor object for conversion to a python object
+ */
+ static void registerSISSOClassifier();
}
}
diff --git a/src/python/descriptor_identifier/SISSOClassifier.cpp b/src/python/descriptor_identifier/SISSOClassifier.cpp
new file mode 100644
index 00000000..89332a00
--- /dev/null
+++ b/src/python/descriptor_identifier/SISSOClassifier.cpp
@@ -0,0 +1,123 @@
+#include <descriptor_identifier/SISSO_DI/SISSOClassifier.hpp>
+
+SISSOClassifier::SISSOClassifier(
+ std::shared_ptr<FeatureSpace> feat_space,
+ Unit prop_unit,
+ np::ndarray prop,
+ np::ndarray prop_test,
+ py::list task_sizes_train,
+ py::list task_sizes_test,
+ py::list leave_out_inds,
+ int n_dim,
+ int n_residual,
+ int n_models_store,
+ bool fix_intercept
+) :
+ SISSO_DI(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept),
+ _training(_task_sizes_train.size()),
+ _int_prop(_prop.size()),
+ _int_prop_test(_prop_test.size()),
+ _int_error(_prop.size()),
+ _trainer(1, !fix_intercept),
+ _c(1.0)
+{
+ std::vector<double> prop_vec = python_conv_utils::from_ndarray<double>(prop);
+ std::vector<double> prop_test_vec = python_conv_utils::from_ndarray<double>(prop_test);
+
+ _trainer.stoppingCondition().maxSeconds = 0.01;
+
+ double min_prop = *std::min_element(prop_vec.begin(), prop_vec.end());
+ if(min_prop < 0.0)
+ {
+ std::transform(prop_vec.begin(), prop_vec.end(), prop_vec.begin(), [&min_prop](double pp){return pp - min_prop;});
+ std::transform(prop_test_vec.begin(), prop_test_vec.end(), prop_test_vec.begin(), [&min_prop](double pp){return pp - min_prop;});
+ }
+ std::vector<bool> not_int(prop_vec.size());
+ std::transform(prop_vec.begin(), prop_vec.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;});
+ if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;}))
+ throw std::logic_error("For classification the property (training set) must be an integer");
+
+ not_int.resize(prop_test_vec.size());
+ std::transform(prop_test_vec.begin(), prop_test_vec.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;});
+ if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;}))
+ throw std::logic_error("For classification the property (test set) must be an integer");
+
+ for(auto& pt : prop_test_vec)
+ if(std::none_of(prop_vec.begin(), prop_vec.end(), [&pt](double pp){return std::abs(pp - pt) < 1e-10;}))
+ throw std::logic_error("A class in the property vector (test set) is not in the training set.");
+
+ std::transform(_prop.begin(), _prop.end(), _prop.begin(), [&min_prop](double pp){return pp - min_prop;});
+ std::transform(_prop_test.begin(), _prop_test.end(), _prop_test.begin(), [&min_prop](double pp){return pp - min_prop;});
+
+ for(int pp = 0; pp < _prop.size(); ++pp)
+ _int_prop[pp] = static_cast<unsigned int>(std::round(_prop[pp]));
+
+ for(int pp = 0; pp < _prop_test.size(); ++pp)
+ _int_prop_test[pp] = static_cast<unsigned int>(std::round(_prop_test[pp]));
+}
+
+SISSOClassifier::SISSOClassifier(
+ std::shared_ptr<FeatureSpace> feat_space,
+ Unit prop_unit,
+ py::list prop,
+ py::list prop_test,
+ py::list task_sizes_train,
+ py::list task_sizes_test,
+ py::list leave_out_inds,
+ int n_dim,
+ int n_residual,
+ int n_models_store,
+ bool fix_intercept
+) :
+ SISSO_DI(feat_space, prop_unit, prop, prop_test, task_sizes_train, task_sizes_test, leave_out_inds, n_dim, n_residual, n_models_store, fix_intercept),
+ _training(_task_sizes_train.size()),
+ _int_prop(_prop.size()),
+ _int_prop_test(_prop_test.size()),
+ _int_error(_prop.size()),
+ _trainer(1, !fix_intercept),
+ _c(1.0)
+{
+ std::vector<double> prop_vec = python_conv_utils::from_list<double>(prop);
+ std::vector<double> prop_test_vec = python_conv_utils::from_list<double>(prop_test);
+
+ _trainer.stoppingCondition().maxSeconds = 0.01;
+
+ double min_prop = *std::min_element(prop_vec.begin(), prop_vec.end());
+ if(min_prop < 0.0)
+ {
+ std::transform(prop_vec.begin(), prop_vec.end(), prop_vec.begin(), [&min_prop](double pp){return pp - min_prop;});
+ std::transform(prop_test_vec.begin(), prop_test_vec.end(), prop_test_vec.begin(), [&min_prop](double pp){return pp - min_prop;});
+ }
+ std::vector<bool> not_int(prop_vec.size());
+ std::transform(prop_vec.begin(), prop_vec.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;});
+ if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;}))
+ throw std::logic_error("For classification the property (training set) must be an integer");
+
+ not_int.resize(prop_test_vec.size());
+ std::transform(prop_test_vec.begin(), prop_test_vec.end(), not_int.begin(), [](double pp){return std::abs(pp - std::round(pp)) >= 1e-10;});
+ if(std::any_of(not_int.begin(), not_int.end(), [](bool b){return b;}))
+ throw std::logic_error("For classification the property (test set) must be an integer");
+
+ for(auto& pt : prop_test_vec)
+ if(std::none_of(prop_vec.begin(), prop_vec.end(), [&pt](double pp){return std::abs(pp - pt) < 1e-10;}))
+ throw std::logic_error("A class in the property vector (test set) is not in the training set.");
+
+ for(int pp = 0; pp < _prop.size(); ++pp)
+ _int_prop[pp] = static_cast<unsigned int>(std::round(_prop[pp] - min_prop));
+
+ for(int pp = 0; pp < _prop_test.size(); ++pp)
+ _int_prop_test[pp] = static_cast<unsigned int>(std::round(_prop_test[pp] - min_prop));
+}
+
+py::list SISSOClassifier::models_py()
+{
+ py::list model_list;
+ for(auto& mod_list : _models)
+ {
+ py::list lst;
+ for(auto& mod : mod_list)
+ lst.append<ModelClassifier>(mod);
+ model_list.append<py::list>(lst);
+ }
+ return model_list;
+}
diff --git a/src/python/descriptor_identifier/SISSORegressor.cpp b/src/python/descriptor_identifier/SISSORegressor.cpp
index 96a3bc45..d7bccdd3 100644
--- a/src/python/descriptor_identifier/SISSORegressor.cpp
+++ b/src/python/descriptor_identifier/SISSORegressor.cpp
@@ -54,3 +54,16 @@ SISSORegressor::SISSORegressor(
_lwork = get_opt_lwork(_n_dim + 1);
_work = std::vector<double>(_lwork, 0.0);
}
+
+py::list SISSORegressor::models_py()
+{
+ py::list model_list;
+ for(auto& mod_list : _models)
+ {
+ py::list lst;
+ for(auto& mod : mod_list)
+ lst.append<ModelRegressor>(mod);
+ model_list.append<py::list>(lst);
+ }
+ return model_list;
+}
diff --git a/src/python/descriptor_identifier/SISSO_DI.cpp b/src/python/descriptor_identifier/SISSO_DI.cpp
index b8d8bf6f..3a4456b9 100644
--- a/src/python/descriptor_identifier/SISSO_DI.cpp
+++ b/src/python/descriptor_identifier/SISSO_DI.cpp
@@ -61,16 +61,3 @@ SISSO_DI::SISSO_DI(
{
node_value_arrs::initialize_d_matrix_arr();
}
-
-py::list SISSO_DI::models_py()
-{
- py::list model_list;
- for(auto& mod_list : _models)
- {
- py::list lst;
- for(auto& mod : mod_list)
- lst.append<Model>(mod);
- model_list.append<py::list>(lst);
- }
- return model_list;
-}
diff --git a/tests/test_descriptor_identifier/test_model.py b/tests/test_descriptor_identifier/test_model.py
index bbd682c4..0f156dba 100644
--- a/tests/test_descriptor_identifier/test_model.py
+++ b/tests/test_descriptor_identifier/test_model.py
@@ -1,4 +1,4 @@
-from cpp_sisso import Model
+from cpp_sisso import ModelRegressor
from pathlib import Path
import numpy as np
@@ -12,7 +12,7 @@ parent = Path(__file__).parent
def test_model():
- model = Model(
+ model = ModelRegressor(
str(parent / "model_files/train.dat"), str(parent / "model_files/test.dat")
)
assert np.all(np.abs(model.fit - model.prop_train) < 1e-7)
--
GitLab